MarketAlly.AIPlugin.Extensions/MarketAlly.AIPlugin.Learning/Plugins/TerminologyAnalysisPlugin.cs

399 lines
17 KiB
C#
Executable File

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using MarketAlly.AIPlugin;
namespace MarketAlly.AIPlugin.Learning.Plugins
{
/// <summary>
/// Advanced terminology analysis plugin that uses LLM capabilities to intelligently
/// distinguish between technical terminology and common words in documentation
/// </summary>
[AIPlugin("TerminologyAnalysis", "Intelligent terminology analysis using LLM to distinguish technical terms from common words")]
public class TerminologyAnalysisPlugin : IAIPlugin
{
private readonly ILogger<TerminologyAnalysisPlugin>? _logger;
private readonly IAIPluginRegistry? _pluginRegistry;
[AIParameter("Text content to analyze for terminology", required: true)]
public string Content { get; set; } = string.Empty;
[AIParameter("Context description (e.g., 'software documentation', 'API reference')", required: false)]
public string Context { get; set; } = "software documentation";
[AIParameter("Domain or industry (e.g., 'web development', 'machine learning')", required: false)]
public string Domain { get; set; } = "software development";
[AIParameter("Analysis mode: 'extract' or 'validate'", required: false)]
public string Mode { get; set; } = "extract";
[AIParameter("Known terminology list for validation mode (JSON array)", required: false)]
public string? KnownTerminology { get; set; }
public IReadOnlyDictionary<string, Type> SupportedParameters => new Dictionary<string, Type>
{
["content"] = typeof(string),
["context"] = typeof(string),
["domain"] = typeof(string),
["mode"] = typeof(string),
["knownTerminology"] = typeof(string)
};
public TerminologyAnalysisPlugin(ILogger<TerminologyAnalysisPlugin>? logger = null, IAIPluginRegistry? pluginRegistry = null)
{
_logger = logger;
_pluginRegistry = pluginRegistry;
}
public async Task<AIPluginResult> ExecuteAsync(IReadOnlyDictionary<string, object> parameters)
{
try
{
// Extract parameters
var content = parameters["content"].ToString() ?? string.Empty;
var context = parameters.TryGetValue("context", out var ctx) ? ctx.ToString() ?? "software documentation" : "software documentation";
var domain = parameters.TryGetValue("domain", out var dom) ? dom.ToString() ?? "software development" : "software development";
var mode = parameters.TryGetValue("mode", out var m) ? m.ToString()?.ToLower() ?? "extract" : "extract";
var knownTerminologyJson = parameters.TryGetValue("knownTerminology", out var kt) ? kt.ToString() : null;
_logger?.LogInformation("Starting terminology analysis for {ContentLength} characters in {Context} domain",
content.Length, domain);
if (string.IsNullOrWhiteSpace(content))
{
return new AIPluginResult(new { Error = "Content cannot be empty" }, "Content is required for analysis");
}
var result = mode switch
{
"extract" => await ExtractTerminologyAsync(content, context, domain),
"validate" => await ValidateTerminologyAsync(content, context, domain, knownTerminologyJson),
_ => throw new ArgumentException($"Unknown mode: {mode}. Use 'extract' or 'validate'")
};
_logger?.LogInformation("Terminology analysis completed successfully with {TermCount} terms analyzed",
result.GetValueOrDefault("totalTerms", 0));
return new AIPluginResult(result, "Terminology analysis completed successfully");
}
catch (Exception ex)
{
_logger?.LogError(ex, "Error during terminology analysis");
return new AIPluginResult(ex, "Terminology analysis failed");
}
}
private async Task<Dictionary<string, object>> ExtractTerminologyAsync(string content, string context, string domain)
{
// First, extract potential terminology using basic pattern matching
var candidateTerms = ExtractCandidateTerms(content);
if (!candidateTerms.Any())
{
return new Dictionary<string, object>
{
["technicalTerms"] = new List<string>(),
["commonWords"] = new List<string>(),
["totalTerms"] = 0,
["analysisMethod"] = "pattern-matching-only"
};
}
// Use LLM analysis if plugin registry is available
if (_pluginRegistry != null)
{
try
{
var llmResult = await AnalyzeTerminologyWithLLM(candidateTerms, context, domain);
if (llmResult != null)
{
return llmResult;
}
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "LLM analysis failed, falling back to heuristic analysis");
}
}
// Fallback to enhanced heuristic analysis
return AnalyzeTerminologyWithHeuristics(candidateTerms, domain);
}
private async Task<Dictionary<string, object>> ValidateTerminologyAsync(string content, string context, string domain, string? knownTerminologyJson)
{
var candidateTerms = ExtractCandidateTerms(content);
List<string> knownTerms;
try
{
knownTerms = string.IsNullOrWhiteSpace(knownTerminologyJson)
? new List<string>()
: JsonSerializer.Deserialize<List<string>>(knownTerminologyJson) ?? new List<string>();
}
catch (JsonException ex)
{
_logger?.LogWarning(ex, "Failed to parse known terminology JSON, using empty list");
knownTerms = new List<string>();
}
var consistencyIssues = new List<Dictionary<string, object>>();
var unmatchedTerms = new List<string>();
var consistentTerms = new List<string>();
foreach (var term in candidateTerms)
{
var normalizedTerm = NormalizeTerm(term);
var matchingKnownTerms = knownTerms
.Where(kt => string.Equals(NormalizeTerm(kt), normalizedTerm, StringComparison.OrdinalIgnoreCase))
.ToList();
if (matchingKnownTerms.Any())
{
// Check for consistency (exact matches vs variations)
if (matchingKnownTerms.All(kt => kt == term))
{
consistentTerms.Add(term);
}
else
{
consistencyIssues.Add(new Dictionary<string, object>
{
["foundTerm"] = term,
["expectedTerms"] = matchingKnownTerms,
["issueType"] = "inconsistent-casing-or-variation"
});
}
}
else
{
// Check if it's likely a technical term that should be in known terminology
if (IsLikelyTechnicalTerm(term))
{
unmatchedTerms.Add(term);
}
}
}
var consistencyScore = candidateTerms.Count > 0
? (decimal)(consistentTerms.Count) / candidateTerms.Count
: 1.0m;
return new Dictionary<string, object>
{
["isConsistent"] = consistencyIssues.Count == 0 && unmatchedTerms.Count == 0,
["consistencyScore"] = consistencyScore,
["consistentTerms"] = consistentTerms,
["consistencyIssues"] = consistencyIssues,
["unmatchedTerms"] = unmatchedTerms,
["totalTermsAnalyzed"] = candidateTerms.Count,
["analysisMethod"] = "validation-with-known-terms"
};
}
private async Task<Dictionary<string, object>?> AnalyzeTerminologyWithLLM(List<string> candidateTerms, string context, string domain)
{
if (_pluginRegistry == null) return null;
try
{
// Prepare prompt for LLM analysis
var prompt = $@"Analyze the following terms extracted from {context} in the {domain} domain.
Classify each term as either 'technical' (domain-specific terminology) or 'common' (general English words).
Terms to analyze: {string.Join(", ", candidateTerms.Take(50))} // Limit to prevent token overflow
Respond with a JSON object in this exact format:
{{
""technicalTerms"": [""term1"", ""term2""],
""commonWords"": [""word1"", ""word2""],
""reasoning"": ""Brief explanation of classification approach""
}}
Focus on identifying terms that are:
- Technical: API names, programming concepts, domain-specific jargon, technical processes
- Common: General English words, articles, prepositions, common adjectives/verbs
Be conservative - when in doubt, classify as technical to avoid false negatives.";
// Use a hypothetical LLM analysis plugin (this would need to be implemented)
var llmParameters = new Dictionary<string, object>
{
["prompt"] = prompt,
["maxTokens"] = 1000,
["temperature"] = 0.1 // Low temperature for consistent classification
};
// This is a placeholder - would need actual LLM plugin implementation
// var llmResult = await _pluginRegistry.CallFunctionAsync("LLMAnalysis", llmParameters);
// For now, return null to fall back to heuristics
return null;
}
catch (Exception ex)
{
_logger?.LogError(ex, "Failed to analyze terminology with LLM");
return null;
}
}
private Dictionary<string, object> AnalyzeTerminologyWithHeuristics(List<string> candidateTerms, string domain)
{
var technicalTerms = new List<string>();
var commonWords = new List<string>();
foreach (var term in candidateTerms)
{
if (IsLikelyTechnicalTerm(term))
{
technicalTerms.Add(term);
}
else
{
commonWords.Add(term);
}
}
return new Dictionary<string, object>
{
["technicalTerms"] = technicalTerms,
["commonWords"] = commonWords,
["totalTerms"] = candidateTerms.Count,
["analysisMethod"] = "enhanced-heuristics",
["confidence"] = CalculateConfidenceScore(technicalTerms, commonWords)
};
}
private List<string> ExtractCandidateTerms(string content)
{
// Extract capitalized terms that could be technical terminology
var pattern = @"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b";
var matches = System.Text.RegularExpressions.Regex.Matches(content, pattern);
return matches
.Cast<System.Text.RegularExpressions.Match>()
.Select(m => m.Value.Trim())
.Where(term => term.Length > 2) // Filter out very short terms
.Distinct()
.ToList();
}
private bool IsLikelyTechnicalTerm(string term)
{
// Enhanced heuristics for technical term detection
var lowerTerm = term.ToLower();
// Common English words that should not be considered technical
var commonWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"The", "This", "That", "These", "Those", "A", "An", "In", "On", "At", "By", "For", "With", "Without",
"To", "From", "Of", "About", "Over", "Under", "Above", "Below", "Between", "Among", "Through",
"During", "Before", "After", "While", "Since", "Until", "Although", "However", "Therefore",
"Moreover", "Furthermore", "Nevertheless", "Otherwise", "Meanwhile", "Instead", "Rather",
"Chapter", "Part", "Section", "Story", "Content", "Introduction", "Overview", "Summary",
"All", "Each", "Every", "Some", "Any", "Many", "Few", "Several", "Most", "More", "Less",
"First", "Second", "Third", "Last", "Next", "Previous", "Final", "Initial", "Primary",
"Good", "Bad", "Better", "Best", "Worse", "Worst", "Great", "Small", "Large", "Big", "Little",
"New", "Old", "Young", "High", "Low", "Long", "Short", "Wide", "Narrow", "Deep", "Shallow"
};
if (commonWords.Contains(term))
{
return false;
}
// Technical term indicators
var technicalIndicators = new[]
{
// Programming concepts
"api", "sdk", "framework", "library", "service", "controller", "manager", "handler", "provider",
"factory", "builder", "repository", "middleware", "plugin", "component", "module",
// Software architecture
"microservice", "monolith", "database", "cache", "queue", "pipeline", "workflow", "architecture",
// Technical processes
"authentication", "authorization", "validation", "serialization", "encryption", "deployment",
"integration", "orchestration", "synchronization", "optimization",
// Data concepts
"model", "entity", "schema", "migration", "index", "query", "transaction", "connection"
};
// Check if term contains technical indicators
if (technicalIndicators.Any(indicator => lowerTerm.Contains(indicator)))
{
return true;
}
// Check for common technical naming patterns
if (term.EndsWith("Service") || term.EndsWith("Manager") || term.EndsWith("Controller") ||
term.EndsWith("Handler") || term.EndsWith("Provider") || term.EndsWith("Factory") ||
term.EndsWith("Builder") || term.EndsWith("Repository") || term.EndsWith("Plugin") ||
term.EndsWith("Component") || term.EndsWith("Module") || term.EndsWith("Helper"))
{
return true;
}
// Check for technical acronyms (2-5 uppercase letters)
if (System.Text.RegularExpressions.Regex.IsMatch(term, @"^[A-Z]{2,5}$"))
{
return true;
}
// Check for camelCase or PascalCase (likely code identifiers)
if (System.Text.RegularExpressions.Regex.IsMatch(term, @"^[A-Z][a-z]+[A-Z][a-zA-Z]*$"))
{
return true;
}
// Default to false for common words
return false;
}
private string NormalizeTerm(string term)
{
// Remove common articles and normalize whitespace
var normalized = term.Trim();
var articles = new[] { "The ", "A ", "An " };
foreach (var article in articles)
{
if (normalized.StartsWith(article, StringComparison.OrdinalIgnoreCase))
{
normalized = normalized.Substring(article.Length);
break;
}
}
return normalized;
}
private decimal CalculateConfidenceScore(List<string> technicalTerms, List<string> commonWords)
{
var totalTerms = technicalTerms.Count + commonWords.Count;
if (totalTerms == 0) return 1.0m;
// Confidence is higher when we have clear separation between technical and common terms
var technicalRatio = (decimal)technicalTerms.Count / totalTerms;
// Confidence is highest around 0.3-0.7 ratio (mixed but clear distinction)
// Lower confidence for all technical (1.0) or all common (0.0)
if (technicalRatio >= 0.3m && technicalRatio <= 0.7m)
{
return 0.9m;
}
else if (technicalRatio >= 0.2m && technicalRatio <= 0.8m)
{
return 0.8m;
}
else
{
return 0.7m;
}
}
}
}