399 lines
17 KiB
C#
Executable File
399 lines
17 KiB
C#
Executable File
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text.Json;
|
|
using System.Threading.Tasks;
|
|
using Microsoft.Extensions.Logging;
|
|
using MarketAlly.AIPlugin;
|
|
|
|
namespace MarketAlly.AIPlugin.Learning.Plugins
|
|
{
|
|
/// <summary>
|
|
/// Advanced terminology analysis plugin that uses LLM capabilities to intelligently
|
|
/// distinguish between technical terminology and common words in documentation
|
|
/// </summary>
|
|
[AIPlugin("TerminologyAnalysis", "Intelligent terminology analysis using LLM to distinguish technical terms from common words")]
|
|
public class TerminologyAnalysisPlugin : IAIPlugin
|
|
{
|
|
private readonly ILogger<TerminologyAnalysisPlugin>? _logger;
|
|
private readonly IAIPluginRegistry? _pluginRegistry;
|
|
|
|
[AIParameter("Text content to analyze for terminology", required: true)]
|
|
public string Content { get; set; } = string.Empty;
|
|
|
|
[AIParameter("Context description (e.g., 'software documentation', 'API reference')", required: false)]
|
|
public string Context { get; set; } = "software documentation";
|
|
|
|
[AIParameter("Domain or industry (e.g., 'web development', 'machine learning')", required: false)]
|
|
public string Domain { get; set; } = "software development";
|
|
|
|
[AIParameter("Analysis mode: 'extract' or 'validate'", required: false)]
|
|
public string Mode { get; set; } = "extract";
|
|
|
|
[AIParameter("Known terminology list for validation mode (JSON array)", required: false)]
|
|
public string? KnownTerminology { get; set; }
|
|
|
|
public IReadOnlyDictionary<string, Type> SupportedParameters => new Dictionary<string, Type>
|
|
{
|
|
["content"] = typeof(string),
|
|
["context"] = typeof(string),
|
|
["domain"] = typeof(string),
|
|
["mode"] = typeof(string),
|
|
["knownTerminology"] = typeof(string)
|
|
};
|
|
|
|
public TerminologyAnalysisPlugin(ILogger<TerminologyAnalysisPlugin>? logger = null, IAIPluginRegistry? pluginRegistry = null)
|
|
{
|
|
_logger = logger;
|
|
_pluginRegistry = pluginRegistry;
|
|
}
|
|
|
|
public async Task<AIPluginResult> ExecuteAsync(IReadOnlyDictionary<string, object> parameters)
|
|
{
|
|
try
|
|
{
|
|
// Extract parameters
|
|
var content = parameters["content"].ToString() ?? string.Empty;
|
|
var context = parameters.TryGetValue("context", out var ctx) ? ctx.ToString() ?? "software documentation" : "software documentation";
|
|
var domain = parameters.TryGetValue("domain", out var dom) ? dom.ToString() ?? "software development" : "software development";
|
|
var mode = parameters.TryGetValue("mode", out var m) ? m.ToString()?.ToLower() ?? "extract" : "extract";
|
|
var knownTerminologyJson = parameters.TryGetValue("knownTerminology", out var kt) ? kt.ToString() : null;
|
|
|
|
_logger?.LogInformation("Starting terminology analysis for {ContentLength} characters in {Context} domain",
|
|
content.Length, domain);
|
|
|
|
if (string.IsNullOrWhiteSpace(content))
|
|
{
|
|
return new AIPluginResult(new { Error = "Content cannot be empty" }, "Content is required for analysis");
|
|
}
|
|
|
|
var result = mode switch
|
|
{
|
|
"extract" => await ExtractTerminologyAsync(content, context, domain),
|
|
"validate" => await ValidateTerminologyAsync(content, context, domain, knownTerminologyJson),
|
|
_ => throw new ArgumentException($"Unknown mode: {mode}. Use 'extract' or 'validate'")
|
|
};
|
|
|
|
_logger?.LogInformation("Terminology analysis completed successfully with {TermCount} terms analyzed",
|
|
result.GetValueOrDefault("totalTerms", 0));
|
|
|
|
return new AIPluginResult(result, "Terminology analysis completed successfully");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger?.LogError(ex, "Error during terminology analysis");
|
|
return new AIPluginResult(ex, "Terminology analysis failed");
|
|
}
|
|
}
|
|
|
|
private async Task<Dictionary<string, object>> ExtractTerminologyAsync(string content, string context, string domain)
|
|
{
|
|
// First, extract potential terminology using basic pattern matching
|
|
var candidateTerms = ExtractCandidateTerms(content);
|
|
|
|
if (!candidateTerms.Any())
|
|
{
|
|
return new Dictionary<string, object>
|
|
{
|
|
["technicalTerms"] = new List<string>(),
|
|
["commonWords"] = new List<string>(),
|
|
["totalTerms"] = 0,
|
|
["analysisMethod"] = "pattern-matching-only"
|
|
};
|
|
}
|
|
|
|
// Use LLM analysis if plugin registry is available
|
|
if (_pluginRegistry != null)
|
|
{
|
|
try
|
|
{
|
|
var llmResult = await AnalyzeTerminologyWithLLM(candidateTerms, context, domain);
|
|
if (llmResult != null)
|
|
{
|
|
return llmResult;
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger?.LogWarning(ex, "LLM analysis failed, falling back to heuristic analysis");
|
|
}
|
|
}
|
|
|
|
// Fallback to enhanced heuristic analysis
|
|
return AnalyzeTerminologyWithHeuristics(candidateTerms, domain);
|
|
}
|
|
|
|
private async Task<Dictionary<string, object>> ValidateTerminologyAsync(string content, string context, string domain, string? knownTerminologyJson)
|
|
{
|
|
var candidateTerms = ExtractCandidateTerms(content);
|
|
|
|
List<string> knownTerms;
|
|
try
|
|
{
|
|
knownTerms = string.IsNullOrWhiteSpace(knownTerminologyJson)
|
|
? new List<string>()
|
|
: JsonSerializer.Deserialize<List<string>>(knownTerminologyJson) ?? new List<string>();
|
|
}
|
|
catch (JsonException ex)
|
|
{
|
|
_logger?.LogWarning(ex, "Failed to parse known terminology JSON, using empty list");
|
|
knownTerms = new List<string>();
|
|
}
|
|
|
|
var consistencyIssues = new List<Dictionary<string, object>>();
|
|
var unmatchedTerms = new List<string>();
|
|
var consistentTerms = new List<string>();
|
|
|
|
foreach (var term in candidateTerms)
|
|
{
|
|
var normalizedTerm = NormalizeTerm(term);
|
|
var matchingKnownTerms = knownTerms
|
|
.Where(kt => string.Equals(NormalizeTerm(kt), normalizedTerm, StringComparison.OrdinalIgnoreCase))
|
|
.ToList();
|
|
|
|
if (matchingKnownTerms.Any())
|
|
{
|
|
// Check for consistency (exact matches vs variations)
|
|
if (matchingKnownTerms.All(kt => kt == term))
|
|
{
|
|
consistentTerms.Add(term);
|
|
}
|
|
else
|
|
{
|
|
consistencyIssues.Add(new Dictionary<string, object>
|
|
{
|
|
["foundTerm"] = term,
|
|
["expectedTerms"] = matchingKnownTerms,
|
|
["issueType"] = "inconsistent-casing-or-variation"
|
|
});
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Check if it's likely a technical term that should be in known terminology
|
|
if (IsLikelyTechnicalTerm(term))
|
|
{
|
|
unmatchedTerms.Add(term);
|
|
}
|
|
}
|
|
}
|
|
|
|
var consistencyScore = candidateTerms.Count > 0
|
|
? (decimal)(consistentTerms.Count) / candidateTerms.Count
|
|
: 1.0m;
|
|
|
|
return new Dictionary<string, object>
|
|
{
|
|
["isConsistent"] = consistencyIssues.Count == 0 && unmatchedTerms.Count == 0,
|
|
["consistencyScore"] = consistencyScore,
|
|
["consistentTerms"] = consistentTerms,
|
|
["consistencyIssues"] = consistencyIssues,
|
|
["unmatchedTerms"] = unmatchedTerms,
|
|
["totalTermsAnalyzed"] = candidateTerms.Count,
|
|
["analysisMethod"] = "validation-with-known-terms"
|
|
};
|
|
}
|
|
|
|
private async Task<Dictionary<string, object>?> AnalyzeTerminologyWithLLM(List<string> candidateTerms, string context, string domain)
|
|
{
|
|
if (_pluginRegistry == null) return null;
|
|
|
|
try
|
|
{
|
|
// Prepare prompt for LLM analysis
|
|
var prompt = $@"Analyze the following terms extracted from {context} in the {domain} domain.
|
|
Classify each term as either 'technical' (domain-specific terminology) or 'common' (general English words).
|
|
|
|
Terms to analyze: {string.Join(", ", candidateTerms.Take(50))} // Limit to prevent token overflow
|
|
|
|
Respond with a JSON object in this exact format:
|
|
{{
|
|
""technicalTerms"": [""term1"", ""term2""],
|
|
""commonWords"": [""word1"", ""word2""],
|
|
""reasoning"": ""Brief explanation of classification approach""
|
|
}}
|
|
|
|
Focus on identifying terms that are:
|
|
- Technical: API names, programming concepts, domain-specific jargon, technical processes
|
|
- Common: General English words, articles, prepositions, common adjectives/verbs
|
|
|
|
Be conservative - when in doubt, classify as technical to avoid false negatives.";
|
|
|
|
// Use a hypothetical LLM analysis plugin (this would need to be implemented)
|
|
var llmParameters = new Dictionary<string, object>
|
|
{
|
|
["prompt"] = prompt,
|
|
["maxTokens"] = 1000,
|
|
["temperature"] = 0.1 // Low temperature for consistent classification
|
|
};
|
|
|
|
// This is a placeholder - would need actual LLM plugin implementation
|
|
// var llmResult = await _pluginRegistry.CallFunctionAsync("LLMAnalysis", llmParameters);
|
|
|
|
// For now, return null to fall back to heuristics
|
|
return null;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger?.LogError(ex, "Failed to analyze terminology with LLM");
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private Dictionary<string, object> AnalyzeTerminologyWithHeuristics(List<string> candidateTerms, string domain)
|
|
{
|
|
var technicalTerms = new List<string>();
|
|
var commonWords = new List<string>();
|
|
|
|
foreach (var term in candidateTerms)
|
|
{
|
|
if (IsLikelyTechnicalTerm(term))
|
|
{
|
|
technicalTerms.Add(term);
|
|
}
|
|
else
|
|
{
|
|
commonWords.Add(term);
|
|
}
|
|
}
|
|
|
|
return new Dictionary<string, object>
|
|
{
|
|
["technicalTerms"] = technicalTerms,
|
|
["commonWords"] = commonWords,
|
|
["totalTerms"] = candidateTerms.Count,
|
|
["analysisMethod"] = "enhanced-heuristics",
|
|
["confidence"] = CalculateConfidenceScore(technicalTerms, commonWords)
|
|
};
|
|
}
|
|
|
|
private List<string> ExtractCandidateTerms(string content)
|
|
{
|
|
// Extract capitalized terms that could be technical terminology
|
|
var pattern = @"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b";
|
|
var matches = System.Text.RegularExpressions.Regex.Matches(content, pattern);
|
|
|
|
return matches
|
|
.Cast<System.Text.RegularExpressions.Match>()
|
|
.Select(m => m.Value.Trim())
|
|
.Where(term => term.Length > 2) // Filter out very short terms
|
|
.Distinct()
|
|
.ToList();
|
|
}
|
|
|
|
private bool IsLikelyTechnicalTerm(string term)
|
|
{
|
|
// Enhanced heuristics for technical term detection
|
|
var lowerTerm = term.ToLower();
|
|
|
|
// Common English words that should not be considered technical
|
|
var commonWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
"The", "This", "That", "These", "Those", "A", "An", "In", "On", "At", "By", "For", "With", "Without",
|
|
"To", "From", "Of", "About", "Over", "Under", "Above", "Below", "Between", "Among", "Through",
|
|
"During", "Before", "After", "While", "Since", "Until", "Although", "However", "Therefore",
|
|
"Moreover", "Furthermore", "Nevertheless", "Otherwise", "Meanwhile", "Instead", "Rather",
|
|
"Chapter", "Part", "Section", "Story", "Content", "Introduction", "Overview", "Summary",
|
|
"All", "Each", "Every", "Some", "Any", "Many", "Few", "Several", "Most", "More", "Less",
|
|
"First", "Second", "Third", "Last", "Next", "Previous", "Final", "Initial", "Primary",
|
|
"Good", "Bad", "Better", "Best", "Worse", "Worst", "Great", "Small", "Large", "Big", "Little",
|
|
"New", "Old", "Young", "High", "Low", "Long", "Short", "Wide", "Narrow", "Deep", "Shallow"
|
|
};
|
|
|
|
if (commonWords.Contains(term))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Technical term indicators
|
|
var technicalIndicators = new[]
|
|
{
|
|
// Programming concepts
|
|
"api", "sdk", "framework", "library", "service", "controller", "manager", "handler", "provider",
|
|
"factory", "builder", "repository", "middleware", "plugin", "component", "module",
|
|
|
|
// Software architecture
|
|
"microservice", "monolith", "database", "cache", "queue", "pipeline", "workflow", "architecture",
|
|
|
|
// Technical processes
|
|
"authentication", "authorization", "validation", "serialization", "encryption", "deployment",
|
|
"integration", "orchestration", "synchronization", "optimization",
|
|
|
|
// Data concepts
|
|
"model", "entity", "schema", "migration", "index", "query", "transaction", "connection"
|
|
};
|
|
|
|
// Check if term contains technical indicators
|
|
if (technicalIndicators.Any(indicator => lowerTerm.Contains(indicator)))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Check for common technical naming patterns
|
|
if (term.EndsWith("Service") || term.EndsWith("Manager") || term.EndsWith("Controller") ||
|
|
term.EndsWith("Handler") || term.EndsWith("Provider") || term.EndsWith("Factory") ||
|
|
term.EndsWith("Builder") || term.EndsWith("Repository") || term.EndsWith("Plugin") ||
|
|
term.EndsWith("Component") || term.EndsWith("Module") || term.EndsWith("Helper"))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Check for technical acronyms (2-5 uppercase letters)
|
|
if (System.Text.RegularExpressions.Regex.IsMatch(term, @"^[A-Z]{2,5}$"))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Check for camelCase or PascalCase (likely code identifiers)
|
|
if (System.Text.RegularExpressions.Regex.IsMatch(term, @"^[A-Z][a-z]+[A-Z][a-zA-Z]*$"))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Default to false for common words
|
|
return false;
|
|
}
|
|
|
|
private string NormalizeTerm(string term)
|
|
{
|
|
// Remove common articles and normalize whitespace
|
|
var normalized = term.Trim();
|
|
var articles = new[] { "The ", "A ", "An " };
|
|
|
|
foreach (var article in articles)
|
|
{
|
|
if (normalized.StartsWith(article, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
normalized = normalized.Substring(article.Length);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return normalized;
|
|
}
|
|
|
|
private decimal CalculateConfidenceScore(List<string> technicalTerms, List<string> commonWords)
|
|
{
|
|
var totalTerms = technicalTerms.Count + commonWords.Count;
|
|
if (totalTerms == 0) return 1.0m;
|
|
|
|
// Confidence is higher when we have clear separation between technical and common terms
|
|
var technicalRatio = (decimal)technicalTerms.Count / totalTerms;
|
|
|
|
// Confidence is highest around 0.3-0.7 ratio (mixed but clear distinction)
|
|
// Lower confidence for all technical (1.0) or all common (0.0)
|
|
if (technicalRatio >= 0.3m && technicalRatio <= 0.7m)
|
|
{
|
|
return 0.9m;
|
|
}
|
|
else if (technicalRatio >= 0.2m && technicalRatio <= 0.8m)
|
|
{
|
|
return 0.8m;
|
|
}
|
|
else
|
|
{
|
|
return 0.7m;
|
|
}
|
|
}
|
|
}
|
|
} |