MarketAlly.AIPlugin.Extensions/MarketAlly.AIPlugin.Learning/Plugins/TerminologyAnalysisPlugin.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using MarketAlly.AIPlugin;

namespace MarketAlly.AIPlugin.Learning.Plugins
{
    /// <summary>
    /// Advanced terminology analysis plugin that uses LLM capabilities to intelligently
    /// distinguish between technical terminology and common words in documentation
    /// </summary>
    [AIPlugin("TerminologyAnalysis", "Intelligent terminology analysis using LLM to distinguish technical terms from common words")]
    public class TerminologyAnalysisPlugin : IAIPlugin
    {
        private readonly ILogger<TerminologyAnalysisPlugin>? _logger;
        private readonly IAIPluginRegistry? _pluginRegistry;

        [AIParameter("Text content to analyze for terminology", required: true)]
        public string Content { get; set; } = string.Empty;

        [AIParameter("Context description (e.g., 'software documentation', 'API reference')", required: false)]
        public string Context { get; set; } = "software documentation";

        [AIParameter("Domain or industry (e.g., 'web development', 'machine learning')", required: false)]
        public string Domain { get; set; } = "software development";

        [AIParameter("Analysis mode: 'extract' or 'validate'", required: false)]
        public string Mode { get; set; } = "extract";

        [AIParameter("Known terminology list for validation mode (JSON array)", required: false)]
        public string? KnownTerminology { get; set; }

        public IReadOnlyDictionary<string, Type> SupportedParameters => new Dictionary<string, Type>
        {
            ["content"] = typeof(string),
            ["context"] = typeof(string),
            ["domain"] = typeof(string),
            ["mode"] = typeof(string),
            ["knownTerminology"] = typeof(string)
        };

        public TerminologyAnalysisPlugin(ILogger<TerminologyAnalysisPlugin>? logger = null, IAIPluginRegistry? pluginRegistry = null)
        {
            _logger = logger;
            _pluginRegistry = pluginRegistry;
        }

        public async Task<AIPluginResult> ExecuteAsync(IReadOnlyDictionary<string, object> parameters)
        {
            try
            {
                // Extract parameters
                var content = parameters["content"].ToString() ?? string.Empty;
                var context = parameters.TryGetValue("context", out var ctx) ? ctx.ToString() ?? "software documentation" : "software documentation";
                var domain = parameters.TryGetValue("domain", out var dom) ? dom.ToString() ?? "software development" : "software development";
                var mode = parameters.TryGetValue("mode", out var m) ? m.ToString()?.ToLower() ?? "extract" : "extract";
                var knownTerminologyJson = parameters.TryGetValue("knownTerminology", out var kt) ? kt.ToString() : null;

                _logger?.LogInformation("Starting terminology analysis for {ContentLength} characters in {Context} domain",
                    content.Length, domain);

                if (string.IsNullOrWhiteSpace(content))
                {
                    return new AIPluginResult(new { Error = "Content cannot be empty" }, "Content is required for analysis");
                }

                var result = mode switch
                {
                    "extract" => await ExtractTerminologyAsync(content, context, domain),
                    "validate" => await ValidateTerminologyAsync(content, context, domain, knownTerminologyJson),
                    _ => throw new ArgumentException($"Unknown mode: {mode}. Use 'extract' or 'validate'")
                };

                _logger?.LogInformation("Terminology analysis completed successfully with {TermCount} terms analyzed",
                    result.GetValueOrDefault("totalTerms", 0));

                return new AIPluginResult(result, "Terminology analysis completed successfully");
            }
            catch (Exception ex)
            {
                _logger?.LogError(ex, "Error during terminology analysis");
                return new AIPluginResult(ex, "Terminology analysis failed");
            }
        }

        private async Task<Dictionary<string, object>> ExtractTerminologyAsync(string content, string context, string domain)
        {
            // First, extract potential terminology using basic pattern matching
            var candidateTerms = ExtractCandidateTerms(content);

            if (!candidateTerms.Any())
            {
                return new Dictionary<string, object>
                {
                    ["technicalTerms"] = new List<string>(),
                    ["commonWords"] = new List<string>(),
                    ["totalTerms"] = 0,
                    ["analysisMethod"] = "pattern-matching-only"
                };
            }

            // Use LLM analysis if plugin registry is available
            if (_pluginRegistry != null)
            {
                try
                {
                    var llmResult = await AnalyzeTerminologyWithLLM(candidateTerms, context, domain);
                    if (llmResult != null)
                    {
                        return llmResult;
                    }
                }
                catch (Exception ex)
                {
                    _logger?.LogWarning(ex, "LLM analysis failed, falling back to heuristic analysis");
                }
            }

            // Fallback to enhanced heuristic analysis
            return AnalyzeTerminologyWithHeuristics(candidateTerms, domain);
        }

        private async Task<Dictionary<string, object>> ValidateTerminologyAsync(string content, string context, string domain, string? knownTerminologyJson)
        {
            var candidateTerms = ExtractCandidateTerms(content);

            List<string> knownTerms;
            try
            {
                knownTerms = string.IsNullOrWhiteSpace(knownTerminologyJson)
                    ? new List<string>()
                    : JsonSerializer.Deserialize<List<string>>(knownTerminologyJson) ?? new List<string>();
            }
            catch (JsonException ex)
            {
                _logger?.LogWarning(ex, "Failed to parse known terminology JSON, using empty list");
                knownTerms = new List<string>();
            }

            var consistencyIssues = new List<Dictionary<string, object>>();
            var unmatchedTerms = new List<string>();
            var consistentTerms = new List<string>();

            foreach (var term in candidateTerms)
            {
                var normalizedTerm = NormalizeTerm(term);
                var matchingKnownTerms = knownTerms
                    .Where(kt => string.Equals(NormalizeTerm(kt), normalizedTerm, StringComparison.OrdinalIgnoreCase))
                    .ToList();

                if (matchingKnownTerms.Any())
                {
                    // Check for consistency (exact matches vs variations)
                    if (matchingKnownTerms.All(kt => kt == term))
                    {
                        consistentTerms.Add(term);
                    }
                    else
                    {
                        consistencyIssues.Add(new Dictionary<string, object>
                        {
                            ["foundTerm"] = term,
                            ["expectedTerms"] = matchingKnownTerms,
                            ["issueType"] = "inconsistent-casing-or-variation"
                        });
                    }
                }
                else
                {
                    // Check if it's likely a technical term that should be in known terminology
                    if (IsLikelyTechnicalTerm(term))
                    {
                        unmatchedTerms.Add(term);
                    }
                }
            }

            var consistencyScore = candidateTerms.Count > 0
                ? (decimal)(consistentTerms.Count) / candidateTerms.Count
                : 1.0m;

            return new Dictionary<string, object>
            {
                ["isConsistent"] = consistencyIssues.Count == 0 && unmatchedTerms.Count == 0,
                ["consistencyScore"] = consistencyScore,
                ["consistentTerms"] = consistentTerms,
                ["consistencyIssues"] = consistencyIssues,
                ["unmatchedTerms"] = unmatchedTerms,
                ["totalTermsAnalyzed"] = candidateTerms.Count,
                ["analysisMethod"] = "validation-with-known-terms"
            };
        }

        private async Task<Dictionary<string, object>?> AnalyzeTerminologyWithLLM(List<string> candidateTerms, string context, string domain)
        {
            if (_pluginRegistry == null) return null;

            try
            {
                // Prepare prompt for LLM analysis
                var prompt = $@"Analyze the following terms extracted from {context} in the {domain} domain.
Classify each term as either 'technical' (domain-specific terminology) or 'common' (general English words).

Terms to analyze: {string.Join(", ", candidateTerms.Take(50))} // Limit to prevent token overflow

Respond with a JSON object in this exact format:
{{
  ""technicalTerms"": [""term1"", ""term2""],
  ""commonWords"": [""word1"", ""word2""],
  ""reasoning"": ""Brief explanation of classification approach""
}}

Focus on identifying terms that are:
- Technical: API names, programming concepts, domain-specific jargon, technical processes
- Common: General English words, articles, prepositions, common adjectives/verbs

Be conservative - when in doubt, classify as technical to avoid false negatives.";

                // Use a hypothetical LLM analysis plugin (this would need to be implemented)
                var llmParameters = new Dictionary<string, object>
                {
                    ["prompt"] = prompt,
                    ["maxTokens"] = 1000,
                    ["temperature"] = 0.1 // Low temperature for consistent classification
                };

                // This is a placeholder - would need actual LLM plugin implementation
                // var llmResult = await _pluginRegistry.CallFunctionAsync("LLMAnalysis", llmParameters);

                // For now, return null to fall back to heuristics
                return null;
            }
            catch (Exception ex)
            {
                _logger?.LogError(ex, "Failed to analyze terminology with LLM");
                return null;
            }
        }

        private Dictionary<string, object> AnalyzeTerminologyWithHeuristics(List<string> candidateTerms, string domain)
        {
            var technicalTerms = new List<string>();
            var commonWords = new List<string>();

            foreach (var term in candidateTerms)
            {
                if (IsLikelyTechnicalTerm(term))
                {
                    technicalTerms.Add(term);
                }
                else
                {
                    commonWords.Add(term);
                }
            }

            return new Dictionary<string, object>
            {
                ["technicalTerms"] = technicalTerms,
                ["commonWords"] = commonWords,
                ["totalTerms"] = candidateTerms.Count,
                ["analysisMethod"] = "enhanced-heuristics",
                ["confidence"] = CalculateConfidenceScore(technicalTerms, commonWords)
            };
        }

        private List<string> ExtractCandidateTerms(string content)
        {
            // Extract capitalized terms that could be technical terminology
            var pattern = @"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b";
            var matches = System.Text.RegularExpressions.Regex.Matches(content, pattern);

            return matches
                .Cast<System.Text.RegularExpressions.Match>()
                .Select(m => m.Value.Trim())
                .Where(term => term.Length > 2) // Filter out very short terms
                .Distinct()
                .ToList();
        }

        private bool IsLikelyTechnicalTerm(string term)
        {
            // Enhanced heuristics for technical term detection
            var lowerTerm = term.ToLower();

            // Common English words that should not be considered technical
            var commonWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
            {
                "The", "This", "That", "These", "Those", "A", "An", "In", "On", "At", "By", "For", "With", "Without",
                "To", "From", "Of", "About", "Over", "Under", "Above", "Below", "Between", "Among", "Through",
                "During", "Before", "After", "While", "Since", "Until", "Although", "However", "Therefore",
                "Moreover", "Furthermore", "Nevertheless", "Otherwise", "Meanwhile", "Instead", "Rather",
                "Chapter", "Part", "Section", "Story", "Content", "Introduction", "Overview", "Summary",
                "All", "Each", "Every", "Some", "Any", "Many", "Few", "Several", "Most", "More", "Less",
                "First", "Second", "Third", "Last", "Next", "Previous", "Final", "Initial", "Primary",
                "Good", "Bad", "Better", "Best", "Worse", "Worst", "Great", "Small", "Large", "Big", "Little",
                "New", "Old", "Young", "High", "Low", "Long", "Short", "Wide", "Narrow", "Deep", "Shallow"
            };

            if (commonWords.Contains(term))
            {
                return false;
            }

            // Technical term indicators
            var technicalIndicators = new[]
            {
                // Programming concepts
                "api", "sdk", "framework", "library", "service", "controller", "manager", "handler", "provider",
                "factory", "builder", "repository", "middleware", "plugin", "component", "module",

                // Software architecture
                "microservice", "monolith", "database", "cache", "queue", "pipeline", "workflow", "architecture",

                // Technical processes
                "authentication", "authorization", "validation", "serialization", "encryption", "deployment",
                "integration", "orchestration", "synchronization", "optimization",

                // Data concepts
                "model", "entity", "schema", "migration", "index", "query", "transaction", "connection"
            };

            // Check if term contains technical indicators
            if (technicalIndicators.Any(indicator => lowerTerm.Contains(indicator)))
            {
                return true;
            }

            // Check for common technical naming patterns
            if (term.EndsWith("Service") || term.EndsWith("Manager") || term.EndsWith("Controller") ||
                term.EndsWith("Handler") || term.EndsWith("Provider") || term.EndsWith("Factory") ||
                term.EndsWith("Builder") || term.EndsWith("Repository") || term.EndsWith("Plugin") ||
                term.EndsWith("Component") || term.EndsWith("Module") || term.EndsWith("Helper"))
            {
                return true;
            }

            // Check for technical acronyms (2-5 uppercase letters)
            if (System.Text.RegularExpressions.Regex.IsMatch(term, @"^[A-Z]{2,5}$"))
            {
                return true;
            }

            // Check for camelCase or PascalCase (likely code identifiers)
            if (System.Text.RegularExpressions.Regex.IsMatch(term, @"^[A-Z][a-z]+[A-Z][a-zA-Z]*$"))
            {
                return true;
            }

            // Default to false for common words
            return false;
        }

        private string NormalizeTerm(string term)
        {
            // Remove common articles and normalize whitespace
            var normalized = term.Trim();
            var articles = new[] { "The ", "A ", "An " };

            foreach (var article in articles)
            {
                if (normalized.StartsWith(article, StringComparison.OrdinalIgnoreCase))
                {
                    normalized = normalized.Substring(article.Length);
                    break;
                }
            }

            return normalized;
        }

        private decimal CalculateConfidenceScore(List<string> technicalTerms, List<string> commonWords)
        {
            var totalTerms = technicalTerms.Count + commonWords.Count;
            if (totalTerms == 0) return 1.0m;

            // Confidence is higher when we have clear separation between technical and common terms
            var technicalRatio = (decimal)technicalTerms.Count / totalTerms;

            // Confidence is highest around 0.3-0.7 ratio (mixed but clear distinction)
            // Lower confidence for all technical (1.0) or all common (0.0)
            if (technicalRatio >= 0.3m && technicalRatio <= 0.7m)
            {
                return 0.9m;
            }
            else if (technicalRatio >= 0.2m && technicalRatio <= 0.8m)
            {
                return 0.8m;
            }
            else
            {
                return 0.7m;
            }
        }
    }
}