367 lines
11 KiB
C#
Executable File
367 lines
11 KiB
C#
Executable File
using System.Text.RegularExpressions;
|
|
using Microsoft.Extensions.Logging;
|
|
using MarketAlly.AIPlugin.Context.Configuration;
|
|
using System.Text;
|
|
|
|
namespace MarketAlly.AIPlugin.Context.Search
|
|
{
|
|
/// <summary>
|
|
/// Enhanced search engine with semantic search and fuzzy matching capabilities
|
|
/// </summary>
|
|
public class EnhancedSearchEngine
|
|
{
|
|
private readonly ContextConfiguration _configuration;
|
|
private readonly ILogger<EnhancedSearchEngine> _logger;
|
|
private readonly SemanticSearchEnhancer? _semanticSearch;
|
|
private readonly FuzzyMatcher _fuzzyMatcher;
|
|
|
|
public EnhancedSearchEngine(
|
|
ContextConfiguration configuration,
|
|
ILogger<EnhancedSearchEngine> logger,
|
|
SemanticSearchEnhancer? semanticSearch = null)
|
|
{
|
|
_configuration = configuration;
|
|
_logger = logger;
|
|
_semanticSearch = semanticSearch;
|
|
_fuzzyMatcher = new FuzzyMatcher(configuration.Search.FuzzyMatchingThreshold);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Performs enhanced search with semantic understanding and fuzzy matching
|
|
/// </summary>
|
|
public async Task<EnhancedSearchResults> SearchAsync(
|
|
string query,
|
|
IEnumerable<StoredContextEntry> entries,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
var startTime = DateTime.UtcNow;
|
|
var results = new List<EnhancedSearchResult>();
|
|
var queryTerms = ExtractSearchTerms(query);
|
|
|
|
_logger.LogDebug("Starting enhanced search for query: {Query} with {EntryCount} entries",
|
|
query, entries.Count());
|
|
|
|
foreach (var entry in entries)
|
|
{
|
|
if (cancellationToken.IsCancellationRequested)
|
|
break;
|
|
|
|
var searchResult = await AnalyzeEntryAsync(query, queryTerms, entry, cancellationToken);
|
|
if (searchResult.TotalRelevance > 0)
|
|
{
|
|
results.Add(searchResult);
|
|
}
|
|
}
|
|
|
|
// Sort by relevance score
|
|
results = results.OrderByDescending(r => r.TotalRelevance)
|
|
.ThenByDescending(r => r.Entry.Timestamp)
|
|
.Take(_configuration.Search.MaxSearchResults)
|
|
.ToList();
|
|
|
|
var searchDuration = DateTime.UtcNow - startTime;
|
|
_logger.LogInformation("Enhanced search completed in {Duration}ms, found {ResultCount} relevant entries",
|
|
searchDuration.TotalMilliseconds, results.Count);
|
|
|
|
return new EnhancedSearchResults
|
|
{
|
|
Query = query,
|
|
Results = results,
|
|
TotalFound = results.Count,
|
|
SearchDuration = searchDuration,
|
|
UsedSemanticSearch = _semanticSearch != null && _configuration.Search.EnableSemanticSearch,
|
|
UsedFuzzyMatching = _configuration.Search.EnableFuzzyMatching
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Analyzes a single context entry for relevance to the search query
|
|
/// </summary>
|
|
private async Task<EnhancedSearchResult> AnalyzeEntryAsync(
|
|
string query,
|
|
List<string> queryTerms,
|
|
StoredContextEntry entry,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var result = new EnhancedSearchResult
|
|
{
|
|
Entry = entry,
|
|
MatchedTerms = new List<string>(),
|
|
RelevanceScores = new RelevanceBreakdown()
|
|
};
|
|
|
|
// 1. Exact keyword matching
|
|
result.RelevanceScores.KeywordRelevance = CalculateKeywordRelevance(queryTerms, entry, result.MatchedTerms);
|
|
|
|
// 2. Fuzzy matching (if enabled)
|
|
if (_configuration.Search.EnableFuzzyMatching)
|
|
{
|
|
result.RelevanceScores.FuzzyRelevance = CalculateFuzzyRelevance(query, entry, result.MatchedTerms);
|
|
}
|
|
|
|
// 3. Semantic similarity (if enabled and available)
|
|
if (_configuration.Search.EnableSemanticSearch && _semanticSearch != null)
|
|
{
|
|
try
|
|
{
|
|
result.RelevanceScores.SemanticRelevance = await _semanticSearch.CalculateSemanticSimilarityAsync(
|
|
query, $"{entry.Summary} {entry.Content}", cancellationToken);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to calculate semantic similarity for entry {EntryId}", entry.Id);
|
|
}
|
|
}
|
|
|
|
// 4. Context-specific scoring
|
|
result.RelevanceScores.ContextRelevance = CalculateContextRelevance(query, entry);
|
|
|
|
// 5. Recency boost
|
|
result.RelevanceScores.RecencyBoost = CalculateRecencyBoost(entry.Timestamp);
|
|
|
|
// Calculate total relevance
|
|
result.TotalRelevance = CalculateTotalRelevance(result.RelevanceScores);
|
|
|
|
return result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates keyword-based relevance score
|
|
/// </summary>
|
|
private double CalculateKeywordRelevance(List<string> queryTerms, StoredContextEntry entry, List<string> matchedTerms)
|
|
{
|
|
var relevance = 0.0;
|
|
var summaryLower = entry.Summary.ToLower();
|
|
var contentLower = entry.Content.ToLower();
|
|
var tagsLower = entry.Tags.Select(t => t.ToLower()).ToList();
|
|
|
|
foreach (var term in queryTerms)
|
|
{
|
|
var termLower = term.ToLower();
|
|
var termRelevance = 0.0;
|
|
|
|
// Summary matches (highest weight)
|
|
if (summaryLower.Contains(termLower))
|
|
{
|
|
termRelevance += 3.0;
|
|
if (!matchedTerms.Contains(term))
|
|
matchedTerms.Add(term);
|
|
}
|
|
|
|
// Tag matches (high weight)
|
|
if (tagsLower.Any(tag => tag.Contains(termLower)))
|
|
{
|
|
termRelevance += 2.5;
|
|
if (!matchedTerms.Contains(term))
|
|
matchedTerms.Add(term);
|
|
}
|
|
|
|
// Content matches (medium weight)
|
|
if (contentLower.Contains(termLower))
|
|
{
|
|
termRelevance += 1.0;
|
|
if (!matchedTerms.Contains(term))
|
|
matchedTerms.Add(term);
|
|
}
|
|
|
|
// Boost for longer terms (more specific)
|
|
if (term.Length > 5)
|
|
{
|
|
termRelevance *= 1.2;
|
|
}
|
|
|
|
relevance += termRelevance;
|
|
}
|
|
|
|
return relevance;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates fuzzy matching relevance score
|
|
/// </summary>
|
|
private double CalculateFuzzyRelevance(string query, StoredContextEntry entry, List<string> matchedTerms)
|
|
{
|
|
var relevance = 0.0;
|
|
|
|
// Fuzzy match against summary
|
|
var summaryScore = _fuzzyMatcher.CalculateSimilarity(query, entry.Summary);
|
|
if (summaryScore > _configuration.Search.FuzzyMatchingThreshold)
|
|
{
|
|
relevance += summaryScore * 2.0; // High weight for summary matches
|
|
}
|
|
|
|
// Fuzzy match against tags
|
|
foreach (var tag in entry.Tags)
|
|
{
|
|
var tagScore = _fuzzyMatcher.CalculateSimilarity(query, tag);
|
|
if (tagScore > _configuration.Search.FuzzyMatchingThreshold)
|
|
{
|
|
relevance += tagScore * 1.5; // Medium-high weight for tag matches
|
|
}
|
|
}
|
|
|
|
// Fuzzy match against content (but limit to prevent overwhelming)
|
|
var contentWords = entry.Content.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
|
.Take(50) // Only check first 50 words
|
|
.ToArray();
|
|
|
|
foreach (var word in contentWords)
|
|
{
|
|
var wordScore = _fuzzyMatcher.CalculateSimilarity(query, word);
|
|
if (wordScore > _configuration.Search.FuzzyMatchingThreshold)
|
|
{
|
|
relevance += wordScore * 0.5; // Lower weight for content word matches
|
|
}
|
|
}
|
|
|
|
return relevance;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates context-specific relevance based on entry type and priority
|
|
/// </summary>
|
|
private double CalculateContextRelevance(string query, StoredContextEntry entry)
|
|
{
|
|
var relevance = 0.0;
|
|
|
|
// Priority-based scoring
|
|
relevance += entry.Priority.ToLower() switch
|
|
{
|
|
"critical" => 2.0,
|
|
"high" => 1.5,
|
|
"medium" => 1.0,
|
|
"low" => 0.5,
|
|
_ => 1.0
|
|
};
|
|
|
|
// Type-based scoring
|
|
relevance += entry.Type.ToLower() switch
|
|
{
|
|
"decision" => 1.5, // Decisions are important
|
|
"milestone" => 1.3, // Milestones are significant
|
|
"insight" => 1.2, // Insights are valuable
|
|
"codechange" => 1.0, // Code changes are relevant
|
|
"conversation" => 0.8, // Conversations are less structured
|
|
_ => 1.0
|
|
};
|
|
|
|
return relevance;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates recency boost based on entry timestamp
|
|
/// </summary>
|
|
private double CalculateRecencyBoost(DateTime timestamp)
|
|
{
|
|
var daysSinceCreated = (DateTime.UtcNow - timestamp).TotalDays;
|
|
|
|
// Recent entries get a boost
|
|
return daysSinceCreated switch
|
|
{
|
|
<= 1 => 1.5, // Last 24 hours
|
|
<= 7 => 1.2, // Last week
|
|
<= 30 => 1.0, // Last month
|
|
<= 90 => 0.8, // Last quarter
|
|
_ => 0.6 // Older than 3 months
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates total relevance score from all components
|
|
/// </summary>
|
|
private double CalculateTotalRelevance(RelevanceBreakdown scores)
|
|
{
|
|
var total = scores.KeywordRelevance * 0.4; // 40% weight
|
|
total += scores.FuzzyRelevance * 0.2; // 20% weight
|
|
total += scores.SemanticRelevance * 0.25; // 25% weight
|
|
total += scores.ContextRelevance * 0.1; // 10% weight
|
|
total += scores.RecencyBoost * 0.05; // 5% weight
|
|
|
|
return total;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extracts meaningful terms from the search query
|
|
/// </summary>
|
|
private List<string> ExtractSearchTerms(string query)
|
|
{
|
|
var terms = new List<string>();
|
|
|
|
// Split by common delimiters and clean up
|
|
var rawTerms = Regex.Split(query.ToLower(), @"[\s,;.!?]+")
|
|
.Where(t => t.Length > 2) // Ignore very short terms
|
|
.Where(t => !IsStopWord(t))
|
|
.ToList();
|
|
|
|
terms.AddRange(rawTerms);
|
|
|
|
// Also add quoted phrases
|
|
var quotedPhrases = Regex.Matches(query, @"""([^""]+)""")
|
|
.Cast<Match>()
|
|
.Select(m => m.Groups[1].Value.ToLower())
|
|
.Where(p => p.Length > 2);
|
|
|
|
terms.AddRange(quotedPhrases);
|
|
|
|
// Add the full query for exact phrase matching (if long enough)
|
|
if (query.Length > 5)
|
|
{
|
|
terms.Add(query.ToLower());
|
|
}
|
|
|
|
return terms.Distinct().ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if a word is a common stop word
|
|
/// </summary>
|
|
private bool IsStopWord(string word)
|
|
{
|
|
var stopWords = new HashSet<string>
|
|
{
|
|
"the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
|
|
"is", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did",
|
|
"will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those",
|
|
"a", "an", "as", "if", "then", "than", "when", "where", "why", "how", "what", "who", "which"
|
|
};
|
|
|
|
return stopWords.Contains(word);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Enhanced search results with detailed scoring information
|
|
/// </summary>
|
|
public class EnhancedSearchResults
|
|
{
|
|
public string Query { get; set; } = "";
|
|
public List<EnhancedSearchResult> Results { get; set; } = new();
|
|
public int TotalFound { get; set; }
|
|
public TimeSpan SearchDuration { get; set; }
|
|
public bool UsedSemanticSearch { get; set; }
|
|
public bool UsedFuzzyMatching { get; set; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Individual search result with detailed relevance scoring
|
|
/// </summary>
|
|
public class EnhancedSearchResult
|
|
{
|
|
public StoredContextEntry Entry { get; set; } = new();
|
|
public List<string> MatchedTerms { get; set; } = new();
|
|
public RelevanceBreakdown RelevanceScores { get; set; } = new();
|
|
public double TotalRelevance { get; set; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Breakdown of relevance scoring components
|
|
/// </summary>
|
|
public class RelevanceBreakdown
|
|
{
|
|
public double KeywordRelevance { get; set; }
|
|
public double FuzzyRelevance { get; set; }
|
|
public double SemanticRelevance { get; set; }
|
|
public double ContextRelevance { get; set; }
|
|
public double RecencyBoost { get; set; }
|
|
}
|
|
} |