MarketAlly.AIPlugin.Extensions/MarketAlly.AIPlugin.Context/Search/FuzzyMatcher.cs

282 lines
7.3 KiB
C#
Executable File

using System.Text;
namespace MarketAlly.AIPlugin.Context.Search
{
/// <summary>
/// Provides fuzzy string matching capabilities for search operations
/// </summary>
public class FuzzyMatcher
{
private readonly double _threshold;
public FuzzyMatcher(double threshold = 0.7)
{
_threshold = threshold;
}
/// <summary>
/// Calculates similarity between two strings using multiple algorithms
/// </summary>
public double CalculateSimilarity(string source, string target)
{
if (string.IsNullOrEmpty(source) || string.IsNullOrEmpty(target))
return 0.0;
// Normalize strings
var normalizedSource = NormalizeString(source);
var normalizedTarget = NormalizeString(target);
// If strings are identical after normalization, return perfect match
if (normalizedSource.Equals(normalizedTarget, StringComparison.OrdinalIgnoreCase))
return 1.0;
// Combine multiple similarity algorithms for better accuracy
var levenshteinRatio = CalculateLevenshteinRatio(normalizedSource, normalizedTarget);
var jaroWinklerScore = CalculateJaroWinkler(normalizedSource, normalizedTarget);
var tokenSetRatio = CalculateTokenSetRatio(normalizedSource, normalizedTarget);
// Weighted combination of different algorithms
var combinedScore = (levenshteinRatio * 0.4) + (jaroWinklerScore * 0.4) + (tokenSetRatio * 0.2);
return Math.Min(1.0, combinedScore);
}
/// <summary>
/// Checks if two strings are similar based on the configured threshold
/// </summary>
public bool AreSimilar(string source, string target)
{
return CalculateSimilarity(source, target) >= _threshold;
}
/// <summary>
/// Finds the best match for a query in a collection of candidates
/// </summary>
public FuzzyMatchResult FindBestMatch(string query, IEnumerable<string> candidates)
{
var bestMatch = new FuzzyMatchResult { Query = query };
foreach (var candidate in candidates)
{
var similarity = CalculateSimilarity(query, candidate);
if (similarity > bestMatch.Score)
{
bestMatch.Match = candidate;
bestMatch.Score = similarity;
}
}
bestMatch.IsMatch = bestMatch.Score >= _threshold;
return bestMatch;
}
/// <summary>
/// Finds all matches above the threshold
/// </summary>
public List<FuzzyMatchResult> FindMatches(string query, IEnumerable<string> candidates)
{
var matches = new List<FuzzyMatchResult>();
foreach (var candidate in candidates)
{
var similarity = CalculateSimilarity(query, candidate);
if (similarity >= _threshold)
{
matches.Add(new FuzzyMatchResult
{
Query = query,
Match = candidate,
Score = similarity,
IsMatch = true
});
}
}
return matches.OrderByDescending(m => m.Score).ToList();
}
/// <summary>
/// Normalizes a string for better matching
/// </summary>
private string NormalizeString(string input)
{
if (string.IsNullOrEmpty(input))
return string.Empty;
return input.ToLowerInvariant()
.Trim()
.Replace(" ", " ") // Remove double spaces
.Replace("-", " ")
.Replace("_", " ");
}
/// <summary>
/// Calculates Levenshtein distance ratio (1 - distance/max_length)
/// </summary>
private double CalculateLevenshteinRatio(string source, string target)
{
var distance = CalculateLevenshteinDistance(source, target);
var maxLength = Math.Max(source.Length, target.Length);
if (maxLength == 0)
return 1.0;
return 1.0 - (double)distance / maxLength;
}
/// <summary>
/// Calculates Levenshtein distance between two strings
/// </summary>
private int CalculateLevenshteinDistance(string source, string target)
{
if (string.IsNullOrEmpty(source))
return target?.Length ?? 0;
if (string.IsNullOrEmpty(target))
return source.Length;
var sourceLength = source.Length;
var targetLength = target.Length;
var matrix = new int[sourceLength + 1, targetLength + 1];
// Initialize first row and column
for (int i = 0; i <= sourceLength; i++)
matrix[i, 0] = i;
for (int j = 0; j <= targetLength; j++)
matrix[0, j] = j;
// Calculate distances
for (int i = 1; i <= sourceLength; i++)
{
for (int j = 1; j <= targetLength; j++)
{
var cost = source[i - 1] == target[j - 1] ? 0 : 1;
matrix[i, j] = Math.Min(
Math.Min(matrix[i - 1, j] + 1, matrix[i, j - 1] + 1),
matrix[i - 1, j - 1] + cost);
}
}
return matrix[sourceLength, targetLength];
}
/// <summary>
/// Calculates Jaro-Winkler similarity
/// </summary>
private double CalculateJaroWinkler(string source, string target)
{
var jaroScore = CalculateJaro(source, target);
if (jaroScore < 0.7) // Jaro-Winkler only applies prefix scaling if Jaro > 0.7
return jaroScore;
// Calculate common prefix (up to 4 characters)
var prefix = 0;
var minLength = Math.Min(source.Length, Math.Min(target.Length, 4));
for (int i = 0; i < minLength; i++)
{
if (source[i] == target[i])
prefix++;
else
break;
}
return jaroScore + (0.1 * prefix * (1 - jaroScore));
}
/// <summary>
/// Calculates Jaro similarity
/// </summary>
private double CalculateJaro(string source, string target)
{
if (source.Length == 0 && target.Length == 0)
return 1.0;
if (source.Length == 0 || target.Length == 0)
return 0.0;
var matchWindow = Math.Max(source.Length, target.Length) / 2 - 1;
if (matchWindow < 0)
matchWindow = 0;
var sourceMatches = new bool[source.Length];
var targetMatches = new bool[target.Length];
var matches = 0;
var transpositions = 0;
// Find matches
for (int i = 0; i < source.Length; i++)
{
var start = Math.Max(0, i - matchWindow);
var end = Math.Min(i + matchWindow + 1, target.Length);
for (int j = start; j < end; j++)
{
if (targetMatches[j] || source[i] != target[j])
continue;
sourceMatches[i] = true;
targetMatches[j] = true;
matches++;
break;
}
}
if (matches == 0)
return 0.0;
// Count transpositions
var k = 0;
for (int i = 0; i < source.Length; i++)
{
if (!sourceMatches[i])
continue;
while (!targetMatches[k])
k++;
if (source[i] != target[k])
transpositions++;
k++;
}
return (matches / (double)source.Length +
matches / (double)target.Length +
(matches - transpositions / 2.0) / matches) / 3.0;
}
/// <summary>
/// Calculates token set ratio for better handling of word order differences
/// </summary>
private double CalculateTokenSetRatio(string source, string target)
{
var sourceTokens = new HashSet<string>(source.Split(' ', StringSplitOptions.RemoveEmptyEntries));
var targetTokens = new HashSet<string>(target.Split(' ', StringSplitOptions.RemoveEmptyEntries));
if (sourceTokens.Count == 0 && targetTokens.Count == 0)
return 1.0;
if (sourceTokens.Count == 0 || targetTokens.Count == 0)
return 0.0;
var intersection = sourceTokens.Intersect(targetTokens).Count();
var union = sourceTokens.Union(targetTokens).Count();
return (double)intersection / union;
}
}
/// <summary>
/// Result of a fuzzy matching operation
/// </summary>
public class FuzzyMatchResult
{
public string Query { get; set; } = "";
public string Match { get; set; } = "";
public double Score { get; set; }
public bool IsMatch { get; set; }
}
}