282 lines
7.3 KiB
C#
Executable File
282 lines
7.3 KiB
C#
Executable File
using System.Text;
|
|
|
|
namespace MarketAlly.AIPlugin.Context.Search
|
|
{
|
|
/// <summary>
|
|
/// Provides fuzzy string matching capabilities for search operations
|
|
/// </summary>
|
|
public class FuzzyMatcher
|
|
{
|
|
private readonly double _threshold;
|
|
|
|
public FuzzyMatcher(double threshold = 0.7)
|
|
{
|
|
_threshold = threshold;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates similarity between two strings using multiple algorithms
|
|
/// </summary>
|
|
public double CalculateSimilarity(string source, string target)
|
|
{
|
|
if (string.IsNullOrEmpty(source) || string.IsNullOrEmpty(target))
|
|
return 0.0;
|
|
|
|
// Normalize strings
|
|
var normalizedSource = NormalizeString(source);
|
|
var normalizedTarget = NormalizeString(target);
|
|
|
|
// If strings are identical after normalization, return perfect match
|
|
if (normalizedSource.Equals(normalizedTarget, StringComparison.OrdinalIgnoreCase))
|
|
return 1.0;
|
|
|
|
// Combine multiple similarity algorithms for better accuracy
|
|
var levenshteinRatio = CalculateLevenshteinRatio(normalizedSource, normalizedTarget);
|
|
var jaroWinklerScore = CalculateJaroWinkler(normalizedSource, normalizedTarget);
|
|
var tokenSetRatio = CalculateTokenSetRatio(normalizedSource, normalizedTarget);
|
|
|
|
// Weighted combination of different algorithms
|
|
var combinedScore = (levenshteinRatio * 0.4) + (jaroWinklerScore * 0.4) + (tokenSetRatio * 0.2);
|
|
|
|
return Math.Min(1.0, combinedScore);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if two strings are similar based on the configured threshold
|
|
/// </summary>
|
|
public bool AreSimilar(string source, string target)
|
|
{
|
|
return CalculateSimilarity(source, target) >= _threshold;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Finds the best match for a query in a collection of candidates
|
|
/// </summary>
|
|
public FuzzyMatchResult FindBestMatch(string query, IEnumerable<string> candidates)
|
|
{
|
|
var bestMatch = new FuzzyMatchResult { Query = query };
|
|
|
|
foreach (var candidate in candidates)
|
|
{
|
|
var similarity = CalculateSimilarity(query, candidate);
|
|
if (similarity > bestMatch.Score)
|
|
{
|
|
bestMatch.Match = candidate;
|
|
bestMatch.Score = similarity;
|
|
}
|
|
}
|
|
|
|
bestMatch.IsMatch = bestMatch.Score >= _threshold;
|
|
return bestMatch;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Finds all matches above the threshold
|
|
/// </summary>
|
|
public List<FuzzyMatchResult> FindMatches(string query, IEnumerable<string> candidates)
|
|
{
|
|
var matches = new List<FuzzyMatchResult>();
|
|
|
|
foreach (var candidate in candidates)
|
|
{
|
|
var similarity = CalculateSimilarity(query, candidate);
|
|
if (similarity >= _threshold)
|
|
{
|
|
matches.Add(new FuzzyMatchResult
|
|
{
|
|
Query = query,
|
|
Match = candidate,
|
|
Score = similarity,
|
|
IsMatch = true
|
|
});
|
|
}
|
|
}
|
|
|
|
return matches.OrderByDescending(m => m.Score).ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Normalizes a string for better matching
|
|
/// </summary>
|
|
private string NormalizeString(string input)
|
|
{
|
|
if (string.IsNullOrEmpty(input))
|
|
return string.Empty;
|
|
|
|
return input.ToLowerInvariant()
|
|
.Trim()
|
|
.Replace(" ", " ") // Remove double spaces
|
|
.Replace("-", " ")
|
|
.Replace("_", " ");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates Levenshtein distance ratio (1 - distance/max_length)
|
|
/// </summary>
|
|
private double CalculateLevenshteinRatio(string source, string target)
|
|
{
|
|
var distance = CalculateLevenshteinDistance(source, target);
|
|
var maxLength = Math.Max(source.Length, target.Length);
|
|
|
|
if (maxLength == 0)
|
|
return 1.0;
|
|
|
|
return 1.0 - (double)distance / maxLength;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates Levenshtein distance between two strings
|
|
/// </summary>
|
|
private int CalculateLevenshteinDistance(string source, string target)
|
|
{
|
|
if (string.IsNullOrEmpty(source))
|
|
return target?.Length ?? 0;
|
|
|
|
if (string.IsNullOrEmpty(target))
|
|
return source.Length;
|
|
|
|
var sourceLength = source.Length;
|
|
var targetLength = target.Length;
|
|
var matrix = new int[sourceLength + 1, targetLength + 1];
|
|
|
|
// Initialize first row and column
|
|
for (int i = 0; i <= sourceLength; i++)
|
|
matrix[i, 0] = i;
|
|
|
|
for (int j = 0; j <= targetLength; j++)
|
|
matrix[0, j] = j;
|
|
|
|
// Calculate distances
|
|
for (int i = 1; i <= sourceLength; i++)
|
|
{
|
|
for (int j = 1; j <= targetLength; j++)
|
|
{
|
|
var cost = source[i - 1] == target[j - 1] ? 0 : 1;
|
|
matrix[i, j] = Math.Min(
|
|
Math.Min(matrix[i - 1, j] + 1, matrix[i, j - 1] + 1),
|
|
matrix[i - 1, j - 1] + cost);
|
|
}
|
|
}
|
|
|
|
return matrix[sourceLength, targetLength];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates Jaro-Winkler similarity
|
|
/// </summary>
|
|
private double CalculateJaroWinkler(string source, string target)
|
|
{
|
|
var jaroScore = CalculateJaro(source, target);
|
|
|
|
if (jaroScore < 0.7) // Jaro-Winkler only applies prefix scaling if Jaro > 0.7
|
|
return jaroScore;
|
|
|
|
// Calculate common prefix (up to 4 characters)
|
|
var prefix = 0;
|
|
var minLength = Math.Min(source.Length, Math.Min(target.Length, 4));
|
|
|
|
for (int i = 0; i < minLength; i++)
|
|
{
|
|
if (source[i] == target[i])
|
|
prefix++;
|
|
else
|
|
break;
|
|
}
|
|
|
|
return jaroScore + (0.1 * prefix * (1 - jaroScore));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates Jaro similarity
|
|
/// </summary>
|
|
private double CalculateJaro(string source, string target)
|
|
{
|
|
if (source.Length == 0 && target.Length == 0)
|
|
return 1.0;
|
|
|
|
if (source.Length == 0 || target.Length == 0)
|
|
return 0.0;
|
|
|
|
var matchWindow = Math.Max(source.Length, target.Length) / 2 - 1;
|
|
if (matchWindow < 0)
|
|
matchWindow = 0;
|
|
|
|
var sourceMatches = new bool[source.Length];
|
|
var targetMatches = new bool[target.Length];
|
|
var matches = 0;
|
|
var transpositions = 0;
|
|
|
|
// Find matches
|
|
for (int i = 0; i < source.Length; i++)
|
|
{
|
|
var start = Math.Max(0, i - matchWindow);
|
|
var end = Math.Min(i + matchWindow + 1, target.Length);
|
|
|
|
for (int j = start; j < end; j++)
|
|
{
|
|
if (targetMatches[j] || source[i] != target[j])
|
|
continue;
|
|
|
|
sourceMatches[i] = true;
|
|
targetMatches[j] = true;
|
|
matches++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (matches == 0)
|
|
return 0.0;
|
|
|
|
// Count transpositions
|
|
var k = 0;
|
|
for (int i = 0; i < source.Length; i++)
|
|
{
|
|
if (!sourceMatches[i])
|
|
continue;
|
|
|
|
while (!targetMatches[k])
|
|
k++;
|
|
|
|
if (source[i] != target[k])
|
|
transpositions++;
|
|
|
|
k++;
|
|
}
|
|
|
|
return (matches / (double)source.Length +
|
|
matches / (double)target.Length +
|
|
(matches - transpositions / 2.0) / matches) / 3.0;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates token set ratio for better handling of word order differences
|
|
/// </summary>
|
|
private double CalculateTokenSetRatio(string source, string target)
|
|
{
|
|
var sourceTokens = new HashSet<string>(source.Split(' ', StringSplitOptions.RemoveEmptyEntries));
|
|
var targetTokens = new HashSet<string>(target.Split(' ', StringSplitOptions.RemoveEmptyEntries));
|
|
|
|
if (sourceTokens.Count == 0 && targetTokens.Count == 0)
|
|
return 1.0;
|
|
|
|
if (sourceTokens.Count == 0 || targetTokens.Count == 0)
|
|
return 0.0;
|
|
|
|
var intersection = sourceTokens.Intersect(targetTokens).Count();
|
|
var union = sourceTokens.Union(targetTokens).Count();
|
|
|
|
return (double)intersection / union;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Result of a fuzzy matching operation
|
|
/// </summary>
|
|
public class FuzzyMatchResult
|
|
{
|
|
public string Query { get; set; } = "";
|
|
public string Match { get; set; } = "";
|
|
public double Score { get; set; }
|
|
public bool IsMatch { get; set; }
|
|
}
|
|
} |