Files
allstarr/allstarr/Services/Common/FuzzyMatcher.cs
T
joshpatra 40338ce25f
Docker Build & Push / build-and-test (push) Has been cancelled
Docker Build & Push / docker (push) Has been cancelled
v1.0.0: Lots of WebUI fixes, API fixes, refactored all of caching, general bug fixes, redid all log messages
2026-02-11 23:24:40 -05:00

363 lines
13 KiB
C#

namespace allstarr.Services.Common;
/// <summary>
/// Provides fuzzy string matching for search result scoring.
/// OPTIMAL ORDER: 1. Strip decorators → 2. Substring matching → 3. Levenshtein → 4. Greedy assignment
/// </summary>
public static class FuzzyMatcher
{
/// <summary>
/// STEP 1: Strips common decorators from track titles to improve matching.
/// Removes: (feat. X), (with Y), (ft. Z), - From "Album", [Remix], etc.
/// This MUST be done first to avoid systematic noise in matching.
/// </summary>
public static string StripDecorators(string title)
{
if (string.IsNullOrWhiteSpace(title))
{
return string.Empty;
}
var cleaned = title;
// Remove (feat. ...), (ft. ...), (with ...), (featuring ...)
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*[\(\[]?\s*(feat\.?|ft\.?|with|featuring)\s+[^\)\]]+[\)\]]?",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove - From "Album Name" or - From Album Name
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*-\s*from\s+[""']?[^""']+[""']?",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove - Remastered, - Radio Edit, etc.
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*-\s*(remaster|radio edit|single version|album version|extended|original mix)[^\-]*",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove [Remix], [Remaster], [Live], [Explicit], etc.
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*[\[\(](remix|remaster|live|acoustic|radio edit|explicit|clean|official|audio|video|lyric)[^\]\)]*[\]\)]",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove trailing/leading whitespace and normalize
cleaned = cleaned.Trim();
return cleaned;
}
/// <summary>
/// Calculates similarity score following OPTIMAL ORDER:
/// 1. Strip decorators (already done by caller)
/// 2. Substring matching (cheap, high-precision)
/// 3. Token-based matching (handles word order)
/// 4. Levenshtein distance (expensive, fuzzy)
/// Returns score 0-100.
/// </summary>
public static int CalculateSimilarity(string query, string target)
{
if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(target))
{
return 0;
}
var queryNorm = NormalizeForMatching(query);
var targetNorm = NormalizeForMatching(target);
// STEP 2: SUBSTRING MATCHING (cheap, high-precision)
// Exact match
if (queryNorm == targetNorm)
{
return 100;
}
// One string fully contains the other (substring match)
// Example: "luther" ⊂ "luther remastered" → instant win
if (targetNorm.Contains(queryNorm) || queryNorm.Contains(targetNorm))
{
return 95;
}
// Starts with query
if (targetNorm.StartsWith(queryNorm) || queryNorm.StartsWith(targetNorm))
{
return 90;
}
// Contains query as whole word
if (targetNorm.Contains($" {queryNorm} ") ||
targetNorm.StartsWith($"{queryNorm} ") ||
targetNorm.EndsWith($" {queryNorm}") ||
queryNorm.Contains($" {targetNorm} ") ||
queryNorm.StartsWith($"{targetNorm} ") ||
queryNorm.EndsWith($" {targetNorm}"))
{
return 85;
}
// STEP 3: TOKEN-BASED MATCHING (handles word order)
var tokens1 = queryNorm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
var tokens2 = targetNorm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
if (tokens1.Length > 0 && tokens2.Length > 0)
{
// Calculate how many tokens match (order-independent)
var matchedTokens = 0.0; // Use double for partial matches
var usedTokens = new HashSet<int>();
foreach (var token1 in tokens1)
{
for (int i = 0; i < tokens2.Length; i++)
{
if (usedTokens.Contains(i)) continue;
var token2 = tokens2[i];
// Exact token match
if (token1 == token2)
{
matchedTokens++;
usedTokens.Add(i);
break;
}
// Partial token match (one contains the other)
else if (token1.Contains(token2) || token2.Contains(token1))
{
matchedTokens += 0.8; // Partial credit
usedTokens.Add(i);
break;
}
}
}
// Calculate token match percentage
var maxTokens = Math.Max(tokens1.Length, tokens2.Length);
var tokenMatchScore = (matchedTokens / maxTokens) * 100.0;
// If token match is very high (90%+), return it
if (tokenMatchScore >= 90)
{
return (int)Math.Round(tokenMatchScore, MidpointRounding.AwayFromZero);
}
// If token match is decent (70%+), use it as a floor for Levenshtein
if (tokenMatchScore >= 70)
{
var levenshteinScore = CalculateLevenshteinScore(queryNorm, targetNorm);
return (int)Math.Max(tokenMatchScore, levenshteinScore);
}
}
// STEP 4: LEVENSHTEIN DISTANCE (expensive, fuzzy)
return CalculateLevenshteinScore(queryNorm, targetNorm);
}
/// <summary>
/// Calculates similarity score based on Levenshtein distance.
/// Returns score 0-75 (reserve 75-100 for substring/token matches).
/// </summary>
private static int CalculateLevenshteinScore(string str1, string str2)
{
var distance = LevenshteinDistance(str1, str2);
var maxLength = Math.Max(str1.Length, str2.Length);
if (maxLength == 0)
{
return 100;
}
// Normalize distance by length: score = 1 - (distance / max_length)
var normalizedSimilarity = 1.0 - ((double)distance / maxLength);
// Convert to 0-75 range (reserve 75-100 for substring/token matches)
// Using 75 instead of 80 to be slightly stricter
var score = (int)(normalizedSimilarity * 75);
return Math.Max(0, score);
}
/// <summary>
/// AGGRESSIVE matching that follows optimal order:
/// 1. Strip decorators FIRST
/// 2. Substring matching
/// 3. Levenshtein distance
/// Returns the best score.
/// </summary>
public static int CalculateSimilarityAggressive(string query, string target)
{
if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(target))
{
return 0;
}
// STEP 1: Strip decorators FIRST (always)
var queryStripped = StripDecorators(query);
var targetStripped = StripDecorators(target);
// STEP 2-3: Substring matching + Levenshtein
var strippedScore = CalculateSimilarity(queryStripped, targetStripped);
// Also try without stripping in case decorators are part of the actual title
var rawScore = CalculateSimilarity(query, target);
// Return the best score
return Math.Max(rawScore, strippedScore);
}
/// <summary>
/// Normalizes a string for matching by:
/// - Converting to lowercase
/// - Removing accents/diacritics
/// - Converting hyphens/underscores to spaces (for word separation)
/// - Removing other punctuation (periods, apostrophes, commas, etc.)
/// - Removing extra whitespace
/// </summary>
private static string NormalizeForMatching(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
return string.Empty;
}
var normalized = text.ToLowerInvariant().Trim();
// Remove accents/diacritics (é -> e, ñ -> n, etc.)
normalized = RemoveDiacritics(normalized);
// Replace hyphens and underscores with spaces (for word separation)
// This ensures "Dua-Lipa" becomes "Dua Lipa" not "DuaLipa"
normalized = normalized.Replace('-', ' ').Replace('_', ' ');
// Remove all other punctuation: periods, apostrophes, commas, etc.
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^\w\s]", "");
// Normalize whitespace
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ").Trim();
return normalized;
}
/// <summary>
/// Removes diacritics (accents) from characters.
/// Example: é -> e, ñ -> n, ü -> u
/// </summary>
private static string RemoveDiacritics(string text)
{
var normalizedString = text.Normalize(System.Text.NormalizationForm.FormD);
var stringBuilder = new System.Text.StringBuilder();
foreach (var c in normalizedString)
{
var unicodeCategory = System.Globalization.CharUnicodeInfo.GetUnicodeCategory(c);
if (unicodeCategory != System.Globalization.UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(System.Text.NormalizationForm.FormC);
}
/// <summary>
/// Calculates Levenshtein distance between two strings.
/// </summary>
private static int LevenshteinDistance(string source, string target)
{
if (string.IsNullOrEmpty(source))
{
return target?.Length ?? 0;
}
if (string.IsNullOrEmpty(target))
{
return source.Length;
}
var sourceLength = source.Length;
var targetLength = target.Length;
var distance = new int[sourceLength + 1, targetLength + 1];
for (var i = 0; i <= sourceLength; i++)
{
distance[i, 0] = i;
}
for (var j = 0; j <= targetLength; j++)
{
distance[0, j] = j;
}
for (var i = 1; i <= sourceLength; i++)
{
for (var j = 1; j <= targetLength; j++)
{
var cost = target[j - 1] == source[i - 1] ? 0 : 1;
distance[i, j] = Math.Min(
Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1),
distance[i - 1, j - 1] + cost);
}
}
return distance[sourceLength, targetLength];
}
/// <summary>
/// Calculates artist match score between Spotify artists and local song artists.
/// Checks bidirectional matching and penalizes mismatches.
/// Penalizes if artist counts don't match or if any artist is missing.
/// Returns score 0-100.
/// </summary>
public static double CalculateArtistMatchScore(List<string> spotifyArtists, string songMainArtist, List<string> songContributors)
{
if (spotifyArtists.Count == 0 || string.IsNullOrEmpty(songMainArtist))
return 0;
// Build list of all song artists (main + contributors)
var allSongArtists = new List<string> { songMainArtist };
allSongArtists.AddRange(songContributors);
// If artist counts differ significantly, penalize
var countDiff = Math.Abs(spotifyArtists.Count - allSongArtists.Count);
if (countDiff > 1) // Allow 1 artist difference (sometimes features are listed differently)
return 0;
// Check that each Spotify artist has a good match in song artists
var spotifyScores = new List<double>();
foreach (var spotifyArtist in spotifyArtists)
{
var bestMatch = allSongArtists.Max(songArtist =>
CalculateSimilarity(spotifyArtist, songArtist));
spotifyScores.Add(bestMatch);
}
// Check that each song artist has a good match in Spotify artists
var songScores = new List<double>();
foreach (var songArtist in allSongArtists)
{
var bestMatch = spotifyArtists.Max(spotifyArtist =>
CalculateSimilarity(songArtist, spotifyArtist));
songScores.Add(bestMatch);
}
// Average all scores - this ensures ALL artists must match well
var allScores = spotifyScores.Concat(songScores);
var avgScore = allScores.Average();
// Penalize if any individual artist match is poor (< 70)
var minScore = allScores.Min();
if (minScore < 70)
avgScore *= 0.7; // 30% penalty for poor individual match
return avgScore;
}
}