mirror of
https://github.com/SoPat712/allstarr.git
synced 2026-04-27 03:53:10 -04:00
363 lines
13 KiB
C#
363 lines
13 KiB
C#
namespace allstarr.Services.Common;
|
|
|
|
/// <summary>
|
|
/// Provides fuzzy string matching for search result scoring.
|
|
/// OPTIMAL ORDER: 1. Strip decorators → 2. Substring matching → 3. Levenshtein → 4. Greedy assignment
|
|
/// </summary>
|
|
public static class FuzzyMatcher
|
|
{
|
|
/// <summary>
|
|
/// STEP 1: Strips common decorators from track titles to improve matching.
|
|
/// Removes: (feat. X), (with Y), (ft. Z), - From "Album", [Remix], etc.
|
|
/// This MUST be done first to avoid systematic noise in matching.
|
|
/// </summary>
|
|
public static string StripDecorators(string title)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(title))
|
|
{
|
|
return string.Empty;
|
|
}
|
|
|
|
var cleaned = title;
|
|
|
|
// Remove (feat. ...), (ft. ...), (with ...), (featuring ...)
|
|
cleaned = System.Text.RegularExpressions.Regex.Replace(
|
|
cleaned,
|
|
@"\s*[\(\[]?\s*(feat\.?|ft\.?|with|featuring)\s+[^\)\]]+[\)\]]?",
|
|
"",
|
|
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
|
|
|
// Remove - From "Album Name" or - From Album Name
|
|
cleaned = System.Text.RegularExpressions.Regex.Replace(
|
|
cleaned,
|
|
@"\s*-\s*from\s+[""']?[^""']+[""']?",
|
|
"",
|
|
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
|
|
|
// Remove - Remastered, - Radio Edit, etc.
|
|
cleaned = System.Text.RegularExpressions.Regex.Replace(
|
|
cleaned,
|
|
@"\s*-\s*(remaster|radio edit|single version|album version|extended|original mix)[^\-]*",
|
|
"",
|
|
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
|
|
|
// Remove [Remix], [Remaster], [Live], [Explicit], etc.
|
|
cleaned = System.Text.RegularExpressions.Regex.Replace(
|
|
cleaned,
|
|
@"\s*[\[\(](remix|remaster|live|acoustic|radio edit|explicit|clean|official|audio|video|lyric)[^\]\)]*[\]\)]",
|
|
"",
|
|
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
|
|
|
// Remove trailing/leading whitespace and normalize
|
|
cleaned = cleaned.Trim();
|
|
|
|
return cleaned;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates similarity score following OPTIMAL ORDER:
|
|
/// 1. Strip decorators (already done by caller)
|
|
/// 2. Substring matching (cheap, high-precision)
|
|
/// 3. Token-based matching (handles word order)
|
|
/// 4. Levenshtein distance (expensive, fuzzy)
|
|
/// Returns score 0-100.
|
|
/// </summary>
|
|
public static int CalculateSimilarity(string query, string target)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(target))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
var queryNorm = NormalizeForMatching(query);
|
|
var targetNorm = NormalizeForMatching(target);
|
|
|
|
// STEP 2: SUBSTRING MATCHING (cheap, high-precision)
|
|
|
|
// Exact match
|
|
if (queryNorm == targetNorm)
|
|
{
|
|
return 100;
|
|
}
|
|
|
|
// One string fully contains the other (substring match)
|
|
// Example: "luther" ⊂ "luther remastered" → instant win
|
|
if (targetNorm.Contains(queryNorm) || queryNorm.Contains(targetNorm))
|
|
{
|
|
return 95;
|
|
}
|
|
|
|
// Starts with query
|
|
if (targetNorm.StartsWith(queryNorm) || queryNorm.StartsWith(targetNorm))
|
|
{
|
|
return 90;
|
|
}
|
|
|
|
// Contains query as whole word
|
|
if (targetNorm.Contains($" {queryNorm} ") ||
|
|
targetNorm.StartsWith($"{queryNorm} ") ||
|
|
targetNorm.EndsWith($" {queryNorm}") ||
|
|
queryNorm.Contains($" {targetNorm} ") ||
|
|
queryNorm.StartsWith($"{targetNorm} ") ||
|
|
queryNorm.EndsWith($" {targetNorm}"))
|
|
{
|
|
return 85;
|
|
}
|
|
|
|
// STEP 3: TOKEN-BASED MATCHING (handles word order)
|
|
var tokens1 = queryNorm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
|
|
var tokens2 = targetNorm.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
|
|
|
|
if (tokens1.Length > 0 && tokens2.Length > 0)
|
|
{
|
|
// Calculate how many tokens match (order-independent)
|
|
var matchedTokens = 0.0; // Use double for partial matches
|
|
var usedTokens = new HashSet<int>();
|
|
|
|
foreach (var token1 in tokens1)
|
|
{
|
|
for (int i = 0; i < tokens2.Length; i++)
|
|
{
|
|
if (usedTokens.Contains(i)) continue;
|
|
|
|
var token2 = tokens2[i];
|
|
|
|
// Exact token match
|
|
if (token1 == token2)
|
|
{
|
|
matchedTokens++;
|
|
usedTokens.Add(i);
|
|
break;
|
|
}
|
|
// Partial token match (one contains the other)
|
|
else if (token1.Contains(token2) || token2.Contains(token1))
|
|
{
|
|
matchedTokens += 0.8; // Partial credit
|
|
usedTokens.Add(i);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Calculate token match percentage
|
|
var maxTokens = Math.Max(tokens1.Length, tokens2.Length);
|
|
var tokenMatchScore = (matchedTokens / maxTokens) * 100.0;
|
|
|
|
// If token match is very high (90%+), return it
|
|
if (tokenMatchScore >= 90)
|
|
{
|
|
return (int)Math.Round(tokenMatchScore, MidpointRounding.AwayFromZero);
|
|
}
|
|
|
|
// If token match is decent (70%+), use it as a floor for Levenshtein
|
|
if (tokenMatchScore >= 70)
|
|
{
|
|
var levenshteinScore = CalculateLevenshteinScore(queryNorm, targetNorm);
|
|
return (int)Math.Max(tokenMatchScore, levenshteinScore);
|
|
}
|
|
}
|
|
|
|
// STEP 4: LEVENSHTEIN DISTANCE (expensive, fuzzy)
|
|
return CalculateLevenshteinScore(queryNorm, targetNorm);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates similarity score based on Levenshtein distance.
|
|
/// Returns score 0-75 (reserve 75-100 for substring/token matches).
|
|
/// </summary>
|
|
private static int CalculateLevenshteinScore(string str1, string str2)
|
|
{
|
|
var distance = LevenshteinDistance(str1, str2);
|
|
var maxLength = Math.Max(str1.Length, str2.Length);
|
|
|
|
if (maxLength == 0)
|
|
{
|
|
return 100;
|
|
}
|
|
|
|
// Normalize distance by length: score = 1 - (distance / max_length)
|
|
var normalizedSimilarity = 1.0 - ((double)distance / maxLength);
|
|
|
|
// Convert to 0-75 range (reserve 75-100 for substring/token matches)
|
|
// Using 75 instead of 80 to be slightly stricter
|
|
var score = (int)(normalizedSimilarity * 75);
|
|
|
|
return Math.Max(0, score);
|
|
}
|
|
|
|
/// <summary>
|
|
/// AGGRESSIVE matching that follows optimal order:
|
|
/// 1. Strip decorators FIRST
|
|
/// 2. Substring matching
|
|
/// 3. Levenshtein distance
|
|
/// Returns the best score.
|
|
/// </summary>
|
|
public static int CalculateSimilarityAggressive(string query, string target)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(target))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
// STEP 1: Strip decorators FIRST (always)
|
|
var queryStripped = StripDecorators(query);
|
|
var targetStripped = StripDecorators(target);
|
|
|
|
// STEP 2-3: Substring matching + Levenshtein
|
|
var strippedScore = CalculateSimilarity(queryStripped, targetStripped);
|
|
|
|
// Also try without stripping in case decorators are part of the actual title
|
|
var rawScore = CalculateSimilarity(query, target);
|
|
|
|
// Return the best score
|
|
return Math.Max(rawScore, strippedScore);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Normalizes a string for matching by:
|
|
/// - Converting to lowercase
|
|
/// - Removing accents/diacritics
|
|
/// - Converting hyphens/underscores to spaces (for word separation)
|
|
/// - Removing other punctuation (periods, apostrophes, commas, etc.)
|
|
/// - Removing extra whitespace
|
|
/// </summary>
|
|
private static string NormalizeForMatching(string text)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(text))
|
|
{
|
|
return string.Empty;
|
|
}
|
|
|
|
var normalized = text.ToLowerInvariant().Trim();
|
|
|
|
// Remove accents/diacritics (é -> e, ñ -> n, etc.)
|
|
normalized = RemoveDiacritics(normalized);
|
|
|
|
// Replace hyphens and underscores with spaces (for word separation)
|
|
// This ensures "Dua-Lipa" becomes "Dua Lipa" not "DuaLipa"
|
|
normalized = normalized.Replace('-', ' ').Replace('_', ' ');
|
|
|
|
// Remove all other punctuation: periods, apostrophes, commas, etc.
|
|
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^\w\s]", "");
|
|
|
|
// Normalize whitespace
|
|
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ").Trim();
|
|
|
|
return normalized;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Removes diacritics (accents) from characters.
|
|
/// Example: é -> e, ñ -> n, ü -> u
|
|
/// </summary>
|
|
private static string RemoveDiacritics(string text)
|
|
{
|
|
var normalizedString = text.Normalize(System.Text.NormalizationForm.FormD);
|
|
var stringBuilder = new System.Text.StringBuilder();
|
|
|
|
foreach (var c in normalizedString)
|
|
{
|
|
var unicodeCategory = System.Globalization.CharUnicodeInfo.GetUnicodeCategory(c);
|
|
if (unicodeCategory != System.Globalization.UnicodeCategory.NonSpacingMark)
|
|
{
|
|
stringBuilder.Append(c);
|
|
}
|
|
}
|
|
|
|
return stringBuilder.ToString().Normalize(System.Text.NormalizationForm.FormC);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates Levenshtein distance between two strings.
|
|
/// </summary>
|
|
private static int LevenshteinDistance(string source, string target)
|
|
{
|
|
if (string.IsNullOrEmpty(source))
|
|
{
|
|
return target?.Length ?? 0;
|
|
}
|
|
|
|
if (string.IsNullOrEmpty(target))
|
|
{
|
|
return source.Length;
|
|
}
|
|
|
|
var sourceLength = source.Length;
|
|
var targetLength = target.Length;
|
|
var distance = new int[sourceLength + 1, targetLength + 1];
|
|
|
|
for (var i = 0; i <= sourceLength; i++)
|
|
{
|
|
distance[i, 0] = i;
|
|
}
|
|
|
|
for (var j = 0; j <= targetLength; j++)
|
|
{
|
|
distance[0, j] = j;
|
|
}
|
|
|
|
for (var i = 1; i <= sourceLength; i++)
|
|
{
|
|
for (var j = 1; j <= targetLength; j++)
|
|
{
|
|
var cost = target[j - 1] == source[i - 1] ? 0 : 1;
|
|
distance[i, j] = Math.Min(
|
|
Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1),
|
|
distance[i - 1, j - 1] + cost);
|
|
}
|
|
}
|
|
|
|
return distance[sourceLength, targetLength];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates artist match score between Spotify artists and local song artists.
|
|
/// Checks bidirectional matching and penalizes mismatches.
|
|
/// Penalizes if artist counts don't match or if any artist is missing.
|
|
/// Returns score 0-100.
|
|
/// </summary>
|
|
public static double CalculateArtistMatchScore(List<string> spotifyArtists, string songMainArtist, List<string> songContributors)
|
|
{
|
|
if (spotifyArtists.Count == 0 || string.IsNullOrEmpty(songMainArtist))
|
|
return 0;
|
|
|
|
// Build list of all song artists (main + contributors)
|
|
var allSongArtists = new List<string> { songMainArtist };
|
|
allSongArtists.AddRange(songContributors);
|
|
|
|
// If artist counts differ significantly, penalize
|
|
var countDiff = Math.Abs(spotifyArtists.Count - allSongArtists.Count);
|
|
if (countDiff > 1) // Allow 1 artist difference (sometimes features are listed differently)
|
|
return 0;
|
|
|
|
// Check that each Spotify artist has a good match in song artists
|
|
var spotifyScores = new List<double>();
|
|
foreach (var spotifyArtist in spotifyArtists)
|
|
{
|
|
var bestMatch = allSongArtists.Max(songArtist =>
|
|
CalculateSimilarity(spotifyArtist, songArtist));
|
|
spotifyScores.Add(bestMatch);
|
|
}
|
|
|
|
// Check that each song artist has a good match in Spotify artists
|
|
var songScores = new List<double>();
|
|
foreach (var songArtist in allSongArtists)
|
|
{
|
|
var bestMatch = spotifyArtists.Max(spotifyArtist =>
|
|
CalculateSimilarity(songArtist, spotifyArtist));
|
|
songScores.Add(bestMatch);
|
|
}
|
|
|
|
// Average all scores - this ensures ALL artists must match well
|
|
var allScores = spotifyScores.Concat(songScores);
|
|
var avgScore = allScores.Average();
|
|
|
|
// Penalize if any individual artist match is poor (< 70)
|
|
var minScore = allScores.Min();
|
|
if (minScore < 70)
|
|
avgScore *= 0.7; // 30% penalty for poor individual match
|
|
|
|
return avgScore;
|
|
}
|
|
}
|