feat: aggressive track matching with optimal order

- Strip decorators FIRST (feat, remaster, explicit, etc)
- Substring matching SECOND (cheap, high-precision)
- Levenshtein distance THIRD (expensive, fuzzy)
- Greedy assignment LAST (optimal global matching)
- Lower threshold to 40 (was 50-60) for max coverage
- Accept artist priority matches (artist 70+, title 30+)
- Handles cases like 'luther' → 'luther (feat. sza)'
- Handles cases like 'a' → 'a-blah' with same artist
- Prevents duplicate assignments across tracks
This commit is contained in:
2026-02-06 21:22:42 -05:00
parent bb3140a247
commit a6ac0dfbd2
2 changed files with 301 additions and 96 deletions

View File

@@ -2,12 +2,64 @@ namespace allstarr.Services.Common;
/// <summary>
/// Provides fuzzy string matching for search result scoring.
/// OPTIMAL ORDER: 1. Strip decorators → 2. Substring matching → 3. Levenshtein → 4. Greedy assignment
/// </summary>
public static class FuzzyMatcher
{
/// <summary>
/// Calculates a similarity score between two strings (0-100).
/// Higher score means better match.
/// STEP 1: Strips common decorators from track titles to improve matching.
/// Removes: (feat. X), (with Y), (ft. Z), - From "Album", [Remix], etc.
/// This MUST be done first to avoid systematic noise in matching.
/// </summary>
public static string StripDecorators(string title)
{
if (string.IsNullOrWhiteSpace(title))
{
return string.Empty;
}
var cleaned = title;
// Remove (feat. ...), (ft. ...), (with ...), (featuring ...)
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*[\(\[]?\s*(feat\.?|ft\.?|with|featuring)\s+[^\)\]]+[\)\]]?",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove - From "Album Name" or - From Album Name
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*-\s*from\s+[""']?[^""']+[""']?",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove - Remastered, - Radio Edit, etc.
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*-\s*(remaster|radio edit|single version|album version|extended|original mix)[^\-]*",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove [Remix], [Remaster], [Live], [Explicit], etc.
cleaned = System.Text.RegularExpressions.Regex.Replace(
cleaned,
@"\s*[\[\(](remix|remaster|live|acoustic|radio edit|explicit|clean|official|audio|video|lyric)[^\]\)]*[\]\)]",
"",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove trailing/leading whitespace and normalize
cleaned = cleaned.Trim();
return cleaned;
}
/// <summary>
/// Calculates similarity score following OPTIMAL ORDER:
/// 1. Strip decorators (already done by caller)
/// 2. Substring matching (cheap, high-precision)
/// 3. Levenshtein distance (expensive, fuzzy)
/// Returns score 0-100.
/// </summary>
public static int CalculateSimilarity(string query, string target)
{
@@ -16,47 +68,87 @@ public static class FuzzyMatcher
return 0;
}
var queryLower = NormalizeForMatching(query);
var targetLower = NormalizeForMatching(target);
var queryNorm = NormalizeForMatching(query);
var targetNorm = NormalizeForMatching(target);
// STEP 2: SUBSTRING MATCHING (cheap, high-precision)
// Exact match
if (queryLower == targetLower)
if (queryNorm == targetNorm)
{
return 100;
}
// One string fully contains the other (substring match)
// Example: "luther" ⊂ "luther remastered" → instant win
if (targetNorm.Contains(queryNorm) || queryNorm.Contains(targetNorm))
{
return 95;
}
// Starts with query
if (targetLower.StartsWith(queryLower))
if (targetNorm.StartsWith(queryNorm) || queryNorm.StartsWith(targetNorm))
{
return 90;
}
// Contains query as whole word
if (targetLower.Contains($" {queryLower} ") ||
targetLower.StartsWith($"{queryLower} ") ||
targetLower.EndsWith($" {queryLower}"))
if (targetNorm.Contains($" {queryNorm} ") ||
targetNorm.StartsWith($"{queryNorm} ") ||
targetNorm.EndsWith($" {queryNorm}") ||
queryNorm.Contains($" {targetNorm} ") ||
queryNorm.StartsWith($"{targetNorm} ") ||
queryNorm.EndsWith($" {targetNorm}"))
{
return 80;
return 85;
}
// Contains query anywhere
if (targetLower.Contains(queryLower))
{
return 70;
}
// STEP 3: LEVENSHTEIN DISTANCE (expensive, fuzzy)
// Only use this for candidates that survived substring checks
// Calculate Levenshtein distance for fuzzy matching
var distance = LevenshteinDistance(queryLower, targetLower);
var maxLength = Math.Max(queryLower.Length, targetLower.Length);
var distance = LevenshteinDistance(queryNorm, targetNorm);
var maxLength = Math.Max(queryNorm.Length, targetNorm.Length);
if (maxLength == 0)
{
return 100;
}
// Convert distance to similarity score (0-60 range for fuzzy matches)
var similarity = (1.0 - (double)distance / maxLength) * 60;
return (int)Math.Max(0, similarity);
// Normalize distance by length: score = 1 - (distance / max_length)
var normalizedSimilarity = 1.0 - ((double)distance / maxLength);
// Convert to 0-80 range (reserve 80-100 for substring matches)
var score = (int)(normalizedSimilarity * 80);
return Math.Max(0, score);
}
/// <summary>
/// AGGRESSIVE matching that follows optimal order:
/// 1. Strip decorators FIRST
/// 2. Substring matching
/// 3. Levenshtein distance
/// Returns the best score.
/// </summary>
public static int CalculateSimilarityAggressive(string query, string target)
{
if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(target))
{
return 0;
}
// STEP 1: Strip decorators FIRST (always)
var queryStripped = StripDecorators(query);
var targetStripped = StripDecorators(target);
// STEP 2-3: Substring matching + Levenshtein
var strippedScore = CalculateSimilarity(queryStripped, targetStripped);
// Also try without stripping in case decorators are part of the actual title
var rawScore = CalculateSimilarity(query, target);
// Return the best score
return Math.Max(rawScore, strippedScore);
}
/// <summary>

View File

@@ -227,6 +227,7 @@ public class SpotifyTrackMatchingService : BackgroundService
/// New matching mode that uses ISRC when available for exact matches.
/// Preserves track position for correct playlist ordering.
/// Only matches tracks that aren't already in the Jellyfin playlist.
/// Uses GREEDY ASSIGNMENT to maximize total matches.
/// </summary>
private async Task MatchPlaylistTracksWithIsrcAsync(
string playlistName,
@@ -320,7 +321,7 @@ public class SpotifyTrackMatchingService : BackgroundService
return;
}
_logger.LogInformation("Matching {ToMatch}/{Total} tracks for {Playlist} (skipping {Existing} already in Jellyfin, ISRC: {IsrcEnabled})",
_logger.LogInformation("Matching {ToMatch}/{Total} tracks for {Playlist} (skipping {Existing} already in Jellyfin, ISRC: {IsrcEnabled}, AGGRESSIVE MODE)",
tracksToMatch.Count, spotifyTracks.Count, playlistName, existingSpotifyIds.Count, _spotifyApiSettings.PreferIsrcMatching);
// Check cache - use snapshot/timestamp to detect changes
@@ -367,6 +368,9 @@ public class SpotifyTrackMatchingService : BackgroundService
var fuzzyMatches = 0;
var noMatch = 0;
// GREEDY ASSIGNMENT: Collect all possible matches first, then assign optimally
var allCandidates = new List<(SpotifyPlaylistTrack SpotifyTrack, Song MatchedSong, double Score, string MatchType)>();
// Process tracks in batches for parallel searching
var orderedTracks = tracksToMatch.OrderBy(t => t.Position).ToList();
for (int i = 0; i < orderedTracks.Count; i += BatchSize)
@@ -382,34 +386,86 @@ public class SpotifyTrackMatchingService : BackgroundService
{
try
{
Song? matchedSong = null;
var matchType = "none";
var candidates = new List<(Song Song, double Score, string MatchType)>();
// Try ISRC match first if available and enabled
if (_spotifyApiSettings.PreferIsrcMatching && !string.IsNullOrEmpty(spotifyTrack.Isrc))
{
matchedSong = await TryMatchByIsrcAsync(spotifyTrack.Isrc, metadataService);
if (matchedSong != null)
var isrcSong = await TryMatchByIsrcAsync(spotifyTrack.Isrc, metadataService);
if (isrcSong != null)
{
matchType = "isrc";
candidates.Add((isrcSong, 100.0, "isrc"));
}
}
// Fall back to fuzzy matching
if (matchedSong == null)
{
matchedSong = await TryMatchByFuzzyAsync(
// Always try fuzzy matching to get more candidates
var fuzzySongs = await TryMatchByFuzzyMultipleAsync(
spotifyTrack.Title,
spotifyTrack.Artists,
metadataService);
if (matchedSong != null)
foreach (var (song, score) in fuzzySongs)
{
matchType = "fuzzy";
candidates.Add((song, score, "fuzzy"));
}
return (spotifyTrack, candidates);
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Failed to match track: {Title} - {Artist}",
spotifyTrack.Title, spotifyTrack.PrimaryArtist);
return (spotifyTrack, new List<(Song, double, string)>());
}
}).ToList();
// Wait for all tracks in this batch to complete
var batchResults = await Task.WhenAll(batchTasks);
// Collect all candidates
foreach (var (spotifyTrack, candidates) in batchResults)
{
foreach (var (song, score, matchType) in candidates)
{
allCandidates.Add((spotifyTrack, song, score, matchType));
}
}
if (matchedSong != null)
// Rate limiting between batches
if (i + BatchSize < orderedTracks.Count)
{
await Task.Delay(DelayBetweenSearchesMs, cancellationToken);
}
}
// GREEDY ASSIGNMENT: Assign each Spotify track to its best unique match
var usedSongIds = new HashSet<string>();
var assignments = new Dictionary<string, (Song Song, double Score, string MatchType)>();
// Sort candidates by score (highest first)
var sortedCandidates = allCandidates
.OrderByDescending(c => c.Score)
.ToList();
foreach (var (spotifyTrack, song, score, matchType) in sortedCandidates)
{
// Skip if this Spotify track already has a match
if (assignments.ContainsKey(spotifyTrack.SpotifyId))
continue;
// Skip if this song is already used
if (usedSongIds.Contains(song.Id))
continue;
// Assign this match
assignments[spotifyTrack.SpotifyId] = (song, score, matchType);
usedSongIds.Add(song.Id);
}
// Build final matched tracks list
foreach (var spotifyTrack in orderedTracks)
{
if (assignments.TryGetValue(spotifyTrack.SpotifyId, out var match))
{
var matched = new MatchedTrack
{
@@ -418,54 +474,24 @@ public class SpotifyTrackMatchingService : BackgroundService
SpotifyTitle = spotifyTrack.Title,
SpotifyArtist = spotifyTrack.PrimaryArtist,
Isrc = spotifyTrack.Isrc,
MatchType = matchType,
MatchedSong = matchedSong
MatchType = match.MatchType,
MatchedSong = match.Song
};
_logger.LogDebug(" #{Position} {Title} - {Artist} → {MatchType} match: {MatchedTitle}",
spotifyTrack.Position, spotifyTrack.Title, spotifyTrack.PrimaryArtist,
matchType, matchedSong.Title);
return ((MatchedTrack?)matched, matchType);
}
else
{
_logger.LogDebug(" #{Position} {Title} - {Artist} → no match",
spotifyTrack.Position, spotifyTrack.Title, spotifyTrack.PrimaryArtist);
return ((MatchedTrack?)null, "none");
}
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Failed to match track: {Title} - {Artist}",
spotifyTrack.Title, spotifyTrack.PrimaryArtist);
return ((MatchedTrack?)null, "none");
}
}).ToList();
// Wait for all tracks in this batch to complete
var batchResults = await Task.WhenAll(batchTasks);
// Collect results
foreach (var result in batchResults)
{
var (matched, matchType) = result;
if (matched != null)
{
matchedTracks.Add(matched);
if (matchType == "isrc") isrcMatches++;
else if (matchType == "fuzzy") fuzzyMatches++;
if (match.MatchType == "isrc") isrcMatches++;
else if (match.MatchType == "fuzzy") fuzzyMatches++;
_logger.LogDebug(" #{Position} {Title} - {Artist} → {MatchType} match (score: {Score:F1}): {MatchedTitle}",
spotifyTrack.Position, spotifyTrack.Title, spotifyTrack.PrimaryArtist,
match.MatchType, match.Score, match.Song.Title);
}
else
{
noMatch++;
}
}
// Rate limiting between batches (not between individual tracks)
if (i + BatchSize < orderedTracks.Count)
{
await Task.Delay(DelayBetweenSearchesMs, cancellationToken);
_logger.LogDebug(" #{Position} {Title} - {Artist} → no match",
spotifyTrack.Position, spotifyTrack.Title, spotifyTrack.PrimaryArtist);
}
}
@@ -483,7 +509,7 @@ public class SpotifyTrackMatchingService : BackgroundService
await _cache.SetAsync(legacyKey, legacySongs, TimeSpan.FromHours(1));
_logger.LogInformation(
"✓ Cached {Matched}/{Total} tracks for {Playlist} via search (ISRC: {Isrc}, Fuzzy: {Fuzzy}, No match: {NoMatch}) - manual mappings will be applied next",
"✓ Cached {Matched}/{Total} tracks for {Playlist} via GREEDY ASSIGNMENT (ISRC: {Isrc}, Fuzzy: {Fuzzy}, No match: {NoMatch}) - manual mappings will be applied next",
matchedTracks.Count, tracksToMatch.Count, playlistName, isrcMatches, fuzzyMatches, noMatch);
// Pre-build playlist items cache for instant serving
@@ -495,6 +521,64 @@ public class SpotifyTrackMatchingService : BackgroundService
}
}
/// <summary>
/// Returns multiple candidate matches with scores for greedy assignment.
/// FOLLOWS OPTIMAL ORDER:
/// 1. Strip decorators (done in FuzzyMatcher)
/// 2. Substring matching (done in FuzzyMatcher)
/// 3. Levenshtein distance (done in FuzzyMatcher)
/// This method just collects candidates; greedy assignment happens later.
/// </summary>
private async Task<List<(Song Song, double Score)>> TryMatchByFuzzyMultipleAsync(
string title,
List<string> artists,
IMusicMetadataService metadataService)
{
try
{
var primaryArtist = artists.FirstOrDefault() ?? "";
// STEP 1: Strip decorators FIRST (before searching)
var titleStripped = FuzzyMatcher.StripDecorators(title);
var query = $"{titleStripped} {primaryArtist}";
var results = await metadataService.SearchSongsAsync(query, limit: 10);
if (results.Count == 0) return new List<(Song, double)>();
// STEP 2-3: Score all results (substring + Levenshtein already in CalculateSimilarityAggressive)
var scoredResults = results
.Select(song => new
{
Song = song,
// Use aggressive matching which follows optimal order internally
TitleScore = FuzzyMatcher.CalculateSimilarityAggressive(title, song.Title),
ArtistScore = CalculateArtistMatchScore(artists, song.Artist, song.Contributors)
})
.Select(x => new
{
x.Song,
x.TitleScore,
x.ArtistScore,
// Weight: 70% title, 30% artist (prioritize title matching)
TotalScore = (x.TitleScore * 0.7) + (x.ArtistScore * 0.3)
})
.Where(x =>
x.TotalScore >= 40 ||
(x.ArtistScore >= 70 && x.TitleScore >= 30) ||
x.TitleScore >= 85)
.OrderByDescending(x => x.TotalScore)
.Select(x => (x.Song, x.TotalScore))
.ToList();
return scoredResults;
}
catch
{
return new List<(Song, double)>();
}
}
/// <summary>
/// Attempts to match a track by ISRC using provider search.
/// </summary>
@@ -524,7 +608,12 @@ public class SpotifyTrackMatchingService : BackgroundService
}
/// <summary>
/// Attempts to match a track by title and artist using fuzzy matching.
/// Attempts to match a track by title and artist using AGGRESSIVE fuzzy matching.
/// FOLLOWS OPTIMAL ORDER:
/// 1. Strip decorators FIRST (before searching)
/// 2. Substring matching (in FuzzyMatcher)
/// 3. Levenshtein distance (in FuzzyMatcher)
/// PRIORITY: Match as many tracks as possible, even with lower confidence.
/// </summary>
private async Task<Song?> TryMatchByFuzzyAsync(
string title,
@@ -534,17 +623,22 @@ public class SpotifyTrackMatchingService : BackgroundService
try
{
var primaryArtist = artists.FirstOrDefault() ?? "";
var query = $"{title} {primaryArtist}";
var results = await metadataService.SearchSongsAsync(query, limit: 5);
// STEP 1: Strip decorators FIRST (before searching)
var titleStripped = FuzzyMatcher.StripDecorators(title);
var query = $"{titleStripped} {primaryArtist}";
var results = await metadataService.SearchSongsAsync(query, limit: 10);
if (results.Count == 0) return null;
// Score all results
// STEP 2-3: Score all results (substring + Levenshtein in CalculateSimilarityAggressive)
var scoredResults = results
.Select(song => new
{
Song = song,
TitleScore = FuzzyMatcher.CalculateSimilarity(title, song.Title),
// Use aggressive matching which follows optimal order internally
TitleScore = FuzzyMatcher.CalculateSimilarityAggressive(title, song.Title),
ArtistScore = CalculateArtistMatchScore(artists, song.Artist, song.Contributors)
})
.Select(x => new
@@ -552,27 +646,39 @@ public class SpotifyTrackMatchingService : BackgroundService
x.Song,
x.TitleScore,
x.ArtistScore,
TotalScore = (x.TitleScore * 0.6) + (x.ArtistScore * 0.4)
// Weight: 70% title, 30% artist (prioritize title matching)
TotalScore = (x.TitleScore * 0.7) + (x.ArtistScore * 0.3)
})
.OrderByDescending(x => x.TotalScore)
.ToList();
var bestMatch = scoredResults.FirstOrDefault();
// If we have a good match (50+), use it
if (bestMatch != null && bestMatch.TotalScore >= 50)
if (bestMatch == null) return null;
// AGGRESSIVE: Accept matches with score >= 40 (was 50)
if (bestMatch.TotalScore >= 40)
{
_logger.LogDebug("✓ Matched (score: {Score:F1}, title: {TitleScore}, artist: {ArtistScore}): {SpotifyTitle} → {MatchedTitle}",
bestMatch.TotalScore, bestMatch.TitleScore, bestMatch.ArtistScore, title, bestMatch.Song.Title);
return bestMatch.Song;
}
// Fallback: If the provider returned results and the top result has decent artist match,
// trust the provider's search algorithm (it already did fuzzy matching)
// This helps with tracks that have features/remixes in parentheses/brackets
// where the provider might format them differently
if (bestMatch != null && bestMatch.ArtistScore >= 70)
// SUPER AGGRESSIVE: If artist matches well (70+), accept even lower title scores
// This handles cases like "a" → "a-blah" where artist is the same
if (bestMatch.ArtistScore >= 70 && bestMatch.TitleScore >= 30)
{
_logger.LogDebug("Using provider's top result despite low title score (Artist: {ArtistScore}, Title: {TitleScore}): {Title}",
bestMatch.ArtistScore, bestMatch.TitleScore, bestMatch.Song.Title);
_logger.LogDebug("✓ Matched via artist priority (artist: {ArtistScore}, title: {TitleScore}): {SpotifyTitle} → {MatchedTitle}",
bestMatch.ArtistScore, bestMatch.TitleScore, title, bestMatch.Song.Title);
return bestMatch.Song;
}
// ULTRA AGGRESSIVE: If title has high substring match (85+), accept it
// This handles "luther" → "luther (feat. sza)"
if (bestMatch.TitleScore >= 85)
{
_logger.LogDebug("✓ Matched via substring (title: {TitleScore}): {SpotifyTitle} → {MatchedTitle}",
bestMatch.TitleScore, title, bestMatch.Song.Title);
return bestMatch.Song;
}
@@ -993,7 +1099,7 @@ public class SpotifyTrackMatchingService : BackgroundService
}
}
// If no manual external mapping, try fuzzy matching with local Jellyfin tracks
// If no manual external mapping, try AGGRESSIVE fuzzy matching with local Jellyfin tracks
double bestScore = 0;
foreach (var kvp in jellyfinItemsByName)
@@ -1008,11 +1114,18 @@ public class SpotifyTrackMatchingService : BackgroundService
artist = artistsEl[0].GetString() ?? "";
}
var titleScore = FuzzyMatcher.CalculateSimilarity(spotifyTrack.Title, title);
// Use AGGRESSIVE matching with decorator stripping
var titleScore = FuzzyMatcher.CalculateSimilarityAggressive(spotifyTrack.Title, title);
var artistScore = FuzzyMatcher.CalculateSimilarity(spotifyTrack.PrimaryArtist, artist);
// Weight: 70% title, 30% artist (prioritize title matching)
var totalScore = (titleScore * 0.7) + (artistScore * 0.3);
if (totalScore > bestScore && totalScore >= 70)
// AGGRESSIVE: Accept score >= 40 (was 70)
// Also accept if artist matches well (70+) and title is decent (30+)
var isGoodMatch = totalScore >= 40 || (artistScore >= 70 && titleScore >= 30);
if (totalScore > bestScore && isGoodMatch)
{
bestScore = totalScore;
matchedJellyfinItem = item;