diff --git a/allstarr/Services/Common/RoundRobinFallbackHelper.cs b/allstarr/Services/Common/RoundRobinFallbackHelper.cs index a290262..e8fd9eb 100644 --- a/allstarr/Services/Common/RoundRobinFallbackHelper.cs +++ b/allstarr/Services/Common/RoundRobinFallbackHelper.cs @@ -11,6 +11,12 @@ public class RoundRobinFallbackHelper private readonly object _urlIndexLock = new object(); private readonly ILogger _logger; private readonly string _serviceName; + private readonly HttpClient _healthCheckClient; + + // Cache health check results for 30 seconds to avoid excessive checks + private readonly Dictionary _healthCache = new(); + private readonly object _healthCacheLock = new object(); + private readonly TimeSpan _healthCacheExpiry = TimeSpan.FromSeconds(30); public int EndpointCount => _apiUrls.Count; @@ -24,6 +30,91 @@ public class RoundRobinFallbackHelper { throw new ArgumentException("API URLs list cannot be empty", nameof(apiUrls)); } + + // Create a dedicated HttpClient for health checks with short timeout + _healthCheckClient = new HttpClient + { + Timeout = TimeSpan.FromSeconds(3) // Quick health check timeout + }; + } + + /// + /// Quickly checks if an endpoint is healthy (responds within 3 seconds). + /// Results are cached for 30 seconds to avoid excessive health checks. + /// + private async Task IsEndpointHealthyAsync(string baseUrl) + { + // Check cache first + lock (_healthCacheLock) + { + if (_healthCache.TryGetValue(baseUrl, out var cached)) + { + if (DateTime.UtcNow - cached.checkedAt < _healthCacheExpiry) + { + return cached.isHealthy; + } + } + } + + // Perform health check + try + { + var response = await _healthCheckClient.GetAsync(baseUrl, HttpCompletionOption.ResponseHeadersRead); + var isHealthy = response.IsSuccessStatusCode; + + // Cache result + lock (_healthCacheLock) + { + _healthCache[baseUrl] = (isHealthy, DateTime.UtcNow); + } + + if (!isHealthy) + { + _logger.LogDebug("{Service} endpoint {Endpoint} health check failed: {StatusCode}", + _serviceName, baseUrl, response.StatusCode); + } + + return isHealthy; + } + catch (Exception ex) + { + _logger.LogDebug(ex, "{Service} endpoint {Endpoint} health check failed", _serviceName, baseUrl); + + // Cache as unhealthy + lock (_healthCacheLock) + { + _healthCache[baseUrl] = (false, DateTime.UtcNow); + } + + return false; + } + } + + /// + /// Gets a list of healthy endpoints, checking them in parallel. + /// Falls back to all endpoints if none are healthy. + /// + private async Task> GetHealthyEndpointsAsync() + { + var healthCheckTasks = _apiUrls.Select(async url => new + { + Url = url, + IsHealthy = await IsEndpointHealthyAsync(url) + }).ToList(); + + var results = await Task.WhenAll(healthCheckTasks); + var healthyEndpoints = results.Where(r => r.IsHealthy).Select(r => r.Url).ToList(); + + if (healthyEndpoints.Count == 0) + { + _logger.LogWarning("{Service} health check: no healthy endpoints found, will try all", _serviceName); + return _apiUrls; + } + + _logger.LogDebug("{Service} health check: {Healthy}/{Total} endpoints healthy", + _serviceName, healthyEndpoints.Count, _apiUrls.Count); + + return healthyEndpoints; } /// @@ -54,10 +145,14 @@ public class RoundRobinFallbackHelper /// /// Tries the request with the next provider in round-robin, then falls back to others on failure. /// This distributes load evenly across all providers while maintaining reliability. + /// Performs quick health checks first to avoid wasting time on dead endpoints. /// Throws exception if all endpoints fail. /// public async Task TryWithFallbackAsync(Func> action) { + // Get healthy endpoints first (with caching to avoid excessive checks) + var healthyEndpoints = await GetHealthyEndpointsAsync(); + // Start with the next URL in round-robin to distribute load var startIndex = 0; lock (_urlIndexLock) @@ -66,16 +161,21 @@ public class RoundRobinFallbackHelper _currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count; } + // Try healthy endpoints first, then fall back to all if needed + var endpointsToTry = healthyEndpoints.Count < _apiUrls.Count + ? healthyEndpoints.Concat(_apiUrls.Except(healthyEndpoints)).ToList() + : healthyEndpoints; + // Try all URLs starting from the round-robin selected one - for (int attempt = 0; attempt < _apiUrls.Count; attempt++) + for (int attempt = 0; attempt < endpointsToTry.Count; attempt++) { - var urlIndex = (startIndex + attempt) % _apiUrls.Count; - var baseUrl = _apiUrls[urlIndex]; + var urlIndex = (startIndex + attempt) % endpointsToTry.Count; + var baseUrl = endpointsToTry[urlIndex]; try { _logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})", - _serviceName, baseUrl, attempt + 1, _apiUrls.Count); + _serviceName, baseUrl, attempt + 1, endpointsToTry.Count); return await action(baseUrl); } catch (Exception ex) @@ -83,9 +183,15 @@ public class RoundRobinFallbackHelper _logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...", _serviceName, baseUrl); - if (attempt == _apiUrls.Count - 1) + // Mark as unhealthy in cache + lock (_healthCacheLock) { - _logger.LogError("All {Count} {Service} endpoints failed", _apiUrls.Count, _serviceName); + _healthCache[baseUrl] = (false, DateTime.UtcNow); + } + + if (attempt == endpointsToTry.Count - 1) + { + _logger.LogError("All {Count} {Service} endpoints failed", endpointsToTry.Count, _serviceName); throw; } } @@ -150,10 +256,14 @@ public class RoundRobinFallbackHelper /// /// Tries the request with the next provider in round-robin, then falls back to others on failure. + /// Performs quick health checks first to avoid wasting time on dead endpoints. /// Returns default value if all endpoints fail (does not throw). /// public async Task TryWithFallbackAsync(Func> action, T defaultValue) { + // Get healthy endpoints first (with caching to avoid excessive checks) + var healthyEndpoints = await GetHealthyEndpointsAsync(); + // Start with the next URL in round-robin to distribute load var startIndex = 0; lock (_urlIndexLock) @@ -162,16 +272,21 @@ public class RoundRobinFallbackHelper _currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count; } + // Try healthy endpoints first, then fall back to all if needed + var endpointsToTry = healthyEndpoints.Count < _apiUrls.Count + ? healthyEndpoints.Concat(_apiUrls.Except(healthyEndpoints)).ToList() + : healthyEndpoints; + // Try all URLs starting from the round-robin selected one - for (int attempt = 0; attempt < _apiUrls.Count; attempt++) + for (int attempt = 0; attempt < endpointsToTry.Count; attempt++) { - var urlIndex = (startIndex + attempt) % _apiUrls.Count; - var baseUrl = _apiUrls[urlIndex]; + var urlIndex = (startIndex + attempt) % endpointsToTry.Count; + var baseUrl = endpointsToTry[urlIndex]; try { _logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})", - _serviceName, baseUrl, attempt + 1, _apiUrls.Count); + _serviceName, baseUrl, attempt + 1, endpointsToTry.Count); return await action(baseUrl); } catch (Exception ex) @@ -179,10 +294,16 @@ public class RoundRobinFallbackHelper _logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...", _serviceName, baseUrl); - if (attempt == _apiUrls.Count - 1) + // Mark as unhealthy in cache + lock (_healthCacheLock) + { + _healthCache[baseUrl] = (false, DateTime.UtcNow); + } + + if (attempt == endpointsToTry.Count - 1) { _logger.LogError("All {Count} {Service} endpoints failed, returning default value", - _apiUrls.Count, _serviceName); + endpointsToTry.Count, _serviceName); return defaultValue; } }