Files
allstarr/allstarr/Services/Common/RoundRobinFallbackHelper.cs
Josh Patra 972756159d feat: add quick health checks before trying endpoints
- Health checks run in parallel with 3 second timeout
- Results cached for 30 seconds to avoid excessive checks
- Healthy endpoints tried first, unhealthy ones as fallback
- Prevents wasting time on dead endpoints (no more 5 min waits)
- Failed requests mark endpoint as unhealthy in cache
- Significantly improves response time when some endpoints are down
2026-02-08 00:05:27 -05:00

314 lines
12 KiB
C#

namespace allstarr.Services.Common;
/// <summary>
/// Helper for round-robin load balancing with fallback across multiple API endpoints.
/// Distributes load evenly while maintaining reliability through automatic failover.
/// </summary>
public class RoundRobinFallbackHelper
{
private readonly List<string> _apiUrls;
private int _currentUrlIndex = 0;
private readonly object _urlIndexLock = new object();
private readonly ILogger _logger;
private readonly string _serviceName;
private readonly HttpClient _healthCheckClient;
// Cache health check results for 30 seconds to avoid excessive checks
private readonly Dictionary<string, (bool isHealthy, DateTime checkedAt)> _healthCache = new();
private readonly object _healthCacheLock = new object();
private readonly TimeSpan _healthCacheExpiry = TimeSpan.FromSeconds(30);
public int EndpointCount => _apiUrls.Count;
public RoundRobinFallbackHelper(List<string> apiUrls, ILogger logger, string serviceName)
{
_apiUrls = apiUrls ?? throw new ArgumentNullException(nameof(apiUrls));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_serviceName = serviceName ?? "Service";
if (_apiUrls.Count == 0)
{
throw new ArgumentException("API URLs list cannot be empty", nameof(apiUrls));
}
// Create a dedicated HttpClient for health checks with short timeout
_healthCheckClient = new HttpClient
{
Timeout = TimeSpan.FromSeconds(3) // Quick health check timeout
};
}
/// <summary>
/// Quickly checks if an endpoint is healthy (responds within 3 seconds).
/// Results are cached for 30 seconds to avoid excessive health checks.
/// </summary>
private async Task<bool> IsEndpointHealthyAsync(string baseUrl)
{
// Check cache first
lock (_healthCacheLock)
{
if (_healthCache.TryGetValue(baseUrl, out var cached))
{
if (DateTime.UtcNow - cached.checkedAt < _healthCacheExpiry)
{
return cached.isHealthy;
}
}
}
// Perform health check
try
{
var response = await _healthCheckClient.GetAsync(baseUrl, HttpCompletionOption.ResponseHeadersRead);
var isHealthy = response.IsSuccessStatusCode;
// Cache result
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (isHealthy, DateTime.UtcNow);
}
if (!isHealthy)
{
_logger.LogDebug("{Service} endpoint {Endpoint} health check failed: {StatusCode}",
_serviceName, baseUrl, response.StatusCode);
}
return isHealthy;
}
catch (Exception ex)
{
_logger.LogDebug(ex, "{Service} endpoint {Endpoint} health check failed", _serviceName, baseUrl);
// Cache as unhealthy
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (false, DateTime.UtcNow);
}
return false;
}
}
/// <summary>
/// Gets a list of healthy endpoints, checking them in parallel.
/// Falls back to all endpoints if none are healthy.
/// </summary>
private async Task<List<string>> GetHealthyEndpointsAsync()
{
var healthCheckTasks = _apiUrls.Select(async url => new
{
Url = url,
IsHealthy = await IsEndpointHealthyAsync(url)
}).ToList();
var results = await Task.WhenAll(healthCheckTasks);
var healthyEndpoints = results.Where(r => r.IsHealthy).Select(r => r.Url).ToList();
if (healthyEndpoints.Count == 0)
{
_logger.LogWarning("{Service} health check: no healthy endpoints found, will try all", _serviceName);
return _apiUrls;
}
_logger.LogDebug("{Service} health check: {Healthy}/{Total} endpoints healthy",
_serviceName, healthyEndpoints.Count, _apiUrls.Count);
return healthyEndpoints;
}
/// <summary>
/// Updates the endpoint order based on benchmark results (fastest first).
/// </summary>
public void SetEndpointOrder(List<string> orderedEndpoints)
{
lock (_urlIndexLock)
{
// Reorder _apiUrls to match the benchmarked order
var reordered = orderedEndpoints.Where(e => _apiUrls.Contains(e)).ToList();
// Add any endpoints that weren't benchmarked (shouldn't happen, but be safe)
foreach (var url in _apiUrls.Where(u => !reordered.Contains(u)))
{
reordered.Add(url);
}
_apiUrls.Clear();
_apiUrls.AddRange(reordered);
_currentUrlIndex = 0;
_logger.LogInformation("📊 {Service} endpoints reordered by benchmark: {Endpoints}",
_serviceName, string.Join(", ", _apiUrls.Take(3)));
}
}
/// <summary>
/// Tries the request with the next provider in round-robin, then falls back to others on failure.
/// This distributes load evenly across all providers while maintaining reliability.
/// Performs quick health checks first to avoid wasting time on dead endpoints.
/// Throws exception if all endpoints fail.
/// </summary>
public async Task<T> TryWithFallbackAsync<T>(Func<string, Task<T>> action)
{
// Get healthy endpoints first (with caching to avoid excessive checks)
var healthyEndpoints = await GetHealthyEndpointsAsync();
// Start with the next URL in round-robin to distribute load
var startIndex = 0;
lock (_urlIndexLock)
{
startIndex = _currentUrlIndex;
_currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count;
}
// Try healthy endpoints first, then fall back to all if needed
var endpointsToTry = healthyEndpoints.Count < _apiUrls.Count
? healthyEndpoints.Concat(_apiUrls.Except(healthyEndpoints)).ToList()
: healthyEndpoints;
// Try all URLs starting from the round-robin selected one
for (int attempt = 0; attempt < endpointsToTry.Count; attempt++)
{
var urlIndex = (startIndex + attempt) % endpointsToTry.Count;
var baseUrl = endpointsToTry[urlIndex];
try
{
_logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})",
_serviceName, baseUrl, attempt + 1, endpointsToTry.Count);
return await action(baseUrl);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...",
_serviceName, baseUrl);
// Mark as unhealthy in cache
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (false, DateTime.UtcNow);
}
if (attempt == endpointsToTry.Count - 1)
{
_logger.LogError("All {Count} {Service} endpoints failed", endpointsToTry.Count, _serviceName);
throw;
}
}
}
throw new Exception($"All {_serviceName} endpoints failed");
}
/// <summary>
/// Races all endpoints in parallel and returns the first successful result.
/// Cancels remaining requests once one succeeds. Great for latency-sensitive operations.
/// </summary>
public async Task<T> RaceAllEndpointsAsync<T>(Func<string, CancellationToken, Task<T>> action, CancellationToken cancellationToken = default)
{
if (_apiUrls.Count == 1)
{
// No point racing with one endpoint
return await action(_apiUrls[0], cancellationToken);
}
using var raceCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
var tasks = new List<Task<(T result, string endpoint, bool success)>>();
// Start all requests in parallel
foreach (var baseUrl in _apiUrls)
{
var task = Task.Run(async () =>
{
try
{
_logger.LogDebug("Racing {Service} endpoint {Endpoint}", _serviceName, baseUrl);
var result = await action(baseUrl, raceCts.Token);
return (result, baseUrl, true);
}
catch (Exception ex)
{
_logger.LogDebug(ex, "{Service} race failed for endpoint {Endpoint}", _serviceName, baseUrl);
return (default(T)!, baseUrl, false);
}
}, raceCts.Token);
tasks.Add(task);
}
// Wait for first successful completion
while (tasks.Count > 0)
{
var completedTask = await Task.WhenAny(tasks);
var (result, endpoint, success) = await completedTask;
if (success)
{
_logger.LogInformation("🏁 {Service} race won by {Endpoint}, canceling others", _serviceName, endpoint);
raceCts.Cancel(); // Cancel all other requests
return result;
}
tasks.Remove(completedTask);
}
throw new Exception($"All {_serviceName} endpoints failed in race");
}
/// <summary>
/// Tries the request with the next provider in round-robin, then falls back to others on failure.
/// Performs quick health checks first to avoid wasting time on dead endpoints.
/// Returns default value if all endpoints fail (does not throw).
/// </summary>
public async Task<T> TryWithFallbackAsync<T>(Func<string, Task<T>> action, T defaultValue)
{
// Get healthy endpoints first (with caching to avoid excessive checks)
var healthyEndpoints = await GetHealthyEndpointsAsync();
// Start with the next URL in round-robin to distribute load
var startIndex = 0;
lock (_urlIndexLock)
{
startIndex = _currentUrlIndex;
_currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count;
}
// Try healthy endpoints first, then fall back to all if needed
var endpointsToTry = healthyEndpoints.Count < _apiUrls.Count
? healthyEndpoints.Concat(_apiUrls.Except(healthyEndpoints)).ToList()
: healthyEndpoints;
// Try all URLs starting from the round-robin selected one
for (int attempt = 0; attempt < endpointsToTry.Count; attempt++)
{
var urlIndex = (startIndex + attempt) % endpointsToTry.Count;
var baseUrl = endpointsToTry[urlIndex];
try
{
_logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})",
_serviceName, baseUrl, attempt + 1, endpointsToTry.Count);
return await action(baseUrl);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...",
_serviceName, baseUrl);
// Mark as unhealthy in cache
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (false, DateTime.UtcNow);
}
if (attempt == endpointsToTry.Count - 1)
{
_logger.LogError("All {Count} {Service} endpoints failed, returning default value",
endpointsToTry.Count, _serviceName);
return defaultValue;
}
}
}
return defaultValue;
}
}