feat: add quick health checks before trying endpoints

- Health checks run in parallel with 3 second timeout
- Results cached for 30 seconds to avoid excessive checks
- Healthy endpoints tried first, unhealthy ones as fallback
- Prevents wasting time on dead endpoints (no more 5 min waits)
- Failed requests mark endpoint as unhealthy in cache
- Significantly improves response time when some endpoints are down
This commit is contained in:
2026-02-08 00:05:27 -05:00
parent f59f265ad4
commit 972756159d

View File

@@ -11,6 +11,12 @@ public class RoundRobinFallbackHelper
private readonly object _urlIndexLock = new object(); private readonly object _urlIndexLock = new object();
private readonly ILogger _logger; private readonly ILogger _logger;
private readonly string _serviceName; private readonly string _serviceName;
private readonly HttpClient _healthCheckClient;
// Cache health check results for 30 seconds to avoid excessive checks
private readonly Dictionary<string, (bool isHealthy, DateTime checkedAt)> _healthCache = new();
private readonly object _healthCacheLock = new object();
private readonly TimeSpan _healthCacheExpiry = TimeSpan.FromSeconds(30);
public int EndpointCount => _apiUrls.Count; public int EndpointCount => _apiUrls.Count;
@@ -24,6 +30,91 @@ public class RoundRobinFallbackHelper
{ {
throw new ArgumentException("API URLs list cannot be empty", nameof(apiUrls)); throw new ArgumentException("API URLs list cannot be empty", nameof(apiUrls));
} }
// Create a dedicated HttpClient for health checks with short timeout
_healthCheckClient = new HttpClient
{
Timeout = TimeSpan.FromSeconds(3) // Quick health check timeout
};
}
/// <summary>
/// Quickly checks if an endpoint is healthy (responds within 3 seconds).
/// Results are cached for 30 seconds to avoid excessive health checks.
/// </summary>
private async Task<bool> IsEndpointHealthyAsync(string baseUrl)
{
// Check cache first
lock (_healthCacheLock)
{
if (_healthCache.TryGetValue(baseUrl, out var cached))
{
if (DateTime.UtcNow - cached.checkedAt < _healthCacheExpiry)
{
return cached.isHealthy;
}
}
}
// Perform health check
try
{
var response = await _healthCheckClient.GetAsync(baseUrl, HttpCompletionOption.ResponseHeadersRead);
var isHealthy = response.IsSuccessStatusCode;
// Cache result
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (isHealthy, DateTime.UtcNow);
}
if (!isHealthy)
{
_logger.LogDebug("{Service} endpoint {Endpoint} health check failed: {StatusCode}",
_serviceName, baseUrl, response.StatusCode);
}
return isHealthy;
}
catch (Exception ex)
{
_logger.LogDebug(ex, "{Service} endpoint {Endpoint} health check failed", _serviceName, baseUrl);
// Cache as unhealthy
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (false, DateTime.UtcNow);
}
return false;
}
}
/// <summary>
/// Gets a list of healthy endpoints, checking them in parallel.
/// Falls back to all endpoints if none are healthy.
/// </summary>
private async Task<List<string>> GetHealthyEndpointsAsync()
{
var healthCheckTasks = _apiUrls.Select(async url => new
{
Url = url,
IsHealthy = await IsEndpointHealthyAsync(url)
}).ToList();
var results = await Task.WhenAll(healthCheckTasks);
var healthyEndpoints = results.Where(r => r.IsHealthy).Select(r => r.Url).ToList();
if (healthyEndpoints.Count == 0)
{
_logger.LogWarning("{Service} health check: no healthy endpoints found, will try all", _serviceName);
return _apiUrls;
}
_logger.LogDebug("{Service} health check: {Healthy}/{Total} endpoints healthy",
_serviceName, healthyEndpoints.Count, _apiUrls.Count);
return healthyEndpoints;
} }
/// <summary> /// <summary>
@@ -54,10 +145,14 @@ public class RoundRobinFallbackHelper
/// <summary> /// <summary>
/// Tries the request with the next provider in round-robin, then falls back to others on failure. /// Tries the request with the next provider in round-robin, then falls back to others on failure.
/// This distributes load evenly across all providers while maintaining reliability. /// This distributes load evenly across all providers while maintaining reliability.
/// Performs quick health checks first to avoid wasting time on dead endpoints.
/// Throws exception if all endpoints fail. /// Throws exception if all endpoints fail.
/// </summary> /// </summary>
public async Task<T> TryWithFallbackAsync<T>(Func<string, Task<T>> action) public async Task<T> TryWithFallbackAsync<T>(Func<string, Task<T>> action)
{ {
// Get healthy endpoints first (with caching to avoid excessive checks)
var healthyEndpoints = await GetHealthyEndpointsAsync();
// Start with the next URL in round-robin to distribute load // Start with the next URL in round-robin to distribute load
var startIndex = 0; var startIndex = 0;
lock (_urlIndexLock) lock (_urlIndexLock)
@@ -66,16 +161,21 @@ public class RoundRobinFallbackHelper
_currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count; _currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count;
} }
// Try healthy endpoints first, then fall back to all if needed
var endpointsToTry = healthyEndpoints.Count < _apiUrls.Count
? healthyEndpoints.Concat(_apiUrls.Except(healthyEndpoints)).ToList()
: healthyEndpoints;
// Try all URLs starting from the round-robin selected one // Try all URLs starting from the round-robin selected one
for (int attempt = 0; attempt < _apiUrls.Count; attempt++) for (int attempt = 0; attempt < endpointsToTry.Count; attempt++)
{ {
var urlIndex = (startIndex + attempt) % _apiUrls.Count; var urlIndex = (startIndex + attempt) % endpointsToTry.Count;
var baseUrl = _apiUrls[urlIndex]; var baseUrl = endpointsToTry[urlIndex];
try try
{ {
_logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})", _logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})",
_serviceName, baseUrl, attempt + 1, _apiUrls.Count); _serviceName, baseUrl, attempt + 1, endpointsToTry.Count);
return await action(baseUrl); return await action(baseUrl);
} }
catch (Exception ex) catch (Exception ex)
@@ -83,9 +183,15 @@ public class RoundRobinFallbackHelper
_logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...", _logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...",
_serviceName, baseUrl); _serviceName, baseUrl);
if (attempt == _apiUrls.Count - 1) // Mark as unhealthy in cache
lock (_healthCacheLock)
{ {
_logger.LogError("All {Count} {Service} endpoints failed", _apiUrls.Count, _serviceName); _healthCache[baseUrl] = (false, DateTime.UtcNow);
}
if (attempt == endpointsToTry.Count - 1)
{
_logger.LogError("All {Count} {Service} endpoints failed", endpointsToTry.Count, _serviceName);
throw; throw;
} }
} }
@@ -150,10 +256,14 @@ public class RoundRobinFallbackHelper
/// <summary> /// <summary>
/// Tries the request with the next provider in round-robin, then falls back to others on failure. /// Tries the request with the next provider in round-robin, then falls back to others on failure.
/// Performs quick health checks first to avoid wasting time on dead endpoints.
/// Returns default value if all endpoints fail (does not throw). /// Returns default value if all endpoints fail (does not throw).
/// </summary> /// </summary>
public async Task<T> TryWithFallbackAsync<T>(Func<string, Task<T>> action, T defaultValue) public async Task<T> TryWithFallbackAsync<T>(Func<string, Task<T>> action, T defaultValue)
{ {
// Get healthy endpoints first (with caching to avoid excessive checks)
var healthyEndpoints = await GetHealthyEndpointsAsync();
// Start with the next URL in round-robin to distribute load // Start with the next URL in round-robin to distribute load
var startIndex = 0; var startIndex = 0;
lock (_urlIndexLock) lock (_urlIndexLock)
@@ -162,16 +272,21 @@ public class RoundRobinFallbackHelper
_currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count; _currentUrlIndex = (_currentUrlIndex + 1) % _apiUrls.Count;
} }
// Try healthy endpoints first, then fall back to all if needed
var endpointsToTry = healthyEndpoints.Count < _apiUrls.Count
? healthyEndpoints.Concat(_apiUrls.Except(healthyEndpoints)).ToList()
: healthyEndpoints;
// Try all URLs starting from the round-robin selected one // Try all URLs starting from the round-robin selected one
for (int attempt = 0; attempt < _apiUrls.Count; attempt++) for (int attempt = 0; attempt < endpointsToTry.Count; attempt++)
{ {
var urlIndex = (startIndex + attempt) % _apiUrls.Count; var urlIndex = (startIndex + attempt) % endpointsToTry.Count;
var baseUrl = _apiUrls[urlIndex]; var baseUrl = endpointsToTry[urlIndex];
try try
{ {
_logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})", _logger.LogDebug("Trying {Service} endpoint {Endpoint} (attempt {Attempt}/{Total})",
_serviceName, baseUrl, attempt + 1, _apiUrls.Count); _serviceName, baseUrl, attempt + 1, endpointsToTry.Count);
return await action(baseUrl); return await action(baseUrl);
} }
catch (Exception ex) catch (Exception ex)
@@ -179,10 +294,16 @@ public class RoundRobinFallbackHelper
_logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...", _logger.LogWarning(ex, "{Service} request failed with endpoint {Endpoint}, trying next...",
_serviceName, baseUrl); _serviceName, baseUrl);
if (attempt == _apiUrls.Count - 1) // Mark as unhealthy in cache
lock (_healthCacheLock)
{
_healthCache[baseUrl] = (false, DateTime.UtcNow);
}
if (attempt == endpointsToTry.Count - 1)
{ {
_logger.LogError("All {Count} {Service} endpoints failed, returning default value", _logger.LogError("All {Count} {Service} endpoints failed, returning default value",
_apiUrls.Count, _serviceName); endpointsToTry.Count, _serviceName);
return defaultValue; return defaultValue;
} }
} }