468 lines
12 KiB
C#
Executable File
468 lines
12 KiB
C#
Executable File
using Microsoft.Extensions.Logging;
|
|
using MarketAlly.AIPlugin.Context.Configuration;
|
|
using System.Diagnostics;
|
|
|
|
namespace MarketAlly.AIPlugin.Context.Monitoring
|
|
{
|
|
/// <summary>
|
|
/// Provides health checks for context storage and operations
|
|
/// </summary>
|
|
public class HealthCheckService
|
|
{
|
|
private readonly ContextConfiguration _configuration;
|
|
private readonly ILogger<HealthCheckService> _logger;
|
|
private readonly Timer _healthCheckTimer;
|
|
private HealthStatus _lastHealthStatus;
|
|
private readonly object _healthLock = new();
|
|
|
|
public event EventHandler<HealthStatusChangedEventArgs>? HealthStatusChanged;
|
|
|
|
public HealthCheckService(ContextConfiguration configuration, ILogger<HealthCheckService> logger)
|
|
{
|
|
_configuration = configuration;
|
|
_logger = logger;
|
|
_lastHealthStatus = new HealthStatus { IsHealthy = true, CheckTime = DateTime.UtcNow };
|
|
|
|
if (_configuration.Monitoring.EnableHealthChecks)
|
|
{
|
|
var interval = TimeSpan.FromSeconds(_configuration.Monitoring.HealthCheckIntervalSeconds);
|
|
_healthCheckTimer = new Timer(PerformHealthCheck, null, TimeSpan.Zero, interval);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Performs a comprehensive health check
|
|
/// </summary>
|
|
public async Task<HealthStatus> CheckHealthAsync(CancellationToken cancellationToken = default)
|
|
{
|
|
var stopwatch = Stopwatch.StartNew();
|
|
var healthStatus = new HealthStatus
|
|
{
|
|
CheckTime = DateTime.UtcNow,
|
|
IsHealthy = true,
|
|
Details = new List<HealthCheckDetail>()
|
|
};
|
|
|
|
try
|
|
{
|
|
// Check storage accessibility
|
|
await CheckStorageHealthAsync(healthStatus, cancellationToken);
|
|
|
|
// Check memory usage
|
|
CheckMemoryHealth(healthStatus);
|
|
|
|
// Check disk space
|
|
await CheckDiskSpaceAsync(healthStatus, cancellationToken);
|
|
|
|
// Check file system permissions
|
|
await CheckFileSystemPermissionsAsync(healthStatus, cancellationToken);
|
|
|
|
// Check configuration validity
|
|
CheckConfigurationHealth(healthStatus);
|
|
|
|
// Check for stuck operations (if we had a way to track them)
|
|
CheckOperationalHealth(healthStatus);
|
|
|
|
stopwatch.Stop();
|
|
healthStatus.CheckDurationMs = stopwatch.ElapsedMilliseconds;
|
|
|
|
// Determine overall health
|
|
healthStatus.IsHealthy = healthStatus.Details.All(d => d.IsHealthy);
|
|
|
|
// Update cached status
|
|
lock (_healthLock)
|
|
{
|
|
var wasHealthy = _lastHealthStatus.IsHealthy;
|
|
_lastHealthStatus = healthStatus;
|
|
|
|
// Fire event if health status changed
|
|
if (wasHealthy != healthStatus.IsHealthy)
|
|
{
|
|
HealthStatusChanged?.Invoke(this, new HealthStatusChangedEventArgs
|
|
{
|
|
PreviousStatus = wasHealthy,
|
|
CurrentStatus = healthStatus.IsHealthy,
|
|
Details = healthStatus
|
|
});
|
|
}
|
|
}
|
|
|
|
_logger.LogInformation("Health check completed in {Duration}ms - Status: {Status}",
|
|
stopwatch.ElapsedMilliseconds, healthStatus.IsHealthy ? "Healthy" : "Unhealthy");
|
|
|
|
return healthStatus;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Health check failed with exception");
|
|
|
|
stopwatch.Stop();
|
|
return new HealthStatus
|
|
{
|
|
CheckTime = DateTime.UtcNow,
|
|
CheckDurationMs = stopwatch.ElapsedMilliseconds,
|
|
IsHealthy = false,
|
|
Error = ex.Message,
|
|
Details = new List<HealthCheckDetail>
|
|
{
|
|
new HealthCheckDetail
|
|
{
|
|
Component = "HealthCheck",
|
|
IsHealthy = false,
|
|
Message = $"Health check failed: {ex.Message}",
|
|
CheckTime = DateTime.UtcNow
|
|
}
|
|
}
|
|
};
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the last known health status without performing a new check
|
|
/// </summary>
|
|
public HealthStatus GetLastHealthStatus()
|
|
{
|
|
lock (_healthLock)
|
|
{
|
|
return _lastHealthStatus;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if context storage is accessible and functional
|
|
/// </summary>
|
|
private async Task CheckStorageHealthAsync(HealthStatus healthStatus, CancellationToken cancellationToken)
|
|
{
|
|
var detail = new HealthCheckDetail
|
|
{
|
|
Component = "Storage",
|
|
CheckTime = DateTime.UtcNow
|
|
};
|
|
|
|
try
|
|
{
|
|
var testStoragePath = Path.Combine(_configuration.StoragePath, ".health-check");
|
|
|
|
// Ensure directory exists
|
|
if (!Directory.Exists(testStoragePath))
|
|
{
|
|
Directory.CreateDirectory(testStoragePath);
|
|
}
|
|
|
|
// Test write operation
|
|
var testFilePath = Path.Combine(testStoragePath, $"health-{Guid.NewGuid():N}.tmp");
|
|
var testContent = $"Health check at {DateTime.UtcNow:O}";
|
|
|
|
await File.WriteAllTextAsync(testFilePath, testContent, cancellationToken);
|
|
|
|
// Test read operation
|
|
var readContent = await File.ReadAllTextAsync(testFilePath, cancellationToken);
|
|
|
|
if (readContent != testContent)
|
|
{
|
|
throw new Exception("Read content doesn't match written content");
|
|
}
|
|
|
|
// Cleanup
|
|
File.Delete(testFilePath);
|
|
|
|
detail.IsHealthy = true;
|
|
detail.Message = "Storage is accessible and functional";
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
detail.IsHealthy = false;
|
|
detail.Message = $"Storage check failed: {ex.Message}";
|
|
detail.Error = ex.Message;
|
|
}
|
|
|
|
healthStatus.Details.Add(detail);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks current memory usage
|
|
/// </summary>
|
|
private void CheckMemoryHealth(HealthStatus healthStatus)
|
|
{
|
|
var detail = new HealthCheckDetail
|
|
{
|
|
Component = "Memory",
|
|
CheckTime = DateTime.UtcNow
|
|
};
|
|
|
|
try
|
|
{
|
|
var memoryUsage = GC.GetTotalMemory(false);
|
|
var memoryUsageMB = memoryUsage / (1024.0 * 1024.0);
|
|
|
|
// Warn if memory usage is above 500MB (configurable threshold)
|
|
var memoryThresholdMB = 500;
|
|
|
|
detail.IsHealthy = memoryUsageMB < memoryThresholdMB;
|
|
detail.Message = $"Memory usage: {memoryUsageMB:F1} MB";
|
|
|
|
if (!detail.IsHealthy)
|
|
{
|
|
detail.Message += $" (exceeds threshold of {memoryThresholdMB} MB)";
|
|
}
|
|
|
|
detail.Metadata = new Dictionary<string, object>
|
|
{
|
|
["memory_bytes"] = memoryUsage,
|
|
["memory_mb"] = memoryUsageMB,
|
|
["threshold_mb"] = memoryThresholdMB
|
|
};
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
detail.IsHealthy = false;
|
|
detail.Message = $"Memory check failed: {ex.Message}";
|
|
detail.Error = ex.Message;
|
|
}
|
|
|
|
healthStatus.Details.Add(detail);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks available disk space
|
|
/// </summary>
|
|
private async Task CheckDiskSpaceAsync(HealthStatus healthStatus, CancellationToken cancellationToken)
|
|
{
|
|
var detail = new HealthCheckDetail
|
|
{
|
|
Component = "DiskSpace",
|
|
CheckTime = DateTime.UtcNow
|
|
};
|
|
|
|
try
|
|
{
|
|
var storagePath = Path.GetFullPath(_configuration.StoragePath);
|
|
var driveInfo = new DriveInfo(Path.GetPathRoot(storagePath)!);
|
|
|
|
var availableSpaceGB = driveInfo.AvailableFreeSpace / (1024.0 * 1024.0 * 1024.0);
|
|
var totalSpaceGB = driveInfo.TotalSize / (1024.0 * 1024.0 * 1024.0);
|
|
var usedPercentage = ((totalSpaceGB - availableSpaceGB) / totalSpaceGB) * 100;
|
|
|
|
// Warn if disk usage is above 90%
|
|
var diskUsageThreshold = 90.0;
|
|
|
|
detail.IsHealthy = usedPercentage < diskUsageThreshold;
|
|
detail.Message = $"Disk usage: {usedPercentage:F1}% ({availableSpaceGB:F1} GB available)";
|
|
|
|
if (!detail.IsHealthy)
|
|
{
|
|
detail.Message += $" (exceeds threshold of {diskUsageThreshold}%)";
|
|
}
|
|
|
|
detail.Metadata = new Dictionary<string, object>
|
|
{
|
|
["available_space_gb"] = availableSpaceGB,
|
|
["total_space_gb"] = totalSpaceGB,
|
|
["used_percentage"] = usedPercentage,
|
|
["threshold_percentage"] = diskUsageThreshold
|
|
};
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
detail.IsHealthy = false;
|
|
detail.Message = $"Disk space check failed: {ex.Message}";
|
|
detail.Error = ex.Message;
|
|
}
|
|
|
|
healthStatus.Details.Add(detail);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks file system permissions
|
|
/// </summary>
|
|
private async Task CheckFileSystemPermissionsAsync(HealthStatus healthStatus, CancellationToken cancellationToken)
|
|
{
|
|
var detail = new HealthCheckDetail
|
|
{
|
|
Component = "Permissions",
|
|
CheckTime = DateTime.UtcNow
|
|
};
|
|
|
|
try
|
|
{
|
|
var storagePath = _configuration.StoragePath;
|
|
|
|
// Check if we can create directories
|
|
var testDir = Path.Combine(storagePath, $".perm-test-{Guid.NewGuid():N}");
|
|
Directory.CreateDirectory(testDir);
|
|
|
|
// Check if we can create and write files
|
|
var testFile = Path.Combine(testDir, "test.txt");
|
|
await File.WriteAllTextAsync(testFile, "permission test", cancellationToken);
|
|
|
|
// Check if we can read files
|
|
var content = await File.ReadAllTextAsync(testFile, cancellationToken);
|
|
|
|
// Check if we can delete files and directories
|
|
File.Delete(testFile);
|
|
Directory.Delete(testDir);
|
|
|
|
detail.IsHealthy = true;
|
|
detail.Message = "File system permissions are correct";
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
detail.IsHealthy = false;
|
|
detail.Message = $"Permission check failed: {ex.Message}";
|
|
detail.Error = ex.Message;
|
|
}
|
|
|
|
healthStatus.Details.Add(detail);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Validates configuration settings
|
|
/// </summary>
|
|
private void CheckConfigurationHealth(HealthStatus healthStatus)
|
|
{
|
|
var detail = new HealthCheckDetail
|
|
{
|
|
Component = "Configuration",
|
|
CheckTime = DateTime.UtcNow
|
|
};
|
|
|
|
try
|
|
{
|
|
var issues = new List<string>();
|
|
|
|
// Check storage path
|
|
if (string.IsNullOrEmpty(_configuration.StoragePath))
|
|
{
|
|
issues.Add("Storage path is not configured");
|
|
}
|
|
|
|
// Check retention settings
|
|
if (_configuration.Retention.RetentionDays <= 0)
|
|
{
|
|
issues.Add("Invalid retention days setting");
|
|
}
|
|
|
|
if (_configuration.Retention.MaxEntriesPerFile <= 0)
|
|
{
|
|
issues.Add("Invalid max entries per file setting");
|
|
}
|
|
|
|
// Check performance settings
|
|
if (_configuration.Performance.MaxConcurrentOperations <= 0)
|
|
{
|
|
issues.Add("Invalid max concurrent operations setting");
|
|
}
|
|
|
|
// Check search settings
|
|
if (_configuration.Search.EnableSemanticSearch && string.IsNullOrEmpty(_configuration.Search.OpenAIApiKey))
|
|
{
|
|
issues.Add("Semantic search enabled but API key not configured");
|
|
}
|
|
|
|
detail.IsHealthy = issues.Count == 0;
|
|
detail.Message = detail.IsHealthy ? "Configuration is valid" : $"Configuration issues: {string.Join(", ", issues)}";
|
|
|
|
if (issues.Count > 0)
|
|
{
|
|
detail.Metadata = new Dictionary<string, object> { ["issues"] = issues };
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
detail.IsHealthy = false;
|
|
detail.Message = $"Configuration check failed: {ex.Message}";
|
|
detail.Error = ex.Message;
|
|
}
|
|
|
|
healthStatus.Details.Add(detail);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks for operational issues
|
|
/// </summary>
|
|
private void CheckOperationalHealth(HealthStatus healthStatus)
|
|
{
|
|
var detail = new HealthCheckDetail
|
|
{
|
|
Component = "Operations",
|
|
CheckTime = DateTime.UtcNow
|
|
};
|
|
|
|
try
|
|
{
|
|
// In a full implementation, you might check for:
|
|
// - Long-running operations
|
|
// - Failed operations count
|
|
// - Queue sizes
|
|
// - Cache hit ratios
|
|
// etc.
|
|
|
|
detail.IsHealthy = true;
|
|
detail.Message = "No operational issues detected";
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
detail.IsHealthy = false;
|
|
detail.Message = $"Operational check failed: {ex.Message}";
|
|
detail.Error = ex.Message;
|
|
}
|
|
|
|
healthStatus.Details.Add(detail);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Timer callback for periodic health checks
|
|
/// </summary>
|
|
private async void PerformHealthCheck(object? state)
|
|
{
|
|
try
|
|
{
|
|
await CheckHealthAsync();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Periodic health check failed");
|
|
}
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
_healthCheckTimer?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Overall health status
|
|
/// </summary>
|
|
public class HealthStatus
|
|
{
|
|
public DateTime CheckTime { get; set; }
|
|
public long CheckDurationMs { get; set; }
|
|
public bool IsHealthy { get; set; }
|
|
public string? Error { get; set; }
|
|
public List<HealthCheckDetail> Details { get; set; } = new();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Health check detail for a specific component
|
|
/// </summary>
|
|
public class HealthCheckDetail
|
|
{
|
|
public string Component { get; set; } = "";
|
|
public DateTime CheckTime { get; set; }
|
|
public bool IsHealthy { get; set; }
|
|
public string Message { get; set; } = "";
|
|
public string? Error { get; set; }
|
|
public Dictionary<string, object>? Metadata { get; set; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Event args for health status changes
|
|
/// </summary>
|
|
public class HealthStatusChangedEventArgs : EventArgs
|
|
{
|
|
public bool PreviousStatus { get; set; }
|
|
public bool CurrentStatus { get; set; }
|
|
public HealthStatus Details { get; set; } = new();
|
|
}
|
|
} |