Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
595 lines
16 KiB
Go
595 lines
16 KiB
Go
/*
|
|
Health Daemon — Continuous health monitoring for Volt workloads.
|
|
|
|
Unlike deploy-time health checks (which verify a single instance during
|
|
deployment), the health daemon runs continuously, monitoring all
|
|
configured workloads and taking action when they become unhealthy.
|
|
|
|
Features:
|
|
- HTTP, TCP, and exec health checks
|
|
- Configurable intervals and thresholds
|
|
- Auto-restart on sustained unhealthy state
|
|
- Health status API for monitoring integrations
|
|
- Event emission for webhook/notification systems
|
|
|
|
Configuration is stored in /etc/volt/health/ as YAML files, one per
|
|
workload.
|
|
|
|
Copyright (c) Armored Gates LLC. All rights reserved.
|
|
*/
|
|
package healthd
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
// ── Constants ────────────────────────────────────────────────────────────────
|
|
|
|
const (
|
|
// DefaultHealthDir stores health check configurations.
|
|
DefaultHealthDir = "/etc/volt/health"
|
|
|
|
// DefaultStatusDir stores runtime health status.
|
|
DefaultStatusDir = "/var/lib/volt/health"
|
|
)
|
|
|
|
// ── Health Check Config ──────────────────────────────────────────────────────
|
|
|
|
// CheckType defines the type of health check.
|
|
type CheckType string
|
|
|
|
const (
|
|
CheckHTTP CheckType = "http"
|
|
CheckTCP CheckType = "tcp"
|
|
CheckExec CheckType = "exec"
|
|
)
|
|
|
|
// Config defines a health check configuration for a workload.
|
|
type Config struct {
|
|
Workload string `yaml:"workload" json:"workload"`
|
|
Type CheckType `yaml:"type" json:"type"`
|
|
Target string `yaml:"target" json:"target"` // URL path for HTTP, port for TCP, command for exec
|
|
Port int `yaml:"port,omitempty" json:"port,omitempty"`
|
|
Interval time.Duration `yaml:"interval" json:"interval"`
|
|
Timeout time.Duration `yaml:"timeout" json:"timeout"`
|
|
Retries int `yaml:"retries" json:"retries"` // Failures before unhealthy
|
|
AutoRestart bool `yaml:"auto_restart" json:"auto_restart"`
|
|
MaxRestarts int `yaml:"max_restarts" json:"max_restarts"` // 0 = unlimited
|
|
RestartDelay time.Duration `yaml:"restart_delay" json:"restart_delay"`
|
|
Enabled bool `yaml:"enabled" json:"enabled"`
|
|
}
|
|
|
|
// Validate checks that a health config is valid and fills defaults.
|
|
func (c *Config) Validate() error {
|
|
if c.Workload == "" {
|
|
return fmt.Errorf("healthd: workload name required")
|
|
}
|
|
switch c.Type {
|
|
case CheckHTTP:
|
|
if c.Target == "" {
|
|
c.Target = "/healthz"
|
|
}
|
|
if c.Port == 0 {
|
|
c.Port = 8080
|
|
}
|
|
case CheckTCP:
|
|
if c.Port == 0 {
|
|
return fmt.Errorf("healthd: TCP check requires port")
|
|
}
|
|
case CheckExec:
|
|
if c.Target == "" {
|
|
return fmt.Errorf("healthd: exec check requires command")
|
|
}
|
|
default:
|
|
return fmt.Errorf("healthd: unknown check type %q", c.Type)
|
|
}
|
|
|
|
if c.Interval <= 0 {
|
|
c.Interval = 30 * time.Second
|
|
}
|
|
if c.Timeout <= 0 {
|
|
c.Timeout = 5 * time.Second
|
|
}
|
|
if c.Retries <= 0 {
|
|
c.Retries = 3
|
|
}
|
|
if c.RestartDelay <= 0 {
|
|
c.RestartDelay = 10 * time.Second
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ── Health Status ────────────────────────────────────────────────────────────
|
|
|
|
// Status represents the current health state of a workload.
|
|
type Status struct {
|
|
Workload string `json:"workload" yaml:"workload"`
|
|
Healthy bool `json:"healthy" yaml:"healthy"`
|
|
LastCheck time.Time `json:"last_check" yaml:"last_check"`
|
|
LastHealthy time.Time `json:"last_healthy,omitempty" yaml:"last_healthy,omitempty"`
|
|
ConsecutiveFails int `json:"consecutive_fails" yaml:"consecutive_fails"`
|
|
TotalChecks int64 `json:"total_checks" yaml:"total_checks"`
|
|
TotalFails int64 `json:"total_fails" yaml:"total_fails"`
|
|
RestartCount int `json:"restart_count" yaml:"restart_count"`
|
|
LastError string `json:"last_error,omitempty" yaml:"last_error,omitempty"`
|
|
LastRestart time.Time `json:"last_restart,omitempty" yaml:"last_restart,omitempty"`
|
|
}
|
|
|
|
// ── IP Resolver ──────────────────────────────────────────────────────────────
|
|
|
|
// IPResolver maps a workload name to its IP address.
|
|
type IPResolver func(workload string) (string, error)
|
|
|
|
// DefaultIPResolver tries to resolve via machinectl show.
|
|
func DefaultIPResolver(workload string) (string, error) {
|
|
out, err := exec.Command("machinectl", "show", workload, "-p", "Addresses").CombinedOutput()
|
|
if err != nil {
|
|
return "127.0.0.1", nil // Fallback to localhost
|
|
}
|
|
line := strings.TrimSpace(string(out))
|
|
if strings.HasPrefix(line, "Addresses=") {
|
|
addrs := strings.TrimPrefix(line, "Addresses=")
|
|
// Take first address
|
|
parts := strings.Fields(addrs)
|
|
if len(parts) > 0 {
|
|
return parts[0], nil
|
|
}
|
|
}
|
|
return "127.0.0.1", nil
|
|
}
|
|
|
|
// ── Restart Handler ──────────────────────────────────────────────────────────
|
|
|
|
// RestartFunc defines how to restart a workload.
|
|
type RestartFunc func(workload string) error
|
|
|
|
// DefaultRestartFunc restarts via systemctl.
|
|
func DefaultRestartFunc(workload string) error {
|
|
unit := fmt.Sprintf("volt-container@%s.service", workload)
|
|
return exec.Command("systemctl", "restart", unit).Run()
|
|
}
|
|
|
|
// ── Event Handler ────────────────────────────────────────────────────────────
|
|
|
|
// EventType describes health daemon events.
|
|
type EventType string
|
|
|
|
const (
|
|
EventHealthy EventType = "healthy"
|
|
EventUnhealthy EventType = "unhealthy"
|
|
EventRestart EventType = "restart"
|
|
EventCheckFail EventType = "check_fail"
|
|
)
|
|
|
|
// Event is emitted when health state changes.
|
|
type Event struct {
|
|
Type EventType `json:"type"`
|
|
Workload string `json:"workload"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Message string `json:"message"`
|
|
}
|
|
|
|
// EventHandler is called when health events occur.
|
|
type EventHandler func(event Event)
|
|
|
|
// ── Health Daemon ────────────────────────────────────────────────────────────
|
|
|
|
// Daemon manages continuous health monitoring for multiple workloads.
|
|
type Daemon struct {
|
|
configDir string
|
|
statusDir string
|
|
ipResolver IPResolver
|
|
restartFunc RestartFunc
|
|
eventHandler EventHandler
|
|
|
|
configs map[string]*Config
|
|
statuses map[string]*Status
|
|
mu sync.RWMutex
|
|
cancel context.CancelFunc
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
// NewDaemon creates a health monitoring daemon.
|
|
func NewDaemon(configDir, statusDir string) *Daemon {
|
|
if configDir == "" {
|
|
configDir = DefaultHealthDir
|
|
}
|
|
if statusDir == "" {
|
|
statusDir = DefaultStatusDir
|
|
}
|
|
return &Daemon{
|
|
configDir: configDir,
|
|
statusDir: statusDir,
|
|
ipResolver: DefaultIPResolver,
|
|
restartFunc: DefaultRestartFunc,
|
|
configs: make(map[string]*Config),
|
|
statuses: make(map[string]*Status),
|
|
}
|
|
}
|
|
|
|
// SetIPResolver sets a custom IP resolver.
|
|
func (d *Daemon) SetIPResolver(resolver IPResolver) {
|
|
d.ipResolver = resolver
|
|
}
|
|
|
|
// SetRestartFunc sets a custom restart function.
|
|
func (d *Daemon) SetRestartFunc(fn RestartFunc) {
|
|
d.restartFunc = fn
|
|
}
|
|
|
|
// SetEventHandler sets the event callback.
|
|
func (d *Daemon) SetEventHandler(handler EventHandler) {
|
|
d.eventHandler = handler
|
|
}
|
|
|
|
// LoadConfigs reads all health check configurations from disk.
|
|
func (d *Daemon) LoadConfigs() error {
|
|
d.mu.Lock()
|
|
defer d.mu.Unlock()
|
|
|
|
files, err := filepath.Glob(filepath.Join(d.configDir, "*.yaml"))
|
|
if err != nil {
|
|
return fmt.Errorf("healthd: glob configs: %w", err)
|
|
}
|
|
|
|
for _, f := range files {
|
|
data, err := os.ReadFile(f)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
var cfg Config
|
|
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
|
continue
|
|
}
|
|
|
|
if err := cfg.Validate(); err != nil {
|
|
fmt.Fprintf(os.Stderr, "healthd: invalid config %s: %v\n", f, err)
|
|
continue
|
|
}
|
|
|
|
if cfg.Enabled {
|
|
d.configs[cfg.Workload] = &cfg
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Start begins monitoring all configured workloads.
|
|
func (d *Daemon) Start(ctx context.Context) error {
|
|
if err := d.LoadConfigs(); err != nil {
|
|
return err
|
|
}
|
|
|
|
ctx, d.cancel = context.WithCancel(ctx)
|
|
|
|
d.mu.RLock()
|
|
configs := make([]*Config, 0, len(d.configs))
|
|
for _, cfg := range d.configs {
|
|
configs = append(configs, cfg)
|
|
}
|
|
d.mu.RUnlock()
|
|
|
|
for _, cfg := range configs {
|
|
d.wg.Add(1)
|
|
go d.monitorLoop(ctx, cfg)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop gracefully stops the health daemon.
|
|
func (d *Daemon) Stop() {
|
|
if d.cancel != nil {
|
|
d.cancel()
|
|
}
|
|
d.wg.Wait()
|
|
d.saveStatuses()
|
|
}
|
|
|
|
// GetStatus returns the health status of a workload.
|
|
func (d *Daemon) GetStatus(workload string) *Status {
|
|
d.mu.RLock()
|
|
defer d.mu.RUnlock()
|
|
if s, ok := d.statuses[workload]; ok {
|
|
cp := *s
|
|
return &cp
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetAllStatuses returns health status of all monitored workloads.
|
|
func (d *Daemon) GetAllStatuses() []Status {
|
|
d.mu.RLock()
|
|
defer d.mu.RUnlock()
|
|
result := make([]Status, 0, len(d.statuses))
|
|
for _, s := range d.statuses {
|
|
result = append(result, *s)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// ── Configuration Management (CLI) ──────────────────────────────────────────
|
|
|
|
// ConfigureCheck writes or updates a health check configuration.
|
|
func ConfigureCheck(configDir string, cfg Config) error {
|
|
if configDir == "" {
|
|
configDir = DefaultHealthDir
|
|
}
|
|
if err := cfg.Validate(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.MkdirAll(configDir, 0755); err != nil {
|
|
return fmt.Errorf("healthd: create config dir: %w", err)
|
|
}
|
|
|
|
data, err := yaml.Marshal(cfg)
|
|
if err != nil {
|
|
return fmt.Errorf("healthd: marshal config: %w", err)
|
|
}
|
|
|
|
path := filepath.Join(configDir, cfg.Workload+".yaml")
|
|
return os.WriteFile(path, data, 0644)
|
|
}
|
|
|
|
// RemoveCheck removes a health check configuration.
|
|
func RemoveCheck(configDir string, workload string) error {
|
|
if configDir == "" {
|
|
configDir = DefaultHealthDir
|
|
}
|
|
path := filepath.Join(configDir, workload+".yaml")
|
|
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
|
return fmt.Errorf("healthd: remove config: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ListConfigs returns all configured health checks.
|
|
func ListConfigs(configDir string) ([]Config, error) {
|
|
if configDir == "" {
|
|
configDir = DefaultHealthDir
|
|
}
|
|
|
|
files, err := filepath.Glob(filepath.Join(configDir, "*.yaml"))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var configs []Config
|
|
for _, f := range files {
|
|
data, err := os.ReadFile(f)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
var cfg Config
|
|
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
|
continue
|
|
}
|
|
configs = append(configs, cfg)
|
|
}
|
|
return configs, nil
|
|
}
|
|
|
|
// LoadStatuses reads saved health statuses from disk.
|
|
func LoadStatuses(statusDir string) ([]Status, error) {
|
|
if statusDir == "" {
|
|
statusDir = DefaultStatusDir
|
|
}
|
|
|
|
path := filepath.Join(statusDir, "statuses.json")
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
var statuses []Status
|
|
if err := json.Unmarshal(data, &statuses); err != nil {
|
|
return nil, err
|
|
}
|
|
return statuses, nil
|
|
}
|
|
|
|
// ── Monitor Loop ─────────────────────────────────────────────────────────────
|
|
|
|
func (d *Daemon) monitorLoop(ctx context.Context, cfg *Config) {
|
|
defer d.wg.Done()
|
|
|
|
// Initialize status
|
|
d.mu.Lock()
|
|
d.statuses[cfg.Workload] = &Status{
|
|
Workload: cfg.Workload,
|
|
Healthy: true, // Assume healthy until proven otherwise
|
|
}
|
|
d.mu.Unlock()
|
|
|
|
ticker := time.NewTicker(cfg.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
d.runCheck(cfg)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) runCheck(cfg *Config) {
|
|
d.mu.Lock()
|
|
status := d.statuses[cfg.Workload]
|
|
d.mu.Unlock()
|
|
|
|
status.TotalChecks++
|
|
status.LastCheck = time.Now()
|
|
|
|
var err error
|
|
switch cfg.Type {
|
|
case CheckHTTP:
|
|
err = d.checkHTTP(cfg)
|
|
case CheckTCP:
|
|
err = d.checkTCP(cfg)
|
|
case CheckExec:
|
|
err = d.checkExec(cfg)
|
|
}
|
|
|
|
if err != nil {
|
|
status.TotalFails++
|
|
status.ConsecutiveFails++
|
|
status.LastError = err.Error()
|
|
|
|
d.emitEvent(Event{
|
|
Type: EventCheckFail,
|
|
Workload: cfg.Workload,
|
|
Timestamp: time.Now(),
|
|
Message: err.Error(),
|
|
})
|
|
|
|
// Check if we've exceeded the failure threshold
|
|
if status.ConsecutiveFails >= cfg.Retries {
|
|
wasHealthy := status.Healthy
|
|
status.Healthy = false
|
|
|
|
if wasHealthy {
|
|
d.emitEvent(Event{
|
|
Type: EventUnhealthy,
|
|
Workload: cfg.Workload,
|
|
Timestamp: time.Now(),
|
|
Message: fmt.Sprintf("health check failed %d times: %s", status.ConsecutiveFails, err.Error()),
|
|
})
|
|
}
|
|
|
|
// Auto-restart if configured
|
|
if cfg.AutoRestart {
|
|
if cfg.MaxRestarts == 0 || status.RestartCount < cfg.MaxRestarts {
|
|
d.handleRestart(cfg, status)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
wasUnhealthy := !status.Healthy
|
|
status.Healthy = true
|
|
status.ConsecutiveFails = 0
|
|
status.LastHealthy = time.Now()
|
|
status.LastError = ""
|
|
|
|
if wasUnhealthy {
|
|
d.emitEvent(Event{
|
|
Type: EventHealthy,
|
|
Workload: cfg.Workload,
|
|
Timestamp: time.Now(),
|
|
Message: "health check recovered",
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) checkHTTP(cfg *Config) error {
|
|
ip, err := d.ipResolver(cfg.Workload)
|
|
if err != nil {
|
|
return fmt.Errorf("resolve IP: %w", err)
|
|
}
|
|
|
|
url := fmt.Sprintf("http://%s:%d%s", ip, cfg.Port, cfg.Target)
|
|
client := &http.Client{Timeout: cfg.Timeout}
|
|
|
|
resp, err := client.Get(url)
|
|
if err != nil {
|
|
return fmt.Errorf("HTTP check failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 400 {
|
|
return fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) checkTCP(cfg *Config) error {
|
|
ip, err := d.ipResolver(cfg.Workload)
|
|
if err != nil {
|
|
return fmt.Errorf("resolve IP: %w", err)
|
|
}
|
|
|
|
addr := fmt.Sprintf("%s:%d", ip, cfg.Port)
|
|
conn, err := net.DialTimeout("tcp", addr, cfg.Timeout)
|
|
if err != nil {
|
|
return fmt.Errorf("TCP check failed: %w", err)
|
|
}
|
|
conn.Close()
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) checkExec(cfg *Config) error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), cfg.Timeout)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "sh", "-c", cfg.Target)
|
|
if err := cmd.Run(); err != nil {
|
|
return fmt.Errorf("exec check failed: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) handleRestart(cfg *Config, status *Status) {
|
|
// Respect restart delay
|
|
if !status.LastRestart.IsZero() && time.Since(status.LastRestart) < cfg.RestartDelay {
|
|
return
|
|
}
|
|
|
|
d.emitEvent(Event{
|
|
Type: EventRestart,
|
|
Workload: cfg.Workload,
|
|
Timestamp: time.Now(),
|
|
Message: fmt.Sprintf("auto-restarting (attempt %d)", status.RestartCount+1),
|
|
})
|
|
|
|
if err := d.restartFunc(cfg.Workload); err != nil {
|
|
fmt.Fprintf(os.Stderr, "healthd: restart %s failed: %v\n", cfg.Workload, err)
|
|
return
|
|
}
|
|
|
|
status.RestartCount++
|
|
status.LastRestart = time.Now()
|
|
status.ConsecutiveFails = 0 // Reset after restart, let it prove healthy
|
|
}
|
|
|
|
func (d *Daemon) emitEvent(event Event) {
|
|
if d.eventHandler != nil {
|
|
d.eventHandler(event)
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) saveStatuses() {
|
|
d.mu.RLock()
|
|
statuses := make([]Status, 0, len(d.statuses))
|
|
for _, s := range d.statuses {
|
|
statuses = append(statuses, *s)
|
|
}
|
|
d.mu.RUnlock()
|
|
|
|
os.MkdirAll(d.statusDir, 0755)
|
|
data, err := json.MarshalIndent(statuses, "", " ")
|
|
if err != nil {
|
|
return
|
|
}
|
|
os.WriteFile(filepath.Join(d.statusDir, "statuses.json"), data, 0644)
|
|
}
|