Volt CLI: source-available under AGPSL v5.0

Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
2026-03-21 00:30:23 -05:00
commit 0ebe75b2ca
155 changed files with 63317 additions and 0 deletions
--- a/pkg/healthd/healthd.go
+++ b/pkg/healthd/healthd.go
@@ -0,0 +1,594 @@
+/*
+Health Daemon — Continuous health monitoring for Volt workloads.
+
+Unlike deploy-time health checks (which verify a single instance during
+deployment), the health daemon runs continuously, monitoring all
+configured workloads and taking action when they become unhealthy.
+
+Features:
+  - HTTP, TCP, and exec health checks
+  - Configurable intervals and thresholds
+  - Auto-restart on sustained unhealthy state
+  - Health status API for monitoring integrations
+  - Event emission for webhook/notification systems
+
+Configuration is stored in /etc/volt/health/ as YAML files, one per
+workload.
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+*/
+package healthd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	"gopkg.in/yaml.v3"
+)
+
+// ── Constants ────────────────────────────────────────────────────────────────
+
+const (
+	// DefaultHealthDir stores health check configurations.
+	DefaultHealthDir = "/etc/volt/health"
+
+	// DefaultStatusDir stores runtime health status.
+	DefaultStatusDir = "/var/lib/volt/health"
+)
+
+// ── Health Check Config ──────────────────────────────────────────────────────
+
+// CheckType defines the type of health check.
+type CheckType string
+
+const (
+	CheckHTTP CheckType = "http"
+	CheckTCP  CheckType = "tcp"
+	CheckExec CheckType = "exec"
+)
+
+// Config defines a health check configuration for a workload.
+type Config struct {
+	Workload     string        `yaml:"workload" json:"workload"`
+	Type         CheckType     `yaml:"type" json:"type"`
+	Target       string        `yaml:"target" json:"target"`             // URL path for HTTP, port for TCP, command for exec
+	Port         int           `yaml:"port,omitempty" json:"port,omitempty"`
+	Interval     time.Duration `yaml:"interval" json:"interval"`
+	Timeout      time.Duration `yaml:"timeout" json:"timeout"`
+	Retries      int           `yaml:"retries" json:"retries"`           // Failures before unhealthy
+	AutoRestart  bool          `yaml:"auto_restart" json:"auto_restart"`
+	MaxRestarts  int           `yaml:"max_restarts" json:"max_restarts"` // 0 = unlimited
+	RestartDelay time.Duration `yaml:"restart_delay" json:"restart_delay"`
+	Enabled      bool          `yaml:"enabled" json:"enabled"`
+}
+
+// Validate checks that a health config is valid and fills defaults.
+func (c *Config) Validate() error {
+	if c.Workload == "" {
+		return fmt.Errorf("healthd: workload name required")
+	}
+	switch c.Type {
+	case CheckHTTP:
+		if c.Target == "" {
+			c.Target = "/healthz"
+		}
+		if c.Port == 0 {
+			c.Port = 8080
+		}
+	case CheckTCP:
+		if c.Port == 0 {
+			return fmt.Errorf("healthd: TCP check requires port")
+		}
+	case CheckExec:
+		if c.Target == "" {
+			return fmt.Errorf("healthd: exec check requires command")
+		}
+	default:
+		return fmt.Errorf("healthd: unknown check type %q", c.Type)
+	}
+
+	if c.Interval <= 0 {
+		c.Interval = 30 * time.Second
+	}
+	if c.Timeout <= 0 {
+		c.Timeout = 5 * time.Second
+	}
+	if c.Retries <= 0 {
+		c.Retries = 3
+	}
+	if c.RestartDelay <= 0 {
+		c.RestartDelay = 10 * time.Second
+	}
+	return nil
+}
+
+// ── Health Status ────────────────────────────────────────────────────────────
+
+// Status represents the current health state of a workload.
+type Status struct {
+	Workload        string    `json:"workload" yaml:"workload"`
+	Healthy         bool      `json:"healthy" yaml:"healthy"`
+	LastCheck       time.Time `json:"last_check" yaml:"last_check"`
+	LastHealthy     time.Time `json:"last_healthy,omitempty" yaml:"last_healthy,omitempty"`
+	ConsecutiveFails int      `json:"consecutive_fails" yaml:"consecutive_fails"`
+	TotalChecks     int64     `json:"total_checks" yaml:"total_checks"`
+	TotalFails      int64     `json:"total_fails" yaml:"total_fails"`
+	RestartCount    int       `json:"restart_count" yaml:"restart_count"`
+	LastError       string    `json:"last_error,omitempty" yaml:"last_error,omitempty"`
+	LastRestart     time.Time `json:"last_restart,omitempty" yaml:"last_restart,omitempty"`
+}
+
+// ── IP Resolver ──────────────────────────────────────────────────────────────
+
+// IPResolver maps a workload name to its IP address.
+type IPResolver func(workload string) (string, error)
+
+// DefaultIPResolver tries to resolve via machinectl show.
+func DefaultIPResolver(workload string) (string, error) {
+	out, err := exec.Command("machinectl", "show", workload, "-p", "Addresses").CombinedOutput()
+	if err != nil {
+		return "127.0.0.1", nil // Fallback to localhost
+	}
+	line := strings.TrimSpace(string(out))
+	if strings.HasPrefix(line, "Addresses=") {
+		addrs := strings.TrimPrefix(line, "Addresses=")
+		// Take first address
+		parts := strings.Fields(addrs)
+		if len(parts) > 0 {
+			return parts[0], nil
+		}
+	}
+	return "127.0.0.1", nil
+}
+
+// ── Restart Handler ──────────────────────────────────────────────────────────
+
+// RestartFunc defines how to restart a workload.
+type RestartFunc func(workload string) error
+
+// DefaultRestartFunc restarts via systemctl.
+func DefaultRestartFunc(workload string) error {
+	unit := fmt.Sprintf("volt-container@%s.service", workload)
+	return exec.Command("systemctl", "restart", unit).Run()
+}
+
+// ── Event Handler ────────────────────────────────────────────────────────────
+
+// EventType describes health daemon events.
+type EventType string
+
+const (
+	EventHealthy   EventType = "healthy"
+	EventUnhealthy EventType = "unhealthy"
+	EventRestart   EventType = "restart"
+	EventCheckFail EventType = "check_fail"
+)
+
+// Event is emitted when health state changes.
+type Event struct {
+	Type      EventType `json:"type"`
+	Workload  string    `json:"workload"`
+	Timestamp time.Time `json:"timestamp"`
+	Message   string    `json:"message"`
+}
+
+// EventHandler is called when health events occur.
+type EventHandler func(event Event)
+
+// ── Health Daemon ────────────────────────────────────────────────────────────
+
+// Daemon manages continuous health monitoring for multiple workloads.
+type Daemon struct {
+	configDir    string
+	statusDir    string
+	ipResolver   IPResolver
+	restartFunc  RestartFunc
+	eventHandler EventHandler
+
+	configs  map[string]*Config
+	statuses map[string]*Status
+	mu       sync.RWMutex
+	cancel   context.CancelFunc
+	wg       sync.WaitGroup
+}
+
+// NewDaemon creates a health monitoring daemon.
+func NewDaemon(configDir, statusDir string) *Daemon {
+	if configDir == "" {
+		configDir = DefaultHealthDir
+	}
+	if statusDir == "" {
+		statusDir = DefaultStatusDir
+	}
+	return &Daemon{
+		configDir:   configDir,
+		statusDir:   statusDir,
+		ipResolver:  DefaultIPResolver,
+		restartFunc: DefaultRestartFunc,
+		configs:     make(map[string]*Config),
+		statuses:    make(map[string]*Status),
+	}
+}
+
+// SetIPResolver sets a custom IP resolver.
+func (d *Daemon) SetIPResolver(resolver IPResolver) {
+	d.ipResolver = resolver
+}
+
+// SetRestartFunc sets a custom restart function.
+func (d *Daemon) SetRestartFunc(fn RestartFunc) {
+	d.restartFunc = fn
+}
+
+// SetEventHandler sets the event callback.
+func (d *Daemon) SetEventHandler(handler EventHandler) {
+	d.eventHandler = handler
+}
+
+// LoadConfigs reads all health check configurations from disk.
+func (d *Daemon) LoadConfigs() error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	files, err := filepath.Glob(filepath.Join(d.configDir, "*.yaml"))
+	if err != nil {
+		return fmt.Errorf("healthd: glob configs: %w", err)
+	}
+
+	for _, f := range files {
+		data, err := os.ReadFile(f)
+		if err != nil {
+			continue
+		}
+
+		var cfg Config
+		if err := yaml.Unmarshal(data, &cfg); err != nil {
+			continue
+		}
+
+		if err := cfg.Validate(); err != nil {
+			fmt.Fprintf(os.Stderr, "healthd: invalid config %s: %v\n", f, err)
+			continue
+		}
+
+		if cfg.Enabled {
+			d.configs[cfg.Workload] = &cfg
+		}
+	}
+
+	return nil
+}
+
+// Start begins monitoring all configured workloads.
+func (d *Daemon) Start(ctx context.Context) error {
+	if err := d.LoadConfigs(); err != nil {
+		return err
+	}
+
+	ctx, d.cancel = context.WithCancel(ctx)
+
+	d.mu.RLock()
+	configs := make([]*Config, 0, len(d.configs))
+	for _, cfg := range d.configs {
+		configs = append(configs, cfg)
+	}
+	d.mu.RUnlock()
+
+	for _, cfg := range configs {
+		d.wg.Add(1)
+		go d.monitorLoop(ctx, cfg)
+	}
+
+	return nil
+}
+
+// Stop gracefully stops the health daemon.
+func (d *Daemon) Stop() {
+	if d.cancel != nil {
+		d.cancel()
+	}
+	d.wg.Wait()
+	d.saveStatuses()
+}
+
+// GetStatus returns the health status of a workload.
+func (d *Daemon) GetStatus(workload string) *Status {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if s, ok := d.statuses[workload]; ok {
+		cp := *s
+		return &cp
+	}
+	return nil
+}
+
+// GetAllStatuses returns health status of all monitored workloads.
+func (d *Daemon) GetAllStatuses() []Status {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	result := make([]Status, 0, len(d.statuses))
+	for _, s := range d.statuses {
+		result = append(result, *s)
+	}
+	return result
+}
+
+// ── Configuration Management (CLI) ──────────────────────────────────────────
+
+// ConfigureCheck writes or updates a health check configuration.
+func ConfigureCheck(configDir string, cfg Config) error {
+	if configDir == "" {
+		configDir = DefaultHealthDir
+	}
+	if err := cfg.Validate(); err != nil {
+		return err
+	}
+
+	if err := os.MkdirAll(configDir, 0755); err != nil {
+		return fmt.Errorf("healthd: create config dir: %w", err)
+	}
+
+	data, err := yaml.Marshal(cfg)
+	if err != nil {
+		return fmt.Errorf("healthd: marshal config: %w", err)
+	}
+
+	path := filepath.Join(configDir, cfg.Workload+".yaml")
+	return os.WriteFile(path, data, 0644)
+}
+
+// RemoveCheck removes a health check configuration.
+func RemoveCheck(configDir string, workload string) error {
+	if configDir == "" {
+		configDir = DefaultHealthDir
+	}
+	path := filepath.Join(configDir, workload+".yaml")
+	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("healthd: remove config: %w", err)
+	}
+	return nil
+}
+
+// ListConfigs returns all configured health checks.
+func ListConfigs(configDir string) ([]Config, error) {
+	if configDir == "" {
+		configDir = DefaultHealthDir
+	}
+
+	files, err := filepath.Glob(filepath.Join(configDir, "*.yaml"))
+	if err != nil {
+		return nil, err
+	}
+
+	var configs []Config
+	for _, f := range files {
+		data, err := os.ReadFile(f)
+		if err != nil {
+			continue
+		}
+		var cfg Config
+		if err := yaml.Unmarshal(data, &cfg); err != nil {
+			continue
+		}
+		configs = append(configs, cfg)
+	}
+	return configs, nil
+}
+
+// LoadStatuses reads saved health statuses from disk.
+func LoadStatuses(statusDir string) ([]Status, error) {
+	if statusDir == "" {
+		statusDir = DefaultStatusDir
+	}
+
+	path := filepath.Join(statusDir, "statuses.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, err
+	}
+
+	var statuses []Status
+	if err := json.Unmarshal(data, &statuses); err != nil {
+		return nil, err
+	}
+	return statuses, nil
+}
+
+// ── Monitor Loop ─────────────────────────────────────────────────────────────
+
+func (d *Daemon) monitorLoop(ctx context.Context, cfg *Config) {
+	defer d.wg.Done()
+
+	// Initialize status
+	d.mu.Lock()
+	d.statuses[cfg.Workload] = &Status{
+		Workload: cfg.Workload,
+		Healthy:  true, // Assume healthy until proven otherwise
+	}
+	d.mu.Unlock()
+
+	ticker := time.NewTicker(cfg.Interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			d.runCheck(cfg)
+		}
+	}
+}
+
+func (d *Daemon) runCheck(cfg *Config) {
+	d.mu.Lock()
+	status := d.statuses[cfg.Workload]
+	d.mu.Unlock()
+
+	status.TotalChecks++
+	status.LastCheck = time.Now()
+
+	var err error
+	switch cfg.Type {
+	case CheckHTTP:
+		err = d.checkHTTP(cfg)
+	case CheckTCP:
+		err = d.checkTCP(cfg)
+	case CheckExec:
+		err = d.checkExec(cfg)
+	}
+
+	if err != nil {
+		status.TotalFails++
+		status.ConsecutiveFails++
+		status.LastError = err.Error()
+
+		d.emitEvent(Event{
+			Type:      EventCheckFail,
+			Workload:  cfg.Workload,
+			Timestamp: time.Now(),
+			Message:   err.Error(),
+		})
+
+		// Check if we've exceeded the failure threshold
+		if status.ConsecutiveFails >= cfg.Retries {
+			wasHealthy := status.Healthy
+			status.Healthy = false
+
+			if wasHealthy {
+				d.emitEvent(Event{
+					Type:      EventUnhealthy,
+					Workload:  cfg.Workload,
+					Timestamp: time.Now(),
+					Message:   fmt.Sprintf("health check failed %d times: %s", status.ConsecutiveFails, err.Error()),
+				})
+			}
+
+			// Auto-restart if configured
+			if cfg.AutoRestart {
+				if cfg.MaxRestarts == 0 || status.RestartCount < cfg.MaxRestarts {
+					d.handleRestart(cfg, status)
+				}
+			}
+		}
+	} else {
+		wasUnhealthy := !status.Healthy
+		status.Healthy = true
+		status.ConsecutiveFails = 0
+		status.LastHealthy = time.Now()
+		status.LastError = ""
+
+		if wasUnhealthy {
+			d.emitEvent(Event{
+				Type:      EventHealthy,
+				Workload:  cfg.Workload,
+				Timestamp: time.Now(),
+				Message:   "health check recovered",
+			})
+		}
+	}
+}
+
+func (d *Daemon) checkHTTP(cfg *Config) error {
+	ip, err := d.ipResolver(cfg.Workload)
+	if err != nil {
+		return fmt.Errorf("resolve IP: %w", err)
+	}
+
+	url := fmt.Sprintf("http://%s:%d%s", ip, cfg.Port, cfg.Target)
+	client := &http.Client{Timeout: cfg.Timeout}
+
+	resp, err := client.Get(url)
+	if err != nil {
+		return fmt.Errorf("HTTP check failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+	}
+	return nil
+}
+
+func (d *Daemon) checkTCP(cfg *Config) error {
+	ip, err := d.ipResolver(cfg.Workload)
+	if err != nil {
+		return fmt.Errorf("resolve IP: %w", err)
+	}
+
+	addr := fmt.Sprintf("%s:%d", ip, cfg.Port)
+	conn, err := net.DialTimeout("tcp", addr, cfg.Timeout)
+	if err != nil {
+		return fmt.Errorf("TCP check failed: %w", err)
+	}
+	conn.Close()
+	return nil
+}
+
+func (d *Daemon) checkExec(cfg *Config) error {
+	ctx, cancel := context.WithTimeout(context.Background(), cfg.Timeout)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "sh", "-c", cfg.Target)
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("exec check failed: %w", err)
+	}
+	return nil
+}
+
+func (d *Daemon) handleRestart(cfg *Config, status *Status) {
+	// Respect restart delay
+	if !status.LastRestart.IsZero() && time.Since(status.LastRestart) < cfg.RestartDelay {
+		return
+	}
+
+	d.emitEvent(Event{
+		Type:      EventRestart,
+		Workload:  cfg.Workload,
+		Timestamp: time.Now(),
+		Message:   fmt.Sprintf("auto-restarting (attempt %d)", status.RestartCount+1),
+	})
+
+	if err := d.restartFunc(cfg.Workload); err != nil {
+		fmt.Fprintf(os.Stderr, "healthd: restart %s failed: %v\n", cfg.Workload, err)
+		return
+	}
+
+	status.RestartCount++
+	status.LastRestart = time.Now()
+	status.ConsecutiveFails = 0 // Reset after restart, let it prove healthy
+}
+
+func (d *Daemon) emitEvent(event Event) {
+	if d.eventHandler != nil {
+		d.eventHandler(event)
+	}
+}
+
+func (d *Daemon) saveStatuses() {
+	d.mu.RLock()
+	statuses := make([]Status, 0, len(d.statuses))
+	for _, s := range d.statuses {
+		statuses = append(statuses, *s)
+	}
+	d.mu.RUnlock()
+
+	os.MkdirAll(d.statusDir, 0755)
+	data, err := json.MarshalIndent(statuses, "", "  ")
+	if err != nil {
+		return
+	}
+	os.WriteFile(filepath.Join(d.statusDir, "statuses.json"), data, 0644)
+}