Files
volt/pkg/deploy/health.go
Karl Clinger 81ad0b597c Volt CLI: source-available under AGPSL v5.0
Complete infrastructure platform CLI:
- Container runtime (systemd-nspawn)
- VoltVisor VMs (Neutron Stardust / QEMU)
- Stellarium CAS (content-addressed storage)
- ORAS Registry
- GitOps integration
- Landlock LSM security
- Compose orchestration
- Mesh networking

Copyright (c) Armored Gates LLC. All rights reserved.
Licensed under AGPSL v5.0
2026-03-21 00:31:12 -05:00

144 lines
4.7 KiB
Go

/*
Health — Health check implementations for deployment verification.
Supports HTTP, TCP, exec, and no-op health checks. Each check type
retries according to the configured interval and retry count.
Copyright (c) Armored Gates LLC. All rights reserved.
*/
package deploy
import (
"fmt"
"net"
"net/http"
"os/exec"
"time"
)
// ── Health Check Config ──────────────────────────────────────────────────────
// HealthCheck defines how to verify that an instance is healthy after deploy.
type HealthCheck struct {
Type string `json:"type" yaml:"type"` // "http", "tcp", "exec", "none"
Path string `json:"path" yaml:"path"` // HTTP path (e.g., "/healthz")
Port int `json:"port" yaml:"port"` // Port to check
Command string `json:"command" yaml:"command"` // Exec command
Interval time.Duration `json:"interval" yaml:"interval"` // Time between retries
Retries int `json:"retries" yaml:"retries"` // Max retry count
}
// ── Health Checker Interface ─────────────────────────────────────────────────
// HealthChecker verifies instance health during deployments.
type HealthChecker interface {
// WaitHealthy blocks until the instance is healthy or all retries are exhausted.
WaitHealthy(instanceName string, check HealthCheck) error
}
// ── Default Health Checker ───────────────────────────────────────────────────
// DefaultHealthChecker implements HealthChecker using real HTTP/TCP/exec calls.
type DefaultHealthChecker struct {
// InstanceIPResolver resolves an instance name to an IP address.
// If nil, "127.0.0.1" is used.
InstanceIPResolver func(name string) (string, error)
}
// WaitHealthy performs health checks with retries.
func (d *DefaultHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
switch check.Type {
case "none", "":
return nil
case "http":
return d.waitHTTP(instanceName, check)
case "tcp":
return d.waitTCP(instanceName, check)
case "exec":
return d.waitExec(instanceName, check)
default:
return fmt.Errorf("unknown health check type: %q", check.Type)
}
}
func (d *DefaultHealthChecker) resolveIP(instanceName string) string {
if d.InstanceIPResolver != nil {
ip, err := d.InstanceIPResolver(instanceName)
if err == nil {
return ip
}
}
return "127.0.0.1"
}
func (d *DefaultHealthChecker) waitHTTP(instanceName string, check HealthCheck) error {
ip := d.resolveIP(instanceName)
url := fmt.Sprintf("http://%s:%d%s", ip, check.Port, check.Path)
client := &http.Client{Timeout: check.Interval}
var lastErr error
for i := 0; i < check.Retries; i++ {
resp, err := client.Get(url)
if err == nil {
resp.Body.Close()
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
return nil
}
lastErr = fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
} else {
lastErr = err
}
if i < check.Retries-1 {
time.Sleep(check.Interval)
}
}
return fmt.Errorf("health check failed after %d retries: %w", check.Retries, lastErr)
}
func (d *DefaultHealthChecker) waitTCP(instanceName string, check HealthCheck) error {
ip := d.resolveIP(instanceName)
addr := fmt.Sprintf("%s:%d", ip, check.Port)
var lastErr error
for i := 0; i < check.Retries; i++ {
conn, err := net.DialTimeout("tcp", addr, check.Interval)
if err == nil {
conn.Close()
return nil
}
lastErr = err
if i < check.Retries-1 {
time.Sleep(check.Interval)
}
}
return fmt.Errorf("TCP health check failed after %d retries: %w", check.Retries, lastErr)
}
func (d *DefaultHealthChecker) waitExec(instanceName string, check HealthCheck) error {
var lastErr error
for i := 0; i < check.Retries; i++ {
cmd := exec.Command("sh", "-c", check.Command)
if err := cmd.Run(); err == nil {
return nil
} else {
lastErr = err
}
if i < check.Retries-1 {
time.Sleep(check.Interval)
}
}
return fmt.Errorf("exec health check failed after %d retries: %w", check.Retries, lastErr)
}
// ── Noop Health Checker ──────────────────────────────────────────────────────
// NoopHealthChecker always returns healthy. Used for rollbacks and when
// health checking is disabled.
type NoopHealthChecker struct{}
// WaitHealthy always succeeds immediately.
func (n *NoopHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
return nil
}