Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
144 lines
4.7 KiB
Go
144 lines
4.7 KiB
Go
/*
|
|
Health — Health check implementations for deployment verification.
|
|
|
|
Supports HTTP, TCP, exec, and no-op health checks. Each check type
|
|
retries according to the configured interval and retry count.
|
|
|
|
Copyright (c) Armored Gates LLC. All rights reserved.
|
|
*/
|
|
package deploy
|
|
|
|
import (
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"os/exec"
|
|
"time"
|
|
)
|
|
|
|
// ── Health Check Config ──────────────────────────────────────────────────────
|
|
|
|
// HealthCheck defines how to verify that an instance is healthy after deploy.
|
|
type HealthCheck struct {
|
|
Type string `json:"type" yaml:"type"` // "http", "tcp", "exec", "none"
|
|
Path string `json:"path" yaml:"path"` // HTTP path (e.g., "/healthz")
|
|
Port int `json:"port" yaml:"port"` // Port to check
|
|
Command string `json:"command" yaml:"command"` // Exec command
|
|
Interval time.Duration `json:"interval" yaml:"interval"` // Time between retries
|
|
Retries int `json:"retries" yaml:"retries"` // Max retry count
|
|
}
|
|
|
|
// ── Health Checker Interface ─────────────────────────────────────────────────
|
|
|
|
// HealthChecker verifies instance health during deployments.
|
|
type HealthChecker interface {
|
|
// WaitHealthy blocks until the instance is healthy or all retries are exhausted.
|
|
WaitHealthy(instanceName string, check HealthCheck) error
|
|
}
|
|
|
|
// ── Default Health Checker ───────────────────────────────────────────────────
|
|
|
|
// DefaultHealthChecker implements HealthChecker using real HTTP/TCP/exec calls.
|
|
type DefaultHealthChecker struct {
|
|
// InstanceIPResolver resolves an instance name to an IP address.
|
|
// If nil, "127.0.0.1" is used.
|
|
InstanceIPResolver func(name string) (string, error)
|
|
}
|
|
|
|
// WaitHealthy performs health checks with retries.
|
|
func (d *DefaultHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
|
|
switch check.Type {
|
|
case "none", "":
|
|
return nil
|
|
case "http":
|
|
return d.waitHTTP(instanceName, check)
|
|
case "tcp":
|
|
return d.waitTCP(instanceName, check)
|
|
case "exec":
|
|
return d.waitExec(instanceName, check)
|
|
default:
|
|
return fmt.Errorf("unknown health check type: %q", check.Type)
|
|
}
|
|
}
|
|
|
|
func (d *DefaultHealthChecker) resolveIP(instanceName string) string {
|
|
if d.InstanceIPResolver != nil {
|
|
ip, err := d.InstanceIPResolver(instanceName)
|
|
if err == nil {
|
|
return ip
|
|
}
|
|
}
|
|
return "127.0.0.1"
|
|
}
|
|
|
|
func (d *DefaultHealthChecker) waitHTTP(instanceName string, check HealthCheck) error {
|
|
ip := d.resolveIP(instanceName)
|
|
url := fmt.Sprintf("http://%s:%d%s", ip, check.Port, check.Path)
|
|
|
|
client := &http.Client{Timeout: check.Interval}
|
|
|
|
var lastErr error
|
|
for i := 0; i < check.Retries; i++ {
|
|
resp, err := client.Get(url)
|
|
if err == nil {
|
|
resp.Body.Close()
|
|
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
|
|
return nil
|
|
}
|
|
lastErr = fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
|
|
} else {
|
|
lastErr = err
|
|
}
|
|
if i < check.Retries-1 {
|
|
time.Sleep(check.Interval)
|
|
}
|
|
}
|
|
return fmt.Errorf("health check failed after %d retries: %w", check.Retries, lastErr)
|
|
}
|
|
|
|
func (d *DefaultHealthChecker) waitTCP(instanceName string, check HealthCheck) error {
|
|
ip := d.resolveIP(instanceName)
|
|
addr := fmt.Sprintf("%s:%d", ip, check.Port)
|
|
|
|
var lastErr error
|
|
for i := 0; i < check.Retries; i++ {
|
|
conn, err := net.DialTimeout("tcp", addr, check.Interval)
|
|
if err == nil {
|
|
conn.Close()
|
|
return nil
|
|
}
|
|
lastErr = err
|
|
if i < check.Retries-1 {
|
|
time.Sleep(check.Interval)
|
|
}
|
|
}
|
|
return fmt.Errorf("TCP health check failed after %d retries: %w", check.Retries, lastErr)
|
|
}
|
|
|
|
func (d *DefaultHealthChecker) waitExec(instanceName string, check HealthCheck) error {
|
|
var lastErr error
|
|
for i := 0; i < check.Retries; i++ {
|
|
cmd := exec.Command("sh", "-c", check.Command)
|
|
if err := cmd.Run(); err == nil {
|
|
return nil
|
|
} else {
|
|
lastErr = err
|
|
}
|
|
if i < check.Retries-1 {
|
|
time.Sleep(check.Interval)
|
|
}
|
|
}
|
|
return fmt.Errorf("exec health check failed after %d retries: %w", check.Retries, lastErr)
|
|
}
|
|
|
|
// ── Noop Health Checker ──────────────────────────────────────────────────────
|
|
|
|
// NoopHealthChecker always returns healthy. Used for rollbacks and when
|
|
// health checking is disabled.
|
|
type NoopHealthChecker struct{}
|
|
|
|
// WaitHealthy always succeeds immediately.
|
|
func (n *NoopHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
|
|
return nil
|
|
}
|