Volt CLI: source-available under AGPSL v5.0
Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
This commit is contained in:
733
pkg/deploy/deploy.go
Normal file
733
pkg/deploy/deploy.go
Normal file
@@ -0,0 +1,733 @@
|
||||
/*
|
||||
Deploy — Rolling and canary deployment strategies for Volt workloads.
|
||||
|
||||
Coordinates zero-downtime updates for containers and workloads by
|
||||
orchestrating instance creation, health verification, traffic shifting,
|
||||
and automatic rollback on failure.
|
||||
|
||||
Since Volt uses CAS (content-addressed storage) for rootfs assembly,
|
||||
"updating" a workload means pointing it to a new CAS ref and having
|
||||
TinyVol reassemble the directory tree from the new blob manifest.
|
||||
|
||||
Strategies:
|
||||
rolling — Update instances one-by-one (respecting MaxSurge/MaxUnavail)
|
||||
canary — Route a percentage of traffic to a new instance before full rollout
|
||||
|
||||
Copyright (c) Armored Gates LLC. All rights reserved.
|
||||
*/
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── Strategy ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Strategy defines the deployment approach.
|
||||
type Strategy string
|
||||
|
||||
const (
|
||||
// StrategyRolling updates instances one-by-one with health verification.
|
||||
StrategyRolling Strategy = "rolling"
|
||||
// StrategyCanary routes a percentage of traffic to a new instance first.
|
||||
StrategyCanary Strategy = "canary"
|
||||
)
|
||||
|
||||
// ── Configuration ────────────────────────────────────────────────────────────
|
||||
|
||||
// DeployConfig holds all parameters for a deployment operation.
|
||||
type DeployConfig struct {
|
||||
Strategy Strategy // Deployment strategy
|
||||
Target string // Container/workload name or pattern
|
||||
NewImage string // New CAS ref or image path to deploy
|
||||
MaxSurge int // Max extra instances during rolling (default: 1)
|
||||
MaxUnavail int // Max unavailable during rolling (default: 0)
|
||||
CanaryWeight int // Canary traffic percentage (1-99)
|
||||
HealthCheck HealthCheck // How to verify new instance is healthy
|
||||
Timeout time.Duration // Max time for the entire deployment
|
||||
AutoRollback bool // Rollback on failure
|
||||
}
|
||||
|
||||
// Validate checks that the config is usable and fills in defaults.
|
||||
func (c *DeployConfig) Validate() error {
|
||||
if c.Target == "" {
|
||||
return fmt.Errorf("deploy: target is required")
|
||||
}
|
||||
if c.NewImage == "" {
|
||||
return fmt.Errorf("deploy: new image (CAS ref) is required")
|
||||
}
|
||||
|
||||
switch c.Strategy {
|
||||
case StrategyRolling:
|
||||
if c.MaxSurge <= 0 {
|
||||
c.MaxSurge = 1
|
||||
}
|
||||
if c.MaxUnavail < 0 {
|
||||
c.MaxUnavail = 0
|
||||
}
|
||||
case StrategyCanary:
|
||||
if c.CanaryWeight <= 0 || c.CanaryWeight >= 100 {
|
||||
return fmt.Errorf("deploy: canary weight must be between 1 and 99, got %d", c.CanaryWeight)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("deploy: unknown strategy %q (use 'rolling' or 'canary')", c.Strategy)
|
||||
}
|
||||
|
||||
if c.Timeout <= 0 {
|
||||
c.Timeout = 10 * time.Minute
|
||||
}
|
||||
if c.HealthCheck.Type == "" {
|
||||
c.HealthCheck.Type = "none"
|
||||
}
|
||||
if c.HealthCheck.Interval <= 0 {
|
||||
c.HealthCheck.Interval = 5 * time.Second
|
||||
}
|
||||
if c.HealthCheck.Retries <= 0 {
|
||||
c.HealthCheck.Retries = 3
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Deploy Status ────────────────────────────────────────────────────────────
|
||||
|
||||
// Phase represents the current phase of a deployment.
|
||||
type Phase string
|
||||
|
||||
const (
|
||||
PhasePreparing Phase = "preparing"
|
||||
PhaseDeploying Phase = "deploying"
|
||||
PhaseVerifying Phase = "verifying"
|
||||
PhaseComplete Phase = "complete"
|
||||
PhaseRollingBack Phase = "rolling-back"
|
||||
PhaseFailed Phase = "failed"
|
||||
PhasePaused Phase = "paused"
|
||||
)
|
||||
|
||||
// DeployStatus tracks the progress of an active deployment.
|
||||
type DeployStatus struct {
|
||||
ID string `json:"id" yaml:"id"`
|
||||
Phase Phase `json:"phase" yaml:"phase"`
|
||||
Progress string `json:"progress" yaml:"progress"` // e.g. "2/5 instances updated"
|
||||
OldVersion string `json:"old_version" yaml:"old_version"` // previous CAS ref
|
||||
NewVersion string `json:"new_version" yaml:"new_version"` // target CAS ref
|
||||
Target string `json:"target" yaml:"target"`
|
||||
Strategy Strategy `json:"strategy" yaml:"strategy"`
|
||||
StartedAt time.Time `json:"started_at" yaml:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at,omitempty" yaml:"completed_at,omitempty"`
|
||||
Message string `json:"message,omitempty" yaml:"message,omitempty"`
|
||||
}
|
||||
|
||||
// ── Instance abstraction ─────────────────────────────────────────────────────
|
||||
|
||||
// Instance represents a single running workload instance that can be deployed to.
|
||||
type Instance struct {
|
||||
Name string // Instance name (e.g., "web-app-1")
|
||||
Image string // Current CAS ref or image
|
||||
Status string // "running", "stopped", etc.
|
||||
Healthy bool // Last known health state
|
||||
}
|
||||
|
||||
// ── Executor interface ───────────────────────────────────────────────────────
|
||||
|
||||
// Executor abstracts the system operations needed for deployments.
|
||||
// This allows testing without real systemd/nspawn/nftables calls.
|
||||
type Executor interface {
|
||||
// ListInstances returns all instances matching the target pattern.
|
||||
ListInstances(target string) ([]Instance, error)
|
||||
|
||||
// CreateInstance creates a new instance with the given image.
|
||||
CreateInstance(name, image string) error
|
||||
|
||||
// StartInstance starts a stopped instance.
|
||||
StartInstance(name string) error
|
||||
|
||||
// StopInstance stops a running instance.
|
||||
StopInstance(name string) error
|
||||
|
||||
// DeleteInstance removes an instance entirely.
|
||||
DeleteInstance(name string) error
|
||||
|
||||
// GetInstanceImage returns the current image/CAS ref for an instance.
|
||||
GetInstanceImage(name string) (string, error)
|
||||
|
||||
// UpdateInstanceImage updates an instance to use a new image (CAS ref).
|
||||
// This reassembles the rootfs via TinyVol and restarts the instance.
|
||||
UpdateInstanceImage(name, newImage string) error
|
||||
|
||||
// UpdateTrafficWeight adjusts traffic routing for canary deployments.
|
||||
// weight is 0-100 representing percentage to the canary instance.
|
||||
UpdateTrafficWeight(target string, canaryName string, weight int) error
|
||||
}
|
||||
|
||||
// ── Active deployments tracking ──────────────────────────────────────────────
|
||||
|
||||
var (
|
||||
activeDeployments = make(map[string]*DeployStatus)
|
||||
activeDeploymentsMu sync.RWMutex
|
||||
)
|
||||
|
||||
// GetActiveDeployments returns a snapshot of all active deployments.
|
||||
func GetActiveDeployments() []DeployStatus {
|
||||
activeDeploymentsMu.RLock()
|
||||
defer activeDeploymentsMu.RUnlock()
|
||||
|
||||
result := make([]DeployStatus, 0, len(activeDeployments))
|
||||
for _, ds := range activeDeployments {
|
||||
result = append(result, *ds)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// GetActiveDeployment returns the active deployment for a target, if any.
|
||||
func GetActiveDeployment(target string) *DeployStatus {
|
||||
activeDeploymentsMu.RLock()
|
||||
defer activeDeploymentsMu.RUnlock()
|
||||
|
||||
if ds, ok := activeDeployments[target]; ok {
|
||||
cp := *ds
|
||||
return &cp
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setActiveDeployment(ds *DeployStatus) {
|
||||
activeDeploymentsMu.Lock()
|
||||
defer activeDeploymentsMu.Unlock()
|
||||
activeDeployments[ds.Target] = ds
|
||||
}
|
||||
|
||||
func removeActiveDeployment(target string) {
|
||||
activeDeploymentsMu.Lock()
|
||||
defer activeDeploymentsMu.Unlock()
|
||||
delete(activeDeployments, target)
|
||||
}
|
||||
|
||||
// ── Progress callback ────────────────────────────────────────────────────────
|
||||
|
||||
// ProgressFunc is called with status updates during deployment.
|
||||
type ProgressFunc func(status DeployStatus)
|
||||
|
||||
// ── Rolling Deploy ───────────────────────────────────────────────────────────
|
||||
|
||||
// RollingDeploy performs a rolling update of instances matching cfg.Target.
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. List all instances matching the target pattern
|
||||
// 2. For each instance (respecting MaxSurge / MaxUnavail):
|
||||
// a. Update instance image to new CAS ref (reassemble rootfs via TinyVol)
|
||||
// b. Start/restart the instance
|
||||
// c. Wait for health check to pass
|
||||
// d. If health check fails and AutoRollback: revert to old image
|
||||
// 3. Record deployment in history
|
||||
func RollingDeploy(cfg DeployConfig, exec Executor, hc HealthChecker, hist *HistoryStore, progress ProgressFunc) error {
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Generate deployment ID.
|
||||
deployID := generateDeployID()
|
||||
|
||||
status := &DeployStatus{
|
||||
ID: deployID,
|
||||
Phase: PhasePreparing,
|
||||
Target: cfg.Target,
|
||||
Strategy: StrategyRolling,
|
||||
NewVersion: cfg.NewImage,
|
||||
StartedAt: time.Now().UTC(),
|
||||
}
|
||||
setActiveDeployment(status)
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
// 1. Discover instances.
|
||||
instances, err := exec.ListInstances(cfg.Target)
|
||||
if err != nil {
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = fmt.Sprintf("failed to list instances: %v", err)
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
if len(instances) == 0 {
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = "no instances found matching target"
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
|
||||
// Record old version from first instance.
|
||||
if len(instances) > 0 {
|
||||
oldImg, _ := exec.GetInstanceImage(instances[0].Name)
|
||||
status.OldVersion = oldImg
|
||||
}
|
||||
|
||||
total := len(instances)
|
||||
updated := 0
|
||||
var rollbackTargets []string // instances that were updated (for rollback)
|
||||
|
||||
status.Phase = PhaseDeploying
|
||||
status.Progress = fmt.Sprintf("0/%d instances updated", total)
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
// Timeout enforcement.
|
||||
deadline := time.Now().Add(cfg.Timeout)
|
||||
|
||||
// 2. Rolling update loop.
|
||||
for i, inst := range instances {
|
||||
if time.Now().After(deadline) {
|
||||
err := fmt.Errorf("deployment timed out after %s", cfg.Timeout)
|
||||
if cfg.AutoRollback && len(rollbackTargets) > 0 {
|
||||
status.Phase = PhaseRollingBack
|
||||
status.Message = err.Error()
|
||||
notifyProgress(progress, *status)
|
||||
rollbackInstances(exec, rollbackTargets, status.OldVersion)
|
||||
}
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = err.Error()
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, updated)
|
||||
return err
|
||||
}
|
||||
|
||||
// Respect MaxSurge: we update in-place, so surge is about allowing
|
||||
// brief overlap. With MaxUnavail=0 and MaxSurge=1, we update one at a time.
|
||||
_ = cfg.MaxSurge // In single-node mode, surge is handled by updating in-place.
|
||||
|
||||
status.Progress = fmt.Sprintf("%d/%d instances updated (updating %s)", i, total, inst.Name)
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
// a. Update the instance image.
|
||||
if err := exec.UpdateInstanceImage(inst.Name, cfg.NewImage); err != nil {
|
||||
errMsg := fmt.Sprintf("failed to update instance %s: %v", inst.Name, err)
|
||||
if cfg.AutoRollback {
|
||||
status.Phase = PhaseRollingBack
|
||||
status.Message = errMsg
|
||||
notifyProgress(progress, *status)
|
||||
rollbackInstances(exec, rollbackTargets, status.OldVersion)
|
||||
status.Phase = PhaseFailed
|
||||
} else {
|
||||
status.Phase = PhaseFailed
|
||||
}
|
||||
status.Message = errMsg
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, updated)
|
||||
return fmt.Errorf("deploy: %s", errMsg)
|
||||
}
|
||||
|
||||
// b. Start the instance.
|
||||
if err := exec.StartInstance(inst.Name); err != nil {
|
||||
errMsg := fmt.Sprintf("failed to start instance %s: %v", inst.Name, err)
|
||||
if cfg.AutoRollback {
|
||||
status.Phase = PhaseRollingBack
|
||||
status.Message = errMsg
|
||||
notifyProgress(progress, *status)
|
||||
// Rollback this instance too.
|
||||
rollbackTargets = append(rollbackTargets, inst.Name)
|
||||
rollbackInstances(exec, rollbackTargets, status.OldVersion)
|
||||
status.Phase = PhaseFailed
|
||||
} else {
|
||||
status.Phase = PhaseFailed
|
||||
}
|
||||
status.Message = errMsg
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, updated)
|
||||
return fmt.Errorf("deploy: %s", errMsg)
|
||||
}
|
||||
|
||||
// c. Health check.
|
||||
status.Phase = PhaseVerifying
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
if err := hc.WaitHealthy(inst.Name, cfg.HealthCheck); err != nil {
|
||||
errMsg := fmt.Sprintf("health check failed for %s: %v", inst.Name, err)
|
||||
if cfg.AutoRollback {
|
||||
status.Phase = PhaseRollingBack
|
||||
status.Message = errMsg
|
||||
notifyProgress(progress, *status)
|
||||
rollbackTargets = append(rollbackTargets, inst.Name)
|
||||
rollbackInstances(exec, rollbackTargets, status.OldVersion)
|
||||
status.Phase = PhaseFailed
|
||||
} else {
|
||||
status.Phase = PhaseFailed
|
||||
}
|
||||
status.Message = errMsg
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, updated)
|
||||
return fmt.Errorf("deploy: %s", errMsg)
|
||||
}
|
||||
|
||||
rollbackTargets = append(rollbackTargets, inst.Name)
|
||||
updated++
|
||||
status.Phase = PhaseDeploying
|
||||
status.Progress = fmt.Sprintf("%d/%d instances updated", updated, total)
|
||||
notifyProgress(progress, *status)
|
||||
}
|
||||
|
||||
// 3. Complete.
|
||||
status.Phase = PhaseComplete
|
||||
status.Progress = fmt.Sprintf("%d/%d instances updated", updated, total)
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, updated)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Canary Deploy ────────────────────────────────────────────────────────────
|
||||
|
||||
// CanaryDeploy creates a canary instance alongside existing instances and
|
||||
// routes cfg.CanaryWeight percent of traffic to it.
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. List existing instances
|
||||
// 2. Create a new canary instance with the new image
|
||||
// 3. Start the canary and verify health
|
||||
// 4. Update traffic routing to send CanaryWeight% to canary
|
||||
// 5. If health fails and AutoRollback: remove canary, restore routing
|
||||
func CanaryDeploy(cfg DeployConfig, exec Executor, hc HealthChecker, hist *HistoryStore, progress ProgressFunc) error {
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deployID := generateDeployID()
|
||||
|
||||
status := &DeployStatus{
|
||||
ID: deployID,
|
||||
Phase: PhasePreparing,
|
||||
Target: cfg.Target,
|
||||
Strategy: StrategyCanary,
|
||||
NewVersion: cfg.NewImage,
|
||||
StartedAt: time.Now().UTC(),
|
||||
}
|
||||
setActiveDeployment(status)
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
// 1. Discover existing instances.
|
||||
instances, err := exec.ListInstances(cfg.Target)
|
||||
if err != nil {
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = fmt.Sprintf("failed to list instances: %v", err)
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
if len(instances) == 0 {
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = "no instances found matching target"
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
|
||||
// Record old version.
|
||||
if oldImg, err := exec.GetInstanceImage(instances[0].Name); err == nil {
|
||||
status.OldVersion = oldImg
|
||||
}
|
||||
|
||||
// 2. Create canary instance.
|
||||
canaryName := canaryInstanceName(cfg.Target)
|
||||
|
||||
status.Phase = PhaseDeploying
|
||||
status.Progress = fmt.Sprintf("creating canary instance %s", canaryName)
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
if err := exec.CreateInstance(canaryName, cfg.NewImage); err != nil {
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = fmt.Sprintf("failed to create canary: %v", err)
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
|
||||
// 3. Start canary and verify health.
|
||||
if err := exec.StartInstance(canaryName); err != nil {
|
||||
cleanupCanary(exec, canaryName)
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = fmt.Sprintf("failed to start canary: %v", err)
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
|
||||
status.Phase = PhaseVerifying
|
||||
status.Progress = "verifying canary health"
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
if err := hc.WaitHealthy(canaryName, cfg.HealthCheck); err != nil {
|
||||
if cfg.AutoRollback {
|
||||
status.Phase = PhaseRollingBack
|
||||
status.Message = fmt.Sprintf("canary health check failed: %v", err)
|
||||
notifyProgress(progress, *status)
|
||||
cleanupCanary(exec, canaryName)
|
||||
}
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = fmt.Sprintf("canary health check failed: %v", err)
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
|
||||
// 4. Update traffic routing.
|
||||
status.Progress = fmt.Sprintf("routing %d%% traffic to canary", cfg.CanaryWeight)
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
if err := exec.UpdateTrafficWeight(cfg.Target, canaryName, cfg.CanaryWeight); err != nil {
|
||||
if cfg.AutoRollback {
|
||||
cleanupCanary(exec, canaryName)
|
||||
}
|
||||
status.Phase = PhaseFailed
|
||||
status.Message = fmt.Sprintf("failed to update traffic routing: %v", err)
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 0)
|
||||
return fmt.Errorf("deploy: %s", status.Message)
|
||||
}
|
||||
|
||||
// 5. Canary is live.
|
||||
status.Phase = PhaseComplete
|
||||
status.Progress = fmt.Sprintf("canary live with %d%% traffic", cfg.CanaryWeight)
|
||||
status.CompletedAt = time.Now().UTC()
|
||||
notifyProgress(progress, *status)
|
||||
removeActiveDeployment(cfg.Target)
|
||||
recordHistory(hist, status, 1)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Rollback ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Rollback reverts a target to its previous version using deployment history.
|
||||
func Rollback(target string, exec Executor, hist *HistoryStore, progress ProgressFunc) error {
|
||||
if hist == nil {
|
||||
return fmt.Errorf("deploy rollback: no history store available")
|
||||
}
|
||||
|
||||
entries, err := hist.ListByTarget(target)
|
||||
if err != nil {
|
||||
return fmt.Errorf("deploy rollback: failed to read history: %w", err)
|
||||
}
|
||||
|
||||
// Find the last successful deployment that has a different version.
|
||||
var previousRef string
|
||||
for _, entry := range entries {
|
||||
if entry.Status == string(PhaseComplete) && entry.OldRef != "" {
|
||||
previousRef = entry.OldRef
|
||||
break
|
||||
}
|
||||
}
|
||||
if previousRef == "" {
|
||||
return fmt.Errorf("deploy rollback: no previous version found in history for %q", target)
|
||||
}
|
||||
|
||||
status := &DeployStatus{
|
||||
ID: generateDeployID(),
|
||||
Phase: PhaseRollingBack,
|
||||
Target: target,
|
||||
Strategy: StrategyRolling,
|
||||
NewVersion: previousRef,
|
||||
StartedAt: time.Now().UTC(),
|
||||
Message: "rollback to previous version",
|
||||
}
|
||||
notifyProgress(progress, *status)
|
||||
|
||||
// Perform a rolling deploy with the previous ref.
|
||||
rollbackCfg := DeployConfig{
|
||||
Strategy: StrategyRolling,
|
||||
Target: target,
|
||||
NewImage: previousRef,
|
||||
MaxSurge: 1,
|
||||
MaxUnavail: 0,
|
||||
HealthCheck: HealthCheck{Type: "none"},
|
||||
Timeout: 5 * time.Minute,
|
||||
AutoRollback: false, // Don't auto-rollback a rollback
|
||||
}
|
||||
|
||||
return RollingDeploy(rollbackCfg, exec, &NoopHealthChecker{}, hist, progress)
|
||||
}
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
// rollbackInstances reverts a list of instances to the old image.
|
||||
func rollbackInstances(exec Executor, names []string, oldImage string) {
|
||||
for _, name := range names {
|
||||
_ = exec.UpdateInstanceImage(name, oldImage)
|
||||
_ = exec.StartInstance(name)
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupCanary stops and removes a canary instance.
|
||||
func cleanupCanary(exec Executor, canaryName string) {
|
||||
_ = exec.StopInstance(canaryName)
|
||||
_ = exec.DeleteInstance(canaryName)
|
||||
}
|
||||
|
||||
// canaryInstanceName generates a canary instance name from the target.
|
||||
func canaryInstanceName(target string) string {
|
||||
// Strip any trailing instance numbers and add -canary suffix.
|
||||
base := strings.TrimRight(target, "0123456789-")
|
||||
if base == "" {
|
||||
base = target
|
||||
}
|
||||
return base + "-canary"
|
||||
}
|
||||
|
||||
// generateDeployID creates a unique deployment ID.
|
||||
func generateDeployID() string {
|
||||
return fmt.Sprintf("deploy-%d", time.Now().UnixNano()/int64(time.Millisecond))
|
||||
}
|
||||
|
||||
// notifyProgress safely calls the progress callback if non-nil.
|
||||
func notifyProgress(fn ProgressFunc, status DeployStatus) {
|
||||
if fn != nil {
|
||||
fn(status)
|
||||
}
|
||||
}
|
||||
|
||||
// recordHistory saves a deployment to the history store if available.
|
||||
func recordHistory(hist *HistoryStore, status *DeployStatus, instancesUpdated int) {
|
||||
if hist == nil {
|
||||
return
|
||||
}
|
||||
entry := HistoryEntry{
|
||||
ID: status.ID,
|
||||
Target: status.Target,
|
||||
Strategy: string(status.Strategy),
|
||||
OldRef: status.OldVersion,
|
||||
NewRef: status.NewVersion,
|
||||
Status: string(status.Phase),
|
||||
StartedAt: status.StartedAt,
|
||||
CompletedAt: status.CompletedAt,
|
||||
InstancesUpdated: instancesUpdated,
|
||||
Message: status.Message,
|
||||
}
|
||||
_ = hist.Append(entry)
|
||||
}
|
||||
|
||||
// ── Default executor (real system calls) ─────────────────────────────────────
|
||||
|
||||
// DefaultCASDir is the default directory for CAS storage.
|
||||
const DefaultCASDir = "/var/lib/volt/cas"
|
||||
|
||||
// SystemExecutor implements Executor using real system commands.
|
||||
type SystemExecutor struct {
|
||||
ContainerBaseDir string
|
||||
CASBaseDir string
|
||||
}
|
||||
|
||||
// NewSystemExecutor creates an executor for real system operations.
|
||||
func NewSystemExecutor() *SystemExecutor {
|
||||
return &SystemExecutor{
|
||||
ContainerBaseDir: "/var/lib/volt/containers",
|
||||
CASBaseDir: DefaultCASDir,
|
||||
}
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) ListInstances(target string) ([]Instance, error) {
|
||||
// Match instances by prefix or exact name.
|
||||
// Scan /var/lib/volt/containers for directories matching the pattern.
|
||||
var instances []Instance
|
||||
|
||||
entries, err := filepath.Glob(filepath.Join(e.ContainerBaseDir, target+"*"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list instances: %w", err)
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
name := filepath.Base(entry)
|
||||
instances = append(instances, Instance{
|
||||
Name: name,
|
||||
Status: "unknown",
|
||||
})
|
||||
}
|
||||
|
||||
// If no glob matches, try exact match.
|
||||
if len(instances) == 0 {
|
||||
exact := filepath.Join(e.ContainerBaseDir, target)
|
||||
if info, err := fileInfo(exact); err == nil && info.IsDir() {
|
||||
instances = append(instances, Instance{
|
||||
Name: target,
|
||||
Status: "unknown",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return instances, nil
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) CreateInstance(name, image string) error {
|
||||
// Create container directory and write unit file.
|
||||
// In a real implementation this would use the backend.Create flow.
|
||||
return fmt.Errorf("SystemExecutor.CreateInstance not yet wired to backend")
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) StartInstance(name string) error {
|
||||
return runSystemctl("start", voltContainerUnit(name))
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) StopInstance(name string) error {
|
||||
return runSystemctl("stop", voltContainerUnit(name))
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) DeleteInstance(name string) error {
|
||||
return fmt.Errorf("SystemExecutor.DeleteInstance not yet wired to backend")
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) GetInstanceImage(name string) (string, error) {
|
||||
// Read the CAS ref from the instance's metadata.
|
||||
// Stored in /var/lib/volt/containers/<name>/.volt-cas-ref
|
||||
refPath := filepath.Join(e.ContainerBaseDir, name, ".volt-cas-ref")
|
||||
data, err := readFile(refPath)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("no CAS ref found for instance %s", name)
|
||||
}
|
||||
return strings.TrimSpace(string(data)), nil
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) UpdateInstanceImage(name, newImage string) error {
|
||||
// 1. Stop the instance.
|
||||
_ = runSystemctl("stop", voltContainerUnit(name))
|
||||
|
||||
// 2. Write new CAS ref.
|
||||
refPath := filepath.Join(e.ContainerBaseDir, name, ".volt-cas-ref")
|
||||
if err := writeFile(refPath, []byte(newImage)); err != nil {
|
||||
return fmt.Errorf("failed to write CAS ref: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *SystemExecutor) UpdateTrafficWeight(target, canaryName string, weight int) error {
|
||||
// In a full implementation this would update nftables rules for load balancing.
|
||||
// For now, record the weight in a metadata file.
|
||||
weightPath := filepath.Join(e.ContainerBaseDir, ".traffic-weights")
|
||||
data := fmt.Sprintf("%s:%s:%d\n", target, canaryName, weight)
|
||||
return appendFile(weightPath, []byte(data))
|
||||
}
|
||||
|
||||
// voltContainerUnit returns the systemd unit name for a container.
|
||||
func voltContainerUnit(name string) string {
|
||||
return fmt.Sprintf("volt-container@%s.service", name)
|
||||
}
|
||||
899
pkg/deploy/deploy_test.go
Normal file
899
pkg/deploy/deploy_test.go
Normal file
@@ -0,0 +1,899 @@
|
||||
/*
|
||||
Deploy Tests — Verifies rolling, canary, rollback, health check, and history logic.
|
||||
|
||||
Uses a mock executor and health checker so no real system calls are made.
|
||||
*/
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── Mock Executor ────────────────────────────────────────────────────────────
|
||||
|
||||
// mockExecutor records all operations for verification.
|
||||
type mockExecutor struct {
|
||||
mu sync.Mutex
|
||||
|
||||
instances map[string]*Instance // name → instance
|
||||
images map[string]string // name → current image
|
||||
|
||||
// Recorded operation log.
|
||||
ops []string
|
||||
|
||||
// Error injection.
|
||||
updateImageErr map[string]error // instance name → error to return
|
||||
startErr map[string]error
|
||||
createErr map[string]error
|
||||
trafficWeights map[string]int // canaryName → weight
|
||||
}
|
||||
|
||||
func newMockExecutor(instances ...Instance) *mockExecutor {
|
||||
m := &mockExecutor{
|
||||
instances: make(map[string]*Instance),
|
||||
images: make(map[string]string),
|
||||
updateImageErr: make(map[string]error),
|
||||
startErr: make(map[string]error),
|
||||
createErr: make(map[string]error),
|
||||
trafficWeights: make(map[string]int),
|
||||
}
|
||||
for _, inst := range instances {
|
||||
cpy := inst
|
||||
m.instances[inst.Name] = &cpy
|
||||
m.images[inst.Name] = inst.Image
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func (m *mockExecutor) record(op string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.ops = append(m.ops, op)
|
||||
}
|
||||
|
||||
func (m *mockExecutor) getOps() []string {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
result := make([]string, len(m.ops))
|
||||
copy(result, m.ops)
|
||||
return result
|
||||
}
|
||||
|
||||
func (m *mockExecutor) ListInstances(target string) ([]Instance, error) {
|
||||
m.record(fmt.Sprintf("list:%s", target))
|
||||
var result []Instance
|
||||
for _, inst := range m.instances {
|
||||
if strings.HasPrefix(inst.Name, target) || inst.Name == target {
|
||||
result = append(result, *inst)
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m *mockExecutor) CreateInstance(name, image string) error {
|
||||
m.record(fmt.Sprintf("create:%s:%s", name, image))
|
||||
if err, ok := m.createErr[name]; ok {
|
||||
return err
|
||||
}
|
||||
m.mu.Lock()
|
||||
m.instances[name] = &Instance{Name: name, Image: image, Status: "stopped"}
|
||||
m.images[name] = image
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockExecutor) StartInstance(name string) error {
|
||||
m.record(fmt.Sprintf("start:%s", name))
|
||||
if err, ok := m.startErr[name]; ok {
|
||||
return err
|
||||
}
|
||||
m.mu.Lock()
|
||||
if inst, ok := m.instances[name]; ok {
|
||||
inst.Status = "running"
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockExecutor) StopInstance(name string) error {
|
||||
m.record(fmt.Sprintf("stop:%s", name))
|
||||
m.mu.Lock()
|
||||
if inst, ok := m.instances[name]; ok {
|
||||
inst.Status = "stopped"
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockExecutor) DeleteInstance(name string) error {
|
||||
m.record(fmt.Sprintf("delete:%s", name))
|
||||
m.mu.Lock()
|
||||
delete(m.instances, name)
|
||||
delete(m.images, name)
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockExecutor) GetInstanceImage(name string) (string, error) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if img, ok := m.images[name]; ok {
|
||||
return img, nil
|
||||
}
|
||||
return "", fmt.Errorf("instance %s not found", name)
|
||||
}
|
||||
|
||||
func (m *mockExecutor) UpdateInstanceImage(name, newImage string) error {
|
||||
m.record(fmt.Sprintf("update-image:%s:%s", name, newImage))
|
||||
if err, ok := m.updateImageErr[name]; ok {
|
||||
return err
|
||||
}
|
||||
m.mu.Lock()
|
||||
m.images[name] = newImage
|
||||
if inst, ok := m.instances[name]; ok {
|
||||
inst.Image = newImage
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockExecutor) UpdateTrafficWeight(target, canaryName string, weight int) error {
|
||||
m.record(fmt.Sprintf("traffic:%s:%s:%d", target, canaryName, weight))
|
||||
m.mu.Lock()
|
||||
m.trafficWeights[canaryName] = weight
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Mock Health Checker ──────────────────────────────────────────────────────
|
||||
|
||||
// mockHealthChecker returns configurable results per instance.
|
||||
type mockHealthChecker struct {
|
||||
mu sync.Mutex
|
||||
results map[string]error // instance name → error (nil = healthy)
|
||||
calls []string
|
||||
}
|
||||
|
||||
func newMockHealthChecker() *mockHealthChecker {
|
||||
return &mockHealthChecker{
|
||||
results: make(map[string]error),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *mockHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
|
||||
h.mu.Lock()
|
||||
h.calls = append(h.calls, instanceName)
|
||||
err := h.results[instanceName]
|
||||
h.mu.Unlock()
|
||||
return err
|
||||
}
|
||||
|
||||
func (h *mockHealthChecker) getCalls() []string {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
result := make([]string, len(h.calls))
|
||||
copy(result, h.calls)
|
||||
return result
|
||||
}
|
||||
|
||||
// ── Progress Collector ───────────────────────────────────────────────────────
|
||||
|
||||
type progressCollector struct {
|
||||
mu sync.Mutex
|
||||
updates []DeployStatus
|
||||
}
|
||||
|
||||
func newProgressCollector() *progressCollector {
|
||||
return &progressCollector{}
|
||||
}
|
||||
|
||||
func (p *progressCollector) callback() ProgressFunc {
|
||||
return func(status DeployStatus) {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
p.updates = append(p.updates, status)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *progressCollector) getUpdates() []DeployStatus {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
result := make([]DeployStatus, len(p.updates))
|
||||
copy(result, p.updates)
|
||||
return result
|
||||
}
|
||||
|
||||
func (p *progressCollector) phases() []Phase {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
var phases []Phase
|
||||
for _, u := range p.updates {
|
||||
phases = append(phases, u.Phase)
|
||||
}
|
||||
return phases
|
||||
}
|
||||
|
||||
// ── Test: Rolling Deploy Order ───────────────────────────────────────────────
|
||||
|
||||
func TestRollingDeployOrder(t *testing.T) {
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "web-1", Image: "sha256:old1", Status: "running"},
|
||||
Instance{Name: "web-2", Image: "sha256:old1", Status: "running"},
|
||||
Instance{Name: "web-3", Image: "sha256:old1", Status: "running"},
|
||||
)
|
||||
hc := newMockHealthChecker()
|
||||
pc := newProgressCollector()
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
cfg := DeployConfig{
|
||||
Strategy: StrategyRolling,
|
||||
Target: "web",
|
||||
NewImage: "sha256:new1",
|
||||
MaxSurge: 1,
|
||||
MaxUnavail: 0,
|
||||
HealthCheck: HealthCheck{Type: "none"},
|
||||
Timeout: 1 * time.Minute,
|
||||
AutoRollback: true,
|
||||
}
|
||||
|
||||
err := RollingDeploy(cfg, exec, hc, hist, pc.callback())
|
||||
if err != nil {
|
||||
t.Fatalf("RollingDeploy returned error: %v", err)
|
||||
}
|
||||
|
||||
// Verify all instances were updated.
|
||||
ops := exec.getOps()
|
||||
|
||||
// Count update-image operations.
|
||||
updateCount := 0
|
||||
for _, op := range ops {
|
||||
if strings.HasPrefix(op, "update-image:") {
|
||||
updateCount++
|
||||
// Verify new image is correct.
|
||||
if !strings.HasSuffix(op, ":sha256:new1") {
|
||||
t.Errorf("expected new image sha256:new1, got op: %s", op)
|
||||
}
|
||||
}
|
||||
}
|
||||
if updateCount != 3 {
|
||||
t.Errorf("expected 3 update-image ops, got %d", updateCount)
|
||||
}
|
||||
|
||||
// Verify instances are updated one at a time (each update is followed by start before next update).
|
||||
var updateOrder []string
|
||||
for _, op := range ops {
|
||||
if strings.HasPrefix(op, "update-image:web-") {
|
||||
name := strings.Split(op, ":")[1]
|
||||
updateOrder = append(updateOrder, name)
|
||||
}
|
||||
}
|
||||
if len(updateOrder) != 3 {
|
||||
t.Errorf("expected 3 instances updated in order, got %d", len(updateOrder))
|
||||
}
|
||||
|
||||
// Verify progress callback was called.
|
||||
phases := pc.phases()
|
||||
if len(phases) == 0 {
|
||||
t.Error("expected progress callbacks, got none")
|
||||
}
|
||||
|
||||
// First should be preparing, last should be complete.
|
||||
if phases[0] != PhasePreparing {
|
||||
t.Errorf("expected first phase to be preparing, got %s", phases[0])
|
||||
}
|
||||
lastPhase := phases[len(phases)-1]
|
||||
if lastPhase != PhaseComplete {
|
||||
t.Errorf("expected last phase to be complete, got %s", lastPhase)
|
||||
}
|
||||
|
||||
// Verify all images are now the new version.
|
||||
for _, name := range []string{"web-1", "web-2", "web-3"} {
|
||||
img, err := exec.GetInstanceImage(name)
|
||||
if err != nil {
|
||||
t.Errorf("GetInstanceImage(%s) error: %v", name, err)
|
||||
continue
|
||||
}
|
||||
if img != "sha256:new1" {
|
||||
t.Errorf("instance %s image = %s, want sha256:new1", name, img)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Canary Weight ──────────────────────────────────────────────────────
|
||||
|
||||
func TestCanaryWeight(t *testing.T) {
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "api-1", Image: "sha256:v1", Status: "running"},
|
||||
Instance{Name: "api-2", Image: "sha256:v1", Status: "running"},
|
||||
)
|
||||
hc := newMockHealthChecker()
|
||||
pc := newProgressCollector()
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
cfg := DeployConfig{
|
||||
Strategy: StrategyCanary,
|
||||
Target: "api",
|
||||
NewImage: "sha256:v2",
|
||||
CanaryWeight: 20,
|
||||
HealthCheck: HealthCheck{Type: "none"},
|
||||
Timeout: 1 * time.Minute,
|
||||
AutoRollback: true,
|
||||
}
|
||||
|
||||
err := CanaryDeploy(cfg, exec, hc, hist, pc.callback())
|
||||
if err != nil {
|
||||
t.Fatalf("CanaryDeploy returned error: %v", err)
|
||||
}
|
||||
|
||||
// Verify canary instance was created.
|
||||
ops := exec.getOps()
|
||||
var createOps []string
|
||||
for _, op := range ops {
|
||||
if strings.HasPrefix(op, "create:") {
|
||||
createOps = append(createOps, op)
|
||||
}
|
||||
}
|
||||
if len(createOps) != 1 {
|
||||
t.Fatalf("expected 1 create op for canary, got %d: %v", len(createOps), createOps)
|
||||
}
|
||||
|
||||
// Verify the canary instance name and image.
|
||||
canaryName := canaryInstanceName("api")
|
||||
expectedCreate := fmt.Sprintf("create:%s:sha256:v2", canaryName)
|
||||
if createOps[0] != expectedCreate {
|
||||
t.Errorf("create op = %q, want %q", createOps[0], expectedCreate)
|
||||
}
|
||||
|
||||
// Verify traffic was routed with the correct weight.
|
||||
var trafficOps []string
|
||||
for _, op := range ops {
|
||||
if strings.HasPrefix(op, "traffic:") {
|
||||
trafficOps = append(trafficOps, op)
|
||||
}
|
||||
}
|
||||
if len(trafficOps) != 1 {
|
||||
t.Fatalf("expected 1 traffic op, got %d: %v", len(trafficOps), trafficOps)
|
||||
}
|
||||
expectedTraffic := fmt.Sprintf("traffic:api:%s:20", canaryName)
|
||||
if trafficOps[0] != expectedTraffic {
|
||||
t.Errorf("traffic op = %q, want %q", trafficOps[0], expectedTraffic)
|
||||
}
|
||||
|
||||
// Verify the canary weight was recorded.
|
||||
exec.mu.Lock()
|
||||
weight := exec.trafficWeights[canaryName]
|
||||
exec.mu.Unlock()
|
||||
if weight != 20 {
|
||||
t.Errorf("canary traffic weight = %d, want 20", weight)
|
||||
}
|
||||
|
||||
// Verify original instances were not modified.
|
||||
for _, name := range []string{"api-1", "api-2"} {
|
||||
img, _ := exec.GetInstanceImage(name)
|
||||
if img != "sha256:v1" {
|
||||
t.Errorf("original instance %s image changed to %s, should still be sha256:v1", name, img)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify progress shows canary-specific messages.
|
||||
updates := pc.getUpdates()
|
||||
foundCanaryProgress := false
|
||||
for _, u := range updates {
|
||||
if strings.Contains(u.Progress, "canary") || strings.Contains(u.Progress, "traffic") {
|
||||
foundCanaryProgress = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundCanaryProgress {
|
||||
t.Error("expected canary-related progress messages")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Rollback Restores Previous ─────────────────────────────────────────
|
||||
|
||||
func TestRollbackRestoresPrevious(t *testing.T) {
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "app-1", Image: "sha256:v2", Status: "running"},
|
||||
)
|
||||
_ = newMockHealthChecker()
|
||||
pc := newProgressCollector()
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
// Seed history with a previous successful deployment.
|
||||
_ = hist.Append(HistoryEntry{
|
||||
ID: "deploy-prev",
|
||||
Target: "app",
|
||||
Strategy: "rolling",
|
||||
OldRef: "sha256:v1",
|
||||
NewRef: "sha256:v2",
|
||||
Status: string(PhaseComplete),
|
||||
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||
CompletedAt: time.Now().Add(-50 * time.Minute),
|
||||
InstancesUpdated: 1,
|
||||
})
|
||||
|
||||
err := Rollback("app", exec, hist, pc.callback())
|
||||
if err != nil {
|
||||
t.Fatalf("Rollback returned error: %v", err)
|
||||
}
|
||||
|
||||
// Verify the instance was updated back to v1.
|
||||
img, err := exec.GetInstanceImage("app-1")
|
||||
if err != nil {
|
||||
t.Fatalf("GetInstanceImage error: %v", err)
|
||||
}
|
||||
if img != "sha256:v1" {
|
||||
t.Errorf("after rollback, instance image = %s, want sha256:v1", img)
|
||||
}
|
||||
|
||||
// Verify rollback was recorded in history.
|
||||
entries, err := hist.ListByTarget("app")
|
||||
if err != nil {
|
||||
t.Fatalf("ListByTarget error: %v", err)
|
||||
}
|
||||
// Should have the original entry + the rollback entry.
|
||||
if len(entries) < 2 {
|
||||
t.Errorf("expected at least 2 history entries, got %d", len(entries))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Health Check Fail Triggers Rollback ────────────────────────────────
|
||||
|
||||
func TestHealthCheckFailTriggersRollback(t *testing.T) {
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "svc-1", Image: "sha256:old", Status: "running"},
|
||||
Instance{Name: "svc-2", Image: "sha256:old", Status: "running"},
|
||||
)
|
||||
hc := newMockHealthChecker()
|
||||
// Make svc-2 fail health check after being updated.
|
||||
// Since instances are iterated from the map, we set both to fail
|
||||
// but we only need to verify that when any fails, rollback happens.
|
||||
hc.results["svc-1"] = nil // svc-1 is healthy
|
||||
hc.results["svc-2"] = fmt.Errorf("connection refused")
|
||||
|
||||
pc := newProgressCollector()
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
cfg := DeployConfig{
|
||||
Strategy: StrategyRolling,
|
||||
Target: "svc",
|
||||
NewImage: "sha256:bad",
|
||||
MaxSurge: 1,
|
||||
MaxUnavail: 0,
|
||||
HealthCheck: HealthCheck{Type: "tcp", Port: 8080, Interval: 100 * time.Millisecond, Retries: 1},
|
||||
Timeout: 30 * time.Second,
|
||||
AutoRollback: true,
|
||||
}
|
||||
|
||||
err := RollingDeploy(cfg, exec, hc, hist, pc.callback())
|
||||
|
||||
// Deployment should fail.
|
||||
if err == nil {
|
||||
t.Fatal("expected RollingDeploy to fail due to health check, but got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "health check failed") {
|
||||
t.Errorf("error should mention health check failure, got: %v", err)
|
||||
}
|
||||
|
||||
// Verify rollback phase appeared in progress.
|
||||
phases := pc.phases()
|
||||
foundRollback := false
|
||||
for _, p := range phases {
|
||||
if p == PhaseRollingBack {
|
||||
foundRollback = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundRollback {
|
||||
t.Error("expected rolling-back phase in progress updates")
|
||||
}
|
||||
|
||||
// Verify rollback operations were attempted (update-image back to old).
|
||||
ops := exec.getOps()
|
||||
rollbackOps := 0
|
||||
for _, op := range ops {
|
||||
if strings.Contains(op, "update-image:") && strings.Contains(op, ":sha256:old") {
|
||||
rollbackOps++
|
||||
}
|
||||
}
|
||||
if rollbackOps == 0 {
|
||||
t.Error("expected rollback operations (update-image back to sha256:old), found none")
|
||||
}
|
||||
|
||||
// Verify history records the failure.
|
||||
entries, _ := hist.ListByTarget("svc")
|
||||
if len(entries) == 0 {
|
||||
t.Fatal("expected history entry for failed deployment")
|
||||
}
|
||||
if entries[0].Status != string(PhaseFailed) {
|
||||
t.Errorf("history status = %s, want failed", entries[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Deploy History ─────────────────────────────────────────────────────
|
||||
|
||||
func TestDeployHistory(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
// Write several entries.
|
||||
entries := []HistoryEntry{
|
||||
{
|
||||
ID: "deploy-001",
|
||||
Target: "web-app",
|
||||
Strategy: "rolling",
|
||||
OldRef: "sha256:abc123",
|
||||
NewRef: "sha256:def456",
|
||||
Status: "complete",
|
||||
StartedAt: time.Date(2026, 3, 20, 15, 0, 0, 0, time.UTC),
|
||||
CompletedAt: time.Date(2026, 3, 20, 15, 5, 0, 0, time.UTC),
|
||||
InstancesUpdated: 3,
|
||||
},
|
||||
{
|
||||
ID: "deploy-002",
|
||||
Target: "web-app",
|
||||
Strategy: "canary",
|
||||
OldRef: "sha256:def456",
|
||||
NewRef: "sha256:ghi789",
|
||||
Status: "complete",
|
||||
StartedAt: time.Date(2026, 3, 21, 10, 0, 0, 0, time.UTC),
|
||||
CompletedAt: time.Date(2026, 3, 21, 10, 2, 0, 0, time.UTC),
|
||||
InstancesUpdated: 1,
|
||||
},
|
||||
{
|
||||
ID: "deploy-003",
|
||||
Target: "api-svc",
|
||||
Strategy: "rolling",
|
||||
OldRef: "sha256:111",
|
||||
NewRef: "sha256:222",
|
||||
Status: "failed",
|
||||
StartedAt: time.Date(2026, 3, 22, 8, 0, 0, 0, time.UTC),
|
||||
CompletedAt: time.Date(2026, 3, 22, 8, 1, 0, 0, time.UTC),
|
||||
InstancesUpdated: 0,
|
||||
Message: "health check timeout",
|
||||
},
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
if err := hist.Append(e); err != nil {
|
||||
t.Fatalf("Append error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify target-specific listing.
|
||||
webEntries, err := hist.ListByTarget("web-app")
|
||||
if err != nil {
|
||||
t.Fatalf("ListByTarget error: %v", err)
|
||||
}
|
||||
if len(webEntries) != 2 {
|
||||
t.Errorf("expected 2 web-app entries, got %d", len(webEntries))
|
||||
}
|
||||
// Most recent first.
|
||||
if len(webEntries) >= 2 && webEntries[0].ID != "deploy-002" {
|
||||
t.Errorf("expected most recent entry first, got %s", webEntries[0].ID)
|
||||
}
|
||||
|
||||
apiEntries, err := hist.ListByTarget("api-svc")
|
||||
if err != nil {
|
||||
t.Fatalf("ListByTarget error: %v", err)
|
||||
}
|
||||
if len(apiEntries) != 1 {
|
||||
t.Errorf("expected 1 api-svc entry, got %d", len(apiEntries))
|
||||
}
|
||||
if len(apiEntries) == 1 && apiEntries[0].Message != "health check timeout" {
|
||||
t.Errorf("expected message 'health check timeout', got %q", apiEntries[0].Message)
|
||||
}
|
||||
|
||||
// Verify ListAll.
|
||||
all, err := hist.ListAll()
|
||||
if err != nil {
|
||||
t.Fatalf("ListAll error: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Errorf("expected 3 total entries, got %d", len(all))
|
||||
}
|
||||
|
||||
// Verify files were created.
|
||||
files, _ := filepath.Glob(filepath.Join(tmpDir, "*.yaml"))
|
||||
if len(files) != 2 { // web-app.yaml and api-svc.yaml
|
||||
t.Errorf("expected 2 history files, got %d", len(files))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Config Validation ──────────────────────────────────────────────────
|
||||
|
||||
func TestConfigValidation(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
cfg DeployConfig
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "empty target",
|
||||
cfg: DeployConfig{Strategy: StrategyRolling, NewImage: "sha256:abc"},
|
||||
wantErr: "target is required",
|
||||
},
|
||||
{
|
||||
name: "empty image",
|
||||
cfg: DeployConfig{Strategy: StrategyRolling, Target: "web"},
|
||||
wantErr: "new image",
|
||||
},
|
||||
{
|
||||
name: "invalid strategy",
|
||||
cfg: DeployConfig{Strategy: "blue-green", Target: "web", NewImage: "sha256:abc"},
|
||||
wantErr: "unknown strategy",
|
||||
},
|
||||
{
|
||||
name: "canary weight zero",
|
||||
cfg: DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 0},
|
||||
wantErr: "canary weight must be between 1 and 99",
|
||||
},
|
||||
{
|
||||
name: "canary weight 100",
|
||||
cfg: DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 100},
|
||||
wantErr: "canary weight must be between 1 and 99",
|
||||
},
|
||||
{
|
||||
name: "valid rolling",
|
||||
cfg: DeployConfig{Strategy: StrategyRolling, Target: "web", NewImage: "sha256:abc"},
|
||||
},
|
||||
{
|
||||
name: "valid canary",
|
||||
cfg: DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 25},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := tt.cfg.Validate()
|
||||
if tt.wantErr != "" {
|
||||
if err == nil {
|
||||
t.Errorf("expected error containing %q, got nil", tt.wantErr)
|
||||
} else if !strings.Contains(err.Error(), tt.wantErr) {
|
||||
t.Errorf("error %q should contain %q", err.Error(), tt.wantErr)
|
||||
}
|
||||
} else {
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Canary Instance Name ───────────────────────────────────────────────
|
||||
|
||||
func TestCanaryInstanceName(t *testing.T) {
|
||||
tests := []struct {
|
||||
target string
|
||||
want string
|
||||
}{
|
||||
{"web-app", "web-app-canary"},
|
||||
{"api-1", "api-canary"},
|
||||
{"simple", "simple-canary"},
|
||||
{"my-service-", "my-service-canary"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := canaryInstanceName(tt.target)
|
||||
if got != tt.want {
|
||||
t.Errorf("canaryInstanceName(%q) = %q, want %q", tt.target, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: No Instances Found ─────────────────────────────────────────────────
|
||||
|
||||
func TestRollingDeployNoInstances(t *testing.T) {
|
||||
exec := newMockExecutor() // empty
|
||||
hc := newMockHealthChecker()
|
||||
|
||||
cfg := DeployConfig{
|
||||
Strategy: StrategyRolling,
|
||||
Target: "nonexistent",
|
||||
NewImage: "sha256:abc",
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
err := RollingDeploy(cfg, exec, hc, nil, nil)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for no instances, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "no instances found") {
|
||||
t.Errorf("error should mention no instances, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Active Deployments Tracking ────────────────────────────────────────
|
||||
|
||||
func TestActiveDeployments(t *testing.T) {
|
||||
// Clear any leftover state.
|
||||
activeDeploymentsMu.Lock()
|
||||
activeDeployments = make(map[string]*DeployStatus)
|
||||
activeDeploymentsMu.Unlock()
|
||||
|
||||
// Initially empty.
|
||||
active := GetActiveDeployments()
|
||||
if len(active) != 0 {
|
||||
t.Errorf("expected 0 active deployments, got %d", len(active))
|
||||
}
|
||||
|
||||
// Run a deployment and check it appears during execution.
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "track-1", Image: "sha256:old", Status: "running"},
|
||||
)
|
||||
hc := newMockHealthChecker()
|
||||
|
||||
var seenActive bool
|
||||
progressFn := func(status DeployStatus) {
|
||||
if status.Phase == PhaseDeploying || status.Phase == PhaseVerifying {
|
||||
ad := GetActiveDeployment("track")
|
||||
if ad != nil {
|
||||
seenActive = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg := DeployConfig{
|
||||
Strategy: StrategyRolling,
|
||||
Target: "track",
|
||||
NewImage: "sha256:new",
|
||||
HealthCheck: HealthCheck{Type: "none"},
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
err := RollingDeploy(cfg, exec, hc, nil, progressFn)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !seenActive {
|
||||
t.Error("expected to see active deployment during execution")
|
||||
}
|
||||
|
||||
// After completion, should be empty again.
|
||||
active = GetActiveDeployments()
|
||||
if len(active) != 0 {
|
||||
t.Errorf("expected 0 active deployments after completion, got %d", len(active))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: History File Persistence ───────────────────────────────────────────
|
||||
|
||||
func TestHistoryFilePersistence(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
entry := HistoryEntry{
|
||||
ID: "persist-001",
|
||||
Target: "my-app",
|
||||
Strategy: "rolling",
|
||||
OldRef: "sha256:aaa",
|
||||
NewRef: "sha256:bbb",
|
||||
Status: "complete",
|
||||
StartedAt: time.Now().UTC(),
|
||||
CompletedAt: time.Now().UTC(),
|
||||
InstancesUpdated: 2,
|
||||
}
|
||||
if err := hist.Append(entry); err != nil {
|
||||
t.Fatalf("Append error: %v", err)
|
||||
}
|
||||
|
||||
// Verify the file exists on disk.
|
||||
filePath := filepath.Join(tmpDir, "my-app.yaml")
|
||||
if _, err := os.Stat(filePath); err != nil {
|
||||
t.Fatalf("history file not found: %v", err)
|
||||
}
|
||||
|
||||
// Create a new store instance (simulating restart) and verify data.
|
||||
hist2 := NewHistoryStore(tmpDir)
|
||||
entries, err := hist2.ListByTarget("my-app")
|
||||
if err != nil {
|
||||
t.Fatalf("ListByTarget error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
if entries[0].ID != "persist-001" {
|
||||
t.Errorf("entry ID = %s, want persist-001", entries[0].ID)
|
||||
}
|
||||
if entries[0].InstancesUpdated != 2 {
|
||||
t.Errorf("instances_updated = %d, want 2", entries[0].InstancesUpdated)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Noop Health Checker ────────────────────────────────────────────────
|
||||
|
||||
func TestNoopHealthChecker(t *testing.T) {
|
||||
noop := &NoopHealthChecker{}
|
||||
err := noop.WaitHealthy("anything", HealthCheck{Type: "http", Port: 9999})
|
||||
if err != nil {
|
||||
t.Errorf("NoopHealthChecker should always return nil, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Rollback Without History ───────────────────────────────────────────
|
||||
|
||||
func TestRollbackWithoutHistory(t *testing.T) {
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "no-hist-1", Image: "sha256:v2", Status: "running"},
|
||||
)
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
err := Rollback("no-hist", exec, hist, nil)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for rollback without history, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "no previous version") {
|
||||
t.Errorf("error should mention no previous version, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Test: Canary Cleanup on Health Failure ────────────────────────────────────
|
||||
|
||||
func TestCanaryCleanupOnHealthFailure(t *testing.T) {
|
||||
exec := newMockExecutor(
|
||||
Instance{Name: "svc-1", Image: "sha256:v1", Status: "running"},
|
||||
)
|
||||
hc := newMockHealthChecker()
|
||||
canaryName := canaryInstanceName("svc")
|
||||
hc.results[canaryName] = fmt.Errorf("unhealthy canary")
|
||||
|
||||
pc := newProgressCollector()
|
||||
tmpDir := t.TempDir()
|
||||
hist := NewHistoryStore(tmpDir)
|
||||
|
||||
cfg := DeployConfig{
|
||||
Strategy: StrategyCanary,
|
||||
Target: "svc",
|
||||
NewImage: "sha256:v2",
|
||||
CanaryWeight: 10,
|
||||
HealthCheck: HealthCheck{Type: "tcp", Port: 8080, Interval: 100 * time.Millisecond, Retries: 1},
|
||||
Timeout: 10 * time.Second,
|
||||
AutoRollback: true,
|
||||
}
|
||||
|
||||
err := CanaryDeploy(cfg, exec, hc, hist, pc.callback())
|
||||
if err == nil {
|
||||
t.Fatal("expected canary to fail, got nil")
|
||||
}
|
||||
|
||||
// Verify canary was cleaned up (stop + delete).
|
||||
ops := exec.getOps()
|
||||
foundStop := false
|
||||
foundDelete := false
|
||||
for _, op := range ops {
|
||||
if op == fmt.Sprintf("stop:%s", canaryName) {
|
||||
foundStop = true
|
||||
}
|
||||
if op == fmt.Sprintf("delete:%s", canaryName) {
|
||||
foundDelete = true
|
||||
}
|
||||
}
|
||||
if !foundStop {
|
||||
t.Error("expected canary stop operation during cleanup")
|
||||
}
|
||||
if !foundDelete {
|
||||
t.Error("expected canary delete operation during cleanup")
|
||||
}
|
||||
|
||||
// Verify original instance was not modified.
|
||||
img, _ := exec.GetInstanceImage("svc-1")
|
||||
if img != "sha256:v1" {
|
||||
t.Errorf("original instance image changed to %s during failed canary", img)
|
||||
}
|
||||
}
|
||||
143
pkg/deploy/health.go
Normal file
143
pkg/deploy/health.go
Normal file
@@ -0,0 +1,143 @@
|
||||
/*
|
||||
Health — Health check implementations for deployment verification.
|
||||
|
||||
Supports HTTP, TCP, exec, and no-op health checks. Each check type
|
||||
retries according to the configured interval and retry count.
|
||||
|
||||
Copyright (c) Armored Gates LLC. All rights reserved.
|
||||
*/
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── Health Check Config ──────────────────────────────────────────────────────
|
||||
|
||||
// HealthCheck defines how to verify that an instance is healthy after deploy.
|
||||
type HealthCheck struct {
|
||||
Type string `json:"type" yaml:"type"` // "http", "tcp", "exec", "none"
|
||||
Path string `json:"path" yaml:"path"` // HTTP path (e.g., "/healthz")
|
||||
Port int `json:"port" yaml:"port"` // Port to check
|
||||
Command string `json:"command" yaml:"command"` // Exec command
|
||||
Interval time.Duration `json:"interval" yaml:"interval"` // Time between retries
|
||||
Retries int `json:"retries" yaml:"retries"` // Max retry count
|
||||
}
|
||||
|
||||
// ── Health Checker Interface ─────────────────────────────────────────────────
|
||||
|
||||
// HealthChecker verifies instance health during deployments.
|
||||
type HealthChecker interface {
|
||||
// WaitHealthy blocks until the instance is healthy or all retries are exhausted.
|
||||
WaitHealthy(instanceName string, check HealthCheck) error
|
||||
}
|
||||
|
||||
// ── Default Health Checker ───────────────────────────────────────────────────
|
||||
|
||||
// DefaultHealthChecker implements HealthChecker using real HTTP/TCP/exec calls.
|
||||
type DefaultHealthChecker struct {
|
||||
// InstanceIPResolver resolves an instance name to an IP address.
|
||||
// If nil, "127.0.0.1" is used.
|
||||
InstanceIPResolver func(name string) (string, error)
|
||||
}
|
||||
|
||||
// WaitHealthy performs health checks with retries.
|
||||
func (d *DefaultHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
|
||||
switch check.Type {
|
||||
case "none", "":
|
||||
return nil
|
||||
case "http":
|
||||
return d.waitHTTP(instanceName, check)
|
||||
case "tcp":
|
||||
return d.waitTCP(instanceName, check)
|
||||
case "exec":
|
||||
return d.waitExec(instanceName, check)
|
||||
default:
|
||||
return fmt.Errorf("unknown health check type: %q", check.Type)
|
||||
}
|
||||
}
|
||||
|
||||
func (d *DefaultHealthChecker) resolveIP(instanceName string) string {
|
||||
if d.InstanceIPResolver != nil {
|
||||
ip, err := d.InstanceIPResolver(instanceName)
|
||||
if err == nil {
|
||||
return ip
|
||||
}
|
||||
}
|
||||
return "127.0.0.1"
|
||||
}
|
||||
|
||||
func (d *DefaultHealthChecker) waitHTTP(instanceName string, check HealthCheck) error {
|
||||
ip := d.resolveIP(instanceName)
|
||||
url := fmt.Sprintf("http://%s:%d%s", ip, check.Port, check.Path)
|
||||
|
||||
client := &http.Client{Timeout: check.Interval}
|
||||
|
||||
var lastErr error
|
||||
for i := 0; i < check.Retries; i++ {
|
||||
resp, err := client.Get(url)
|
||||
if err == nil {
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
|
||||
return nil
|
||||
}
|
||||
lastErr = fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
|
||||
} else {
|
||||
lastErr = err
|
||||
}
|
||||
if i < check.Retries-1 {
|
||||
time.Sleep(check.Interval)
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("health check failed after %d retries: %w", check.Retries, lastErr)
|
||||
}
|
||||
|
||||
func (d *DefaultHealthChecker) waitTCP(instanceName string, check HealthCheck) error {
|
||||
ip := d.resolveIP(instanceName)
|
||||
addr := fmt.Sprintf("%s:%d", ip, check.Port)
|
||||
|
||||
var lastErr error
|
||||
for i := 0; i < check.Retries; i++ {
|
||||
conn, err := net.DialTimeout("tcp", addr, check.Interval)
|
||||
if err == nil {
|
||||
conn.Close()
|
||||
return nil
|
||||
}
|
||||
lastErr = err
|
||||
if i < check.Retries-1 {
|
||||
time.Sleep(check.Interval)
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("TCP health check failed after %d retries: %w", check.Retries, lastErr)
|
||||
}
|
||||
|
||||
func (d *DefaultHealthChecker) waitExec(instanceName string, check HealthCheck) error {
|
||||
var lastErr error
|
||||
for i := 0; i < check.Retries; i++ {
|
||||
cmd := exec.Command("sh", "-c", check.Command)
|
||||
if err := cmd.Run(); err == nil {
|
||||
return nil
|
||||
} else {
|
||||
lastErr = err
|
||||
}
|
||||
if i < check.Retries-1 {
|
||||
time.Sleep(check.Interval)
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("exec health check failed after %d retries: %w", check.Retries, lastErr)
|
||||
}
|
||||
|
||||
// ── Noop Health Checker ──────────────────────────────────────────────────────
|
||||
|
||||
// NoopHealthChecker always returns healthy. Used for rollbacks and when
|
||||
// health checking is disabled.
|
||||
type NoopHealthChecker struct{}
|
||||
|
||||
// WaitHealthy always succeeds immediately.
|
||||
func (n *NoopHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
|
||||
return nil
|
||||
}
|
||||
186
pkg/deploy/history.go
Normal file
186
pkg/deploy/history.go
Normal file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
History — Persistent deployment history for Volt.
|
||||
|
||||
Stores deployment records as YAML in /var/lib/volt/deployments/.
|
||||
Each target gets its own history file to keep lookups fast.
|
||||
|
||||
Copyright (c) Armored Gates LLC. All rights reserved.
|
||||
*/
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// ── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
const (
|
||||
// DefaultHistoryDir is where deployment history files are stored.
|
||||
DefaultHistoryDir = "/var/lib/volt/deployments"
|
||||
)
|
||||
|
||||
// ── History Entry ────────────────────────────────────────────────────────────
|
||||
|
||||
// HistoryEntry records a single deployment operation.
|
||||
type HistoryEntry struct {
|
||||
ID string `yaml:"id" json:"id"`
|
||||
Target string `yaml:"target" json:"target"`
|
||||
Strategy string `yaml:"strategy" json:"strategy"`
|
||||
OldRef string `yaml:"old_ref" json:"old_ref"`
|
||||
NewRef string `yaml:"new_ref" json:"new_ref"`
|
||||
Status string `yaml:"status" json:"status"` // "complete", "failed", "rolling-back"
|
||||
StartedAt time.Time `yaml:"started_at" json:"started_at"`
|
||||
CompletedAt time.Time `yaml:"completed_at" json:"completed_at"`
|
||||
InstancesUpdated int `yaml:"instances_updated" json:"instances_updated"`
|
||||
Message string `yaml:"message,omitempty" json:"message,omitempty"`
|
||||
}
|
||||
|
||||
// ── History Store ────────────────────────────────────────────────────────────
|
||||
|
||||
// HistoryStore manages deployment history on disk.
|
||||
type HistoryStore struct {
|
||||
dir string
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// NewHistoryStore creates a history store at the given directory.
|
||||
func NewHistoryStore(dir string) *HistoryStore {
|
||||
if dir == "" {
|
||||
dir = DefaultHistoryDir
|
||||
}
|
||||
return &HistoryStore{dir: dir}
|
||||
}
|
||||
|
||||
// Dir returns the history directory path.
|
||||
func (h *HistoryStore) Dir() string {
|
||||
return h.dir
|
||||
}
|
||||
|
||||
// historyFile returns the path to the history file for a target.
|
||||
func (h *HistoryStore) historyFile(target string) string {
|
||||
// Sanitize the target name for use as a filename.
|
||||
safe := strings.Map(func(r rune) rune {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') ||
|
||||
(r >= '0' && r <= '9') || r == '-' || r == '_' {
|
||||
return r
|
||||
}
|
||||
return '_'
|
||||
}, target)
|
||||
return filepath.Join(h.dir, safe+".yaml")
|
||||
}
|
||||
|
||||
// Append adds a deployment entry to the target's history file.
|
||||
func (h *HistoryStore) Append(entry HistoryEntry) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
|
||||
if err := os.MkdirAll(h.dir, 0755); err != nil {
|
||||
return fmt.Errorf("history: create dir: %w", err)
|
||||
}
|
||||
|
||||
// Load existing entries.
|
||||
entries, _ := h.readEntries(entry.Target) // ignore error on first write
|
||||
|
||||
// Append and write.
|
||||
entries = append(entries, entry)
|
||||
|
||||
return h.writeEntries(entry.Target, entries)
|
||||
}
|
||||
|
||||
// ListByTarget returns all deployment history for a target, most recent first.
|
||||
func (h *HistoryStore) ListByTarget(target string) ([]HistoryEntry, error) {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
|
||||
entries, err := h.readEntries(target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Sort by StartedAt descending (most recent first).
|
||||
sort.Slice(entries, func(i, j int) bool {
|
||||
return entries[i].StartedAt.After(entries[j].StartedAt)
|
||||
})
|
||||
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// ListAll returns all deployment history across all targets, most recent first.
|
||||
func (h *HistoryStore) ListAll() ([]HistoryEntry, error) {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
|
||||
files, err := filepath.Glob(filepath.Join(h.dir, "*.yaml"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("history: glob: %w", err)
|
||||
}
|
||||
|
||||
var all []HistoryEntry
|
||||
for _, f := range files {
|
||||
data, err := os.ReadFile(f)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var entries []HistoryEntry
|
||||
if err := yaml.Unmarshal(data, &entries); err != nil {
|
||||
continue
|
||||
}
|
||||
all = append(all, entries...)
|
||||
}
|
||||
|
||||
sort.Slice(all, func(i, j int) bool {
|
||||
return all[i].StartedAt.After(all[j].StartedAt)
|
||||
})
|
||||
|
||||
return all, nil
|
||||
}
|
||||
|
||||
// readEntries loads entries from the history file for a target.
|
||||
// Returns empty slice (not error) if file doesn't exist.
|
||||
func (h *HistoryStore) readEntries(target string) ([]HistoryEntry, error) {
|
||||
filePath := h.historyFile(target)
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("history: read %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
var entries []HistoryEntry
|
||||
if err := yaml.Unmarshal(data, &entries); err != nil {
|
||||
return nil, fmt.Errorf("history: parse %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// writeEntries writes entries to the history file for a target.
|
||||
func (h *HistoryStore) writeEntries(target string, entries []HistoryEntry) error {
|
||||
filePath := h.historyFile(target)
|
||||
|
||||
data, err := yaml.Marshal(entries)
|
||||
if err != nil {
|
||||
return fmt.Errorf("history: marshal: %w", err)
|
||||
}
|
||||
|
||||
// Atomic write: tmp + rename.
|
||||
tmpPath := filePath + ".tmp"
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return fmt.Errorf("history: write %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := os.Rename(tmpPath, filePath); err != nil {
|
||||
os.Remove(tmpPath)
|
||||
return fmt.Errorf("history: rename %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
46
pkg/deploy/io.go
Normal file
46
pkg/deploy/io.go
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
IO helpers — Thin wrappers for filesystem and system operations.
|
||||
|
||||
Isolated here so tests can verify logic without needing OS-level mocks.
|
||||
|
||||
Copyright (c) Armored Gates LLC. All rights reserved.
|
||||
*/
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
// readFile reads a file's contents. Wraps os.ReadFile.
|
||||
func readFile(path string) ([]byte, error) {
|
||||
return os.ReadFile(path)
|
||||
}
|
||||
|
||||
// writeFile writes data to a file atomically. Wraps os.WriteFile.
|
||||
func writeFile(path string, data []byte) error {
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
// appendFile appends data to a file, creating it if necessary.
|
||||
func appendFile(path string, data []byte) error {
|
||||
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
_, err = f.Write(data)
|
||||
return err
|
||||
}
|
||||
|
||||
// fileInfo returns os.FileInfo for the given path.
|
||||
func fileInfo(path string) (os.FileInfo, error) {
|
||||
return os.Stat(path)
|
||||
}
|
||||
|
||||
// runSystemctl runs a systemctl subcommand.
|
||||
func runSystemctl(action, unit string) error {
|
||||
cmd := exec.Command("systemctl", action, unit)
|
||||
_, err := cmd.CombinedOutput()
|
||||
return err
|
||||
}
|
||||
Reference in New Issue
Block a user