Volt CLI: source-available under AGPSL v5.0

Complete infrastructure platform CLI:
- Container runtime (systemd-nspawn)
- VoltVisor VMs (Neutron Stardust / QEMU)
- Stellarium CAS (content-addressed storage)
- ORAS Registry
- GitOps integration
- Landlock LSM security
- Compose orchestration
- Mesh networking

Copyright (c) Armored Gates LLC. All rights reserved.
Licensed under AGPSL v5.0
This commit is contained in:
Karl Clinger
2026-03-21 00:30:23 -05:00
commit 81ad0b597c
106 changed files with 35984 additions and 0 deletions

733
pkg/deploy/deploy.go Normal file
View File

@@ -0,0 +1,733 @@
/*
Deploy — Rolling and canary deployment strategies for Volt workloads.
Coordinates zero-downtime updates for containers and workloads by
orchestrating instance creation, health verification, traffic shifting,
and automatic rollback on failure.
Since Volt uses CAS (content-addressed storage) for rootfs assembly,
"updating" a workload means pointing it to a new CAS ref and having
TinyVol reassemble the directory tree from the new blob manifest.
Strategies:
rolling — Update instances one-by-one (respecting MaxSurge/MaxUnavail)
canary — Route a percentage of traffic to a new instance before full rollout
Copyright (c) Armored Gates LLC. All rights reserved.
*/
package deploy
import (
"fmt"
"path/filepath"
"strings"
"sync"
"time"
)
// ── Strategy ─────────────────────────────────────────────────────────────────
// Strategy defines the deployment approach.
type Strategy string
const (
// StrategyRolling updates instances one-by-one with health verification.
StrategyRolling Strategy = "rolling"
// StrategyCanary routes a percentage of traffic to a new instance first.
StrategyCanary Strategy = "canary"
)
// ── Configuration ────────────────────────────────────────────────────────────
// DeployConfig holds all parameters for a deployment operation.
type DeployConfig struct {
Strategy Strategy // Deployment strategy
Target string // Container/workload name or pattern
NewImage string // New CAS ref or image path to deploy
MaxSurge int // Max extra instances during rolling (default: 1)
MaxUnavail int // Max unavailable during rolling (default: 0)
CanaryWeight int // Canary traffic percentage (1-99)
HealthCheck HealthCheck // How to verify new instance is healthy
Timeout time.Duration // Max time for the entire deployment
AutoRollback bool // Rollback on failure
}
// Validate checks that the config is usable and fills in defaults.
func (c *DeployConfig) Validate() error {
if c.Target == "" {
return fmt.Errorf("deploy: target is required")
}
if c.NewImage == "" {
return fmt.Errorf("deploy: new image (CAS ref) is required")
}
switch c.Strategy {
case StrategyRolling:
if c.MaxSurge <= 0 {
c.MaxSurge = 1
}
if c.MaxUnavail < 0 {
c.MaxUnavail = 0
}
case StrategyCanary:
if c.CanaryWeight <= 0 || c.CanaryWeight >= 100 {
return fmt.Errorf("deploy: canary weight must be between 1 and 99, got %d", c.CanaryWeight)
}
default:
return fmt.Errorf("deploy: unknown strategy %q (use 'rolling' or 'canary')", c.Strategy)
}
if c.Timeout <= 0 {
c.Timeout = 10 * time.Minute
}
if c.HealthCheck.Type == "" {
c.HealthCheck.Type = "none"
}
if c.HealthCheck.Interval <= 0 {
c.HealthCheck.Interval = 5 * time.Second
}
if c.HealthCheck.Retries <= 0 {
c.HealthCheck.Retries = 3
}
return nil
}
// ── Deploy Status ────────────────────────────────────────────────────────────
// Phase represents the current phase of a deployment.
type Phase string
const (
PhasePreparing Phase = "preparing"
PhaseDeploying Phase = "deploying"
PhaseVerifying Phase = "verifying"
PhaseComplete Phase = "complete"
PhaseRollingBack Phase = "rolling-back"
PhaseFailed Phase = "failed"
PhasePaused Phase = "paused"
)
// DeployStatus tracks the progress of an active deployment.
type DeployStatus struct {
ID string `json:"id" yaml:"id"`
Phase Phase `json:"phase" yaml:"phase"`
Progress string `json:"progress" yaml:"progress"` // e.g. "2/5 instances updated"
OldVersion string `json:"old_version" yaml:"old_version"` // previous CAS ref
NewVersion string `json:"new_version" yaml:"new_version"` // target CAS ref
Target string `json:"target" yaml:"target"`
Strategy Strategy `json:"strategy" yaml:"strategy"`
StartedAt time.Time `json:"started_at" yaml:"started_at"`
CompletedAt time.Time `json:"completed_at,omitempty" yaml:"completed_at,omitempty"`
Message string `json:"message,omitempty" yaml:"message,omitempty"`
}
// ── Instance abstraction ─────────────────────────────────────────────────────
// Instance represents a single running workload instance that can be deployed to.
type Instance struct {
Name string // Instance name (e.g., "web-app-1")
Image string // Current CAS ref or image
Status string // "running", "stopped", etc.
Healthy bool // Last known health state
}
// ── Executor interface ───────────────────────────────────────────────────────
// Executor abstracts the system operations needed for deployments.
// This allows testing without real systemd/nspawn/nftables calls.
type Executor interface {
// ListInstances returns all instances matching the target pattern.
ListInstances(target string) ([]Instance, error)
// CreateInstance creates a new instance with the given image.
CreateInstance(name, image string) error
// StartInstance starts a stopped instance.
StartInstance(name string) error
// StopInstance stops a running instance.
StopInstance(name string) error
// DeleteInstance removes an instance entirely.
DeleteInstance(name string) error
// GetInstanceImage returns the current image/CAS ref for an instance.
GetInstanceImage(name string) (string, error)
// UpdateInstanceImage updates an instance to use a new image (CAS ref).
// This reassembles the rootfs via TinyVol and restarts the instance.
UpdateInstanceImage(name, newImage string) error
// UpdateTrafficWeight adjusts traffic routing for canary deployments.
// weight is 0-100 representing percentage to the canary instance.
UpdateTrafficWeight(target string, canaryName string, weight int) error
}
// ── Active deployments tracking ──────────────────────────────────────────────
var (
activeDeployments = make(map[string]*DeployStatus)
activeDeploymentsMu sync.RWMutex
)
// GetActiveDeployments returns a snapshot of all active deployments.
func GetActiveDeployments() []DeployStatus {
activeDeploymentsMu.RLock()
defer activeDeploymentsMu.RUnlock()
result := make([]DeployStatus, 0, len(activeDeployments))
for _, ds := range activeDeployments {
result = append(result, *ds)
}
return result
}
// GetActiveDeployment returns the active deployment for a target, if any.
func GetActiveDeployment(target string) *DeployStatus {
activeDeploymentsMu.RLock()
defer activeDeploymentsMu.RUnlock()
if ds, ok := activeDeployments[target]; ok {
cp := *ds
return &cp
}
return nil
}
func setActiveDeployment(ds *DeployStatus) {
activeDeploymentsMu.Lock()
defer activeDeploymentsMu.Unlock()
activeDeployments[ds.Target] = ds
}
func removeActiveDeployment(target string) {
activeDeploymentsMu.Lock()
defer activeDeploymentsMu.Unlock()
delete(activeDeployments, target)
}
// ── Progress callback ────────────────────────────────────────────────────────
// ProgressFunc is called with status updates during deployment.
type ProgressFunc func(status DeployStatus)
// ── Rolling Deploy ───────────────────────────────────────────────────────────
// RollingDeploy performs a rolling update of instances matching cfg.Target.
//
// Algorithm:
// 1. List all instances matching the target pattern
// 2. For each instance (respecting MaxSurge / MaxUnavail):
// a. Update instance image to new CAS ref (reassemble rootfs via TinyVol)
// b. Start/restart the instance
// c. Wait for health check to pass
// d. If health check fails and AutoRollback: revert to old image
// 3. Record deployment in history
func RollingDeploy(cfg DeployConfig, exec Executor, hc HealthChecker, hist *HistoryStore, progress ProgressFunc) error {
if err := cfg.Validate(); err != nil {
return err
}
// Generate deployment ID.
deployID := generateDeployID()
status := &DeployStatus{
ID: deployID,
Phase: PhasePreparing,
Target: cfg.Target,
Strategy: StrategyRolling,
NewVersion: cfg.NewImage,
StartedAt: time.Now().UTC(),
}
setActiveDeployment(status)
notifyProgress(progress, *status)
// 1. Discover instances.
instances, err := exec.ListInstances(cfg.Target)
if err != nil {
status.Phase = PhaseFailed
status.Message = fmt.Sprintf("failed to list instances: %v", err)
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
if len(instances) == 0 {
status.Phase = PhaseFailed
status.Message = "no instances found matching target"
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
// Record old version from first instance.
if len(instances) > 0 {
oldImg, _ := exec.GetInstanceImage(instances[0].Name)
status.OldVersion = oldImg
}
total := len(instances)
updated := 0
var rollbackTargets []string // instances that were updated (for rollback)
status.Phase = PhaseDeploying
status.Progress = fmt.Sprintf("0/%d instances updated", total)
notifyProgress(progress, *status)
// Timeout enforcement.
deadline := time.Now().Add(cfg.Timeout)
// 2. Rolling update loop.
for i, inst := range instances {
if time.Now().After(deadline) {
err := fmt.Errorf("deployment timed out after %s", cfg.Timeout)
if cfg.AutoRollback && len(rollbackTargets) > 0 {
status.Phase = PhaseRollingBack
status.Message = err.Error()
notifyProgress(progress, *status)
rollbackInstances(exec, rollbackTargets, status.OldVersion)
}
status.Phase = PhaseFailed
status.Message = err.Error()
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, updated)
return err
}
// Respect MaxSurge: we update in-place, so surge is about allowing
// brief overlap. With MaxUnavail=0 and MaxSurge=1, we update one at a time.
_ = cfg.MaxSurge // In single-node mode, surge is handled by updating in-place.
status.Progress = fmt.Sprintf("%d/%d instances updated (updating %s)", i, total, inst.Name)
notifyProgress(progress, *status)
// a. Update the instance image.
if err := exec.UpdateInstanceImage(inst.Name, cfg.NewImage); err != nil {
errMsg := fmt.Sprintf("failed to update instance %s: %v", inst.Name, err)
if cfg.AutoRollback {
status.Phase = PhaseRollingBack
status.Message = errMsg
notifyProgress(progress, *status)
rollbackInstances(exec, rollbackTargets, status.OldVersion)
status.Phase = PhaseFailed
} else {
status.Phase = PhaseFailed
}
status.Message = errMsg
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, updated)
return fmt.Errorf("deploy: %s", errMsg)
}
// b. Start the instance.
if err := exec.StartInstance(inst.Name); err != nil {
errMsg := fmt.Sprintf("failed to start instance %s: %v", inst.Name, err)
if cfg.AutoRollback {
status.Phase = PhaseRollingBack
status.Message = errMsg
notifyProgress(progress, *status)
// Rollback this instance too.
rollbackTargets = append(rollbackTargets, inst.Name)
rollbackInstances(exec, rollbackTargets, status.OldVersion)
status.Phase = PhaseFailed
} else {
status.Phase = PhaseFailed
}
status.Message = errMsg
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, updated)
return fmt.Errorf("deploy: %s", errMsg)
}
// c. Health check.
status.Phase = PhaseVerifying
notifyProgress(progress, *status)
if err := hc.WaitHealthy(inst.Name, cfg.HealthCheck); err != nil {
errMsg := fmt.Sprintf("health check failed for %s: %v", inst.Name, err)
if cfg.AutoRollback {
status.Phase = PhaseRollingBack
status.Message = errMsg
notifyProgress(progress, *status)
rollbackTargets = append(rollbackTargets, inst.Name)
rollbackInstances(exec, rollbackTargets, status.OldVersion)
status.Phase = PhaseFailed
} else {
status.Phase = PhaseFailed
}
status.Message = errMsg
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, updated)
return fmt.Errorf("deploy: %s", errMsg)
}
rollbackTargets = append(rollbackTargets, inst.Name)
updated++
status.Phase = PhaseDeploying
status.Progress = fmt.Sprintf("%d/%d instances updated", updated, total)
notifyProgress(progress, *status)
}
// 3. Complete.
status.Phase = PhaseComplete
status.Progress = fmt.Sprintf("%d/%d instances updated", updated, total)
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, updated)
return nil
}
// ── Canary Deploy ────────────────────────────────────────────────────────────
// CanaryDeploy creates a canary instance alongside existing instances and
// routes cfg.CanaryWeight percent of traffic to it.
//
// Algorithm:
// 1. List existing instances
// 2. Create a new canary instance with the new image
// 3. Start the canary and verify health
// 4. Update traffic routing to send CanaryWeight% to canary
// 5. If health fails and AutoRollback: remove canary, restore routing
func CanaryDeploy(cfg DeployConfig, exec Executor, hc HealthChecker, hist *HistoryStore, progress ProgressFunc) error {
if err := cfg.Validate(); err != nil {
return err
}
deployID := generateDeployID()
status := &DeployStatus{
ID: deployID,
Phase: PhasePreparing,
Target: cfg.Target,
Strategy: StrategyCanary,
NewVersion: cfg.NewImage,
StartedAt: time.Now().UTC(),
}
setActiveDeployment(status)
notifyProgress(progress, *status)
// 1. Discover existing instances.
instances, err := exec.ListInstances(cfg.Target)
if err != nil {
status.Phase = PhaseFailed
status.Message = fmt.Sprintf("failed to list instances: %v", err)
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
if len(instances) == 0 {
status.Phase = PhaseFailed
status.Message = "no instances found matching target"
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
// Record old version.
if oldImg, err := exec.GetInstanceImage(instances[0].Name); err == nil {
status.OldVersion = oldImg
}
// 2. Create canary instance.
canaryName := canaryInstanceName(cfg.Target)
status.Phase = PhaseDeploying
status.Progress = fmt.Sprintf("creating canary instance %s", canaryName)
notifyProgress(progress, *status)
if err := exec.CreateInstance(canaryName, cfg.NewImage); err != nil {
status.Phase = PhaseFailed
status.Message = fmt.Sprintf("failed to create canary: %v", err)
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
// 3. Start canary and verify health.
if err := exec.StartInstance(canaryName); err != nil {
cleanupCanary(exec, canaryName)
status.Phase = PhaseFailed
status.Message = fmt.Sprintf("failed to start canary: %v", err)
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
status.Phase = PhaseVerifying
status.Progress = "verifying canary health"
notifyProgress(progress, *status)
if err := hc.WaitHealthy(canaryName, cfg.HealthCheck); err != nil {
if cfg.AutoRollback {
status.Phase = PhaseRollingBack
status.Message = fmt.Sprintf("canary health check failed: %v", err)
notifyProgress(progress, *status)
cleanupCanary(exec, canaryName)
}
status.Phase = PhaseFailed
status.Message = fmt.Sprintf("canary health check failed: %v", err)
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
// 4. Update traffic routing.
status.Progress = fmt.Sprintf("routing %d%% traffic to canary", cfg.CanaryWeight)
notifyProgress(progress, *status)
if err := exec.UpdateTrafficWeight(cfg.Target, canaryName, cfg.CanaryWeight); err != nil {
if cfg.AutoRollback {
cleanupCanary(exec, canaryName)
}
status.Phase = PhaseFailed
status.Message = fmt.Sprintf("failed to update traffic routing: %v", err)
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 0)
return fmt.Errorf("deploy: %s", status.Message)
}
// 5. Canary is live.
status.Phase = PhaseComplete
status.Progress = fmt.Sprintf("canary live with %d%% traffic", cfg.CanaryWeight)
status.CompletedAt = time.Now().UTC()
notifyProgress(progress, *status)
removeActiveDeployment(cfg.Target)
recordHistory(hist, status, 1)
return nil
}
// ── Rollback ─────────────────────────────────────────────────────────────────
// Rollback reverts a target to its previous version using deployment history.
func Rollback(target string, exec Executor, hist *HistoryStore, progress ProgressFunc) error {
if hist == nil {
return fmt.Errorf("deploy rollback: no history store available")
}
entries, err := hist.ListByTarget(target)
if err != nil {
return fmt.Errorf("deploy rollback: failed to read history: %w", err)
}
// Find the last successful deployment that has a different version.
var previousRef string
for _, entry := range entries {
if entry.Status == string(PhaseComplete) && entry.OldRef != "" {
previousRef = entry.OldRef
break
}
}
if previousRef == "" {
return fmt.Errorf("deploy rollback: no previous version found in history for %q", target)
}
status := &DeployStatus{
ID: generateDeployID(),
Phase: PhaseRollingBack,
Target: target,
Strategy: StrategyRolling,
NewVersion: previousRef,
StartedAt: time.Now().UTC(),
Message: "rollback to previous version",
}
notifyProgress(progress, *status)
// Perform a rolling deploy with the previous ref.
rollbackCfg := DeployConfig{
Strategy: StrategyRolling,
Target: target,
NewImage: previousRef,
MaxSurge: 1,
MaxUnavail: 0,
HealthCheck: HealthCheck{Type: "none"},
Timeout: 5 * time.Minute,
AutoRollback: false, // Don't auto-rollback a rollback
}
return RollingDeploy(rollbackCfg, exec, &NoopHealthChecker{}, hist, progress)
}
// ── Helpers ──────────────────────────────────────────────────────────────────
// rollbackInstances reverts a list of instances to the old image.
func rollbackInstances(exec Executor, names []string, oldImage string) {
for _, name := range names {
_ = exec.UpdateInstanceImage(name, oldImage)
_ = exec.StartInstance(name)
}
}
// cleanupCanary stops and removes a canary instance.
func cleanupCanary(exec Executor, canaryName string) {
_ = exec.StopInstance(canaryName)
_ = exec.DeleteInstance(canaryName)
}
// canaryInstanceName generates a canary instance name from the target.
func canaryInstanceName(target string) string {
// Strip any trailing instance numbers and add -canary suffix.
base := strings.TrimRight(target, "0123456789-")
if base == "" {
base = target
}
return base + "-canary"
}
// generateDeployID creates a unique deployment ID.
func generateDeployID() string {
return fmt.Sprintf("deploy-%d", time.Now().UnixNano()/int64(time.Millisecond))
}
// notifyProgress safely calls the progress callback if non-nil.
func notifyProgress(fn ProgressFunc, status DeployStatus) {
if fn != nil {
fn(status)
}
}
// recordHistory saves a deployment to the history store if available.
func recordHistory(hist *HistoryStore, status *DeployStatus, instancesUpdated int) {
if hist == nil {
return
}
entry := HistoryEntry{
ID: status.ID,
Target: status.Target,
Strategy: string(status.Strategy),
OldRef: status.OldVersion,
NewRef: status.NewVersion,
Status: string(status.Phase),
StartedAt: status.StartedAt,
CompletedAt: status.CompletedAt,
InstancesUpdated: instancesUpdated,
Message: status.Message,
}
_ = hist.Append(entry)
}
// ── Default executor (real system calls) ─────────────────────────────────────
// DefaultCASDir is the default directory for CAS storage.
const DefaultCASDir = "/var/lib/volt/cas"
// SystemExecutor implements Executor using real system commands.
type SystemExecutor struct {
ContainerBaseDir string
CASBaseDir string
}
// NewSystemExecutor creates an executor for real system operations.
func NewSystemExecutor() *SystemExecutor {
return &SystemExecutor{
ContainerBaseDir: "/var/lib/volt/containers",
CASBaseDir: DefaultCASDir,
}
}
func (e *SystemExecutor) ListInstances(target string) ([]Instance, error) {
// Match instances by prefix or exact name.
// Scan /var/lib/volt/containers for directories matching the pattern.
var instances []Instance
entries, err := filepath.Glob(filepath.Join(e.ContainerBaseDir, target+"*"))
if err != nil {
return nil, fmt.Errorf("list instances: %w", err)
}
for _, entry := range entries {
name := filepath.Base(entry)
instances = append(instances, Instance{
Name: name,
Status: "unknown",
})
}
// If no glob matches, try exact match.
if len(instances) == 0 {
exact := filepath.Join(e.ContainerBaseDir, target)
if info, err := fileInfo(exact); err == nil && info.IsDir() {
instances = append(instances, Instance{
Name: target,
Status: "unknown",
})
}
}
return instances, nil
}
func (e *SystemExecutor) CreateInstance(name, image string) error {
// Create container directory and write unit file.
// In a real implementation this would use the backend.Create flow.
return fmt.Errorf("SystemExecutor.CreateInstance not yet wired to backend")
}
func (e *SystemExecutor) StartInstance(name string) error {
return runSystemctl("start", voltContainerUnit(name))
}
func (e *SystemExecutor) StopInstance(name string) error {
return runSystemctl("stop", voltContainerUnit(name))
}
func (e *SystemExecutor) DeleteInstance(name string) error {
return fmt.Errorf("SystemExecutor.DeleteInstance not yet wired to backend")
}
func (e *SystemExecutor) GetInstanceImage(name string) (string, error) {
// Read the CAS ref from the instance's metadata.
// Stored in /var/lib/volt/containers/<name>/.volt-cas-ref
refPath := filepath.Join(e.ContainerBaseDir, name, ".volt-cas-ref")
data, err := readFile(refPath)
if err != nil {
return "", fmt.Errorf("no CAS ref found for instance %s", name)
}
return strings.TrimSpace(string(data)), nil
}
func (e *SystemExecutor) UpdateInstanceImage(name, newImage string) error {
// 1. Stop the instance.
_ = runSystemctl("stop", voltContainerUnit(name))
// 2. Write new CAS ref.
refPath := filepath.Join(e.ContainerBaseDir, name, ".volt-cas-ref")
if err := writeFile(refPath, []byte(newImage)); err != nil {
return fmt.Errorf("failed to write CAS ref: %w", err)
}
return nil
}
func (e *SystemExecutor) UpdateTrafficWeight(target, canaryName string, weight int) error {
// In a full implementation this would update nftables rules for load balancing.
// For now, record the weight in a metadata file.
weightPath := filepath.Join(e.ContainerBaseDir, ".traffic-weights")
data := fmt.Sprintf("%s:%s:%d\n", target, canaryName, weight)
return appendFile(weightPath, []byte(data))
}
// voltContainerUnit returns the systemd unit name for a container.
func voltContainerUnit(name string) string {
return fmt.Sprintf("volt-container@%s.service", name)
}

899
pkg/deploy/deploy_test.go Normal file
View File

@@ -0,0 +1,899 @@
/*
Deploy Tests — Verifies rolling, canary, rollback, health check, and history logic.
Uses a mock executor and health checker so no real system calls are made.
*/
package deploy
import (
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"testing"
"time"
)
// ── Mock Executor ────────────────────────────────────────────────────────────
// mockExecutor records all operations for verification.
type mockExecutor struct {
mu sync.Mutex
instances map[string]*Instance // name → instance
images map[string]string // name → current image
// Recorded operation log.
ops []string
// Error injection.
updateImageErr map[string]error // instance name → error to return
startErr map[string]error
createErr map[string]error
trafficWeights map[string]int // canaryName → weight
}
func newMockExecutor(instances ...Instance) *mockExecutor {
m := &mockExecutor{
instances: make(map[string]*Instance),
images: make(map[string]string),
updateImageErr: make(map[string]error),
startErr: make(map[string]error),
createErr: make(map[string]error),
trafficWeights: make(map[string]int),
}
for _, inst := range instances {
cpy := inst
m.instances[inst.Name] = &cpy
m.images[inst.Name] = inst.Image
}
return m
}
func (m *mockExecutor) record(op string) {
m.mu.Lock()
defer m.mu.Unlock()
m.ops = append(m.ops, op)
}
func (m *mockExecutor) getOps() []string {
m.mu.Lock()
defer m.mu.Unlock()
result := make([]string, len(m.ops))
copy(result, m.ops)
return result
}
func (m *mockExecutor) ListInstances(target string) ([]Instance, error) {
m.record(fmt.Sprintf("list:%s", target))
var result []Instance
for _, inst := range m.instances {
if strings.HasPrefix(inst.Name, target) || inst.Name == target {
result = append(result, *inst)
}
}
return result, nil
}
func (m *mockExecutor) CreateInstance(name, image string) error {
m.record(fmt.Sprintf("create:%s:%s", name, image))
if err, ok := m.createErr[name]; ok {
return err
}
m.mu.Lock()
m.instances[name] = &Instance{Name: name, Image: image, Status: "stopped"}
m.images[name] = image
m.mu.Unlock()
return nil
}
func (m *mockExecutor) StartInstance(name string) error {
m.record(fmt.Sprintf("start:%s", name))
if err, ok := m.startErr[name]; ok {
return err
}
m.mu.Lock()
if inst, ok := m.instances[name]; ok {
inst.Status = "running"
}
m.mu.Unlock()
return nil
}
func (m *mockExecutor) StopInstance(name string) error {
m.record(fmt.Sprintf("stop:%s", name))
m.mu.Lock()
if inst, ok := m.instances[name]; ok {
inst.Status = "stopped"
}
m.mu.Unlock()
return nil
}
func (m *mockExecutor) DeleteInstance(name string) error {
m.record(fmt.Sprintf("delete:%s", name))
m.mu.Lock()
delete(m.instances, name)
delete(m.images, name)
m.mu.Unlock()
return nil
}
func (m *mockExecutor) GetInstanceImage(name string) (string, error) {
m.mu.Lock()
defer m.mu.Unlock()
if img, ok := m.images[name]; ok {
return img, nil
}
return "", fmt.Errorf("instance %s not found", name)
}
func (m *mockExecutor) UpdateInstanceImage(name, newImage string) error {
m.record(fmt.Sprintf("update-image:%s:%s", name, newImage))
if err, ok := m.updateImageErr[name]; ok {
return err
}
m.mu.Lock()
m.images[name] = newImage
if inst, ok := m.instances[name]; ok {
inst.Image = newImage
}
m.mu.Unlock()
return nil
}
func (m *mockExecutor) UpdateTrafficWeight(target, canaryName string, weight int) error {
m.record(fmt.Sprintf("traffic:%s:%s:%d", target, canaryName, weight))
m.mu.Lock()
m.trafficWeights[canaryName] = weight
m.mu.Unlock()
return nil
}
// ── Mock Health Checker ──────────────────────────────────────────────────────
// mockHealthChecker returns configurable results per instance.
type mockHealthChecker struct {
mu sync.Mutex
results map[string]error // instance name → error (nil = healthy)
calls []string
}
func newMockHealthChecker() *mockHealthChecker {
return &mockHealthChecker{
results: make(map[string]error),
}
}
func (h *mockHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
h.mu.Lock()
h.calls = append(h.calls, instanceName)
err := h.results[instanceName]
h.mu.Unlock()
return err
}
func (h *mockHealthChecker) getCalls() []string {
h.mu.Lock()
defer h.mu.Unlock()
result := make([]string, len(h.calls))
copy(result, h.calls)
return result
}
// ── Progress Collector ───────────────────────────────────────────────────────
type progressCollector struct {
mu sync.Mutex
updates []DeployStatus
}
func newProgressCollector() *progressCollector {
return &progressCollector{}
}
func (p *progressCollector) callback() ProgressFunc {
return func(status DeployStatus) {
p.mu.Lock()
defer p.mu.Unlock()
p.updates = append(p.updates, status)
}
}
func (p *progressCollector) getUpdates() []DeployStatus {
p.mu.Lock()
defer p.mu.Unlock()
result := make([]DeployStatus, len(p.updates))
copy(result, p.updates)
return result
}
func (p *progressCollector) phases() []Phase {
p.mu.Lock()
defer p.mu.Unlock()
var phases []Phase
for _, u := range p.updates {
phases = append(phases, u.Phase)
}
return phases
}
// ── Test: Rolling Deploy Order ───────────────────────────────────────────────
func TestRollingDeployOrder(t *testing.T) {
exec := newMockExecutor(
Instance{Name: "web-1", Image: "sha256:old1", Status: "running"},
Instance{Name: "web-2", Image: "sha256:old1", Status: "running"},
Instance{Name: "web-3", Image: "sha256:old1", Status: "running"},
)
hc := newMockHealthChecker()
pc := newProgressCollector()
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
cfg := DeployConfig{
Strategy: StrategyRolling,
Target: "web",
NewImage: "sha256:new1",
MaxSurge: 1,
MaxUnavail: 0,
HealthCheck: HealthCheck{Type: "none"},
Timeout: 1 * time.Minute,
AutoRollback: true,
}
err := RollingDeploy(cfg, exec, hc, hist, pc.callback())
if err != nil {
t.Fatalf("RollingDeploy returned error: %v", err)
}
// Verify all instances were updated.
ops := exec.getOps()
// Count update-image operations.
updateCount := 0
for _, op := range ops {
if strings.HasPrefix(op, "update-image:") {
updateCount++
// Verify new image is correct.
if !strings.HasSuffix(op, ":sha256:new1") {
t.Errorf("expected new image sha256:new1, got op: %s", op)
}
}
}
if updateCount != 3 {
t.Errorf("expected 3 update-image ops, got %d", updateCount)
}
// Verify instances are updated one at a time (each update is followed by start before next update).
var updateOrder []string
for _, op := range ops {
if strings.HasPrefix(op, "update-image:web-") {
name := strings.Split(op, ":")[1]
updateOrder = append(updateOrder, name)
}
}
if len(updateOrder) != 3 {
t.Errorf("expected 3 instances updated in order, got %d", len(updateOrder))
}
// Verify progress callback was called.
phases := pc.phases()
if len(phases) == 0 {
t.Error("expected progress callbacks, got none")
}
// First should be preparing, last should be complete.
if phases[0] != PhasePreparing {
t.Errorf("expected first phase to be preparing, got %s", phases[0])
}
lastPhase := phases[len(phases)-1]
if lastPhase != PhaseComplete {
t.Errorf("expected last phase to be complete, got %s", lastPhase)
}
// Verify all images are now the new version.
for _, name := range []string{"web-1", "web-2", "web-3"} {
img, err := exec.GetInstanceImage(name)
if err != nil {
t.Errorf("GetInstanceImage(%s) error: %v", name, err)
continue
}
if img != "sha256:new1" {
t.Errorf("instance %s image = %s, want sha256:new1", name, img)
}
}
}
// ── Test: Canary Weight ──────────────────────────────────────────────────────
func TestCanaryWeight(t *testing.T) {
exec := newMockExecutor(
Instance{Name: "api-1", Image: "sha256:v1", Status: "running"},
Instance{Name: "api-2", Image: "sha256:v1", Status: "running"},
)
hc := newMockHealthChecker()
pc := newProgressCollector()
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
cfg := DeployConfig{
Strategy: StrategyCanary,
Target: "api",
NewImage: "sha256:v2",
CanaryWeight: 20,
HealthCheck: HealthCheck{Type: "none"},
Timeout: 1 * time.Minute,
AutoRollback: true,
}
err := CanaryDeploy(cfg, exec, hc, hist, pc.callback())
if err != nil {
t.Fatalf("CanaryDeploy returned error: %v", err)
}
// Verify canary instance was created.
ops := exec.getOps()
var createOps []string
for _, op := range ops {
if strings.HasPrefix(op, "create:") {
createOps = append(createOps, op)
}
}
if len(createOps) != 1 {
t.Fatalf("expected 1 create op for canary, got %d: %v", len(createOps), createOps)
}
// Verify the canary instance name and image.
canaryName := canaryInstanceName("api")
expectedCreate := fmt.Sprintf("create:%s:sha256:v2", canaryName)
if createOps[0] != expectedCreate {
t.Errorf("create op = %q, want %q", createOps[0], expectedCreate)
}
// Verify traffic was routed with the correct weight.
var trafficOps []string
for _, op := range ops {
if strings.HasPrefix(op, "traffic:") {
trafficOps = append(trafficOps, op)
}
}
if len(trafficOps) != 1 {
t.Fatalf("expected 1 traffic op, got %d: %v", len(trafficOps), trafficOps)
}
expectedTraffic := fmt.Sprintf("traffic:api:%s:20", canaryName)
if trafficOps[0] != expectedTraffic {
t.Errorf("traffic op = %q, want %q", trafficOps[0], expectedTraffic)
}
// Verify the canary weight was recorded.
exec.mu.Lock()
weight := exec.trafficWeights[canaryName]
exec.mu.Unlock()
if weight != 20 {
t.Errorf("canary traffic weight = %d, want 20", weight)
}
// Verify original instances were not modified.
for _, name := range []string{"api-1", "api-2"} {
img, _ := exec.GetInstanceImage(name)
if img != "sha256:v1" {
t.Errorf("original instance %s image changed to %s, should still be sha256:v1", name, img)
}
}
// Verify progress shows canary-specific messages.
updates := pc.getUpdates()
foundCanaryProgress := false
for _, u := range updates {
if strings.Contains(u.Progress, "canary") || strings.Contains(u.Progress, "traffic") {
foundCanaryProgress = true
break
}
}
if !foundCanaryProgress {
t.Error("expected canary-related progress messages")
}
}
// ── Test: Rollback Restores Previous ─────────────────────────────────────────
func TestRollbackRestoresPrevious(t *testing.T) {
exec := newMockExecutor(
Instance{Name: "app-1", Image: "sha256:v2", Status: "running"},
)
_ = newMockHealthChecker()
pc := newProgressCollector()
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
// Seed history with a previous successful deployment.
_ = hist.Append(HistoryEntry{
ID: "deploy-prev",
Target: "app",
Strategy: "rolling",
OldRef: "sha256:v1",
NewRef: "sha256:v2",
Status: string(PhaseComplete),
StartedAt: time.Now().Add(-1 * time.Hour),
CompletedAt: time.Now().Add(-50 * time.Minute),
InstancesUpdated: 1,
})
err := Rollback("app", exec, hist, pc.callback())
if err != nil {
t.Fatalf("Rollback returned error: %v", err)
}
// Verify the instance was updated back to v1.
img, err := exec.GetInstanceImage("app-1")
if err != nil {
t.Fatalf("GetInstanceImage error: %v", err)
}
if img != "sha256:v1" {
t.Errorf("after rollback, instance image = %s, want sha256:v1", img)
}
// Verify rollback was recorded in history.
entries, err := hist.ListByTarget("app")
if err != nil {
t.Fatalf("ListByTarget error: %v", err)
}
// Should have the original entry + the rollback entry.
if len(entries) < 2 {
t.Errorf("expected at least 2 history entries, got %d", len(entries))
}
}
// ── Test: Health Check Fail Triggers Rollback ────────────────────────────────
func TestHealthCheckFailTriggersRollback(t *testing.T) {
exec := newMockExecutor(
Instance{Name: "svc-1", Image: "sha256:old", Status: "running"},
Instance{Name: "svc-2", Image: "sha256:old", Status: "running"},
)
hc := newMockHealthChecker()
// Make svc-2 fail health check after being updated.
// Since instances are iterated from the map, we set both to fail
// but we only need to verify that when any fails, rollback happens.
hc.results["svc-1"] = nil // svc-1 is healthy
hc.results["svc-2"] = fmt.Errorf("connection refused")
pc := newProgressCollector()
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
cfg := DeployConfig{
Strategy: StrategyRolling,
Target: "svc",
NewImage: "sha256:bad",
MaxSurge: 1,
MaxUnavail: 0,
HealthCheck: HealthCheck{Type: "tcp", Port: 8080, Interval: 100 * time.Millisecond, Retries: 1},
Timeout: 30 * time.Second,
AutoRollback: true,
}
err := RollingDeploy(cfg, exec, hc, hist, pc.callback())
// Deployment should fail.
if err == nil {
t.Fatal("expected RollingDeploy to fail due to health check, but got nil")
}
if !strings.Contains(err.Error(), "health check failed") {
t.Errorf("error should mention health check failure, got: %v", err)
}
// Verify rollback phase appeared in progress.
phases := pc.phases()
foundRollback := false
for _, p := range phases {
if p == PhaseRollingBack {
foundRollback = true
break
}
}
if !foundRollback {
t.Error("expected rolling-back phase in progress updates")
}
// Verify rollback operations were attempted (update-image back to old).
ops := exec.getOps()
rollbackOps := 0
for _, op := range ops {
if strings.Contains(op, "update-image:") && strings.Contains(op, ":sha256:old") {
rollbackOps++
}
}
if rollbackOps == 0 {
t.Error("expected rollback operations (update-image back to sha256:old), found none")
}
// Verify history records the failure.
entries, _ := hist.ListByTarget("svc")
if len(entries) == 0 {
t.Fatal("expected history entry for failed deployment")
}
if entries[0].Status != string(PhaseFailed) {
t.Errorf("history status = %s, want failed", entries[0].Status)
}
}
// ── Test: Deploy History ─────────────────────────────────────────────────────
func TestDeployHistory(t *testing.T) {
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
// Write several entries.
entries := []HistoryEntry{
{
ID: "deploy-001",
Target: "web-app",
Strategy: "rolling",
OldRef: "sha256:abc123",
NewRef: "sha256:def456",
Status: "complete",
StartedAt: time.Date(2026, 3, 20, 15, 0, 0, 0, time.UTC),
CompletedAt: time.Date(2026, 3, 20, 15, 5, 0, 0, time.UTC),
InstancesUpdated: 3,
},
{
ID: "deploy-002",
Target: "web-app",
Strategy: "canary",
OldRef: "sha256:def456",
NewRef: "sha256:ghi789",
Status: "complete",
StartedAt: time.Date(2026, 3, 21, 10, 0, 0, 0, time.UTC),
CompletedAt: time.Date(2026, 3, 21, 10, 2, 0, 0, time.UTC),
InstancesUpdated: 1,
},
{
ID: "deploy-003",
Target: "api-svc",
Strategy: "rolling",
OldRef: "sha256:111",
NewRef: "sha256:222",
Status: "failed",
StartedAt: time.Date(2026, 3, 22, 8, 0, 0, 0, time.UTC),
CompletedAt: time.Date(2026, 3, 22, 8, 1, 0, 0, time.UTC),
InstancesUpdated: 0,
Message: "health check timeout",
},
}
for _, e := range entries {
if err := hist.Append(e); err != nil {
t.Fatalf("Append error: %v", err)
}
}
// Verify target-specific listing.
webEntries, err := hist.ListByTarget("web-app")
if err != nil {
t.Fatalf("ListByTarget error: %v", err)
}
if len(webEntries) != 2 {
t.Errorf("expected 2 web-app entries, got %d", len(webEntries))
}
// Most recent first.
if len(webEntries) >= 2 && webEntries[0].ID != "deploy-002" {
t.Errorf("expected most recent entry first, got %s", webEntries[0].ID)
}
apiEntries, err := hist.ListByTarget("api-svc")
if err != nil {
t.Fatalf("ListByTarget error: %v", err)
}
if len(apiEntries) != 1 {
t.Errorf("expected 1 api-svc entry, got %d", len(apiEntries))
}
if len(apiEntries) == 1 && apiEntries[0].Message != "health check timeout" {
t.Errorf("expected message 'health check timeout', got %q", apiEntries[0].Message)
}
// Verify ListAll.
all, err := hist.ListAll()
if err != nil {
t.Fatalf("ListAll error: %v", err)
}
if len(all) != 3 {
t.Errorf("expected 3 total entries, got %d", len(all))
}
// Verify files were created.
files, _ := filepath.Glob(filepath.Join(tmpDir, "*.yaml"))
if len(files) != 2 { // web-app.yaml and api-svc.yaml
t.Errorf("expected 2 history files, got %d", len(files))
}
}
// ── Test: Config Validation ──────────────────────────────────────────────────
func TestConfigValidation(t *testing.T) {
tests := []struct {
name string
cfg DeployConfig
wantErr string
}{
{
name: "empty target",
cfg: DeployConfig{Strategy: StrategyRolling, NewImage: "sha256:abc"},
wantErr: "target is required",
},
{
name: "empty image",
cfg: DeployConfig{Strategy: StrategyRolling, Target: "web"},
wantErr: "new image",
},
{
name: "invalid strategy",
cfg: DeployConfig{Strategy: "blue-green", Target: "web", NewImage: "sha256:abc"},
wantErr: "unknown strategy",
},
{
name: "canary weight zero",
cfg: DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 0},
wantErr: "canary weight must be between 1 and 99",
},
{
name: "canary weight 100",
cfg: DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 100},
wantErr: "canary weight must be between 1 and 99",
},
{
name: "valid rolling",
cfg: DeployConfig{Strategy: StrategyRolling, Target: "web", NewImage: "sha256:abc"},
},
{
name: "valid canary",
cfg: DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 25},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := tt.cfg.Validate()
if tt.wantErr != "" {
if err == nil {
t.Errorf("expected error containing %q, got nil", tt.wantErr)
} else if !strings.Contains(err.Error(), tt.wantErr) {
t.Errorf("error %q should contain %q", err.Error(), tt.wantErr)
}
} else {
if err != nil {
t.Errorf("unexpected error: %v", err)
}
}
})
}
}
// ── Test: Canary Instance Name ───────────────────────────────────────────────
func TestCanaryInstanceName(t *testing.T) {
tests := []struct {
target string
want string
}{
{"web-app", "web-app-canary"},
{"api-1", "api-canary"},
{"simple", "simple-canary"},
{"my-service-", "my-service-canary"},
}
for _, tt := range tests {
got := canaryInstanceName(tt.target)
if got != tt.want {
t.Errorf("canaryInstanceName(%q) = %q, want %q", tt.target, got, tt.want)
}
}
}
// ── Test: No Instances Found ─────────────────────────────────────────────────
func TestRollingDeployNoInstances(t *testing.T) {
exec := newMockExecutor() // empty
hc := newMockHealthChecker()
cfg := DeployConfig{
Strategy: StrategyRolling,
Target: "nonexistent",
NewImage: "sha256:abc",
Timeout: 10 * time.Second,
}
err := RollingDeploy(cfg, exec, hc, nil, nil)
if err == nil {
t.Fatal("expected error for no instances, got nil")
}
if !strings.Contains(err.Error(), "no instances found") {
t.Errorf("error should mention no instances, got: %v", err)
}
}
// ── Test: Active Deployments Tracking ────────────────────────────────────────
func TestActiveDeployments(t *testing.T) {
// Clear any leftover state.
activeDeploymentsMu.Lock()
activeDeployments = make(map[string]*DeployStatus)
activeDeploymentsMu.Unlock()
// Initially empty.
active := GetActiveDeployments()
if len(active) != 0 {
t.Errorf("expected 0 active deployments, got %d", len(active))
}
// Run a deployment and check it appears during execution.
exec := newMockExecutor(
Instance{Name: "track-1", Image: "sha256:old", Status: "running"},
)
hc := newMockHealthChecker()
var seenActive bool
progressFn := func(status DeployStatus) {
if status.Phase == PhaseDeploying || status.Phase == PhaseVerifying {
ad := GetActiveDeployment("track")
if ad != nil {
seenActive = true
}
}
}
cfg := DeployConfig{
Strategy: StrategyRolling,
Target: "track",
NewImage: "sha256:new",
HealthCheck: HealthCheck{Type: "none"},
Timeout: 10 * time.Second,
}
err := RollingDeploy(cfg, exec, hc, nil, progressFn)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !seenActive {
t.Error("expected to see active deployment during execution")
}
// After completion, should be empty again.
active = GetActiveDeployments()
if len(active) != 0 {
t.Errorf("expected 0 active deployments after completion, got %d", len(active))
}
}
// ── Test: History File Persistence ───────────────────────────────────────────
func TestHistoryFilePersistence(t *testing.T) {
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
entry := HistoryEntry{
ID: "persist-001",
Target: "my-app",
Strategy: "rolling",
OldRef: "sha256:aaa",
NewRef: "sha256:bbb",
Status: "complete",
StartedAt: time.Now().UTC(),
CompletedAt: time.Now().UTC(),
InstancesUpdated: 2,
}
if err := hist.Append(entry); err != nil {
t.Fatalf("Append error: %v", err)
}
// Verify the file exists on disk.
filePath := filepath.Join(tmpDir, "my-app.yaml")
if _, err := os.Stat(filePath); err != nil {
t.Fatalf("history file not found: %v", err)
}
// Create a new store instance (simulating restart) and verify data.
hist2 := NewHistoryStore(tmpDir)
entries, err := hist2.ListByTarget("my-app")
if err != nil {
t.Fatalf("ListByTarget error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
if entries[0].ID != "persist-001" {
t.Errorf("entry ID = %s, want persist-001", entries[0].ID)
}
if entries[0].InstancesUpdated != 2 {
t.Errorf("instances_updated = %d, want 2", entries[0].InstancesUpdated)
}
}
// ── Test: Noop Health Checker ────────────────────────────────────────────────
func TestNoopHealthChecker(t *testing.T) {
noop := &NoopHealthChecker{}
err := noop.WaitHealthy("anything", HealthCheck{Type: "http", Port: 9999})
if err != nil {
t.Errorf("NoopHealthChecker should always return nil, got: %v", err)
}
}
// ── Test: Rollback Without History ───────────────────────────────────────────
func TestRollbackWithoutHistory(t *testing.T) {
exec := newMockExecutor(
Instance{Name: "no-hist-1", Image: "sha256:v2", Status: "running"},
)
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
err := Rollback("no-hist", exec, hist, nil)
if err == nil {
t.Fatal("expected error for rollback without history, got nil")
}
if !strings.Contains(err.Error(), "no previous version") {
t.Errorf("error should mention no previous version, got: %v", err)
}
}
// ── Test: Canary Cleanup on Health Failure ────────────────────────────────────
func TestCanaryCleanupOnHealthFailure(t *testing.T) {
exec := newMockExecutor(
Instance{Name: "svc-1", Image: "sha256:v1", Status: "running"},
)
hc := newMockHealthChecker()
canaryName := canaryInstanceName("svc")
hc.results[canaryName] = fmt.Errorf("unhealthy canary")
pc := newProgressCollector()
tmpDir := t.TempDir()
hist := NewHistoryStore(tmpDir)
cfg := DeployConfig{
Strategy: StrategyCanary,
Target: "svc",
NewImage: "sha256:v2",
CanaryWeight: 10,
HealthCheck: HealthCheck{Type: "tcp", Port: 8080, Interval: 100 * time.Millisecond, Retries: 1},
Timeout: 10 * time.Second,
AutoRollback: true,
}
err := CanaryDeploy(cfg, exec, hc, hist, pc.callback())
if err == nil {
t.Fatal("expected canary to fail, got nil")
}
// Verify canary was cleaned up (stop + delete).
ops := exec.getOps()
foundStop := false
foundDelete := false
for _, op := range ops {
if op == fmt.Sprintf("stop:%s", canaryName) {
foundStop = true
}
if op == fmt.Sprintf("delete:%s", canaryName) {
foundDelete = true
}
}
if !foundStop {
t.Error("expected canary stop operation during cleanup")
}
if !foundDelete {
t.Error("expected canary delete operation during cleanup")
}
// Verify original instance was not modified.
img, _ := exec.GetInstanceImage("svc-1")
if img != "sha256:v1" {
t.Errorf("original instance image changed to %s during failed canary", img)
}
}

143
pkg/deploy/health.go Normal file
View File

@@ -0,0 +1,143 @@
/*
Health — Health check implementations for deployment verification.
Supports HTTP, TCP, exec, and no-op health checks. Each check type
retries according to the configured interval and retry count.
Copyright (c) Armored Gates LLC. All rights reserved.
*/
package deploy
import (
"fmt"
"net"
"net/http"
"os/exec"
"time"
)
// ── Health Check Config ──────────────────────────────────────────────────────
// HealthCheck defines how to verify that an instance is healthy after deploy.
type HealthCheck struct {
Type string `json:"type" yaml:"type"` // "http", "tcp", "exec", "none"
Path string `json:"path" yaml:"path"` // HTTP path (e.g., "/healthz")
Port int `json:"port" yaml:"port"` // Port to check
Command string `json:"command" yaml:"command"` // Exec command
Interval time.Duration `json:"interval" yaml:"interval"` // Time between retries
Retries int `json:"retries" yaml:"retries"` // Max retry count
}
// ── Health Checker Interface ─────────────────────────────────────────────────
// HealthChecker verifies instance health during deployments.
type HealthChecker interface {
// WaitHealthy blocks until the instance is healthy or all retries are exhausted.
WaitHealthy(instanceName string, check HealthCheck) error
}
// ── Default Health Checker ───────────────────────────────────────────────────
// DefaultHealthChecker implements HealthChecker using real HTTP/TCP/exec calls.
type DefaultHealthChecker struct {
// InstanceIPResolver resolves an instance name to an IP address.
// If nil, "127.0.0.1" is used.
InstanceIPResolver func(name string) (string, error)
}
// WaitHealthy performs health checks with retries.
func (d *DefaultHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
switch check.Type {
case "none", "":
return nil
case "http":
return d.waitHTTP(instanceName, check)
case "tcp":
return d.waitTCP(instanceName, check)
case "exec":
return d.waitExec(instanceName, check)
default:
return fmt.Errorf("unknown health check type: %q", check.Type)
}
}
func (d *DefaultHealthChecker) resolveIP(instanceName string) string {
if d.InstanceIPResolver != nil {
ip, err := d.InstanceIPResolver(instanceName)
if err == nil {
return ip
}
}
return "127.0.0.1"
}
func (d *DefaultHealthChecker) waitHTTP(instanceName string, check HealthCheck) error {
ip := d.resolveIP(instanceName)
url := fmt.Sprintf("http://%s:%d%s", ip, check.Port, check.Path)
client := &http.Client{Timeout: check.Interval}
var lastErr error
for i := 0; i < check.Retries; i++ {
resp, err := client.Get(url)
if err == nil {
resp.Body.Close()
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
return nil
}
lastErr = fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
} else {
lastErr = err
}
if i < check.Retries-1 {
time.Sleep(check.Interval)
}
}
return fmt.Errorf("health check failed after %d retries: %w", check.Retries, lastErr)
}
func (d *DefaultHealthChecker) waitTCP(instanceName string, check HealthCheck) error {
ip := d.resolveIP(instanceName)
addr := fmt.Sprintf("%s:%d", ip, check.Port)
var lastErr error
for i := 0; i < check.Retries; i++ {
conn, err := net.DialTimeout("tcp", addr, check.Interval)
if err == nil {
conn.Close()
return nil
}
lastErr = err
if i < check.Retries-1 {
time.Sleep(check.Interval)
}
}
return fmt.Errorf("TCP health check failed after %d retries: %w", check.Retries, lastErr)
}
func (d *DefaultHealthChecker) waitExec(instanceName string, check HealthCheck) error {
var lastErr error
for i := 0; i < check.Retries; i++ {
cmd := exec.Command("sh", "-c", check.Command)
if err := cmd.Run(); err == nil {
return nil
} else {
lastErr = err
}
if i < check.Retries-1 {
time.Sleep(check.Interval)
}
}
return fmt.Errorf("exec health check failed after %d retries: %w", check.Retries, lastErr)
}
// ── Noop Health Checker ──────────────────────────────────────────────────────
// NoopHealthChecker always returns healthy. Used for rollbacks and when
// health checking is disabled.
type NoopHealthChecker struct{}
// WaitHealthy always succeeds immediately.
func (n *NoopHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
return nil
}

186
pkg/deploy/history.go Normal file
View File

@@ -0,0 +1,186 @@
/*
History — Persistent deployment history for Volt.
Stores deployment records as YAML in /var/lib/volt/deployments/.
Each target gets its own history file to keep lookups fast.
Copyright (c) Armored Gates LLC. All rights reserved.
*/
package deploy
import (
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
"gopkg.in/yaml.v3"
)
// ── Constants ────────────────────────────────────────────────────────────────
const (
// DefaultHistoryDir is where deployment history files are stored.
DefaultHistoryDir = "/var/lib/volt/deployments"
)
// ── History Entry ────────────────────────────────────────────────────────────
// HistoryEntry records a single deployment operation.
type HistoryEntry struct {
ID string `yaml:"id" json:"id"`
Target string `yaml:"target" json:"target"`
Strategy string `yaml:"strategy" json:"strategy"`
OldRef string `yaml:"old_ref" json:"old_ref"`
NewRef string `yaml:"new_ref" json:"new_ref"`
Status string `yaml:"status" json:"status"` // "complete", "failed", "rolling-back"
StartedAt time.Time `yaml:"started_at" json:"started_at"`
CompletedAt time.Time `yaml:"completed_at" json:"completed_at"`
InstancesUpdated int `yaml:"instances_updated" json:"instances_updated"`
Message string `yaml:"message,omitempty" json:"message,omitempty"`
}
// ── History Store ────────────────────────────────────────────────────────────
// HistoryStore manages deployment history on disk.
type HistoryStore struct {
dir string
mu sync.Mutex
}
// NewHistoryStore creates a history store at the given directory.
func NewHistoryStore(dir string) *HistoryStore {
if dir == "" {
dir = DefaultHistoryDir
}
return &HistoryStore{dir: dir}
}
// Dir returns the history directory path.
func (h *HistoryStore) Dir() string {
return h.dir
}
// historyFile returns the path to the history file for a target.
func (h *HistoryStore) historyFile(target string) string {
// Sanitize the target name for use as a filename.
safe := strings.Map(func(r rune) rune {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') ||
(r >= '0' && r <= '9') || r == '-' || r == '_' {
return r
}
return '_'
}, target)
return filepath.Join(h.dir, safe+".yaml")
}
// Append adds a deployment entry to the target's history file.
func (h *HistoryStore) Append(entry HistoryEntry) error {
h.mu.Lock()
defer h.mu.Unlock()
if err := os.MkdirAll(h.dir, 0755); err != nil {
return fmt.Errorf("history: create dir: %w", err)
}
// Load existing entries.
entries, _ := h.readEntries(entry.Target) // ignore error on first write
// Append and write.
entries = append(entries, entry)
return h.writeEntries(entry.Target, entries)
}
// ListByTarget returns all deployment history for a target, most recent first.
func (h *HistoryStore) ListByTarget(target string) ([]HistoryEntry, error) {
h.mu.Lock()
defer h.mu.Unlock()
entries, err := h.readEntries(target)
if err != nil {
return nil, err
}
// Sort by StartedAt descending (most recent first).
sort.Slice(entries, func(i, j int) bool {
return entries[i].StartedAt.After(entries[j].StartedAt)
})
return entries, nil
}
// ListAll returns all deployment history across all targets, most recent first.
func (h *HistoryStore) ListAll() ([]HistoryEntry, error) {
h.mu.Lock()
defer h.mu.Unlock()
files, err := filepath.Glob(filepath.Join(h.dir, "*.yaml"))
if err != nil {
return nil, fmt.Errorf("history: glob: %w", err)
}
var all []HistoryEntry
for _, f := range files {
data, err := os.ReadFile(f)
if err != nil {
continue
}
var entries []HistoryEntry
if err := yaml.Unmarshal(data, &entries); err != nil {
continue
}
all = append(all, entries...)
}
sort.Slice(all, func(i, j int) bool {
return all[i].StartedAt.After(all[j].StartedAt)
})
return all, nil
}
// readEntries loads entries from the history file for a target.
// Returns empty slice (not error) if file doesn't exist.
func (h *HistoryStore) readEntries(target string) ([]HistoryEntry, error) {
filePath := h.historyFile(target)
data, err := os.ReadFile(filePath)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("history: read %s: %w", filePath, err)
}
var entries []HistoryEntry
if err := yaml.Unmarshal(data, &entries); err != nil {
return nil, fmt.Errorf("history: parse %s: %w", filePath, err)
}
return entries, nil
}
// writeEntries writes entries to the history file for a target.
func (h *HistoryStore) writeEntries(target string, entries []HistoryEntry) error {
filePath := h.historyFile(target)
data, err := yaml.Marshal(entries)
if err != nil {
return fmt.Errorf("history: marshal: %w", err)
}
// Atomic write: tmp + rename.
tmpPath := filePath + ".tmp"
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
return fmt.Errorf("history: write %s: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, filePath); err != nil {
os.Remove(tmpPath)
return fmt.Errorf("history: rename %s: %w", filePath, err)
}
return nil
}

46
pkg/deploy/io.go Normal file
View File

@@ -0,0 +1,46 @@
/*
IO helpers — Thin wrappers for filesystem and system operations.
Isolated here so tests can verify logic without needing OS-level mocks.
Copyright (c) Armored Gates LLC. All rights reserved.
*/
package deploy
import (
"os"
"os/exec"
)
// readFile reads a file's contents. Wraps os.ReadFile.
func readFile(path string) ([]byte, error) {
return os.ReadFile(path)
}
// writeFile writes data to a file atomically. Wraps os.WriteFile.
func writeFile(path string, data []byte) error {
return os.WriteFile(path, data, 0644)
}
// appendFile appends data to a file, creating it if necessary.
func appendFile(path string, data []byte) error {
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return err
}
defer f.Close()
_, err = f.Write(data)
return err
}
// fileInfo returns os.FileInfo for the given path.
func fileInfo(path string) (os.FileInfo, error) {
return os.Stat(path)
}
// runSystemctl runs a systemctl subcommand.
func runSystemctl(action, unit string) error {
cmd := exec.Command("systemctl", action, unit)
_, err := cmd.CombinedOutput()
return err
}