Volt CLI: source-available under AGPSL v5.0

Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
2026-03-21 00:30:23 -05:00
commit 0ebe75b2ca
155 changed files with 63317 additions and 0 deletions
--- a/pkg/deploy/deploy.go
+++ b/pkg/deploy/deploy.go
@@ -0,0 +1,733 @@
+/*
+Deploy — Rolling and canary deployment strategies for Volt workloads.
+
+Coordinates zero-downtime updates for containers and workloads by
+orchestrating instance creation, health verification, traffic shifting,
+and automatic rollback on failure.
+
+Since Volt uses CAS (content-addressed storage) for rootfs assembly,
+"updating" a workload means pointing it to a new CAS ref and having
+TinyVol reassemble the directory tree from the new blob manifest.
+
+Strategies:
+  rolling — Update instances one-by-one (respecting MaxSurge/MaxUnavail)
+  canary  — Route a percentage of traffic to a new instance before full rollout
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+*/
+package deploy
+
+import (
+	"fmt"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+)
+
+// ── Strategy ─────────────────────────────────────────────────────────────────
+
+// Strategy defines the deployment approach.
+type Strategy string
+
+const (
+	// StrategyRolling updates instances one-by-one with health verification.
+	StrategyRolling Strategy = "rolling"
+	// StrategyCanary routes a percentage of traffic to a new instance first.
+	StrategyCanary Strategy = "canary"
+)
+
+// ── Configuration ────────────────────────────────────────────────────────────
+
+// DeployConfig holds all parameters for a deployment operation.
+type DeployConfig struct {
+	Strategy     Strategy      // Deployment strategy
+	Target       string        // Container/workload name or pattern
+	NewImage     string        // New CAS ref or image path to deploy
+	MaxSurge     int           // Max extra instances during rolling (default: 1)
+	MaxUnavail   int           // Max unavailable during rolling (default: 0)
+	CanaryWeight int           // Canary traffic percentage (1-99)
+	HealthCheck  HealthCheck   // How to verify new instance is healthy
+	Timeout      time.Duration // Max time for the entire deployment
+	AutoRollback bool          // Rollback on failure
+}
+
+// Validate checks that the config is usable and fills in defaults.
+func (c *DeployConfig) Validate() error {
+	if c.Target == "" {
+		return fmt.Errorf("deploy: target is required")
+	}
+	if c.NewImage == "" {
+		return fmt.Errorf("deploy: new image (CAS ref) is required")
+	}
+
+	switch c.Strategy {
+	case StrategyRolling:
+		if c.MaxSurge <= 0 {
+			c.MaxSurge = 1
+		}
+		if c.MaxUnavail < 0 {
+			c.MaxUnavail = 0
+		}
+	case StrategyCanary:
+		if c.CanaryWeight <= 0 || c.CanaryWeight >= 100 {
+			return fmt.Errorf("deploy: canary weight must be between 1 and 99, got %d", c.CanaryWeight)
+		}
+	default:
+		return fmt.Errorf("deploy: unknown strategy %q (use 'rolling' or 'canary')", c.Strategy)
+	}
+
+	if c.Timeout <= 0 {
+		c.Timeout = 10 * time.Minute
+	}
+	if c.HealthCheck.Type == "" {
+		c.HealthCheck.Type = "none"
+	}
+	if c.HealthCheck.Interval <= 0 {
+		c.HealthCheck.Interval = 5 * time.Second
+	}
+	if c.HealthCheck.Retries <= 0 {
+		c.HealthCheck.Retries = 3
+	}
+
+	return nil
+}
+
+// ── Deploy Status ────────────────────────────────────────────────────────────
+
+// Phase represents the current phase of a deployment.
+type Phase string
+
+const (
+	PhasePreparing   Phase = "preparing"
+	PhaseDeploying   Phase = "deploying"
+	PhaseVerifying   Phase = "verifying"
+	PhaseComplete    Phase = "complete"
+	PhaseRollingBack Phase = "rolling-back"
+	PhaseFailed      Phase = "failed"
+	PhasePaused      Phase = "paused"
+)
+
+// DeployStatus tracks the progress of an active deployment.
+type DeployStatus struct {
+	ID          string    `json:"id" yaml:"id"`
+	Phase       Phase     `json:"phase" yaml:"phase"`
+	Progress    string    `json:"progress" yaml:"progress"`       // e.g. "2/5 instances updated"
+	OldVersion  string    `json:"old_version" yaml:"old_version"` // previous CAS ref
+	NewVersion  string    `json:"new_version" yaml:"new_version"` // target CAS ref
+	Target      string    `json:"target" yaml:"target"`
+	Strategy    Strategy  `json:"strategy" yaml:"strategy"`
+	StartedAt   time.Time `json:"started_at" yaml:"started_at"`
+	CompletedAt time.Time `json:"completed_at,omitempty" yaml:"completed_at,omitempty"`
+	Message     string    `json:"message,omitempty" yaml:"message,omitempty"`
+}
+
+// ── Instance abstraction ─────────────────────────────────────────────────────
+
+// Instance represents a single running workload instance that can be deployed to.
+type Instance struct {
+	Name     string // Instance name (e.g., "web-app-1")
+	Image    string // Current CAS ref or image
+	Status   string // "running", "stopped", etc.
+	Healthy  bool   // Last known health state
+}
+
+// ── Executor interface ───────────────────────────────────────────────────────
+
+// Executor abstracts the system operations needed for deployments.
+// This allows testing without real systemd/nspawn/nftables calls.
+type Executor interface {
+	// ListInstances returns all instances matching the target pattern.
+	ListInstances(target string) ([]Instance, error)
+
+	// CreateInstance creates a new instance with the given image.
+	CreateInstance(name, image string) error
+
+	// StartInstance starts a stopped instance.
+	StartInstance(name string) error
+
+	// StopInstance stops a running instance.
+	StopInstance(name string) error
+
+	// DeleteInstance removes an instance entirely.
+	DeleteInstance(name string) error
+
+	// GetInstanceImage returns the current image/CAS ref for an instance.
+	GetInstanceImage(name string) (string, error)
+
+	// UpdateInstanceImage updates an instance to use a new image (CAS ref).
+	// This reassembles the rootfs via TinyVol and restarts the instance.
+	UpdateInstanceImage(name, newImage string) error
+
+	// UpdateTrafficWeight adjusts traffic routing for canary deployments.
+	// weight is 0-100 representing percentage to the canary instance.
+	UpdateTrafficWeight(target string, canaryName string, weight int) error
+}
+
+// ── Active deployments tracking ──────────────────────────────────────────────
+
+var (
+	activeDeployments   = make(map[string]*DeployStatus)
+	activeDeploymentsMu sync.RWMutex
+)
+
+// GetActiveDeployments returns a snapshot of all active deployments.
+func GetActiveDeployments() []DeployStatus {
+	activeDeploymentsMu.RLock()
+	defer activeDeploymentsMu.RUnlock()
+
+	result := make([]DeployStatus, 0, len(activeDeployments))
+	for _, ds := range activeDeployments {
+		result = append(result, *ds)
+	}
+	return result
+}
+
+// GetActiveDeployment returns the active deployment for a target, if any.
+func GetActiveDeployment(target string) *DeployStatus {
+	activeDeploymentsMu.RLock()
+	defer activeDeploymentsMu.RUnlock()
+
+	if ds, ok := activeDeployments[target]; ok {
+		cp := *ds
+		return &cp
+	}
+	return nil
+}
+
+func setActiveDeployment(ds *DeployStatus) {
+	activeDeploymentsMu.Lock()
+	defer activeDeploymentsMu.Unlock()
+	activeDeployments[ds.Target] = ds
+}
+
+func removeActiveDeployment(target string) {
+	activeDeploymentsMu.Lock()
+	defer activeDeploymentsMu.Unlock()
+	delete(activeDeployments, target)
+}
+
+// ── Progress callback ────────────────────────────────────────────────────────
+
+// ProgressFunc is called with status updates during deployment.
+type ProgressFunc func(status DeployStatus)
+
+// ── Rolling Deploy ───────────────────────────────────────────────────────────
+
+// RollingDeploy performs a rolling update of instances matching cfg.Target.
+//
+// Algorithm:
+//  1. List all instances matching the target pattern
+//  2. For each instance (respecting MaxSurge / MaxUnavail):
+//     a. Update instance image to new CAS ref (reassemble rootfs via TinyVol)
+//     b. Start/restart the instance
+//     c. Wait for health check to pass
+//     d. If health check fails and AutoRollback: revert to old image
+//  3. Record deployment in history
+func RollingDeploy(cfg DeployConfig, exec Executor, hc HealthChecker, hist *HistoryStore, progress ProgressFunc) error {
+	if err := cfg.Validate(); err != nil {
+		return err
+	}
+
+	// Generate deployment ID.
+	deployID := generateDeployID()
+
+	status := &DeployStatus{
+		ID:         deployID,
+		Phase:      PhasePreparing,
+		Target:     cfg.Target,
+		Strategy:   StrategyRolling,
+		NewVersion: cfg.NewImage,
+		StartedAt:  time.Now().UTC(),
+	}
+	setActiveDeployment(status)
+	notifyProgress(progress, *status)
+
+	// 1. Discover instances.
+	instances, err := exec.ListInstances(cfg.Target)
+	if err != nil {
+		status.Phase = PhaseFailed
+		status.Message = fmt.Sprintf("failed to list instances: %v", err)
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+	if len(instances) == 0 {
+		status.Phase = PhaseFailed
+		status.Message = "no instances found matching target"
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+
+	// Record old version from first instance.
+	if len(instances) > 0 {
+		oldImg, _ := exec.GetInstanceImage(instances[0].Name)
+		status.OldVersion = oldImg
+	}
+
+	total := len(instances)
+	updated := 0
+	var rollbackTargets []string // instances that were updated (for rollback)
+
+	status.Phase = PhaseDeploying
+	status.Progress = fmt.Sprintf("0/%d instances updated", total)
+	notifyProgress(progress, *status)
+
+	// Timeout enforcement.
+	deadline := time.Now().Add(cfg.Timeout)
+
+	// 2. Rolling update loop.
+	for i, inst := range instances {
+		if time.Now().After(deadline) {
+			err := fmt.Errorf("deployment timed out after %s", cfg.Timeout)
+			if cfg.AutoRollback && len(rollbackTargets) > 0 {
+				status.Phase = PhaseRollingBack
+				status.Message = err.Error()
+				notifyProgress(progress, *status)
+				rollbackInstances(exec, rollbackTargets, status.OldVersion)
+			}
+			status.Phase = PhaseFailed
+			status.Message = err.Error()
+			status.CompletedAt = time.Now().UTC()
+			notifyProgress(progress, *status)
+			removeActiveDeployment(cfg.Target)
+			recordHistory(hist, status, updated)
+			return err
+		}
+
+		// Respect MaxSurge: we update in-place, so surge is about allowing
+		// brief overlap. With MaxUnavail=0 and MaxSurge=1, we update one at a time.
+		_ = cfg.MaxSurge // In single-node mode, surge is handled by updating in-place.
+
+		status.Progress = fmt.Sprintf("%d/%d instances updated (updating %s)", i, total, inst.Name)
+		notifyProgress(progress, *status)
+
+		// a. Update the instance image.
+		if err := exec.UpdateInstanceImage(inst.Name, cfg.NewImage); err != nil {
+			errMsg := fmt.Sprintf("failed to update instance %s: %v", inst.Name, err)
+			if cfg.AutoRollback {
+				status.Phase = PhaseRollingBack
+				status.Message = errMsg
+				notifyProgress(progress, *status)
+				rollbackInstances(exec, rollbackTargets, status.OldVersion)
+				status.Phase = PhaseFailed
+			} else {
+				status.Phase = PhaseFailed
+			}
+			status.Message = errMsg
+			status.CompletedAt = time.Now().UTC()
+			notifyProgress(progress, *status)
+			removeActiveDeployment(cfg.Target)
+			recordHistory(hist, status, updated)
+			return fmt.Errorf("deploy: %s", errMsg)
+		}
+
+		// b. Start the instance.
+		if err := exec.StartInstance(inst.Name); err != nil {
+			errMsg := fmt.Sprintf("failed to start instance %s: %v", inst.Name, err)
+			if cfg.AutoRollback {
+				status.Phase = PhaseRollingBack
+				status.Message = errMsg
+				notifyProgress(progress, *status)
+				// Rollback this instance too.
+				rollbackTargets = append(rollbackTargets, inst.Name)
+				rollbackInstances(exec, rollbackTargets, status.OldVersion)
+				status.Phase = PhaseFailed
+			} else {
+				status.Phase = PhaseFailed
+			}
+			status.Message = errMsg
+			status.CompletedAt = time.Now().UTC()
+			notifyProgress(progress, *status)
+			removeActiveDeployment(cfg.Target)
+			recordHistory(hist, status, updated)
+			return fmt.Errorf("deploy: %s", errMsg)
+		}
+
+		// c. Health check.
+		status.Phase = PhaseVerifying
+		notifyProgress(progress, *status)
+
+		if err := hc.WaitHealthy(inst.Name, cfg.HealthCheck); err != nil {
+			errMsg := fmt.Sprintf("health check failed for %s: %v", inst.Name, err)
+			if cfg.AutoRollback {
+				status.Phase = PhaseRollingBack
+				status.Message = errMsg
+				notifyProgress(progress, *status)
+				rollbackTargets = append(rollbackTargets, inst.Name)
+				rollbackInstances(exec, rollbackTargets, status.OldVersion)
+				status.Phase = PhaseFailed
+			} else {
+				status.Phase = PhaseFailed
+			}
+			status.Message = errMsg
+			status.CompletedAt = time.Now().UTC()
+			notifyProgress(progress, *status)
+			removeActiveDeployment(cfg.Target)
+			recordHistory(hist, status, updated)
+			return fmt.Errorf("deploy: %s", errMsg)
+		}
+
+		rollbackTargets = append(rollbackTargets, inst.Name)
+		updated++
+		status.Phase = PhaseDeploying
+		status.Progress = fmt.Sprintf("%d/%d instances updated", updated, total)
+		notifyProgress(progress, *status)
+	}
+
+	// 3. Complete.
+	status.Phase = PhaseComplete
+	status.Progress = fmt.Sprintf("%d/%d instances updated", updated, total)
+	status.CompletedAt = time.Now().UTC()
+	notifyProgress(progress, *status)
+	removeActiveDeployment(cfg.Target)
+	recordHistory(hist, status, updated)
+
+	return nil
+}
+
+// ── Canary Deploy ────────────────────────────────────────────────────────────
+
+// CanaryDeploy creates a canary instance alongside existing instances and
+// routes cfg.CanaryWeight percent of traffic to it.
+//
+// Algorithm:
+//  1. List existing instances
+//  2. Create a new canary instance with the new image
+//  3. Start the canary and verify health
+//  4. Update traffic routing to send CanaryWeight% to canary
+//  5. If health fails and AutoRollback: remove canary, restore routing
+func CanaryDeploy(cfg DeployConfig, exec Executor, hc HealthChecker, hist *HistoryStore, progress ProgressFunc) error {
+	if err := cfg.Validate(); err != nil {
+		return err
+	}
+
+	deployID := generateDeployID()
+
+	status := &DeployStatus{
+		ID:         deployID,
+		Phase:      PhasePreparing,
+		Target:     cfg.Target,
+		Strategy:   StrategyCanary,
+		NewVersion: cfg.NewImage,
+		StartedAt:  time.Now().UTC(),
+	}
+	setActiveDeployment(status)
+	notifyProgress(progress, *status)
+
+	// 1. Discover existing instances.
+	instances, err := exec.ListInstances(cfg.Target)
+	if err != nil {
+		status.Phase = PhaseFailed
+		status.Message = fmt.Sprintf("failed to list instances: %v", err)
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+	if len(instances) == 0 {
+		status.Phase = PhaseFailed
+		status.Message = "no instances found matching target"
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+
+	// Record old version.
+	if oldImg, err := exec.GetInstanceImage(instances[0].Name); err == nil {
+		status.OldVersion = oldImg
+	}
+
+	// 2. Create canary instance.
+	canaryName := canaryInstanceName(cfg.Target)
+
+	status.Phase = PhaseDeploying
+	status.Progress = fmt.Sprintf("creating canary instance %s", canaryName)
+	notifyProgress(progress, *status)
+
+	if err := exec.CreateInstance(canaryName, cfg.NewImage); err != nil {
+		status.Phase = PhaseFailed
+		status.Message = fmt.Sprintf("failed to create canary: %v", err)
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+
+	// 3. Start canary and verify health.
+	if err := exec.StartInstance(canaryName); err != nil {
+		cleanupCanary(exec, canaryName)
+		status.Phase = PhaseFailed
+		status.Message = fmt.Sprintf("failed to start canary: %v", err)
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+
+	status.Phase = PhaseVerifying
+	status.Progress = "verifying canary health"
+	notifyProgress(progress, *status)
+
+	if err := hc.WaitHealthy(canaryName, cfg.HealthCheck); err != nil {
+		if cfg.AutoRollback {
+			status.Phase = PhaseRollingBack
+			status.Message = fmt.Sprintf("canary health check failed: %v", err)
+			notifyProgress(progress, *status)
+			cleanupCanary(exec, canaryName)
+		}
+		status.Phase = PhaseFailed
+		status.Message = fmt.Sprintf("canary health check failed: %v", err)
+		status.CompletedAt = time.Now().UTC()
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+
+	// 4. Update traffic routing.
+	status.Progress = fmt.Sprintf("routing %d%% traffic to canary", cfg.CanaryWeight)
+	notifyProgress(progress, *status)
+
+	if err := exec.UpdateTrafficWeight(cfg.Target, canaryName, cfg.CanaryWeight); err != nil {
+		if cfg.AutoRollback {
+			cleanupCanary(exec, canaryName)
+		}
+		status.Phase = PhaseFailed
+		status.Message = fmt.Sprintf("failed to update traffic routing: %v", err)
+		status.CompletedAt = time.Now().UTC()
+		notifyProgress(progress, *status)
+		removeActiveDeployment(cfg.Target)
+		recordHistory(hist, status, 0)
+		return fmt.Errorf("deploy: %s", status.Message)
+	}
+
+	// 5. Canary is live.
+	status.Phase = PhaseComplete
+	status.Progress = fmt.Sprintf("canary live with %d%% traffic", cfg.CanaryWeight)
+	status.CompletedAt = time.Now().UTC()
+	notifyProgress(progress, *status)
+	removeActiveDeployment(cfg.Target)
+	recordHistory(hist, status, 1)
+
+	return nil
+}
+
+// ── Rollback ─────────────────────────────────────────────────────────────────
+
+// Rollback reverts a target to its previous version using deployment history.
+func Rollback(target string, exec Executor, hist *HistoryStore, progress ProgressFunc) error {
+	if hist == nil {
+		return fmt.Errorf("deploy rollback: no history store available")
+	}
+
+	entries, err := hist.ListByTarget(target)
+	if err != nil {
+		return fmt.Errorf("deploy rollback: failed to read history: %w", err)
+	}
+
+	// Find the last successful deployment that has a different version.
+	var previousRef string
+	for _, entry := range entries {
+		if entry.Status == string(PhaseComplete) && entry.OldRef != "" {
+			previousRef = entry.OldRef
+			break
+		}
+	}
+	if previousRef == "" {
+		return fmt.Errorf("deploy rollback: no previous version found in history for %q", target)
+	}
+
+	status := &DeployStatus{
+		ID:         generateDeployID(),
+		Phase:      PhaseRollingBack,
+		Target:     target,
+		Strategy:   StrategyRolling,
+		NewVersion: previousRef,
+		StartedAt:  time.Now().UTC(),
+		Message:    "rollback to previous version",
+	}
+	notifyProgress(progress, *status)
+
+	// Perform a rolling deploy with the previous ref.
+	rollbackCfg := DeployConfig{
+		Strategy:     StrategyRolling,
+		Target:       target,
+		NewImage:     previousRef,
+		MaxSurge:     1,
+		MaxUnavail:   0,
+		HealthCheck:  HealthCheck{Type: "none"},
+		Timeout:      5 * time.Minute,
+		AutoRollback: false, // Don't auto-rollback a rollback
+	}
+
+	return RollingDeploy(rollbackCfg, exec, &NoopHealthChecker{}, hist, progress)
+}
+
+// ── Helpers ──────────────────────────────────────────────────────────────────
+
+// rollbackInstances reverts a list of instances to the old image.
+func rollbackInstances(exec Executor, names []string, oldImage string) {
+	for _, name := range names {
+		_ = exec.UpdateInstanceImage(name, oldImage)
+		_ = exec.StartInstance(name)
+	}
+}
+
+// cleanupCanary stops and removes a canary instance.
+func cleanupCanary(exec Executor, canaryName string) {
+	_ = exec.StopInstance(canaryName)
+	_ = exec.DeleteInstance(canaryName)
+}
+
+// canaryInstanceName generates a canary instance name from the target.
+func canaryInstanceName(target string) string {
+	// Strip any trailing instance numbers and add -canary suffix.
+	base := strings.TrimRight(target, "0123456789-")
+	if base == "" {
+		base = target
+	}
+	return base + "-canary"
+}
+
+// generateDeployID creates a unique deployment ID.
+func generateDeployID() string {
+	return fmt.Sprintf("deploy-%d", time.Now().UnixNano()/int64(time.Millisecond))
+}
+
+// notifyProgress safely calls the progress callback if non-nil.
+func notifyProgress(fn ProgressFunc, status DeployStatus) {
+	if fn != nil {
+		fn(status)
+	}
+}
+
+// recordHistory saves a deployment to the history store if available.
+func recordHistory(hist *HistoryStore, status *DeployStatus, instancesUpdated int) {
+	if hist == nil {
+		return
+	}
+	entry := HistoryEntry{
+		ID:               status.ID,
+		Target:           status.Target,
+		Strategy:         string(status.Strategy),
+		OldRef:           status.OldVersion,
+		NewRef:           status.NewVersion,
+		Status:           string(status.Phase),
+		StartedAt:        status.StartedAt,
+		CompletedAt:      status.CompletedAt,
+		InstancesUpdated: instancesUpdated,
+		Message:          status.Message,
+	}
+	_ = hist.Append(entry)
+}
+
+// ── Default executor (real system calls) ─────────────────────────────────────
+
+// DefaultCASDir is the default directory for CAS storage.
+const DefaultCASDir = "/var/lib/volt/cas"
+
+// SystemExecutor implements Executor using real system commands.
+type SystemExecutor struct {
+	ContainerBaseDir string
+	CASBaseDir       string
+}
+
+// NewSystemExecutor creates an executor for real system operations.
+func NewSystemExecutor() *SystemExecutor {
+	return &SystemExecutor{
+		ContainerBaseDir: "/var/lib/volt/containers",
+		CASBaseDir:       DefaultCASDir,
+	}
+}
+
+func (e *SystemExecutor) ListInstances(target string) ([]Instance, error) {
+	// Match instances by prefix or exact name.
+	// Scan /var/lib/volt/containers for directories matching the pattern.
+	var instances []Instance
+
+	entries, err := filepath.Glob(filepath.Join(e.ContainerBaseDir, target+"*"))
+	if err != nil {
+		return nil, fmt.Errorf("list instances: %w", err)
+	}
+
+	for _, entry := range entries {
+		name := filepath.Base(entry)
+		instances = append(instances, Instance{
+			Name:   name,
+			Status: "unknown",
+		})
+	}
+
+	// If no glob matches, try exact match.
+	if len(instances) == 0 {
+		exact := filepath.Join(e.ContainerBaseDir, target)
+		if info, err := fileInfo(exact); err == nil && info.IsDir() {
+			instances = append(instances, Instance{
+				Name:   target,
+				Status: "unknown",
+			})
+		}
+	}
+
+	return instances, nil
+}
+
+func (e *SystemExecutor) CreateInstance(name, image string) error {
+	// Create container directory and write unit file.
+	// In a real implementation this would use the backend.Create flow.
+	return fmt.Errorf("SystemExecutor.CreateInstance not yet wired to backend")
+}
+
+func (e *SystemExecutor) StartInstance(name string) error {
+	return runSystemctl("start", voltContainerUnit(name))
+}
+
+func (e *SystemExecutor) StopInstance(name string) error {
+	return runSystemctl("stop", voltContainerUnit(name))
+}
+
+func (e *SystemExecutor) DeleteInstance(name string) error {
+	return fmt.Errorf("SystemExecutor.DeleteInstance not yet wired to backend")
+}
+
+func (e *SystemExecutor) GetInstanceImage(name string) (string, error) {
+	// Read the CAS ref from the instance's metadata.
+	// Stored in /var/lib/volt/containers/<name>/.volt-cas-ref
+	refPath := filepath.Join(e.ContainerBaseDir, name, ".volt-cas-ref")
+	data, err := readFile(refPath)
+	if err != nil {
+		return "", fmt.Errorf("no CAS ref found for instance %s", name)
+	}
+	return strings.TrimSpace(string(data)), nil
+}
+
+func (e *SystemExecutor) UpdateInstanceImage(name, newImage string) error {
+	// 1. Stop the instance.
+	_ = runSystemctl("stop", voltContainerUnit(name))
+
+	// 2. Write new CAS ref.
+	refPath := filepath.Join(e.ContainerBaseDir, name, ".volt-cas-ref")
+	if err := writeFile(refPath, []byte(newImage)); err != nil {
+		return fmt.Errorf("failed to write CAS ref: %w", err)
+	}
+
+	return nil
+}
+
+func (e *SystemExecutor) UpdateTrafficWeight(target, canaryName string, weight int) error {
+	// In a full implementation this would update nftables rules for load balancing.
+	// For now, record the weight in a metadata file.
+	weightPath := filepath.Join(e.ContainerBaseDir, ".traffic-weights")
+	data := fmt.Sprintf("%s:%s:%d\n", target, canaryName, weight)
+	return appendFile(weightPath, []byte(data))
+}
+
+// voltContainerUnit returns the systemd unit name for a container.
+func voltContainerUnit(name string) string {
+	return fmt.Sprintf("volt-container@%s.service", name)
+}
--- a/pkg/deploy/deploy_test.go
+++ b/pkg/deploy/deploy_test.go
@@ -0,0 +1,899 @@
+/*
+Deploy Tests — Verifies rolling, canary, rollback, health check, and history logic.
+
+Uses a mock executor and health checker so no real system calls are made.
+*/
+package deploy
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// ── Mock Executor ────────────────────────────────────────────────────────────
+
+// mockExecutor records all operations for verification.
+type mockExecutor struct {
+	mu sync.Mutex
+
+	instances map[string]*Instance // name → instance
+	images    map[string]string    // name → current image
+
+	// Recorded operation log.
+	ops []string
+
+	// Error injection.
+	updateImageErr  map[string]error // instance name → error to return
+	startErr        map[string]error
+	createErr       map[string]error
+	trafficWeights  map[string]int // canaryName → weight
+}
+
+func newMockExecutor(instances ...Instance) *mockExecutor {
+	m := &mockExecutor{
+		instances:      make(map[string]*Instance),
+		images:         make(map[string]string),
+		updateImageErr: make(map[string]error),
+		startErr:       make(map[string]error),
+		createErr:      make(map[string]error),
+		trafficWeights: make(map[string]int),
+	}
+	for _, inst := range instances {
+		cpy := inst
+		m.instances[inst.Name] = &cpy
+		m.images[inst.Name] = inst.Image
+	}
+	return m
+}
+
+func (m *mockExecutor) record(op string) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.ops = append(m.ops, op)
+}
+
+func (m *mockExecutor) getOps() []string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	result := make([]string, len(m.ops))
+	copy(result, m.ops)
+	return result
+}
+
+func (m *mockExecutor) ListInstances(target string) ([]Instance, error) {
+	m.record(fmt.Sprintf("list:%s", target))
+	var result []Instance
+	for _, inst := range m.instances {
+		if strings.HasPrefix(inst.Name, target) || inst.Name == target {
+			result = append(result, *inst)
+		}
+	}
+	return result, nil
+}
+
+func (m *mockExecutor) CreateInstance(name, image string) error {
+	m.record(fmt.Sprintf("create:%s:%s", name, image))
+	if err, ok := m.createErr[name]; ok {
+		return err
+	}
+	m.mu.Lock()
+	m.instances[name] = &Instance{Name: name, Image: image, Status: "stopped"}
+	m.images[name] = image
+	m.mu.Unlock()
+	return nil
+}
+
+func (m *mockExecutor) StartInstance(name string) error {
+	m.record(fmt.Sprintf("start:%s", name))
+	if err, ok := m.startErr[name]; ok {
+		return err
+	}
+	m.mu.Lock()
+	if inst, ok := m.instances[name]; ok {
+		inst.Status = "running"
+	}
+	m.mu.Unlock()
+	return nil
+}
+
+func (m *mockExecutor) StopInstance(name string) error {
+	m.record(fmt.Sprintf("stop:%s", name))
+	m.mu.Lock()
+	if inst, ok := m.instances[name]; ok {
+		inst.Status = "stopped"
+	}
+	m.mu.Unlock()
+	return nil
+}
+
+func (m *mockExecutor) DeleteInstance(name string) error {
+	m.record(fmt.Sprintf("delete:%s", name))
+	m.mu.Lock()
+	delete(m.instances, name)
+	delete(m.images, name)
+	m.mu.Unlock()
+	return nil
+}
+
+func (m *mockExecutor) GetInstanceImage(name string) (string, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	if img, ok := m.images[name]; ok {
+		return img, nil
+	}
+	return "", fmt.Errorf("instance %s not found", name)
+}
+
+func (m *mockExecutor) UpdateInstanceImage(name, newImage string) error {
+	m.record(fmt.Sprintf("update-image:%s:%s", name, newImage))
+	if err, ok := m.updateImageErr[name]; ok {
+		return err
+	}
+	m.mu.Lock()
+	m.images[name] = newImage
+	if inst, ok := m.instances[name]; ok {
+		inst.Image = newImage
+	}
+	m.mu.Unlock()
+	return nil
+}
+
+func (m *mockExecutor) UpdateTrafficWeight(target, canaryName string, weight int) error {
+	m.record(fmt.Sprintf("traffic:%s:%s:%d", target, canaryName, weight))
+	m.mu.Lock()
+	m.trafficWeights[canaryName] = weight
+	m.mu.Unlock()
+	return nil
+}
+
+// ── Mock Health Checker ──────────────────────────────────────────────────────
+
+// mockHealthChecker returns configurable results per instance.
+type mockHealthChecker struct {
+	mu       sync.Mutex
+	results  map[string]error // instance name → error (nil = healthy)
+	calls    []string
+}
+
+func newMockHealthChecker() *mockHealthChecker {
+	return &mockHealthChecker{
+		results: make(map[string]error),
+	}
+}
+
+func (h *mockHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
+	h.mu.Lock()
+	h.calls = append(h.calls, instanceName)
+	err := h.results[instanceName]
+	h.mu.Unlock()
+	return err
+}
+
+func (h *mockHealthChecker) getCalls() []string {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	result := make([]string, len(h.calls))
+	copy(result, h.calls)
+	return result
+}
+
+// ── Progress Collector ───────────────────────────────────────────────────────
+
+type progressCollector struct {
+	mu      sync.Mutex
+	updates []DeployStatus
+}
+
+func newProgressCollector() *progressCollector {
+	return &progressCollector{}
+}
+
+func (p *progressCollector) callback() ProgressFunc {
+	return func(status DeployStatus) {
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		p.updates = append(p.updates, status)
+	}
+}
+
+func (p *progressCollector) getUpdates() []DeployStatus {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	result := make([]DeployStatus, len(p.updates))
+	copy(result, p.updates)
+	return result
+}
+
+func (p *progressCollector) phases() []Phase {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	var phases []Phase
+	for _, u := range p.updates {
+		phases = append(phases, u.Phase)
+	}
+	return phases
+}
+
+// ── Test: Rolling Deploy Order ───────────────────────────────────────────────
+
+func TestRollingDeployOrder(t *testing.T) {
+	exec := newMockExecutor(
+		Instance{Name: "web-1", Image: "sha256:old1", Status: "running"},
+		Instance{Name: "web-2", Image: "sha256:old1", Status: "running"},
+		Instance{Name: "web-3", Image: "sha256:old1", Status: "running"},
+	)
+	hc := newMockHealthChecker()
+	pc := newProgressCollector()
+
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	cfg := DeployConfig{
+		Strategy:     StrategyRolling,
+		Target:       "web",
+		NewImage:     "sha256:new1",
+		MaxSurge:     1,
+		MaxUnavail:   0,
+		HealthCheck:  HealthCheck{Type: "none"},
+		Timeout:      1 * time.Minute,
+		AutoRollback: true,
+	}
+
+	err := RollingDeploy(cfg, exec, hc, hist, pc.callback())
+	if err != nil {
+		t.Fatalf("RollingDeploy returned error: %v", err)
+	}
+
+	// Verify all instances were updated.
+	ops := exec.getOps()
+
+	// Count update-image operations.
+	updateCount := 0
+	for _, op := range ops {
+		if strings.HasPrefix(op, "update-image:") {
+			updateCount++
+			// Verify new image is correct.
+			if !strings.HasSuffix(op, ":sha256:new1") {
+				t.Errorf("expected new image sha256:new1, got op: %s", op)
+			}
+		}
+	}
+	if updateCount != 3 {
+		t.Errorf("expected 3 update-image ops, got %d", updateCount)
+	}
+
+	// Verify instances are updated one at a time (each update is followed by start before next update).
+	var updateOrder []string
+	for _, op := range ops {
+		if strings.HasPrefix(op, "update-image:web-") {
+			name := strings.Split(op, ":")[1]
+			updateOrder = append(updateOrder, name)
+		}
+	}
+	if len(updateOrder) != 3 {
+		t.Errorf("expected 3 instances updated in order, got %d", len(updateOrder))
+	}
+
+	// Verify progress callback was called.
+	phases := pc.phases()
+	if len(phases) == 0 {
+		t.Error("expected progress callbacks, got none")
+	}
+
+	// First should be preparing, last should be complete.
+	if phases[0] != PhasePreparing {
+		t.Errorf("expected first phase to be preparing, got %s", phases[0])
+	}
+	lastPhase := phases[len(phases)-1]
+	if lastPhase != PhaseComplete {
+		t.Errorf("expected last phase to be complete, got %s", lastPhase)
+	}
+
+	// Verify all images are now the new version.
+	for _, name := range []string{"web-1", "web-2", "web-3"} {
+		img, err := exec.GetInstanceImage(name)
+		if err != nil {
+			t.Errorf("GetInstanceImage(%s) error: %v", name, err)
+			continue
+		}
+		if img != "sha256:new1" {
+			t.Errorf("instance %s image = %s, want sha256:new1", name, img)
+		}
+	}
+}
+
+// ── Test: Canary Weight ──────────────────────────────────────────────────────
+
+func TestCanaryWeight(t *testing.T) {
+	exec := newMockExecutor(
+		Instance{Name: "api-1", Image: "sha256:v1", Status: "running"},
+		Instance{Name: "api-2", Image: "sha256:v1", Status: "running"},
+	)
+	hc := newMockHealthChecker()
+	pc := newProgressCollector()
+
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	cfg := DeployConfig{
+		Strategy:     StrategyCanary,
+		Target:       "api",
+		NewImage:     "sha256:v2",
+		CanaryWeight: 20,
+		HealthCheck:  HealthCheck{Type: "none"},
+		Timeout:      1 * time.Minute,
+		AutoRollback: true,
+	}
+
+	err := CanaryDeploy(cfg, exec, hc, hist, pc.callback())
+	if err != nil {
+		t.Fatalf("CanaryDeploy returned error: %v", err)
+	}
+
+	// Verify canary instance was created.
+	ops := exec.getOps()
+	var createOps []string
+	for _, op := range ops {
+		if strings.HasPrefix(op, "create:") {
+			createOps = append(createOps, op)
+		}
+	}
+	if len(createOps) != 1 {
+		t.Fatalf("expected 1 create op for canary, got %d: %v", len(createOps), createOps)
+	}
+
+	// Verify the canary instance name and image.
+	canaryName := canaryInstanceName("api")
+	expectedCreate := fmt.Sprintf("create:%s:sha256:v2", canaryName)
+	if createOps[0] != expectedCreate {
+		t.Errorf("create op = %q, want %q", createOps[0], expectedCreate)
+	}
+
+	// Verify traffic was routed with the correct weight.
+	var trafficOps []string
+	for _, op := range ops {
+		if strings.HasPrefix(op, "traffic:") {
+			trafficOps = append(trafficOps, op)
+		}
+	}
+	if len(trafficOps) != 1 {
+		t.Fatalf("expected 1 traffic op, got %d: %v", len(trafficOps), trafficOps)
+	}
+	expectedTraffic := fmt.Sprintf("traffic:api:%s:20", canaryName)
+	if trafficOps[0] != expectedTraffic {
+		t.Errorf("traffic op = %q, want %q", trafficOps[0], expectedTraffic)
+	}
+
+	// Verify the canary weight was recorded.
+	exec.mu.Lock()
+	weight := exec.trafficWeights[canaryName]
+	exec.mu.Unlock()
+	if weight != 20 {
+		t.Errorf("canary traffic weight = %d, want 20", weight)
+	}
+
+	// Verify original instances were not modified.
+	for _, name := range []string{"api-1", "api-2"} {
+		img, _ := exec.GetInstanceImage(name)
+		if img != "sha256:v1" {
+			t.Errorf("original instance %s image changed to %s, should still be sha256:v1", name, img)
+		}
+	}
+
+	// Verify progress shows canary-specific messages.
+	updates := pc.getUpdates()
+	foundCanaryProgress := false
+	for _, u := range updates {
+		if strings.Contains(u.Progress, "canary") || strings.Contains(u.Progress, "traffic") {
+			foundCanaryProgress = true
+			break
+		}
+	}
+	if !foundCanaryProgress {
+		t.Error("expected canary-related progress messages")
+	}
+}
+
+// ── Test: Rollback Restores Previous ─────────────────────────────────────────
+
+func TestRollbackRestoresPrevious(t *testing.T) {
+	exec := newMockExecutor(
+		Instance{Name: "app-1", Image: "sha256:v2", Status: "running"},
+	)
+	_ = newMockHealthChecker()
+	pc := newProgressCollector()
+
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	// Seed history with a previous successful deployment.
+	_ = hist.Append(HistoryEntry{
+		ID:               "deploy-prev",
+		Target:           "app",
+		Strategy:         "rolling",
+		OldRef:           "sha256:v1",
+		NewRef:           "sha256:v2",
+		Status:           string(PhaseComplete),
+		StartedAt:        time.Now().Add(-1 * time.Hour),
+		CompletedAt:      time.Now().Add(-50 * time.Minute),
+		InstancesUpdated: 1,
+	})
+
+	err := Rollback("app", exec, hist, pc.callback())
+	if err != nil {
+		t.Fatalf("Rollback returned error: %v", err)
+	}
+
+	// Verify the instance was updated back to v1.
+	img, err := exec.GetInstanceImage("app-1")
+	if err != nil {
+		t.Fatalf("GetInstanceImage error: %v", err)
+	}
+	if img != "sha256:v1" {
+		t.Errorf("after rollback, instance image = %s, want sha256:v1", img)
+	}
+
+	// Verify rollback was recorded in history.
+	entries, err := hist.ListByTarget("app")
+	if err != nil {
+		t.Fatalf("ListByTarget error: %v", err)
+	}
+	// Should have the original entry + the rollback entry.
+	if len(entries) < 2 {
+		t.Errorf("expected at least 2 history entries, got %d", len(entries))
+	}
+}
+
+// ── Test: Health Check Fail Triggers Rollback ────────────────────────────────
+
+func TestHealthCheckFailTriggersRollback(t *testing.T) {
+	exec := newMockExecutor(
+		Instance{Name: "svc-1", Image: "sha256:old", Status: "running"},
+		Instance{Name: "svc-2", Image: "sha256:old", Status: "running"},
+	)
+	hc := newMockHealthChecker()
+	// Make svc-2 fail health check after being updated.
+	// Since instances are iterated from the map, we set both to fail
+	// but we only need to verify that when any fails, rollback happens.
+	hc.results["svc-1"] = nil // svc-1 is healthy
+	hc.results["svc-2"] = fmt.Errorf("connection refused")
+
+	pc := newProgressCollector()
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	cfg := DeployConfig{
+		Strategy:     StrategyRolling,
+		Target:       "svc",
+		NewImage:     "sha256:bad",
+		MaxSurge:     1,
+		MaxUnavail:   0,
+		HealthCheck:  HealthCheck{Type: "tcp", Port: 8080, Interval: 100 * time.Millisecond, Retries: 1},
+		Timeout:      30 * time.Second,
+		AutoRollback: true,
+	}
+
+	err := RollingDeploy(cfg, exec, hc, hist, pc.callback())
+
+	// Deployment should fail.
+	if err == nil {
+		t.Fatal("expected RollingDeploy to fail due to health check, but got nil")
+	}
+	if !strings.Contains(err.Error(), "health check failed") {
+		t.Errorf("error should mention health check failure, got: %v", err)
+	}
+
+	// Verify rollback phase appeared in progress.
+	phases := pc.phases()
+	foundRollback := false
+	for _, p := range phases {
+		if p == PhaseRollingBack {
+			foundRollback = true
+			break
+		}
+	}
+	if !foundRollback {
+		t.Error("expected rolling-back phase in progress updates")
+	}
+
+	// Verify rollback operations were attempted (update-image back to old).
+	ops := exec.getOps()
+	rollbackOps := 0
+	for _, op := range ops {
+		if strings.Contains(op, "update-image:") && strings.Contains(op, ":sha256:old") {
+			rollbackOps++
+		}
+	}
+	if rollbackOps == 0 {
+		t.Error("expected rollback operations (update-image back to sha256:old), found none")
+	}
+
+	// Verify history records the failure.
+	entries, _ := hist.ListByTarget("svc")
+	if len(entries) == 0 {
+		t.Fatal("expected history entry for failed deployment")
+	}
+	if entries[0].Status != string(PhaseFailed) {
+		t.Errorf("history status = %s, want failed", entries[0].Status)
+	}
+}
+
+// ── Test: Deploy History ─────────────────────────────────────────────────────
+
+func TestDeployHistory(t *testing.T) {
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	// Write several entries.
+	entries := []HistoryEntry{
+		{
+			ID:               "deploy-001",
+			Target:           "web-app",
+			Strategy:         "rolling",
+			OldRef:           "sha256:abc123",
+			NewRef:           "sha256:def456",
+			Status:           "complete",
+			StartedAt:        time.Date(2026, 3, 20, 15, 0, 0, 0, time.UTC),
+			CompletedAt:      time.Date(2026, 3, 20, 15, 5, 0, 0, time.UTC),
+			InstancesUpdated: 3,
+		},
+		{
+			ID:               "deploy-002",
+			Target:           "web-app",
+			Strategy:         "canary",
+			OldRef:           "sha256:def456",
+			NewRef:           "sha256:ghi789",
+			Status:           "complete",
+			StartedAt:        time.Date(2026, 3, 21, 10, 0, 0, 0, time.UTC),
+			CompletedAt:      time.Date(2026, 3, 21, 10, 2, 0, 0, time.UTC),
+			InstancesUpdated: 1,
+		},
+		{
+			ID:               "deploy-003",
+			Target:           "api-svc",
+			Strategy:         "rolling",
+			OldRef:           "sha256:111",
+			NewRef:           "sha256:222",
+			Status:           "failed",
+			StartedAt:        time.Date(2026, 3, 22, 8, 0, 0, 0, time.UTC),
+			CompletedAt:      time.Date(2026, 3, 22, 8, 1, 0, 0, time.UTC),
+			InstancesUpdated: 0,
+			Message:          "health check timeout",
+		},
+	}
+
+	for _, e := range entries {
+		if err := hist.Append(e); err != nil {
+			t.Fatalf("Append error: %v", err)
+		}
+	}
+
+	// Verify target-specific listing.
+	webEntries, err := hist.ListByTarget("web-app")
+	if err != nil {
+		t.Fatalf("ListByTarget error: %v", err)
+	}
+	if len(webEntries) != 2 {
+		t.Errorf("expected 2 web-app entries, got %d", len(webEntries))
+	}
+	// Most recent first.
+	if len(webEntries) >= 2 && webEntries[0].ID != "deploy-002" {
+		t.Errorf("expected most recent entry first, got %s", webEntries[0].ID)
+	}
+
+	apiEntries, err := hist.ListByTarget("api-svc")
+	if err != nil {
+		t.Fatalf("ListByTarget error: %v", err)
+	}
+	if len(apiEntries) != 1 {
+		t.Errorf("expected 1 api-svc entry, got %d", len(apiEntries))
+	}
+	if len(apiEntries) == 1 && apiEntries[0].Message != "health check timeout" {
+		t.Errorf("expected message 'health check timeout', got %q", apiEntries[0].Message)
+	}
+
+	// Verify ListAll.
+	all, err := hist.ListAll()
+	if err != nil {
+		t.Fatalf("ListAll error: %v", err)
+	}
+	if len(all) != 3 {
+		t.Errorf("expected 3 total entries, got %d", len(all))
+	}
+
+	// Verify files were created.
+	files, _ := filepath.Glob(filepath.Join(tmpDir, "*.yaml"))
+	if len(files) != 2 { // web-app.yaml and api-svc.yaml
+		t.Errorf("expected 2 history files, got %d", len(files))
+	}
+}
+
+// ── Test: Config Validation ──────────────────────────────────────────────────
+
+func TestConfigValidation(t *testing.T) {
+	tests := []struct {
+		name    string
+		cfg     DeployConfig
+		wantErr string
+	}{
+		{
+			name:    "empty target",
+			cfg:     DeployConfig{Strategy: StrategyRolling, NewImage: "sha256:abc"},
+			wantErr: "target is required",
+		},
+		{
+			name:    "empty image",
+			cfg:     DeployConfig{Strategy: StrategyRolling, Target: "web"},
+			wantErr: "new image",
+		},
+		{
+			name:    "invalid strategy",
+			cfg:     DeployConfig{Strategy: "blue-green", Target: "web", NewImage: "sha256:abc"},
+			wantErr: "unknown strategy",
+		},
+		{
+			name:    "canary weight zero",
+			cfg:     DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 0},
+			wantErr: "canary weight must be between 1 and 99",
+		},
+		{
+			name:    "canary weight 100",
+			cfg:     DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 100},
+			wantErr: "canary weight must be between 1 and 99",
+		},
+		{
+			name: "valid rolling",
+			cfg:  DeployConfig{Strategy: StrategyRolling, Target: "web", NewImage: "sha256:abc"},
+		},
+		{
+			name: "valid canary",
+			cfg:  DeployConfig{Strategy: StrategyCanary, Target: "web", NewImage: "sha256:abc", CanaryWeight: 25},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tt.cfg.Validate()
+			if tt.wantErr != "" {
+				if err == nil {
+					t.Errorf("expected error containing %q, got nil", tt.wantErr)
+				} else if !strings.Contains(err.Error(), tt.wantErr) {
+					t.Errorf("error %q should contain %q", err.Error(), tt.wantErr)
+				}
+			} else {
+				if err != nil {
+					t.Errorf("unexpected error: %v", err)
+				}
+			}
+		})
+	}
+}
+
+// ── Test: Canary Instance Name ───────────────────────────────────────────────
+
+func TestCanaryInstanceName(t *testing.T) {
+	tests := []struct {
+		target string
+		want   string
+	}{
+		{"web-app", "web-app-canary"},
+		{"api-1", "api-canary"},
+		{"simple", "simple-canary"},
+		{"my-service-", "my-service-canary"},
+	}
+
+	for _, tt := range tests {
+		got := canaryInstanceName(tt.target)
+		if got != tt.want {
+			t.Errorf("canaryInstanceName(%q) = %q, want %q", tt.target, got, tt.want)
+		}
+	}
+}
+
+// ── Test: No Instances Found ─────────────────────────────────────────────────
+
+func TestRollingDeployNoInstances(t *testing.T) {
+	exec := newMockExecutor() // empty
+	hc := newMockHealthChecker()
+
+	cfg := DeployConfig{
+		Strategy: StrategyRolling,
+		Target:   "nonexistent",
+		NewImage: "sha256:abc",
+		Timeout:  10 * time.Second,
+	}
+
+	err := RollingDeploy(cfg, exec, hc, nil, nil)
+	if err == nil {
+		t.Fatal("expected error for no instances, got nil")
+	}
+	if !strings.Contains(err.Error(), "no instances found") {
+		t.Errorf("error should mention no instances, got: %v", err)
+	}
+}
+
+// ── Test: Active Deployments Tracking ────────────────────────────────────────
+
+func TestActiveDeployments(t *testing.T) {
+	// Clear any leftover state.
+	activeDeploymentsMu.Lock()
+	activeDeployments = make(map[string]*DeployStatus)
+	activeDeploymentsMu.Unlock()
+
+	// Initially empty.
+	active := GetActiveDeployments()
+	if len(active) != 0 {
+		t.Errorf("expected 0 active deployments, got %d", len(active))
+	}
+
+	// Run a deployment and check it appears during execution.
+	exec := newMockExecutor(
+		Instance{Name: "track-1", Image: "sha256:old", Status: "running"},
+	)
+	hc := newMockHealthChecker()
+
+	var seenActive bool
+	progressFn := func(status DeployStatus) {
+		if status.Phase == PhaseDeploying || status.Phase == PhaseVerifying {
+			ad := GetActiveDeployment("track")
+			if ad != nil {
+				seenActive = true
+			}
+		}
+	}
+
+	cfg := DeployConfig{
+		Strategy:    StrategyRolling,
+		Target:      "track",
+		NewImage:    "sha256:new",
+		HealthCheck: HealthCheck{Type: "none"},
+		Timeout:     10 * time.Second,
+	}
+
+	err := RollingDeploy(cfg, exec, hc, nil, progressFn)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !seenActive {
+		t.Error("expected to see active deployment during execution")
+	}
+
+	// After completion, should be empty again.
+	active = GetActiveDeployments()
+	if len(active) != 0 {
+		t.Errorf("expected 0 active deployments after completion, got %d", len(active))
+	}
+}
+
+// ── Test: History File Persistence ───────────────────────────────────────────
+
+func TestHistoryFilePersistence(t *testing.T) {
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	entry := HistoryEntry{
+		ID:               "persist-001",
+		Target:           "my-app",
+		Strategy:         "rolling",
+		OldRef:           "sha256:aaa",
+		NewRef:           "sha256:bbb",
+		Status:           "complete",
+		StartedAt:        time.Now().UTC(),
+		CompletedAt:      time.Now().UTC(),
+		InstancesUpdated: 2,
+	}
+	if err := hist.Append(entry); err != nil {
+		t.Fatalf("Append error: %v", err)
+	}
+
+	// Verify the file exists on disk.
+	filePath := filepath.Join(tmpDir, "my-app.yaml")
+	if _, err := os.Stat(filePath); err != nil {
+		t.Fatalf("history file not found: %v", err)
+	}
+
+	// Create a new store instance (simulating restart) and verify data.
+	hist2 := NewHistoryStore(tmpDir)
+	entries, err := hist2.ListByTarget("my-app")
+	if err != nil {
+		t.Fatalf("ListByTarget error: %v", err)
+	}
+	if len(entries) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(entries))
+	}
+	if entries[0].ID != "persist-001" {
+		t.Errorf("entry ID = %s, want persist-001", entries[0].ID)
+	}
+	if entries[0].InstancesUpdated != 2 {
+		t.Errorf("instances_updated = %d, want 2", entries[0].InstancesUpdated)
+	}
+}
+
+// ── Test: Noop Health Checker ────────────────────────────────────────────────
+
+func TestNoopHealthChecker(t *testing.T) {
+	noop := &NoopHealthChecker{}
+	err := noop.WaitHealthy("anything", HealthCheck{Type: "http", Port: 9999})
+	if err != nil {
+		t.Errorf("NoopHealthChecker should always return nil, got: %v", err)
+	}
+}
+
+// ── Test: Rollback Without History ───────────────────────────────────────────
+
+func TestRollbackWithoutHistory(t *testing.T) {
+	exec := newMockExecutor(
+		Instance{Name: "no-hist-1", Image: "sha256:v2", Status: "running"},
+	)
+
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	err := Rollback("no-hist", exec, hist, nil)
+	if err == nil {
+		t.Fatal("expected error for rollback without history, got nil")
+	}
+	if !strings.Contains(err.Error(), "no previous version") {
+		t.Errorf("error should mention no previous version, got: %v", err)
+	}
+}
+
+// ── Test: Canary Cleanup on Health Failure ────────────────────────────────────
+
+func TestCanaryCleanupOnHealthFailure(t *testing.T) {
+	exec := newMockExecutor(
+		Instance{Name: "svc-1", Image: "sha256:v1", Status: "running"},
+	)
+	hc := newMockHealthChecker()
+	canaryName := canaryInstanceName("svc")
+	hc.results[canaryName] = fmt.Errorf("unhealthy canary")
+
+	pc := newProgressCollector()
+	tmpDir := t.TempDir()
+	hist := NewHistoryStore(tmpDir)
+
+	cfg := DeployConfig{
+		Strategy:     StrategyCanary,
+		Target:       "svc",
+		NewImage:     "sha256:v2",
+		CanaryWeight: 10,
+		HealthCheck:  HealthCheck{Type: "tcp", Port: 8080, Interval: 100 * time.Millisecond, Retries: 1},
+		Timeout:      10 * time.Second,
+		AutoRollback: true,
+	}
+
+	err := CanaryDeploy(cfg, exec, hc, hist, pc.callback())
+	if err == nil {
+		t.Fatal("expected canary to fail, got nil")
+	}
+
+	// Verify canary was cleaned up (stop + delete).
+	ops := exec.getOps()
+	foundStop := false
+	foundDelete := false
+	for _, op := range ops {
+		if op == fmt.Sprintf("stop:%s", canaryName) {
+			foundStop = true
+		}
+		if op == fmt.Sprintf("delete:%s", canaryName) {
+			foundDelete = true
+		}
+	}
+	if !foundStop {
+		t.Error("expected canary stop operation during cleanup")
+	}
+	if !foundDelete {
+		t.Error("expected canary delete operation during cleanup")
+	}
+
+	// Verify original instance was not modified.
+	img, _ := exec.GetInstanceImage("svc-1")
+	if img != "sha256:v1" {
+		t.Errorf("original instance image changed to %s during failed canary", img)
+	}
+}
--- a/pkg/deploy/health.go
+++ b/pkg/deploy/health.go
@@ -0,0 +1,143 @@
+/*
+Health — Health check implementations for deployment verification.
+
+Supports HTTP, TCP, exec, and no-op health checks. Each check type
+retries according to the configured interval and retry count.
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+*/
+package deploy
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"os/exec"
+	"time"
+)
+
+// ── Health Check Config ──────────────────────────────────────────────────────
+
+// HealthCheck defines how to verify that an instance is healthy after deploy.
+type HealthCheck struct {
+	Type     string        `json:"type" yaml:"type"`         // "http", "tcp", "exec", "none"
+	Path     string        `json:"path" yaml:"path"`         // HTTP path (e.g., "/healthz")
+	Port     int           `json:"port" yaml:"port"`         // Port to check
+	Command  string        `json:"command" yaml:"command"`   // Exec command
+	Interval time.Duration `json:"interval" yaml:"interval"` // Time between retries
+	Retries  int           `json:"retries" yaml:"retries"`   // Max retry count
+}
+
+// ── Health Checker Interface ─────────────────────────────────────────────────
+
+// HealthChecker verifies instance health during deployments.
+type HealthChecker interface {
+	// WaitHealthy blocks until the instance is healthy or all retries are exhausted.
+	WaitHealthy(instanceName string, check HealthCheck) error
+}
+
+// ── Default Health Checker ───────────────────────────────────────────────────
+
+// DefaultHealthChecker implements HealthChecker using real HTTP/TCP/exec calls.
+type DefaultHealthChecker struct {
+	// InstanceIPResolver resolves an instance name to an IP address.
+	// If nil, "127.0.0.1" is used.
+	InstanceIPResolver func(name string) (string, error)
+}
+
+// WaitHealthy performs health checks with retries.
+func (d *DefaultHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
+	switch check.Type {
+	case "none", "":
+		return nil
+	case "http":
+		return d.waitHTTP(instanceName, check)
+	case "tcp":
+		return d.waitTCP(instanceName, check)
+	case "exec":
+		return d.waitExec(instanceName, check)
+	default:
+		return fmt.Errorf("unknown health check type: %q", check.Type)
+	}
+}
+
+func (d *DefaultHealthChecker) resolveIP(instanceName string) string {
+	if d.InstanceIPResolver != nil {
+		ip, err := d.InstanceIPResolver(instanceName)
+		if err == nil {
+			return ip
+		}
+	}
+	return "127.0.0.1"
+}
+
+func (d *DefaultHealthChecker) waitHTTP(instanceName string, check HealthCheck) error {
+	ip := d.resolveIP(instanceName)
+	url := fmt.Sprintf("http://%s:%d%s", ip, check.Port, check.Path)
+
+	client := &http.Client{Timeout: check.Interval}
+
+	var lastErr error
+	for i := 0; i < check.Retries; i++ {
+		resp, err := client.Get(url)
+		if err == nil {
+			resp.Body.Close()
+			if resp.StatusCode >= 200 && resp.StatusCode < 400 {
+				return nil
+			}
+			lastErr = fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+		} else {
+			lastErr = err
+		}
+		if i < check.Retries-1 {
+			time.Sleep(check.Interval)
+		}
+	}
+	return fmt.Errorf("health check failed after %d retries: %w", check.Retries, lastErr)
+}
+
+func (d *DefaultHealthChecker) waitTCP(instanceName string, check HealthCheck) error {
+	ip := d.resolveIP(instanceName)
+	addr := fmt.Sprintf("%s:%d", ip, check.Port)
+
+	var lastErr error
+	for i := 0; i < check.Retries; i++ {
+		conn, err := net.DialTimeout("tcp", addr, check.Interval)
+		if err == nil {
+			conn.Close()
+			return nil
+		}
+		lastErr = err
+		if i < check.Retries-1 {
+			time.Sleep(check.Interval)
+		}
+	}
+	return fmt.Errorf("TCP health check failed after %d retries: %w", check.Retries, lastErr)
+}
+
+func (d *DefaultHealthChecker) waitExec(instanceName string, check HealthCheck) error {
+	var lastErr error
+	for i := 0; i < check.Retries; i++ {
+		cmd := exec.Command("sh", "-c", check.Command)
+		if err := cmd.Run(); err == nil {
+			return nil
+		} else {
+			lastErr = err
+		}
+		if i < check.Retries-1 {
+			time.Sleep(check.Interval)
+		}
+	}
+	return fmt.Errorf("exec health check failed after %d retries: %w", check.Retries, lastErr)
+}
+
+// ── Noop Health Checker ──────────────────────────────────────────────────────
+
+// NoopHealthChecker always returns healthy. Used for rollbacks and when
+// health checking is disabled.
+type NoopHealthChecker struct{}
+
+// WaitHealthy always succeeds immediately.
+func (n *NoopHealthChecker) WaitHealthy(instanceName string, check HealthCheck) error {
+	return nil
+}
--- a/pkg/deploy/history.go
+++ b/pkg/deploy/history.go
@@ -0,0 +1,186 @@
+/*
+History — Persistent deployment history for Volt.
+
+Stores deployment records as YAML in /var/lib/volt/deployments/.
+Each target gets its own history file to keep lookups fast.
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+*/
+package deploy
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"gopkg.in/yaml.v3"
+)
+
+// ── Constants ────────────────────────────────────────────────────────────────
+
+const (
+	// DefaultHistoryDir is where deployment history files are stored.
+	DefaultHistoryDir = "/var/lib/volt/deployments"
+)
+
+// ── History Entry ────────────────────────────────────────────────────────────
+
+// HistoryEntry records a single deployment operation.
+type HistoryEntry struct {
+	ID               string    `yaml:"id" json:"id"`
+	Target           string    `yaml:"target" json:"target"`
+	Strategy         string    `yaml:"strategy" json:"strategy"`
+	OldRef           string    `yaml:"old_ref" json:"old_ref"`
+	NewRef           string    `yaml:"new_ref" json:"new_ref"`
+	Status           string    `yaml:"status" json:"status"` // "complete", "failed", "rolling-back"
+	StartedAt        time.Time `yaml:"started_at" json:"started_at"`
+	CompletedAt      time.Time `yaml:"completed_at" json:"completed_at"`
+	InstancesUpdated int       `yaml:"instances_updated" json:"instances_updated"`
+	Message          string    `yaml:"message,omitempty" json:"message,omitempty"`
+}
+
+// ── History Store ────────────────────────────────────────────────────────────
+
+// HistoryStore manages deployment history on disk.
+type HistoryStore struct {
+	dir string
+	mu  sync.Mutex
+}
+
+// NewHistoryStore creates a history store at the given directory.
+func NewHistoryStore(dir string) *HistoryStore {
+	if dir == "" {
+		dir = DefaultHistoryDir
+	}
+	return &HistoryStore{dir: dir}
+}
+
+// Dir returns the history directory path.
+func (h *HistoryStore) Dir() string {
+	return h.dir
+}
+
+// historyFile returns the path to the history file for a target.
+func (h *HistoryStore) historyFile(target string) string {
+	// Sanitize the target name for use as a filename.
+	safe := strings.Map(func(r rune) rune {
+		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') ||
+			(r >= '0' && r <= '9') || r == '-' || r == '_' {
+			return r
+		}
+		return '_'
+	}, target)
+	return filepath.Join(h.dir, safe+".yaml")
+}
+
+// Append adds a deployment entry to the target's history file.
+func (h *HistoryStore) Append(entry HistoryEntry) error {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	if err := os.MkdirAll(h.dir, 0755); err != nil {
+		return fmt.Errorf("history: create dir: %w", err)
+	}
+
+	// Load existing entries.
+	entries, _ := h.readEntries(entry.Target) // ignore error on first write
+
+	// Append and write.
+	entries = append(entries, entry)
+
+	return h.writeEntries(entry.Target, entries)
+}
+
+// ListByTarget returns all deployment history for a target, most recent first.
+func (h *HistoryStore) ListByTarget(target string) ([]HistoryEntry, error) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	entries, err := h.readEntries(target)
+	if err != nil {
+		return nil, err
+	}
+
+	// Sort by StartedAt descending (most recent first).
+	sort.Slice(entries, func(i, j int) bool {
+		return entries[i].StartedAt.After(entries[j].StartedAt)
+	})
+
+	return entries, nil
+}
+
+// ListAll returns all deployment history across all targets, most recent first.
+func (h *HistoryStore) ListAll() ([]HistoryEntry, error) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	files, err := filepath.Glob(filepath.Join(h.dir, "*.yaml"))
+	if err != nil {
+		return nil, fmt.Errorf("history: glob: %w", err)
+	}
+
+	var all []HistoryEntry
+	for _, f := range files {
+		data, err := os.ReadFile(f)
+		if err != nil {
+			continue
+		}
+		var entries []HistoryEntry
+		if err := yaml.Unmarshal(data, &entries); err != nil {
+			continue
+		}
+		all = append(all, entries...)
+	}
+
+	sort.Slice(all, func(i, j int) bool {
+		return all[i].StartedAt.After(all[j].StartedAt)
+	})
+
+	return all, nil
+}
+
+// readEntries loads entries from the history file for a target.
+// Returns empty slice (not error) if file doesn't exist.
+func (h *HistoryStore) readEntries(target string) ([]HistoryEntry, error) {
+	filePath := h.historyFile(target)
+	data, err := os.ReadFile(filePath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("history: read %s: %w", filePath, err)
+	}
+
+	var entries []HistoryEntry
+	if err := yaml.Unmarshal(data, &entries); err != nil {
+		return nil, fmt.Errorf("history: parse %s: %w", filePath, err)
+	}
+
+	return entries, nil
+}
+
+// writeEntries writes entries to the history file for a target.
+func (h *HistoryStore) writeEntries(target string, entries []HistoryEntry) error {
+	filePath := h.historyFile(target)
+
+	data, err := yaml.Marshal(entries)
+	if err != nil {
+		return fmt.Errorf("history: marshal: %w", err)
+	}
+
+	// Atomic write: tmp + rename.
+	tmpPath := filePath + ".tmp"
+	if err := os.WriteFile(tmpPath, data, 0644); err != nil {
+		return fmt.Errorf("history: write %s: %w", tmpPath, err)
+	}
+	if err := os.Rename(tmpPath, filePath); err != nil {
+		os.Remove(tmpPath)
+		return fmt.Errorf("history: rename %s: %w", filePath, err)
+	}
+
+	return nil
+}
--- a/pkg/deploy/io.go
+++ b/pkg/deploy/io.go
@@ -0,0 +1,46 @@
+/*
+IO helpers — Thin wrappers for filesystem and system operations.
+
+Isolated here so tests can verify logic without needing OS-level mocks.
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+*/
+package deploy
+
+import (
+	"os"
+	"os/exec"
+)
+
+// readFile reads a file's contents. Wraps os.ReadFile.
+func readFile(path string) ([]byte, error) {
+	return os.ReadFile(path)
+}
+
+// writeFile writes data to a file atomically. Wraps os.WriteFile.
+func writeFile(path string, data []byte) error {
+	return os.WriteFile(path, data, 0644)
+}
+
+// appendFile appends data to a file, creating it if necessary.
+func appendFile(path string, data []byte) error {
+	f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	_, err = f.Write(data)
+	return err
+}
+
+// fileInfo returns os.FileInfo for the given path.
+func fileInfo(path string) (os.FileInfo, error) {
+	return os.Stat(path)
+}
+
+// runSystemctl runs a systemctl subcommand.
+func runSystemctl(action, unit string) error {
+	cmd := exec.Command("systemctl", action, unit)
+	_, err := cmd.CombinedOutput()
+	return err
+}