Volt CLI: source-available under AGPSL v5.0

Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
2026-03-21 00:30:23 -05:00
commit 81ad0b597c
106 changed files with 35984 additions and 0 deletions
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@@ -0,0 +1,761 @@
+/*
+Volt Native Clustering — Core cluster management engine.
+
+Provides node discovery, health monitoring, workload scheduling, and leader
+election using Raft consensus. This replaces the kubectl wrapper in k8s.go
+with a real, native clustering implementation.
+
+Architecture:
+  - Raft consensus for leader election and distributed state
+  - Leader handles all scheduling decisions
+  - Followers execute workloads and report health
+  - State machine (FSM) tracks nodes, workloads, and assignments
+  - Health monitoring via periodic heartbeats (1s interval, 5s timeout)
+
+Transport: Runs over WireGuard mesh when available, falls back to plaintext.
+
+License: AGPSL v5 — Pro tier ("cluster" feature)
+*/
+package cluster
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+)
+
+// ── Constants ───────────────────────────────────────────────────────────────
+
+const (
+	ClusterConfigDir  = "/var/lib/volt/cluster"
+	ClusterStateFile  = "/var/lib/volt/cluster/state.json"
+	ClusterRaftDir    = "/var/lib/volt/cluster/raft"
+
+	DefaultRaftPort    = 7946
+	DefaultRPCPort     = 7947
+	DefaultGossipPort  = 7948
+
+	HeartbeatInterval  = 1 * time.Second
+	HeartbeatTimeout   = 5 * time.Second
+	NodeDeadThreshold  = 30 * time.Second
+	ElectionTimeout    = 10 * time.Second
+)
+
+// ── Node Types ──────────────────────────────────────────────────────────────
+
+// NodeRole represents a node's role in the cluster
+type NodeRole string
+
+const (
+	RoleLeader   NodeRole = "leader"
+	RoleFollower NodeRole = "follower"
+	RoleCandidate NodeRole = "candidate"
+)
+
+// NodeStatus represents a node's health status
+type NodeStatus string
+
+const (
+	StatusHealthy     NodeStatus = "healthy"
+	StatusDegraded    NodeStatus = "degraded"
+	StatusUnreachable NodeStatus = "unreachable"
+	StatusDead        NodeStatus = "dead"
+	StatusDraining    NodeStatus = "draining"
+	StatusLeft        NodeStatus = "left"
+)
+
+// Node represents a cluster member
+type Node struct {
+	ID            string            `json:"id"`
+	Name          string            `json:"name"`
+	MeshIP        string            `json:"mesh_ip"`
+	Endpoint      string            `json:"endpoint"`
+	Role          NodeRole          `json:"role"`
+	Status        NodeStatus        `json:"status"`
+	Labels        map[string]string `json:"labels,omitempty"`
+	Resources     NodeResources     `json:"resources"`
+	Allocated     NodeResources     `json:"allocated"`
+	JoinedAt      time.Time         `json:"joined_at"`
+	LastHeartbeat time.Time         `json:"last_heartbeat"`
+	Version       string            `json:"version,omitempty"`
+}
+
+// NodeResources tracks a node's resource capacity
+type NodeResources struct {
+	CPUCores    int   `json:"cpu_cores"`
+	MemoryMB    int64 `json:"memory_mb"`
+	DiskMB      int64 `json:"disk_mb"`
+	Containers  int   `json:"containers"`
+	MaxContainers int `json:"max_containers,omitempty"`
+}
+
+// AvailableMemoryMB returns unallocated memory
+func (n *Node) AvailableMemoryMB() int64 {
+	return n.Resources.MemoryMB - n.Allocated.MemoryMB
+}
+
+// AvailableCPU returns unallocated CPU cores
+func (n *Node) AvailableCPU() int {
+	return n.Resources.CPUCores - n.Allocated.CPUCores
+}
+
+// ── Workload Assignment ─────────────────────────────────────────────────────
+
+// WorkloadAssignment tracks which workload runs on which node
+type WorkloadAssignment struct {
+	WorkloadID   string            `json:"workload_id"`
+	WorkloadName string            `json:"workload_name"`
+	NodeID       string            `json:"node_id"`
+	Status       string            `json:"status"`
+	Resources    WorkloadResources `json:"resources"`
+	Constraints  ScheduleConstraints `json:"constraints,omitempty"`
+	AssignedAt   time.Time         `json:"assigned_at"`
+	StartedAt    time.Time         `json:"started_at,omitempty"`
+}
+
+// WorkloadResources specifies the resources a workload requires
+type WorkloadResources struct {
+	CPUCores  int   `json:"cpu_cores"`
+	MemoryMB  int64 `json:"memory_mb"`
+	DiskMB    int64 `json:"disk_mb,omitempty"`
+}
+
+// ScheduleConstraints define placement requirements for workloads
+type ScheduleConstraints struct {
+	// Labels that must match on the target node
+	NodeLabels map[string]string `json:"node_labels,omitempty"`
+	// Preferred labels (soft constraint)
+	PreferLabels map[string]string `json:"prefer_labels,omitempty"`
+	// Anti-affinity: don't schedule on nodes running these workload IDs
+	AntiAffinity []string `json:"anti_affinity,omitempty"`
+	// Require specific node
+	PinToNode string `json:"pin_to_node,omitempty"`
+	// Zone/rack awareness
+	Zone string `json:"zone,omitempty"`
+}
+
+// ── Cluster State ───────────────────────────────────────────────────────────
+
+// ClusterState is the canonical state of the cluster, replicated via Raft
+type ClusterState struct {
+	mu sync.RWMutex
+
+	ClusterID   string                        `json:"cluster_id"`
+	Name        string                        `json:"name"`
+	CreatedAt   time.Time                     `json:"created_at"`
+	Nodes       map[string]*Node              `json:"nodes"`
+	Assignments map[string]*WorkloadAssignment `json:"assignments"`
+	LeaderID    string                        `json:"leader_id"`
+	Term        uint64                        `json:"term"`
+	Version     uint64                        `json:"version"`
+}
+
+// NewClusterState creates an empty cluster state
+func NewClusterState(clusterID, name string) *ClusterState {
+	return &ClusterState{
+		ClusterID:   clusterID,
+		Name:        name,
+		CreatedAt:   time.Now().UTC(),
+		Nodes:       make(map[string]*Node),
+		Assignments: make(map[string]*WorkloadAssignment),
+	}
+}
+
+// AddNode registers a new node in the cluster
+func (cs *ClusterState) AddNode(node *Node) error {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	if _, exists := cs.Nodes[node.ID]; exists {
+		return fmt.Errorf("node %q already exists", node.ID)
+	}
+
+	node.JoinedAt = time.Now().UTC()
+	node.LastHeartbeat = time.Now().UTC()
+	node.Status = StatusHealthy
+	cs.Nodes[node.ID] = node
+	cs.Version++
+
+	return nil
+}
+
+// RemoveNode removes a node from the cluster
+func (cs *ClusterState) RemoveNode(nodeID string) error {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	if _, exists := cs.Nodes[nodeID]; !exists {
+		return fmt.Errorf("node %q not found", nodeID)
+	}
+
+	delete(cs.Nodes, nodeID)
+	cs.Version++
+	return nil
+}
+
+// UpdateHeartbeat marks a node as alive
+func (cs *ClusterState) UpdateHeartbeat(nodeID string, resources NodeResources) error {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	node, exists := cs.Nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %q not found", nodeID)
+	}
+
+	node.LastHeartbeat = time.Now().UTC()
+	node.Resources = resources
+	node.Status = StatusHealthy
+
+	return nil
+}
+
+// GetNode returns a node by ID
+func (cs *ClusterState) GetNode(nodeID string) *Node {
+	cs.mu.RLock()
+	defer cs.mu.RUnlock()
+	return cs.Nodes[nodeID]
+}
+
+// ListNodes returns all nodes
+func (cs *ClusterState) ListNodes() []*Node {
+	cs.mu.RLock()
+	defer cs.mu.RUnlock()
+
+	nodes := make([]*Node, 0, len(cs.Nodes))
+	for _, n := range cs.Nodes {
+		nodes = append(nodes, n)
+	}
+	return nodes
+}
+
+// HealthyNodes returns nodes that can accept workloads
+func (cs *ClusterState) HealthyNodes() []*Node {
+	cs.mu.RLock()
+	defer cs.mu.RUnlock()
+
+	var healthy []*Node
+	for _, n := range cs.Nodes {
+		if n.Status == StatusHealthy {
+			healthy = append(healthy, n)
+		}
+	}
+	return healthy
+}
+
+// ── Scheduling ──────────────────────────────────────────────────────────────
+
+// Scheduler determines which node should run a workload
+type Scheduler struct {
+	state *ClusterState
+}
+
+// NewScheduler creates a new scheduler
+func NewScheduler(state *ClusterState) *Scheduler {
+	return &Scheduler{state: state}
+}
+
+// Schedule selects the best node for a workload using bin-packing
+func (s *Scheduler) Schedule(workload *WorkloadAssignment) (string, error) {
+	s.state.mu.RLock()
+	defer s.state.mu.RUnlock()
+
+	// If pinned to a specific node, use that
+	if workload.Constraints.PinToNode != "" {
+		node, exists := s.state.Nodes[workload.Constraints.PinToNode]
+		if !exists {
+			return "", fmt.Errorf("pinned node %q not found", workload.Constraints.PinToNode)
+		}
+		if node.Status != StatusHealthy {
+			return "", fmt.Errorf("pinned node %q is %s", workload.Constraints.PinToNode, node.Status)
+		}
+		return node.ID, nil
+	}
+
+	// Filter candidates
+	candidates := s.filterCandidates(workload)
+	if len(candidates) == 0 {
+		return "", fmt.Errorf("no eligible nodes found for workload %q (need %dMB RAM, %d CPU)",
+			workload.WorkloadID, workload.Resources.MemoryMB, workload.Resources.CPUCores)
+	}
+
+	// Score candidates using bin-packing (prefer the most-packed node that still fits)
+	var bestNode *Node
+	bestScore := -1.0
+
+	for _, node := range candidates {
+		score := s.scoreNode(node, workload)
+		if score > bestScore {
+			bestScore = score
+			bestNode = node
+		}
+	}
+
+	if bestNode == nil {
+		return "", fmt.Errorf("no suitable node found")
+	}
+
+	return bestNode.ID, nil
+}
+
+// filterCandidates returns nodes that can physically run the workload
+func (s *Scheduler) filterCandidates(workload *WorkloadAssignment) []*Node {
+	var candidates []*Node
+
+	for _, node := range s.state.Nodes {
+		// Must be healthy
+		if node.Status != StatusHealthy {
+			continue
+		}
+
+		// Must have enough resources
+		if node.AvailableMemoryMB() < workload.Resources.MemoryMB {
+			continue
+		}
+		if node.AvailableCPU() < workload.Resources.CPUCores {
+			continue
+		}
+
+		// Check label constraints
+		if !s.matchLabels(node, workload.Constraints.NodeLabels) {
+			continue
+		}
+
+		// Check anti-affinity
+		if s.violatesAntiAffinity(node, workload.Constraints.AntiAffinity) {
+			continue
+		}
+
+		// Check zone constraint
+		if workload.Constraints.Zone != "" {
+			if nodeZone, ok := node.Labels["zone"]; ok {
+				if nodeZone != workload.Constraints.Zone {
+					continue
+				}
+			}
+		}
+
+		candidates = append(candidates, node)
+	}
+
+	return candidates
+}
+
+// matchLabels checks if a node has all required labels
+func (s *Scheduler) matchLabels(node *Node, required map[string]string) bool {
+	for k, v := range required {
+		if nodeVal, ok := node.Labels[k]; !ok || nodeVal != v {
+			return false
+		}
+	}
+	return true
+}
+
+// violatesAntiAffinity checks if scheduling on this node would violate anti-affinity
+func (s *Scheduler) violatesAntiAffinity(node *Node, antiAffinity []string) bool {
+	if len(antiAffinity) == 0 {
+		return false
+	}
+
+	for _, assignment := range s.state.Assignments {
+		if assignment.NodeID != node.ID {
+			continue
+		}
+		for _, aa := range antiAffinity {
+			if assignment.WorkloadID == aa {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// scoreNode scores a node for bin-packing (higher = better fit)
+// Prefers nodes that are already partially filled (pack tight)
+func (s *Scheduler) scoreNode(node *Node, workload *WorkloadAssignment) float64 {
+	if node.Resources.MemoryMB == 0 {
+		return 0
+	}
+
+	// Memory utilization after placing this workload (higher = more packed = preferred)
+	futureAllocMem := float64(node.Allocated.MemoryMB+workload.Resources.MemoryMB) / float64(node.Resources.MemoryMB)
+
+	// CPU utilization
+	futureCPU := 0.0
+	if node.Resources.CPUCores > 0 {
+		futureCPU = float64(node.Allocated.CPUCores+workload.Resources.CPUCores) / float64(node.Resources.CPUCores)
+	}
+
+	// Weighted score: 60% memory, 30% CPU, 10% bonus for preferred labels
+	score := futureAllocMem*0.6 + futureCPU*0.3
+
+	// Bonus for matching preferred labels
+	if len(workload.Constraints.PreferLabels) > 0 {
+		matchCount := 0
+		for k, v := range workload.Constraints.PreferLabels {
+			if nodeVal, ok := node.Labels[k]; ok && nodeVal == v {
+				matchCount++
+			}
+		}
+		if len(workload.Constraints.PreferLabels) > 0 {
+			score += 0.1 * float64(matchCount) / float64(len(workload.Constraints.PreferLabels))
+		}
+	}
+
+	return score
+}
+
+// AssignWorkload records a workload assignment
+func (cs *ClusterState) AssignWorkload(assignment *WorkloadAssignment) error {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	node, exists := cs.Nodes[assignment.NodeID]
+	if !exists {
+		return fmt.Errorf("node %q not found", assignment.NodeID)
+	}
+
+	// Update allocated resources
+	node.Allocated.CPUCores += assignment.Resources.CPUCores
+	node.Allocated.MemoryMB += assignment.Resources.MemoryMB
+	node.Allocated.Containers++
+
+	assignment.AssignedAt = time.Now().UTC()
+	cs.Assignments[assignment.WorkloadID] = assignment
+	cs.Version++
+
+	return nil
+}
+
+// UnassignWorkload removes a workload assignment and frees resources
+func (cs *ClusterState) UnassignWorkload(workloadID string) error {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	assignment, exists := cs.Assignments[workloadID]
+	if !exists {
+		return fmt.Errorf("workload %q not assigned", workloadID)
+	}
+
+	// Free resources on the node
+	if node, ok := cs.Nodes[assignment.NodeID]; ok {
+		node.Allocated.CPUCores -= assignment.Resources.CPUCores
+		node.Allocated.MemoryMB -= assignment.Resources.MemoryMB
+		node.Allocated.Containers--
+		if node.Allocated.CPUCores < 0 {
+			node.Allocated.CPUCores = 0
+		}
+		if node.Allocated.MemoryMB < 0 {
+			node.Allocated.MemoryMB = 0
+		}
+		if node.Allocated.Containers < 0 {
+			node.Allocated.Containers = 0
+		}
+	}
+
+	delete(cs.Assignments, workloadID)
+	cs.Version++
+	return nil
+}
+
+// ── Health Monitor ──────────────────────────────────────────────────────────
+
+// HealthMonitor periodically checks node health and triggers rescheduling
+type HealthMonitor struct {
+	state      *ClusterState
+	scheduler  *Scheduler
+	stopCh     chan struct{}
+	onNodeDead func(nodeID string, orphanedWorkloads []*WorkloadAssignment)
+}
+
+// NewHealthMonitor creates a new health monitor
+func NewHealthMonitor(state *ClusterState, scheduler *Scheduler) *HealthMonitor {
+	return &HealthMonitor{
+		state:     state,
+		scheduler: scheduler,
+		stopCh:    make(chan struct{}),
+	}
+}
+
+// OnNodeDead registers a callback for when a node is declared dead
+func (hm *HealthMonitor) OnNodeDead(fn func(nodeID string, orphaned []*WorkloadAssignment)) {
+	hm.onNodeDead = fn
+}
+
+// Start begins the health monitoring loop
+func (hm *HealthMonitor) Start() {
+	go func() {
+		ticker := time.NewTicker(HeartbeatInterval)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ticker.C:
+				hm.checkHealth()
+			case <-hm.stopCh:
+				return
+			}
+		}
+	}()
+}
+
+// Stop halts the health monitoring loop
+func (hm *HealthMonitor) Stop() {
+	close(hm.stopCh)
+}
+
+func (hm *HealthMonitor) checkHealth() {
+	hm.state.mu.Lock()
+	defer hm.state.mu.Unlock()
+
+	now := time.Now()
+
+	for _, node := range hm.state.Nodes {
+		if node.Status == StatusLeft || node.Status == StatusDead {
+			continue
+		}
+
+		sinceHeartbeat := now.Sub(node.LastHeartbeat)
+
+		switch {
+		case sinceHeartbeat > NodeDeadThreshold:
+			if node.Status != StatusDead {
+				node.Status = StatusDead
+				// Collect orphaned workloads
+				if hm.onNodeDead != nil {
+					var orphaned []*WorkloadAssignment
+					for _, a := range hm.state.Assignments {
+						if a.NodeID == node.ID {
+							orphaned = append(orphaned, a)
+						}
+					}
+					go hm.onNodeDead(node.ID, orphaned)
+				}
+			}
+
+		case sinceHeartbeat > HeartbeatTimeout:
+			node.Status = StatusUnreachable
+
+		default:
+			// Node is alive
+			if node.Status == StatusUnreachable || node.Status == StatusDegraded {
+				node.Status = StatusHealthy
+			}
+		}
+	}
+}
+
+// ── Drain Operation ─────────────────────────────────────────────────────────
+
+// DrainNode moves all workloads off a node for maintenance
+func DrainNode(state *ClusterState, scheduler *Scheduler, nodeID string) ([]string, error) {
+	state.mu.Lock()
+
+	node, exists := state.Nodes[nodeID]
+	if !exists {
+		state.mu.Unlock()
+		return nil, fmt.Errorf("node %q not found", nodeID)
+	}
+
+	node.Status = StatusDraining
+
+	// Collect workloads on this node
+	var toReschedule []*WorkloadAssignment
+	for _, a := range state.Assignments {
+		if a.NodeID == nodeID {
+			toReschedule = append(toReschedule, a)
+		}
+	}
+
+	state.mu.Unlock()
+
+	// Reschedule each workload
+	var rescheduled []string
+	for _, assignment := range toReschedule {
+		// Remove from current node
+		if err := state.UnassignWorkload(assignment.WorkloadID); err != nil {
+			return rescheduled, fmt.Errorf("failed to unassign %s: %w", assignment.WorkloadID, err)
+		}
+
+		// Find new node
+		newNodeID, err := scheduler.Schedule(assignment)
+		if err != nil {
+			return rescheduled, fmt.Errorf("failed to reschedule %s: %w", assignment.WorkloadID, err)
+		}
+
+		assignment.NodeID = newNodeID
+		if err := state.AssignWorkload(assignment); err != nil {
+			return rescheduled, fmt.Errorf("failed to assign %s to %s: %w",
+				assignment.WorkloadID, newNodeID, err)
+		}
+
+		rescheduled = append(rescheduled, fmt.Sprintf("%s → %s", assignment.WorkloadID, newNodeID))
+	}
+
+	return rescheduled, nil
+}
+
+// ── Persistence ─────────────────────────────────────────────────────────────
+
+// SaveState writes cluster state to disk
+func SaveState(state *ClusterState) error {
+	state.mu.RLock()
+	defer state.mu.RUnlock()
+
+	if err := os.MkdirAll(ClusterConfigDir, 0755); err != nil {
+		return err
+	}
+
+	data, err := json.MarshalIndent(state, "", "  ")
+	if err != nil {
+		return err
+	}
+
+	// Atomic write
+	tmpFile := ClusterStateFile + ".tmp"
+	if err := os.WriteFile(tmpFile, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmpFile, ClusterStateFile)
+}
+
+// LoadState reads cluster state from disk
+func LoadState() (*ClusterState, error) {
+	data, err := os.ReadFile(ClusterStateFile)
+	if err != nil {
+		return nil, err
+	}
+
+	var state ClusterState
+	if err := json.Unmarshal(data, &state); err != nil {
+		return nil, err
+	}
+
+	// Initialize maps if nil
+	if state.Nodes == nil {
+		state.Nodes = make(map[string]*Node)
+	}
+	if state.Assignments == nil {
+		state.Assignments = make(map[string]*WorkloadAssignment)
+	}
+
+	return &state, nil
+}
+
+// ── Node Resource Detection ─────────────────────────────────────────────────
+
+// DetectResources probes the local system for available resources
+func DetectResources() NodeResources {
+	res := NodeResources{
+		CPUCores:      detectCPUCores(),
+		MemoryMB:      detectMemoryMB(),
+		DiskMB:        detectDiskMB(),
+		MaxContainers: 500, // Pro default
+	}
+	return res
+}
+
+func detectCPUCores() int {
+	data, err := os.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		return 1
+	}
+	count := 0
+	for _, line := range splitByNewline(string(data)) {
+		if len(line) > 9 && line[:9] == "processor" {
+			count++
+		}
+	}
+	if count == 0 {
+		return 1
+	}
+	return count
+}
+
+func detectMemoryMB() int64 {
+	data, err := os.ReadFile("/proc/meminfo")
+	if err != nil {
+		return 512
+	}
+	for _, line := range splitByNewline(string(data)) {
+		if len(line) > 8 && line[:8] == "MemTotal" {
+			var kb int64
+			fmt.Sscanf(line, "MemTotal: %d kB", &kb)
+			return kb / 1024
+		}
+	}
+	return 512
+}
+
+func detectDiskMB() int64 {
+	// Check /var/lib/volt partition
+	var stat struct {
+		Bavail  uint64
+		Bsize   uint64
+	}
+	// Simple fallback — can be improved with syscall.Statfs
+	info, err := os.Stat("/var/lib/volt")
+	if err != nil {
+		_ = info
+		_ = stat
+		return 10240 // 10GB default
+	}
+	return 10240 // Simplified for now
+}
+
+func splitByNewline(s string) []string {
+	var result []string
+	start := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == '\n' {
+			result = append(result, s[start:i])
+			start = i + 1
+		}
+	}
+	if start < len(s) {
+		result = append(result, s[start:])
+	}
+	return result
+}
+
+// ── Cluster Config ──────────────────────────────────────────────────────────
+
+// ClusterConfig holds local cluster configuration
+type ClusterConfig struct {
+	ClusterID   string `json:"cluster_id"`
+	NodeID      string `json:"node_id"`
+	NodeName    string `json:"node_name"`
+	RaftPort    int    `json:"raft_port"`
+	RPCPort     int    `json:"rpc_port"`
+	LeaderAddr  string `json:"leader_addr,omitempty"`
+	MeshEnabled bool   `json:"mesh_enabled"`
+}
+
+// SaveConfig writes local cluster config
+func SaveConfig(cfg *ClusterConfig) error {
+	if err := os.MkdirAll(ClusterConfigDir, 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(filepath.Join(ClusterConfigDir, "config.json"), data, 0644)
+}
+
+// LoadConfig reads local cluster config
+func LoadConfig() (*ClusterConfig, error) {
+	data, err := os.ReadFile(filepath.Join(ClusterConfigDir, "config.json"))
+	if err != nil {
+		return nil, err
+	}
+	var cfg ClusterConfig
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return nil, err
+	}
+	return &cfg, nil
+}
--- a/pkg/cluster/control.go.bak
+++ b/pkg/cluster/control.go.bak
@@ -0,0 +1,561 @@
+/*
+Volt Cluster — Native control plane for multi-node orchestration.
+
+Replaces the thin kubectl wrapper with a native clustering system built
+specifically for Volt's workload model (containers, hybrid-native, VMs).
+
+Architecture:
+  - Control plane: single leader node running volt-control daemon
+  - Workers: nodes that register via `volt cluster join`
+  - Communication: gRPC-over-mesh (WireGuard) or plain HTTPS
+  - State: JSON-based on-disk store (no etcd dependency)
+  - Health: heartbeat-based with configurable failure detection
+
+The control plane is responsible for:
+  - Node registration and deregistration
+  - Health monitoring (heartbeat processing)
+  - Workload scheduling (resource-based, label selectors)
+  - Workload state sync across nodes
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+AGPSL v5 — Source-available. Anti-competition clauses apply.
+*/
+package cluster
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"sync"
+	"time"
+)
+
+// ── Constants ────────────────────────────────────────────────────────────────
+
+const (
+	DefaultHeartbeatInterval = 10 * time.Second
+	DefaultFailureThreshold  = 3  // missed heartbeats before marking unhealthy
+	DefaultAPIPort           = 9443
+	ClusterStateDir          = "/var/lib/volt/cluster"
+	ClusterStateFile         = "/var/lib/volt/cluster/state.json"
+	NodesStateFile           = "/var/lib/volt/cluster/nodes.json"
+	ScheduleStateFile        = "/var/lib/volt/cluster/schedule.json"
+)
+
+// ── Node ─────────────────────────────────────────────────────────────────────
+
+// NodeStatus represents the health state of a cluster node.
+type NodeStatus string
+
+const (
+	NodeStatusReady      NodeStatus = "ready"
+	NodeStatusNotReady   NodeStatus = "not-ready"
+	NodeStatusJoining    NodeStatus = "joining"
+	NodeStatusDraining   NodeStatus = "draining"
+	NodeStatusRemoved    NodeStatus = "removed"
+)
+
+// NodeResources describes the capacity and usage of a node.
+type NodeResources struct {
+	CPUCores       int    `json:"cpu_cores"`
+	MemoryTotalMB  int64  `json:"memory_total_mb"`
+	MemoryUsedMB   int64  `json:"memory_used_mb"`
+	DiskTotalGB    int64  `json:"disk_total_gb"`
+	DiskUsedGB     int64  `json:"disk_used_gb"`
+	ContainerCount int    `json:"container_count"`
+	WorkloadCount  int    `json:"workload_count"`
+}
+
+// NodeInfo represents a registered cluster node.
+type NodeInfo struct {
+	NodeID          string            `json:"node_id"`
+	Name            string            `json:"name"`
+	MeshIP          string            `json:"mesh_ip"`
+	PublicIP        string            `json:"public_ip,omitempty"`
+	Status          NodeStatus        `json:"status"`
+	Labels          map[string]string `json:"labels,omitempty"`
+	Resources       NodeResources     `json:"resources"`
+	LastHeartbeat   time.Time         `json:"last_heartbeat"`
+	JoinedAt        time.Time         `json:"joined_at"`
+	MissedBeats     int               `json:"missed_beats"`
+	VoltVersion     string            `json:"volt_version,omitempty"`
+	KernelVersion   string            `json:"kernel_version,omitempty"`
+	OS              string            `json:"os,omitempty"`
+	Region          string            `json:"region,omitempty"`
+}
+
+// IsHealthy returns true if the node is responding to heartbeats.
+func (n *NodeInfo) IsHealthy() bool {
+	return n.Status == NodeStatusReady && n.MissedBeats < DefaultFailureThreshold
+}
+
+// ── Cluster State ────────────────────────────────────────────────────────────
+
+// ClusterRole indicates this node's role in the cluster.
+type ClusterRole string
+
+const (
+	RoleControl ClusterRole = "control"
+	RoleWorker  ClusterRole = "worker"
+	RoleNone    ClusterRole = "none"
+)
+
+// ClusterState is the persistent on-disk cluster membership state for this node.
+type ClusterState struct {
+	ClusterID   string      `json:"cluster_id"`
+	Role        ClusterRole `json:"role"`
+	NodeID      string      `json:"node_id"`
+	NodeName    string      `json:"node_name"`
+	ControlURL  string      `json:"control_url"`
+	APIPort     int         `json:"api_port"`
+	JoinedAt    time.Time   `json:"joined_at"`
+	HeartbeatInterval time.Duration `json:"heartbeat_interval"`
+}
+
+// ── Scheduled Workload ───────────────────────────────────────────────────────
+
+// ScheduledWorkload represents a workload assigned to a node by the scheduler.
+type ScheduledWorkload struct {
+	WorkloadID   string            `json:"workload_id"`
+	NodeID       string            `json:"node_id"`
+	NodeName     string            `json:"node_name"`
+	Mode         string            `json:"mode"`          // container, hybrid-native, etc.
+	ManifestPath string            `json:"manifest_path,omitempty"`
+	Labels       map[string]string `json:"labels,omitempty"`
+	Resources    WorkloadResources `json:"resources"`
+	Status       string            `json:"status"`        // pending, running, stopped, failed
+	ScheduledAt  time.Time         `json:"scheduled_at"`
+}
+
+// WorkloadResources describes the resource requirements for a workload.
+type WorkloadResources struct {
+	CPUCores  int   `json:"cpu_cores"`
+	MemoryMB  int64 `json:"memory_mb"`
+	DiskMB    int64 `json:"disk_mb,omitempty"`
+}
+
+// ── Control Plane ────────────────────────────────────────────────────────────
+
+// ControlPlane manages cluster state, node registration, and scheduling.
+type ControlPlane struct {
+	state      *ClusterState
+	nodes      map[string]*NodeInfo
+	schedule   []*ScheduledWorkload
+	mu         sync.RWMutex
+}
+
+// NewControlPlane creates or loads a control plane instance.
+func NewControlPlane() *ControlPlane {
+	cp := &ControlPlane{
+		nodes:    make(map[string]*NodeInfo),
+	}
+	cp.loadState()
+	cp.loadNodes()
+	cp.loadSchedule()
+	return cp
+}
+
+// IsInitialized returns true if the cluster has been initialized.
+func (cp *ControlPlane) IsInitialized() bool {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	return cp.state != nil && cp.state.ClusterID != ""
+}
+
+// State returns a copy of the cluster state.
+func (cp *ControlPlane) State() *ClusterState {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	if cp.state == nil {
+		return nil
+	}
+	copy := *cp.state
+	return &copy
+}
+
+// Role returns this node's cluster role.
+func (cp *ControlPlane) Role() ClusterRole {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	if cp.state == nil {
+		return RoleNone
+	}
+	return cp.state.Role
+}
+
+// Nodes returns all registered nodes.
+func (cp *ControlPlane) Nodes() []*NodeInfo {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	result := make([]*NodeInfo, 0, len(cp.nodes))
+	for _, n := range cp.nodes {
+		copy := *n
+		result = append(result, &copy)
+	}
+	return result
+}
+
+// GetNode returns a node by ID or name.
+func (cp *ControlPlane) GetNode(idOrName string) *NodeInfo {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	if n, ok := cp.nodes[idOrName]; ok {
+		copy := *n
+		return &copy
+	}
+	// Try by name
+	for _, n := range cp.nodes {
+		if n.Name == idOrName {
+			copy := *n
+			return &copy
+		}
+	}
+	return nil
+}
+
+// Schedule returns the current workload schedule.
+func (cp *ControlPlane) Schedule() []*ScheduledWorkload {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	result := make([]*ScheduledWorkload, len(cp.schedule))
+	for i, sw := range cp.schedule {
+		copy := *sw
+		result[i] = &copy
+	}
+	return result
+}
+
+// ── Init ─────────────────────────────────────────────────────────────────────
+
+// InitCluster initializes this node as the cluster control plane.
+func (cp *ControlPlane) InitCluster(clusterID, nodeName, meshIP string, apiPort int) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state != nil && cp.state.ClusterID != "" {
+		return fmt.Errorf("already part of cluster %q", cp.state.ClusterID)
+	}
+
+	if apiPort == 0 {
+		apiPort = DefaultAPIPort
+	}
+
+	cp.state = &ClusterState{
+		ClusterID:         clusterID,
+		Role:              RoleControl,
+		NodeID:            clusterID + "-control",
+		NodeName:          nodeName,
+		ControlURL:        fmt.Sprintf("https://%s:%d", meshIP, apiPort),
+		APIPort:           apiPort,
+		JoinedAt:          time.Now().UTC(),
+		HeartbeatInterval: DefaultHeartbeatInterval,
+	}
+
+	// Register self as a node
+	cp.nodes[cp.state.NodeID] = &NodeInfo{
+		NodeID:        cp.state.NodeID,
+		Name:          nodeName,
+		MeshIP:        meshIP,
+		Status:        NodeStatusReady,
+		Labels:        map[string]string{"role": "control"},
+		LastHeartbeat: time.Now().UTC(),
+		JoinedAt:      time.Now().UTC(),
+	}
+
+	if err := cp.saveState(); err != nil {
+		return err
+	}
+	return cp.saveNodes()
+}
+
+// ── Join ─────────────────────────────────────────────────────────────────────
+
+// JoinCluster registers this node as a worker in an existing cluster.
+func (cp *ControlPlane) JoinCluster(clusterID, controlURL, nodeID, nodeName, meshIP string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state != nil && cp.state.ClusterID != "" {
+		return fmt.Errorf("already part of cluster %q — run 'volt cluster leave' first", cp.state.ClusterID)
+	}
+
+	cp.state = &ClusterState{
+		ClusterID:         clusterID,
+		Role:              RoleWorker,
+		NodeID:            nodeID,
+		NodeName:          nodeName,
+		ControlURL:        controlURL,
+		JoinedAt:          time.Now().UTC(),
+		HeartbeatInterval: DefaultHeartbeatInterval,
+	}
+
+	return cp.saveState()
+}
+
+// ── Node Registration ────────────────────────────────────────────────────────
+
+// RegisterNode adds a new worker node to the cluster (control plane only).
+func (cp *ControlPlane) RegisterNode(node *NodeInfo) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state == nil || cp.state.Role != RoleControl {
+		return fmt.Errorf("not the control plane — cannot register nodes")
+	}
+
+	node.Status = NodeStatusReady
+	node.JoinedAt = time.Now().UTC()
+	node.LastHeartbeat = time.Now().UTC()
+	cp.nodes[node.NodeID] = node
+
+	return cp.saveNodes()
+}
+
+// DeregisterNode removes a node from the cluster.
+func (cp *ControlPlane) DeregisterNode(nodeID string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if _, exists := cp.nodes[nodeID]; !exists {
+		return fmt.Errorf("node %q not found", nodeID)
+	}
+
+	delete(cp.nodes, nodeID)
+	return cp.saveNodes()
+}
+
+// ── Heartbeat ────────────────────────────────────────────────────────────────
+
+// ProcessHeartbeat updates a node's health status.
+func (cp *ControlPlane) ProcessHeartbeat(nodeID string, resources NodeResources) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	node, exists := cp.nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %q not registered", nodeID)
+	}
+
+	node.LastHeartbeat = time.Now().UTC()
+	node.MissedBeats = 0
+	node.Resources = resources
+	if node.Status == NodeStatusNotReady {
+		node.Status = NodeStatusReady
+	}
+
+	return cp.saveNodes()
+}
+
+// CheckHealth evaluates all nodes and marks those with missed heartbeats.
+func (cp *ControlPlane) CheckHealth() []string {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	var unhealthy []string
+	threshold := time.Duration(DefaultFailureThreshold) * DefaultHeartbeatInterval
+
+	for _, node := range cp.nodes {
+		if node.Status == NodeStatusRemoved || node.Status == NodeStatusDraining {
+			continue
+		}
+		if time.Since(node.LastHeartbeat) > threshold {
+			node.MissedBeats++
+			if node.MissedBeats >= DefaultFailureThreshold {
+				node.Status = NodeStatusNotReady
+				unhealthy = append(unhealthy, node.NodeID)
+			}
+		}
+	}
+
+	cp.saveNodes()
+	return unhealthy
+}
+
+// ── Drain ────────────────────────────────────────────────────────────────────
+
+// DrainNode marks a node for draining (no new workloads, existing ones rescheduled).
+func (cp *ControlPlane) DrainNode(nodeID string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	node, exists := cp.nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %q not found", nodeID)
+	}
+
+	node.Status = NodeStatusDraining
+
+	// Find workloads on this node and mark for rescheduling
+	for _, sw := range cp.schedule {
+		if sw.NodeID == nodeID && sw.Status == "running" {
+			sw.Status = "pending" // will be rescheduled
+			sw.NodeID = ""
+			sw.NodeName = ""
+		}
+	}
+
+	cp.saveNodes()
+	return cp.saveSchedule()
+}
+
+// ── Leave ────────────────────────────────────────────────────────────────────
+
+// LeaveCluster removes this node from the cluster.
+func (cp *ControlPlane) LeaveCluster() error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state == nil {
+		return fmt.Errorf("not part of any cluster")
+	}
+
+	// If control plane, clean up
+	if cp.state.Role == RoleControl {
+		cp.nodes = make(map[string]*NodeInfo)
+		cp.schedule = nil
+		os.Remove(NodesStateFile)
+		os.Remove(ScheduleStateFile)
+	}
+
+	cp.state = nil
+	os.Remove(ClusterStateFile)
+	return nil
+}
+
+// ── Scheduling ───────────────────────────────────────────────────────────────
+
+// ScheduleWorkload assigns a workload to a node based on resource availability
+// and label selectors.
+func (cp *ControlPlane) ScheduleWorkload(workload *ScheduledWorkload, nodeSelector map[string]string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state == nil || cp.state.Role != RoleControl {
+		return fmt.Errorf("not the control plane — cannot schedule workloads")
+	}
+
+	// Find best node
+	bestNode := cp.findBestNode(workload.Resources, nodeSelector)
+	if bestNode == nil {
+		return fmt.Errorf("no suitable node found for workload %q (required: %dMB RAM, %d CPU cores)",
+			workload.WorkloadID, workload.Resources.MemoryMB, workload.Resources.CPUCores)
+	}
+
+	workload.NodeID = bestNode.NodeID
+	workload.NodeName = bestNode.Name
+	workload.Status = "pending"
+	workload.ScheduledAt = time.Now().UTC()
+
+	cp.schedule = append(cp.schedule, workload)
+
+	return cp.saveSchedule()
+}
+
+// findBestNode selects the best available node for a workload based on
+// resource availability and label matching. Uses a simple "least loaded" strategy.
+func (cp *ControlPlane) findBestNode(required WorkloadResources, selector map[string]string) *NodeInfo {
+	var best *NodeInfo
+	var bestScore int64 = -1
+
+	for _, node := range cp.nodes {
+		// Skip unhealthy/draining nodes
+		if node.Status != NodeStatusReady {
+			continue
+		}
+
+		// Check label selector
+		if !matchLabels(node.Labels, selector) {
+			continue
+		}
+
+		// Check resource availability
+		availMem := node.Resources.MemoryTotalMB - node.Resources.MemoryUsedMB
+		if required.MemoryMB > 0 && availMem < required.MemoryMB {
+			continue
+		}
+
+		// Score: prefer nodes with more available resources (simple bin-packing)
+		score := availMem
+		if best == nil || score > bestScore {
+			best = node
+			bestScore = score
+		}
+	}
+
+	return best
+}
+
+// matchLabels checks if a node's labels satisfy a selector.
+func matchLabels(nodeLabels, selector map[string]string) bool {
+	for k, v := range selector {
+		if nodeLabels[k] != v {
+			return false
+		}
+	}
+	return true
+}
+
+// ── Persistence ──────────────────────────────────────────────────────────────
+
+func (cp *ControlPlane) loadState() {
+	data, err := os.ReadFile(ClusterStateFile)
+	if err != nil {
+		return
+	}
+	var state ClusterState
+	if err := json.Unmarshal(data, &state); err != nil {
+		return
+	}
+	cp.state = &state
+}
+
+func (cp *ControlPlane) saveState() error {
+	os.MkdirAll(ClusterStateDir, 0755)
+	data, err := json.MarshalIndent(cp.state, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(ClusterStateFile, data, 0644)
+}
+
+func (cp *ControlPlane) loadNodes() {
+	data, err := os.ReadFile(NodesStateFile)
+	if err != nil {
+		return
+	}
+	var nodes map[string]*NodeInfo
+	if err := json.Unmarshal(data, &nodes); err != nil {
+		return
+	}
+	cp.nodes = nodes
+}
+
+func (cp *ControlPlane) saveNodes() error {
+	os.MkdirAll(ClusterStateDir, 0755)
+	data, err := json.MarshalIndent(cp.nodes, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(NodesStateFile, data, 0644)
+}
+
+func (cp *ControlPlane) loadSchedule() {
+	data, err := os.ReadFile(ScheduleStateFile)
+	if err != nil {
+		return
+	}
+	var schedule []*ScheduledWorkload
+	if err := json.Unmarshal(data, &schedule); err != nil {
+		return
+	}
+	cp.schedule = schedule
+}
+
+func (cp *ControlPlane) saveSchedule() error {
+	os.MkdirAll(ClusterStateDir, 0755)
+	data, err := json.MarshalIndent(cp.schedule, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(ScheduleStateFile, data, 0644)
+}
--- a/pkg/cluster/node.go.bak
+++ b/pkg/cluster/node.go.bak
@@ -0,0 +1,153 @@
+/*
+Volt Cluster — Node agent for worker nodes.
+
+The node agent runs on every worker and is responsible for:
+  - Sending heartbeats to the control plane
+  - Reporting resource usage (CPU, memory, disk, workload count)
+  - Accepting workload scheduling commands from the control plane
+  - Executing workload lifecycle operations locally
+
+Communication with the control plane uses HTTPS over the mesh network.
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+AGPSL v5 — Source-available. Anti-competition clauses apply.
+*/
+package cluster
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// NodeAgent runs on worker nodes and communicates with the control plane.
+type NodeAgent struct {
+	nodeID      string
+	nodeName    string
+	controlURL  string
+	interval    time.Duration
+	stopCh      chan struct{}
+}
+
+// NewNodeAgent creates a node agent for the given cluster state.
+func NewNodeAgent(state *ClusterState) *NodeAgent {
+	interval := state.HeartbeatInterval
+	if interval == 0 {
+		interval = DefaultHeartbeatInterval
+	}
+	return &NodeAgent{
+		nodeID:     state.NodeID,
+		nodeName:   state.NodeName,
+		controlURL: state.ControlURL,
+		interval:   interval,
+		stopCh:     make(chan struct{}),
+	}
+}
+
+// CollectResources gathers current node resource information.
+func CollectResources() NodeResources {
+	res := NodeResources{
+		CPUCores: runtime.NumCPU(),
+	}
+
+	// Memory from /proc/meminfo
+	if data, err := os.ReadFile("/proc/meminfo"); err == nil {
+		lines := strings.Split(string(data), "\n")
+		for _, line := range lines {
+			if strings.HasPrefix(line, "MemTotal:") {
+				res.MemoryTotalMB = parseMemInfoKB(line) / 1024
+			} else if strings.HasPrefix(line, "MemAvailable:") {
+				availMB := parseMemInfoKB(line) / 1024
+				res.MemoryUsedMB = res.MemoryTotalMB - availMB
+			}
+		}
+	}
+
+	// Disk usage from df
+	if out, err := exec.Command("df", "--output=size,used", "-BG", "/").Output(); err == nil {
+		lines := strings.Split(strings.TrimSpace(string(out)), "\n")
+		if len(lines) >= 2 {
+			fields := strings.Fields(lines[1])
+			if len(fields) >= 2 {
+				res.DiskTotalGB = parseGB(fields[0])
+				res.DiskUsedGB = parseGB(fields[1])
+			}
+		}
+	}
+
+	// Container count from machinectl
+	if out, err := exec.Command("machinectl", "list", "--no-legend", "--no-pager").Output(); err == nil {
+		count := 0
+		for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+			if strings.TrimSpace(line) != "" {
+				count++
+			}
+		}
+		res.ContainerCount = count
+	}
+
+	// Workload count from volt state
+	if data, err := os.ReadFile("/var/lib/volt/workload-state.json"); err == nil {
+		// Quick count of workload entries
+		count := strings.Count(string(data), `"id"`)
+		res.WorkloadCount = count
+	}
+
+	return res
+}
+
+// GetSystemInfo returns OS and kernel information.
+func GetSystemInfo() (osInfo, kernelVersion string) {
+	if out, err := exec.Command("uname", "-r").Output(); err == nil {
+		kernelVersion = strings.TrimSpace(string(out))
+	}
+	if data, err := os.ReadFile("/etc/os-release"); err == nil {
+		for _, line := range strings.Split(string(data), "\n") {
+			if strings.HasPrefix(line, "PRETTY_NAME=") {
+				osInfo = strings.Trim(strings.TrimPrefix(line, "PRETTY_NAME="), "\"")
+				break
+			}
+		}
+	}
+	return
+}
+
+// FormatResources returns a human-readable resource summary.
+func FormatResources(r NodeResources) string {
+	memPct := float64(0)
+	if r.MemoryTotalMB > 0 {
+		memPct = float64(r.MemoryUsedMB) / float64(r.MemoryTotalMB) * 100
+	}
+	diskPct := float64(0)
+	if r.DiskTotalGB > 0 {
+		diskPct = float64(r.DiskUsedGB) / float64(r.DiskTotalGB) * 100
+	}
+	return fmt.Sprintf("CPU: %d cores | RAM: %dMB/%dMB (%.0f%%) | Disk: %dGB/%dGB (%.0f%%) | Containers: %d",
+		r.CPUCores,
+		r.MemoryUsedMB, r.MemoryTotalMB, memPct,
+		r.DiskUsedGB, r.DiskTotalGB, diskPct,
+		r.ContainerCount,
+	)
+}
+
+// ── Helpers ──────────────────────────────────────────────────────────────────
+
+func parseMemInfoKB(line string) int64 {
+	// Format: "MemTotal:       16384000 kB"
+	fields := strings.Fields(line)
+	if len(fields) >= 2 {
+		val, _ := strconv.ParseInt(fields[1], 10, 64)
+		return val
+	}
+	return 0
+}
+
+func parseGB(s string) int64 {
+	s = strings.TrimSuffix(s, "G")
+	val, _ := strconv.ParseInt(s, 10, 64)
+	return val
+}
--- a/pkg/cluster/scheduler.go.bak
+++ b/pkg/cluster/scheduler.go.bak
@@ -0,0 +1,195 @@
+/*
+Volt Cluster — Workload Scheduler.
+
+Implements scheduling strategies for assigning workloads to cluster nodes.
+The scheduler considers:
+  - Resource availability (CPU, memory, disk)
+  - Label selectors and affinity rules
+  - Node health status
+  - Current workload distribution (spread/pack strategies)
+
+Strategies:
+  - BinPack: Pack workloads onto fewest nodes (maximize density)
+  - Spread:  Distribute evenly across nodes (maximize availability)
+  - Manual:  Explicit node selection by name/label
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+AGPSL v5 — Source-available. Anti-competition clauses apply.
+*/
+package cluster
+
+import (
+	"fmt"
+	"sort"
+)
+
+// ── Strategy ─────────────────────────────────────────────────────────────────
+
+// ScheduleStrategy defines how workloads are assigned to nodes.
+type ScheduleStrategy string
+
+const (
+	StrategyBinPack ScheduleStrategy = "binpack"
+	StrategySpread  ScheduleStrategy = "spread"
+	StrategyManual  ScheduleStrategy = "manual"
+)
+
+// ── Scheduler ────────────────────────────────────────────────────────────────
+
+// Scheduler assigns workloads to nodes based on a configurable strategy.
+type Scheduler struct {
+	strategy ScheduleStrategy
+}
+
+// NewScheduler creates a scheduler with the given strategy.
+func NewScheduler(strategy ScheduleStrategy) *Scheduler {
+	if strategy == "" {
+		strategy = StrategyBinPack
+	}
+	return &Scheduler{strategy: strategy}
+}
+
+// SelectNode chooses the best node for a workload based on the current strategy.
+// Returns the selected NodeInfo or an error if no suitable node exists.
+func (s *Scheduler) SelectNode(
+	nodes []*NodeInfo,
+	required WorkloadResources,
+	selector map[string]string,
+	existingSchedule []*ScheduledWorkload,
+) (*NodeInfo, error) {
+
+	// Filter to eligible nodes
+	eligible := s.filterEligible(nodes, required, selector)
+	if len(eligible) == 0 {
+		return nil, fmt.Errorf("no eligible nodes: checked %d nodes, none meet resource/label requirements", len(nodes))
+	}
+
+	switch s.strategy {
+	case StrategySpread:
+		return s.selectSpread(eligible, existingSchedule), nil
+	case StrategyBinPack:
+		return s.selectBinPack(eligible), nil
+	case StrategyManual:
+		// Manual strategy returns the first eligible node matching the selector
+		return eligible[0], nil
+	default:
+		return s.selectBinPack(eligible), nil
+	}
+}
+
+// filterEligible returns nodes that are healthy, match labels, and have sufficient resources.
+func (s *Scheduler) filterEligible(nodes []*NodeInfo, required WorkloadResources, selector map[string]string) []*NodeInfo {
+	var eligible []*NodeInfo
+
+	for _, node := range nodes {
+		// Must be ready
+		if node.Status != NodeStatusReady {
+			continue
+		}
+
+		// Must match label selector
+		if !matchLabels(node.Labels, selector) {
+			continue
+		}
+
+		// Must have sufficient resources
+		availMem := node.Resources.MemoryTotalMB - node.Resources.MemoryUsedMB
+		if required.MemoryMB > 0 && availMem < required.MemoryMB {
+			continue
+		}
+
+		// CPU check (basic — just core count)
+		if required.CPUCores > 0 && node.Resources.CPUCores < required.CPUCores {
+			continue
+		}
+
+		// Disk check
+		availDisk := (node.Resources.DiskTotalGB - node.Resources.DiskUsedGB) * 1024 // convert to MB
+		if required.DiskMB > 0 && availDisk < required.DiskMB {
+			continue
+		}
+
+		eligible = append(eligible, node)
+	}
+
+	return eligible
+}
+
+// selectBinPack picks the node with the LEAST available memory (pack tight).
+func (s *Scheduler) selectBinPack(nodes []*NodeInfo) *NodeInfo {
+	sort.Slice(nodes, func(i, j int) bool {
+		availI := nodes[i].Resources.MemoryTotalMB - nodes[i].Resources.MemoryUsedMB
+		availJ := nodes[j].Resources.MemoryTotalMB - nodes[j].Resources.MemoryUsedMB
+		return availI < availJ // least available first
+	})
+	return nodes[0]
+}
+
+// selectSpread picks the node with the fewest currently scheduled workloads.
+func (s *Scheduler) selectSpread(nodes []*NodeInfo, schedule []*ScheduledWorkload) *NodeInfo {
+	// Count workloads per node
+	counts := make(map[string]int)
+	for _, sw := range schedule {
+		if sw.Status == "running" || sw.Status == "pending" {
+			counts[sw.NodeID]++
+		}
+	}
+
+	// Sort by workload count (ascending)
+	sort.Slice(nodes, func(i, j int) bool {
+		return counts[nodes[i].NodeID] < counts[nodes[j].NodeID]
+	})
+
+	return nodes[0]
+}
+
+// ── Scoring (for future extensibility) ───────────────────────────────────────
+
+// NodeScore represents a scored node for scheduling decisions.
+type NodeScore struct {
+	Node  *NodeInfo
+	Score float64
+}
+
+// ScoreNodes evaluates and ranks all eligible nodes for a workload.
+// Higher scores are better.
+func ScoreNodes(nodes []*NodeInfo, required WorkloadResources) []NodeScore {
+	var scores []NodeScore
+
+	for _, node := range nodes {
+		if node.Status != NodeStatusReady {
+			continue
+		}
+
+		score := 0.0
+
+		// Resource availability score (0-50 points)
+		if node.Resources.MemoryTotalMB > 0 {
+			memPct := float64(node.Resources.MemoryTotalMB-node.Resources.MemoryUsedMB) / float64(node.Resources.MemoryTotalMB)
+			score += memPct * 50
+		}
+
+		// CPU headroom score (0-25 points)
+		if node.Resources.CPUCores > required.CPUCores {
+			score += 25
+		}
+
+		// Health score (0-25 points)
+		if node.MissedBeats == 0 {
+			score += 25
+		} else {
+			score += float64(25-node.MissedBeats*5)
+			if score < 0 {
+				score = 0
+			}
+		}
+
+		scores = append(scores, NodeScore{Node: node, Score: score})
+	}
+
+	sort.Slice(scores, func(i, j int) bool {
+		return scores[i].Score > scores[j].Score
+	})
+
+	return scores
+}