Volt CLI: source-available under AGPSL v5.0

Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
2026-03-21 00:30:23 -05:00
commit 81ad0b597c
106 changed files with 35984 additions and 0 deletions
--- a/pkg/cluster/control.go.bak
+++ b/pkg/cluster/control.go.bak
@@ -0,0 +1,561 @@
+/*
+Volt Cluster — Native control plane for multi-node orchestration.
+
+Replaces the thin kubectl wrapper with a native clustering system built
+specifically for Volt's workload model (containers, hybrid-native, VMs).
+
+Architecture:
+  - Control plane: single leader node running volt-control daemon
+  - Workers: nodes that register via `volt cluster join`
+  - Communication: gRPC-over-mesh (WireGuard) or plain HTTPS
+  - State: JSON-based on-disk store (no etcd dependency)
+  - Health: heartbeat-based with configurable failure detection
+
+The control plane is responsible for:
+  - Node registration and deregistration
+  - Health monitoring (heartbeat processing)
+  - Workload scheduling (resource-based, label selectors)
+  - Workload state sync across nodes
+
+Copyright (c) Armored Gates LLC. All rights reserved.
+AGPSL v5 — Source-available. Anti-competition clauses apply.
+*/
+package cluster
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"sync"
+	"time"
+)
+
+// ── Constants ────────────────────────────────────────────────────────────────
+
+const (
+	DefaultHeartbeatInterval = 10 * time.Second
+	DefaultFailureThreshold  = 3  // missed heartbeats before marking unhealthy
+	DefaultAPIPort           = 9443
+	ClusterStateDir          = "/var/lib/volt/cluster"
+	ClusterStateFile         = "/var/lib/volt/cluster/state.json"
+	NodesStateFile           = "/var/lib/volt/cluster/nodes.json"
+	ScheduleStateFile        = "/var/lib/volt/cluster/schedule.json"
+)
+
+// ── Node ─────────────────────────────────────────────────────────────────────
+
+// NodeStatus represents the health state of a cluster node.
+type NodeStatus string
+
+const (
+	NodeStatusReady      NodeStatus = "ready"
+	NodeStatusNotReady   NodeStatus = "not-ready"
+	NodeStatusJoining    NodeStatus = "joining"
+	NodeStatusDraining   NodeStatus = "draining"
+	NodeStatusRemoved    NodeStatus = "removed"
+)
+
+// NodeResources describes the capacity and usage of a node.
+type NodeResources struct {
+	CPUCores       int    `json:"cpu_cores"`
+	MemoryTotalMB  int64  `json:"memory_total_mb"`
+	MemoryUsedMB   int64  `json:"memory_used_mb"`
+	DiskTotalGB    int64  `json:"disk_total_gb"`
+	DiskUsedGB     int64  `json:"disk_used_gb"`
+	ContainerCount int    `json:"container_count"`
+	WorkloadCount  int    `json:"workload_count"`
+}
+
+// NodeInfo represents a registered cluster node.
+type NodeInfo struct {
+	NodeID          string            `json:"node_id"`
+	Name            string            `json:"name"`
+	MeshIP          string            `json:"mesh_ip"`
+	PublicIP        string            `json:"public_ip,omitempty"`
+	Status          NodeStatus        `json:"status"`
+	Labels          map[string]string `json:"labels,omitempty"`
+	Resources       NodeResources     `json:"resources"`
+	LastHeartbeat   time.Time         `json:"last_heartbeat"`
+	JoinedAt        time.Time         `json:"joined_at"`
+	MissedBeats     int               `json:"missed_beats"`
+	VoltVersion     string            `json:"volt_version,omitempty"`
+	KernelVersion   string            `json:"kernel_version,omitempty"`
+	OS              string            `json:"os,omitempty"`
+	Region          string            `json:"region,omitempty"`
+}
+
+// IsHealthy returns true if the node is responding to heartbeats.
+func (n *NodeInfo) IsHealthy() bool {
+	return n.Status == NodeStatusReady && n.MissedBeats < DefaultFailureThreshold
+}
+
+// ── Cluster State ────────────────────────────────────────────────────────────
+
+// ClusterRole indicates this node's role in the cluster.
+type ClusterRole string
+
+const (
+	RoleControl ClusterRole = "control"
+	RoleWorker  ClusterRole = "worker"
+	RoleNone    ClusterRole = "none"
+)
+
+// ClusterState is the persistent on-disk cluster membership state for this node.
+type ClusterState struct {
+	ClusterID   string      `json:"cluster_id"`
+	Role        ClusterRole `json:"role"`
+	NodeID      string      `json:"node_id"`
+	NodeName    string      `json:"node_name"`
+	ControlURL  string      `json:"control_url"`
+	APIPort     int         `json:"api_port"`
+	JoinedAt    time.Time   `json:"joined_at"`
+	HeartbeatInterval time.Duration `json:"heartbeat_interval"`
+}
+
+// ── Scheduled Workload ───────────────────────────────────────────────────────
+
+// ScheduledWorkload represents a workload assigned to a node by the scheduler.
+type ScheduledWorkload struct {
+	WorkloadID   string            `json:"workload_id"`
+	NodeID       string            `json:"node_id"`
+	NodeName     string            `json:"node_name"`
+	Mode         string            `json:"mode"`          // container, hybrid-native, etc.
+	ManifestPath string            `json:"manifest_path,omitempty"`
+	Labels       map[string]string `json:"labels,omitempty"`
+	Resources    WorkloadResources `json:"resources"`
+	Status       string            `json:"status"`        // pending, running, stopped, failed
+	ScheduledAt  time.Time         `json:"scheduled_at"`
+}
+
+// WorkloadResources describes the resource requirements for a workload.
+type WorkloadResources struct {
+	CPUCores  int   `json:"cpu_cores"`
+	MemoryMB  int64 `json:"memory_mb"`
+	DiskMB    int64 `json:"disk_mb,omitempty"`
+}
+
+// ── Control Plane ────────────────────────────────────────────────────────────
+
+// ControlPlane manages cluster state, node registration, and scheduling.
+type ControlPlane struct {
+	state      *ClusterState
+	nodes      map[string]*NodeInfo
+	schedule   []*ScheduledWorkload
+	mu         sync.RWMutex
+}
+
+// NewControlPlane creates or loads a control plane instance.
+func NewControlPlane() *ControlPlane {
+	cp := &ControlPlane{
+		nodes:    make(map[string]*NodeInfo),
+	}
+	cp.loadState()
+	cp.loadNodes()
+	cp.loadSchedule()
+	return cp
+}
+
+// IsInitialized returns true if the cluster has been initialized.
+func (cp *ControlPlane) IsInitialized() bool {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	return cp.state != nil && cp.state.ClusterID != ""
+}
+
+// State returns a copy of the cluster state.
+func (cp *ControlPlane) State() *ClusterState {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	if cp.state == nil {
+		return nil
+	}
+	copy := *cp.state
+	return &copy
+}
+
+// Role returns this node's cluster role.
+func (cp *ControlPlane) Role() ClusterRole {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	if cp.state == nil {
+		return RoleNone
+	}
+	return cp.state.Role
+}
+
+// Nodes returns all registered nodes.
+func (cp *ControlPlane) Nodes() []*NodeInfo {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	result := make([]*NodeInfo, 0, len(cp.nodes))
+	for _, n := range cp.nodes {
+		copy := *n
+		result = append(result, &copy)
+	}
+	return result
+}
+
+// GetNode returns a node by ID or name.
+func (cp *ControlPlane) GetNode(idOrName string) *NodeInfo {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	if n, ok := cp.nodes[idOrName]; ok {
+		copy := *n
+		return &copy
+	}
+	// Try by name
+	for _, n := range cp.nodes {
+		if n.Name == idOrName {
+			copy := *n
+			return &copy
+		}
+	}
+	return nil
+}
+
+// Schedule returns the current workload schedule.
+func (cp *ControlPlane) Schedule() []*ScheduledWorkload {
+	cp.mu.RLock()
+	defer cp.mu.RUnlock()
+	result := make([]*ScheduledWorkload, len(cp.schedule))
+	for i, sw := range cp.schedule {
+		copy := *sw
+		result[i] = &copy
+	}
+	return result
+}
+
+// ── Init ─────────────────────────────────────────────────────────────────────
+
+// InitCluster initializes this node as the cluster control plane.
+func (cp *ControlPlane) InitCluster(clusterID, nodeName, meshIP string, apiPort int) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state != nil && cp.state.ClusterID != "" {
+		return fmt.Errorf("already part of cluster %q", cp.state.ClusterID)
+	}
+
+	if apiPort == 0 {
+		apiPort = DefaultAPIPort
+	}
+
+	cp.state = &ClusterState{
+		ClusterID:         clusterID,
+		Role:              RoleControl,
+		NodeID:            clusterID + "-control",
+		NodeName:          nodeName,
+		ControlURL:        fmt.Sprintf("https://%s:%d", meshIP, apiPort),
+		APIPort:           apiPort,
+		JoinedAt:          time.Now().UTC(),
+		HeartbeatInterval: DefaultHeartbeatInterval,
+	}
+
+	// Register self as a node
+	cp.nodes[cp.state.NodeID] = &NodeInfo{
+		NodeID:        cp.state.NodeID,
+		Name:          nodeName,
+		MeshIP:        meshIP,
+		Status:        NodeStatusReady,
+		Labels:        map[string]string{"role": "control"},
+		LastHeartbeat: time.Now().UTC(),
+		JoinedAt:      time.Now().UTC(),
+	}
+
+	if err := cp.saveState(); err != nil {
+		return err
+	}
+	return cp.saveNodes()
+}
+
+// ── Join ─────────────────────────────────────────────────────────────────────
+
+// JoinCluster registers this node as a worker in an existing cluster.
+func (cp *ControlPlane) JoinCluster(clusterID, controlURL, nodeID, nodeName, meshIP string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state != nil && cp.state.ClusterID != "" {
+		return fmt.Errorf("already part of cluster %q — run 'volt cluster leave' first", cp.state.ClusterID)
+	}
+
+	cp.state = &ClusterState{
+		ClusterID:         clusterID,
+		Role:              RoleWorker,
+		NodeID:            nodeID,
+		NodeName:          nodeName,
+		ControlURL:        controlURL,
+		JoinedAt:          time.Now().UTC(),
+		HeartbeatInterval: DefaultHeartbeatInterval,
+	}
+
+	return cp.saveState()
+}
+
+// ── Node Registration ────────────────────────────────────────────────────────
+
+// RegisterNode adds a new worker node to the cluster (control plane only).
+func (cp *ControlPlane) RegisterNode(node *NodeInfo) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state == nil || cp.state.Role != RoleControl {
+		return fmt.Errorf("not the control plane — cannot register nodes")
+	}
+
+	node.Status = NodeStatusReady
+	node.JoinedAt = time.Now().UTC()
+	node.LastHeartbeat = time.Now().UTC()
+	cp.nodes[node.NodeID] = node
+
+	return cp.saveNodes()
+}
+
+// DeregisterNode removes a node from the cluster.
+func (cp *ControlPlane) DeregisterNode(nodeID string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if _, exists := cp.nodes[nodeID]; !exists {
+		return fmt.Errorf("node %q not found", nodeID)
+	}
+
+	delete(cp.nodes, nodeID)
+	return cp.saveNodes()
+}
+
+// ── Heartbeat ────────────────────────────────────────────────────────────────
+
+// ProcessHeartbeat updates a node's health status.
+func (cp *ControlPlane) ProcessHeartbeat(nodeID string, resources NodeResources) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	node, exists := cp.nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %q not registered", nodeID)
+	}
+
+	node.LastHeartbeat = time.Now().UTC()
+	node.MissedBeats = 0
+	node.Resources = resources
+	if node.Status == NodeStatusNotReady {
+		node.Status = NodeStatusReady
+	}
+
+	return cp.saveNodes()
+}
+
+// CheckHealth evaluates all nodes and marks those with missed heartbeats.
+func (cp *ControlPlane) CheckHealth() []string {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	var unhealthy []string
+	threshold := time.Duration(DefaultFailureThreshold) * DefaultHeartbeatInterval
+
+	for _, node := range cp.nodes {
+		if node.Status == NodeStatusRemoved || node.Status == NodeStatusDraining {
+			continue
+		}
+		if time.Since(node.LastHeartbeat) > threshold {
+			node.MissedBeats++
+			if node.MissedBeats >= DefaultFailureThreshold {
+				node.Status = NodeStatusNotReady
+				unhealthy = append(unhealthy, node.NodeID)
+			}
+		}
+	}
+
+	cp.saveNodes()
+	return unhealthy
+}
+
+// ── Drain ────────────────────────────────────────────────────────────────────
+
+// DrainNode marks a node for draining (no new workloads, existing ones rescheduled).
+func (cp *ControlPlane) DrainNode(nodeID string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	node, exists := cp.nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %q not found", nodeID)
+	}
+
+	node.Status = NodeStatusDraining
+
+	// Find workloads on this node and mark for rescheduling
+	for _, sw := range cp.schedule {
+		if sw.NodeID == nodeID && sw.Status == "running" {
+			sw.Status = "pending" // will be rescheduled
+			sw.NodeID = ""
+			sw.NodeName = ""
+		}
+	}
+
+	cp.saveNodes()
+	return cp.saveSchedule()
+}
+
+// ── Leave ────────────────────────────────────────────────────────────────────
+
+// LeaveCluster removes this node from the cluster.
+func (cp *ControlPlane) LeaveCluster() error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state == nil {
+		return fmt.Errorf("not part of any cluster")
+	}
+
+	// If control plane, clean up
+	if cp.state.Role == RoleControl {
+		cp.nodes = make(map[string]*NodeInfo)
+		cp.schedule = nil
+		os.Remove(NodesStateFile)
+		os.Remove(ScheduleStateFile)
+	}
+
+	cp.state = nil
+	os.Remove(ClusterStateFile)
+	return nil
+}
+
+// ── Scheduling ───────────────────────────────────────────────────────────────
+
+// ScheduleWorkload assigns a workload to a node based on resource availability
+// and label selectors.
+func (cp *ControlPlane) ScheduleWorkload(workload *ScheduledWorkload, nodeSelector map[string]string) error {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+
+	if cp.state == nil || cp.state.Role != RoleControl {
+		return fmt.Errorf("not the control plane — cannot schedule workloads")
+	}
+
+	// Find best node
+	bestNode := cp.findBestNode(workload.Resources, nodeSelector)
+	if bestNode == nil {
+		return fmt.Errorf("no suitable node found for workload %q (required: %dMB RAM, %d CPU cores)",
+			workload.WorkloadID, workload.Resources.MemoryMB, workload.Resources.CPUCores)
+	}
+
+	workload.NodeID = bestNode.NodeID
+	workload.NodeName = bestNode.Name
+	workload.Status = "pending"
+	workload.ScheduledAt = time.Now().UTC()
+
+	cp.schedule = append(cp.schedule, workload)
+
+	return cp.saveSchedule()
+}
+
+// findBestNode selects the best available node for a workload based on
+// resource availability and label matching. Uses a simple "least loaded" strategy.
+func (cp *ControlPlane) findBestNode(required WorkloadResources, selector map[string]string) *NodeInfo {
+	var best *NodeInfo
+	var bestScore int64 = -1
+
+	for _, node := range cp.nodes {
+		// Skip unhealthy/draining nodes
+		if node.Status != NodeStatusReady {
+			continue
+		}
+
+		// Check label selector
+		if !matchLabels(node.Labels, selector) {
+			continue
+		}
+
+		// Check resource availability
+		availMem := node.Resources.MemoryTotalMB - node.Resources.MemoryUsedMB
+		if required.MemoryMB > 0 && availMem < required.MemoryMB {
+			continue
+		}
+
+		// Score: prefer nodes with more available resources (simple bin-packing)
+		score := availMem
+		if best == nil || score > bestScore {
+			best = node
+			bestScore = score
+		}
+	}
+
+	return best
+}
+
+// matchLabels checks if a node's labels satisfy a selector.
+func matchLabels(nodeLabels, selector map[string]string) bool {
+	for k, v := range selector {
+		if nodeLabels[k] != v {
+			return false
+		}
+	}
+	return true
+}
+
+// ── Persistence ──────────────────────────────────────────────────────────────
+
+func (cp *ControlPlane) loadState() {
+	data, err := os.ReadFile(ClusterStateFile)
+	if err != nil {
+		return
+	}
+	var state ClusterState
+	if err := json.Unmarshal(data, &state); err != nil {
+		return
+	}
+	cp.state = &state
+}
+
+func (cp *ControlPlane) saveState() error {
+	os.MkdirAll(ClusterStateDir, 0755)
+	data, err := json.MarshalIndent(cp.state, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(ClusterStateFile, data, 0644)
+}
+
+func (cp *ControlPlane) loadNodes() {
+	data, err := os.ReadFile(NodesStateFile)
+	if err != nil {
+		return
+	}
+	var nodes map[string]*NodeInfo
+	if err := json.Unmarshal(data, &nodes); err != nil {
+		return
+	}
+	cp.nodes = nodes
+}
+
+func (cp *ControlPlane) saveNodes() error {
+	os.MkdirAll(ClusterStateDir, 0755)
+	data, err := json.MarshalIndent(cp.nodes, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(NodesStateFile, data, 0644)
+}
+
+func (cp *ControlPlane) loadSchedule() {
+	data, err := os.ReadFile(ScheduleStateFile)
+	if err != nil {
+		return
+	}
+	var schedule []*ScheduledWorkload
+	if err := json.Unmarshal(data, &schedule); err != nil {
+		return
+	}
+	cp.schedule = schedule
+}
+
+func (cp *ControlPlane) saveSchedule() error {
+	os.MkdirAll(ClusterStateDir, 0755)
+	data, err := json.MarshalIndent(cp.schedule, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(ScheduleStateFile, data, 0644)
+}