Volt CLI: source-available under AGPSL v5.0
Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
This commit is contained in:
561
pkg/cluster/control.go.bak
Normal file
561
pkg/cluster/control.go.bak
Normal file
@@ -0,0 +1,561 @@
|
||||
/*
|
||||
Volt Cluster — Native control plane for multi-node orchestration.
|
||||
|
||||
Replaces the thin kubectl wrapper with a native clustering system built
|
||||
specifically for Volt's workload model (containers, hybrid-native, VMs).
|
||||
|
||||
Architecture:
|
||||
- Control plane: single leader node running volt-control daemon
|
||||
- Workers: nodes that register via `volt cluster join`
|
||||
- Communication: gRPC-over-mesh (WireGuard) or plain HTTPS
|
||||
- State: JSON-based on-disk store (no etcd dependency)
|
||||
- Health: heartbeat-based with configurable failure detection
|
||||
|
||||
The control plane is responsible for:
|
||||
- Node registration and deregistration
|
||||
- Health monitoring (heartbeat processing)
|
||||
- Workload scheduling (resource-based, label selectors)
|
||||
- Workload state sync across nodes
|
||||
|
||||
Copyright (c) Armored Gates LLC. All rights reserved.
|
||||
AGPSL v5 — Source-available. Anti-competition clauses apply.
|
||||
*/
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
const (
|
||||
DefaultHeartbeatInterval = 10 * time.Second
|
||||
DefaultFailureThreshold = 3 // missed heartbeats before marking unhealthy
|
||||
DefaultAPIPort = 9443
|
||||
ClusterStateDir = "/var/lib/volt/cluster"
|
||||
ClusterStateFile = "/var/lib/volt/cluster/state.json"
|
||||
NodesStateFile = "/var/lib/volt/cluster/nodes.json"
|
||||
ScheduleStateFile = "/var/lib/volt/cluster/schedule.json"
|
||||
)
|
||||
|
||||
// ── Node ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
// NodeStatus represents the health state of a cluster node.
|
||||
type NodeStatus string
|
||||
|
||||
const (
|
||||
NodeStatusReady NodeStatus = "ready"
|
||||
NodeStatusNotReady NodeStatus = "not-ready"
|
||||
NodeStatusJoining NodeStatus = "joining"
|
||||
NodeStatusDraining NodeStatus = "draining"
|
||||
NodeStatusRemoved NodeStatus = "removed"
|
||||
)
|
||||
|
||||
// NodeResources describes the capacity and usage of a node.
|
||||
type NodeResources struct {
|
||||
CPUCores int `json:"cpu_cores"`
|
||||
MemoryTotalMB int64 `json:"memory_total_mb"`
|
||||
MemoryUsedMB int64 `json:"memory_used_mb"`
|
||||
DiskTotalGB int64 `json:"disk_total_gb"`
|
||||
DiskUsedGB int64 `json:"disk_used_gb"`
|
||||
ContainerCount int `json:"container_count"`
|
||||
WorkloadCount int `json:"workload_count"`
|
||||
}
|
||||
|
||||
// NodeInfo represents a registered cluster node.
|
||||
type NodeInfo struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Name string `json:"name"`
|
||||
MeshIP string `json:"mesh_ip"`
|
||||
PublicIP string `json:"public_ip,omitempty"`
|
||||
Status NodeStatus `json:"status"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Resources NodeResources `json:"resources"`
|
||||
LastHeartbeat time.Time `json:"last_heartbeat"`
|
||||
JoinedAt time.Time `json:"joined_at"`
|
||||
MissedBeats int `json:"missed_beats"`
|
||||
VoltVersion string `json:"volt_version,omitempty"`
|
||||
KernelVersion string `json:"kernel_version,omitempty"`
|
||||
OS string `json:"os,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
}
|
||||
|
||||
// IsHealthy returns true if the node is responding to heartbeats.
|
||||
func (n *NodeInfo) IsHealthy() bool {
|
||||
return n.Status == NodeStatusReady && n.MissedBeats < DefaultFailureThreshold
|
||||
}
|
||||
|
||||
// ── Cluster State ────────────────────────────────────────────────────────────
|
||||
|
||||
// ClusterRole indicates this node's role in the cluster.
|
||||
type ClusterRole string
|
||||
|
||||
const (
|
||||
RoleControl ClusterRole = "control"
|
||||
RoleWorker ClusterRole = "worker"
|
||||
RoleNone ClusterRole = "none"
|
||||
)
|
||||
|
||||
// ClusterState is the persistent on-disk cluster membership state for this node.
|
||||
type ClusterState struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Role ClusterRole `json:"role"`
|
||||
NodeID string `json:"node_id"`
|
||||
NodeName string `json:"node_name"`
|
||||
ControlURL string `json:"control_url"`
|
||||
APIPort int `json:"api_port"`
|
||||
JoinedAt time.Time `json:"joined_at"`
|
||||
HeartbeatInterval time.Duration `json:"heartbeat_interval"`
|
||||
}
|
||||
|
||||
// ── Scheduled Workload ───────────────────────────────────────────────────────
|
||||
|
||||
// ScheduledWorkload represents a workload assigned to a node by the scheduler.
|
||||
type ScheduledWorkload struct {
|
||||
WorkloadID string `json:"workload_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
NodeName string `json:"node_name"`
|
||||
Mode string `json:"mode"` // container, hybrid-native, etc.
|
||||
ManifestPath string `json:"manifest_path,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Resources WorkloadResources `json:"resources"`
|
||||
Status string `json:"status"` // pending, running, stopped, failed
|
||||
ScheduledAt time.Time `json:"scheduled_at"`
|
||||
}
|
||||
|
||||
// WorkloadResources describes the resource requirements for a workload.
|
||||
type WorkloadResources struct {
|
||||
CPUCores int `json:"cpu_cores"`
|
||||
MemoryMB int64 `json:"memory_mb"`
|
||||
DiskMB int64 `json:"disk_mb,omitempty"`
|
||||
}
|
||||
|
||||
// ── Control Plane ────────────────────────────────────────────────────────────
|
||||
|
||||
// ControlPlane manages cluster state, node registration, and scheduling.
|
||||
type ControlPlane struct {
|
||||
state *ClusterState
|
||||
nodes map[string]*NodeInfo
|
||||
schedule []*ScheduledWorkload
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewControlPlane creates or loads a control plane instance.
|
||||
func NewControlPlane() *ControlPlane {
|
||||
cp := &ControlPlane{
|
||||
nodes: make(map[string]*NodeInfo),
|
||||
}
|
||||
cp.loadState()
|
||||
cp.loadNodes()
|
||||
cp.loadSchedule()
|
||||
return cp
|
||||
}
|
||||
|
||||
// IsInitialized returns true if the cluster has been initialized.
|
||||
func (cp *ControlPlane) IsInitialized() bool {
|
||||
cp.mu.RLock()
|
||||
defer cp.mu.RUnlock()
|
||||
return cp.state != nil && cp.state.ClusterID != ""
|
||||
}
|
||||
|
||||
// State returns a copy of the cluster state.
|
||||
func (cp *ControlPlane) State() *ClusterState {
|
||||
cp.mu.RLock()
|
||||
defer cp.mu.RUnlock()
|
||||
if cp.state == nil {
|
||||
return nil
|
||||
}
|
||||
copy := *cp.state
|
||||
return ©
|
||||
}
|
||||
|
||||
// Role returns this node's cluster role.
|
||||
func (cp *ControlPlane) Role() ClusterRole {
|
||||
cp.mu.RLock()
|
||||
defer cp.mu.RUnlock()
|
||||
if cp.state == nil {
|
||||
return RoleNone
|
||||
}
|
||||
return cp.state.Role
|
||||
}
|
||||
|
||||
// Nodes returns all registered nodes.
|
||||
func (cp *ControlPlane) Nodes() []*NodeInfo {
|
||||
cp.mu.RLock()
|
||||
defer cp.mu.RUnlock()
|
||||
result := make([]*NodeInfo, 0, len(cp.nodes))
|
||||
for _, n := range cp.nodes {
|
||||
copy := *n
|
||||
result = append(result, ©)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// GetNode returns a node by ID or name.
|
||||
func (cp *ControlPlane) GetNode(idOrName string) *NodeInfo {
|
||||
cp.mu.RLock()
|
||||
defer cp.mu.RUnlock()
|
||||
if n, ok := cp.nodes[idOrName]; ok {
|
||||
copy := *n
|
||||
return ©
|
||||
}
|
||||
// Try by name
|
||||
for _, n := range cp.nodes {
|
||||
if n.Name == idOrName {
|
||||
copy := *n
|
||||
return ©
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Schedule returns the current workload schedule.
|
||||
func (cp *ControlPlane) Schedule() []*ScheduledWorkload {
|
||||
cp.mu.RLock()
|
||||
defer cp.mu.RUnlock()
|
||||
result := make([]*ScheduledWorkload, len(cp.schedule))
|
||||
for i, sw := range cp.schedule {
|
||||
copy := *sw
|
||||
result[i] = ©
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ── Init ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
// InitCluster initializes this node as the cluster control plane.
|
||||
func (cp *ControlPlane) InitCluster(clusterID, nodeName, meshIP string, apiPort int) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
if cp.state != nil && cp.state.ClusterID != "" {
|
||||
return fmt.Errorf("already part of cluster %q", cp.state.ClusterID)
|
||||
}
|
||||
|
||||
if apiPort == 0 {
|
||||
apiPort = DefaultAPIPort
|
||||
}
|
||||
|
||||
cp.state = &ClusterState{
|
||||
ClusterID: clusterID,
|
||||
Role: RoleControl,
|
||||
NodeID: clusterID + "-control",
|
||||
NodeName: nodeName,
|
||||
ControlURL: fmt.Sprintf("https://%s:%d", meshIP, apiPort),
|
||||
APIPort: apiPort,
|
||||
JoinedAt: time.Now().UTC(),
|
||||
HeartbeatInterval: DefaultHeartbeatInterval,
|
||||
}
|
||||
|
||||
// Register self as a node
|
||||
cp.nodes[cp.state.NodeID] = &NodeInfo{
|
||||
NodeID: cp.state.NodeID,
|
||||
Name: nodeName,
|
||||
MeshIP: meshIP,
|
||||
Status: NodeStatusReady,
|
||||
Labels: map[string]string{"role": "control"},
|
||||
LastHeartbeat: time.Now().UTC(),
|
||||
JoinedAt: time.Now().UTC(),
|
||||
}
|
||||
|
||||
if err := cp.saveState(); err != nil {
|
||||
return err
|
||||
}
|
||||
return cp.saveNodes()
|
||||
}
|
||||
|
||||
// ── Join ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
// JoinCluster registers this node as a worker in an existing cluster.
|
||||
func (cp *ControlPlane) JoinCluster(clusterID, controlURL, nodeID, nodeName, meshIP string) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
if cp.state != nil && cp.state.ClusterID != "" {
|
||||
return fmt.Errorf("already part of cluster %q — run 'volt cluster leave' first", cp.state.ClusterID)
|
||||
}
|
||||
|
||||
cp.state = &ClusterState{
|
||||
ClusterID: clusterID,
|
||||
Role: RoleWorker,
|
||||
NodeID: nodeID,
|
||||
NodeName: nodeName,
|
||||
ControlURL: controlURL,
|
||||
JoinedAt: time.Now().UTC(),
|
||||
HeartbeatInterval: DefaultHeartbeatInterval,
|
||||
}
|
||||
|
||||
return cp.saveState()
|
||||
}
|
||||
|
||||
// ── Node Registration ────────────────────────────────────────────────────────
|
||||
|
||||
// RegisterNode adds a new worker node to the cluster (control plane only).
|
||||
func (cp *ControlPlane) RegisterNode(node *NodeInfo) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
if cp.state == nil || cp.state.Role != RoleControl {
|
||||
return fmt.Errorf("not the control plane — cannot register nodes")
|
||||
}
|
||||
|
||||
node.Status = NodeStatusReady
|
||||
node.JoinedAt = time.Now().UTC()
|
||||
node.LastHeartbeat = time.Now().UTC()
|
||||
cp.nodes[node.NodeID] = node
|
||||
|
||||
return cp.saveNodes()
|
||||
}
|
||||
|
||||
// DeregisterNode removes a node from the cluster.
|
||||
func (cp *ControlPlane) DeregisterNode(nodeID string) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
if _, exists := cp.nodes[nodeID]; !exists {
|
||||
return fmt.Errorf("node %q not found", nodeID)
|
||||
}
|
||||
|
||||
delete(cp.nodes, nodeID)
|
||||
return cp.saveNodes()
|
||||
}
|
||||
|
||||
// ── Heartbeat ────────────────────────────────────────────────────────────────
|
||||
|
||||
// ProcessHeartbeat updates a node's health status.
|
||||
func (cp *ControlPlane) ProcessHeartbeat(nodeID string, resources NodeResources) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
node, exists := cp.nodes[nodeID]
|
||||
if !exists {
|
||||
return fmt.Errorf("node %q not registered", nodeID)
|
||||
}
|
||||
|
||||
node.LastHeartbeat = time.Now().UTC()
|
||||
node.MissedBeats = 0
|
||||
node.Resources = resources
|
||||
if node.Status == NodeStatusNotReady {
|
||||
node.Status = NodeStatusReady
|
||||
}
|
||||
|
||||
return cp.saveNodes()
|
||||
}
|
||||
|
||||
// CheckHealth evaluates all nodes and marks those with missed heartbeats.
|
||||
func (cp *ControlPlane) CheckHealth() []string {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
var unhealthy []string
|
||||
threshold := time.Duration(DefaultFailureThreshold) * DefaultHeartbeatInterval
|
||||
|
||||
for _, node := range cp.nodes {
|
||||
if node.Status == NodeStatusRemoved || node.Status == NodeStatusDraining {
|
||||
continue
|
||||
}
|
||||
if time.Since(node.LastHeartbeat) > threshold {
|
||||
node.MissedBeats++
|
||||
if node.MissedBeats >= DefaultFailureThreshold {
|
||||
node.Status = NodeStatusNotReady
|
||||
unhealthy = append(unhealthy, node.NodeID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cp.saveNodes()
|
||||
return unhealthy
|
||||
}
|
||||
|
||||
// ── Drain ────────────────────────────────────────────────────────────────────
|
||||
|
||||
// DrainNode marks a node for draining (no new workloads, existing ones rescheduled).
|
||||
func (cp *ControlPlane) DrainNode(nodeID string) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
node, exists := cp.nodes[nodeID]
|
||||
if !exists {
|
||||
return fmt.Errorf("node %q not found", nodeID)
|
||||
}
|
||||
|
||||
node.Status = NodeStatusDraining
|
||||
|
||||
// Find workloads on this node and mark for rescheduling
|
||||
for _, sw := range cp.schedule {
|
||||
if sw.NodeID == nodeID && sw.Status == "running" {
|
||||
sw.Status = "pending" // will be rescheduled
|
||||
sw.NodeID = ""
|
||||
sw.NodeName = ""
|
||||
}
|
||||
}
|
||||
|
||||
cp.saveNodes()
|
||||
return cp.saveSchedule()
|
||||
}
|
||||
|
||||
// ── Leave ────────────────────────────────────────────────────────────────────
|
||||
|
||||
// LeaveCluster removes this node from the cluster.
|
||||
func (cp *ControlPlane) LeaveCluster() error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
if cp.state == nil {
|
||||
return fmt.Errorf("not part of any cluster")
|
||||
}
|
||||
|
||||
// If control plane, clean up
|
||||
if cp.state.Role == RoleControl {
|
||||
cp.nodes = make(map[string]*NodeInfo)
|
||||
cp.schedule = nil
|
||||
os.Remove(NodesStateFile)
|
||||
os.Remove(ScheduleStateFile)
|
||||
}
|
||||
|
||||
cp.state = nil
|
||||
os.Remove(ClusterStateFile)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Scheduling ───────────────────────────────────────────────────────────────
|
||||
|
||||
// ScheduleWorkload assigns a workload to a node based on resource availability
|
||||
// and label selectors.
|
||||
func (cp *ControlPlane) ScheduleWorkload(workload *ScheduledWorkload, nodeSelector map[string]string) error {
|
||||
cp.mu.Lock()
|
||||
defer cp.mu.Unlock()
|
||||
|
||||
if cp.state == nil || cp.state.Role != RoleControl {
|
||||
return fmt.Errorf("not the control plane — cannot schedule workloads")
|
||||
}
|
||||
|
||||
// Find best node
|
||||
bestNode := cp.findBestNode(workload.Resources, nodeSelector)
|
||||
if bestNode == nil {
|
||||
return fmt.Errorf("no suitable node found for workload %q (required: %dMB RAM, %d CPU cores)",
|
||||
workload.WorkloadID, workload.Resources.MemoryMB, workload.Resources.CPUCores)
|
||||
}
|
||||
|
||||
workload.NodeID = bestNode.NodeID
|
||||
workload.NodeName = bestNode.Name
|
||||
workload.Status = "pending"
|
||||
workload.ScheduledAt = time.Now().UTC()
|
||||
|
||||
cp.schedule = append(cp.schedule, workload)
|
||||
|
||||
return cp.saveSchedule()
|
||||
}
|
||||
|
||||
// findBestNode selects the best available node for a workload based on
|
||||
// resource availability and label matching. Uses a simple "least loaded" strategy.
|
||||
func (cp *ControlPlane) findBestNode(required WorkloadResources, selector map[string]string) *NodeInfo {
|
||||
var best *NodeInfo
|
||||
var bestScore int64 = -1
|
||||
|
||||
for _, node := range cp.nodes {
|
||||
// Skip unhealthy/draining nodes
|
||||
if node.Status != NodeStatusReady {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check label selector
|
||||
if !matchLabels(node.Labels, selector) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check resource availability
|
||||
availMem := node.Resources.MemoryTotalMB - node.Resources.MemoryUsedMB
|
||||
if required.MemoryMB > 0 && availMem < required.MemoryMB {
|
||||
continue
|
||||
}
|
||||
|
||||
// Score: prefer nodes with more available resources (simple bin-packing)
|
||||
score := availMem
|
||||
if best == nil || score > bestScore {
|
||||
best = node
|
||||
bestScore = score
|
||||
}
|
||||
}
|
||||
|
||||
return best
|
||||
}
|
||||
|
||||
// matchLabels checks if a node's labels satisfy a selector.
|
||||
func matchLabels(nodeLabels, selector map[string]string) bool {
|
||||
for k, v := range selector {
|
||||
if nodeLabels[k] != v {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ── Persistence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (cp *ControlPlane) loadState() {
|
||||
data, err := os.ReadFile(ClusterStateFile)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
var state ClusterState
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
return
|
||||
}
|
||||
cp.state = &state
|
||||
}
|
||||
|
||||
func (cp *ControlPlane) saveState() error {
|
||||
os.MkdirAll(ClusterStateDir, 0755)
|
||||
data, err := json.MarshalIndent(cp.state, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(ClusterStateFile, data, 0644)
|
||||
}
|
||||
|
||||
func (cp *ControlPlane) loadNodes() {
|
||||
data, err := os.ReadFile(NodesStateFile)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
var nodes map[string]*NodeInfo
|
||||
if err := json.Unmarshal(data, &nodes); err != nil {
|
||||
return
|
||||
}
|
||||
cp.nodes = nodes
|
||||
}
|
||||
|
||||
func (cp *ControlPlane) saveNodes() error {
|
||||
os.MkdirAll(ClusterStateDir, 0755)
|
||||
data, err := json.MarshalIndent(cp.nodes, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(NodesStateFile, data, 0644)
|
||||
}
|
||||
|
||||
func (cp *ControlPlane) loadSchedule() {
|
||||
data, err := os.ReadFile(ScheduleStateFile)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
var schedule []*ScheduledWorkload
|
||||
if err := json.Unmarshal(data, &schedule); err != nil {
|
||||
return
|
||||
}
|
||||
cp.schedule = schedule
|
||||
}
|
||||
|
||||
func (cp *ControlPlane) saveSchedule() error {
|
||||
os.MkdirAll(ClusterStateDir, 0755)
|
||||
data, err := json.MarshalIndent(cp.schedule, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(ScheduleStateFile, data, 0644)
|
||||
}
|
||||
Reference in New Issue
Block a user