Skip to content

Commit 291e423

Browse files
committed
controller: vm state transition refactoring
1 parent 5596052 commit 291e423

2 files changed

Lines changed: 123 additions & 9 deletions

File tree

controller/health.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,18 @@ func (c *Controller) CheckComponentHealth(component *v1.Component) (bool, error)
8080

8181
// CheckVMHealth performs ping/SSH health check for a VM
8282
func (c *Controller) CheckVMHealth(vm *v1.VirtualMachine) (bool, error) {
83-
// If VM is marked Removing, do not update state during health checks.
84-
if vm.Status.ObservedState == "Removing" {
83+
// State guard: only run health checks for states that require them.
84+
switch vm.Status.ObservedState {
85+
case "Removing", "", "Created", "Starting", "Error", "VMNotFound":
86+
// Not ready, in error, or already gone — skip.
8587
return false, nil
88+
case "Running":
89+
// No health checks configured; VM health is determined by CS hypervisor state alone.
90+
return true, nil
8691
}
92+
// States that reach here: Started, IPNotFound, Healthy, Unhealthy.
8793

88-
// Skip if VM not created in CloudStack
94+
// Skip if VM has no CloudStack ID yet.
8995
if vm.CloudStackID == "" {
9096
return false, nil
9197
}
@@ -128,18 +134,24 @@ func (c *Controller) CheckVMHealth(vm *v1.VirtualMachine) (bool, error) {
128134
}
129135
}
130136

131-
// If still no checks defined, consider the VM healthy if it has an IP and is running in CloudStack.
137+
// No health checks configured: mark as Running (healthy by hypervisor state alone).
132138
if len(checks) == 0 {
133-
log.Printf("Health check passed for VM %s (id=%s): no health checks defined, defaulting to healthy", vm.Metadata.Name, vm.CloudStackID)
134-
vm.Status.ObservedState = "Healthy"
139+
log.Printf("VM %s (id=%s): no health checks defined, marking as Running", vm.Metadata.Name, vm.CloudStackID)
140+
vm.Status.ObservedState = "Running"
135141
vm.Status.Ready = true
136142
vm.Status.LastChecked = time.Now()
137143
return true, db.DB.Save(vm).Error
138144
}
139145

140146
if vmIP == "" {
141147
log.Printf("no IP address found for VM %s (id=%s)", vm.Metadata.Name, vm.CloudStackID)
142-
vm.Status.ObservedState = "IPNotFound"
148+
// IPNotFound is transient during startup (Started → IPNotFound).
149+
// Once the VM has been Healthy/Unhealthy a missing IP is a health failure.
150+
if vm.Status.ObservedState == "Started" || vm.Status.ObservedState == "IPNotFound" {
151+
vm.Status.ObservedState = "IPNotFound"
152+
} else {
153+
vm.Status.ObservedState = "Unhealthy"
154+
}
143155
vm.Status.Ready = false
144156
vm.Status.LastChecked = time.Now()
145157
return false, db.DB.Save(vm).Error

controller/reconcile.go

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,18 @@ func (c *Controller) ReconcileComponent(comp *v1.Component) error {
184184
return nil
185185
}
186186

187+
// Advance the state machine for each VM in this component. This is
188+
// required for paths (e.g. waitForComponentHealth) that call
189+
// ReconcileComponent directly without first calling ReconcileVM.
190+
var compVMs []v1.VirtualMachine
191+
if err := db.DB.Where("component = ? AND (observed_state IS NULL OR observed_state <> ?)", comp.Metadata.Name, "Removing").Find(&compVMs).Error; err == nil {
192+
for _, vm := range compVMs {
193+
if err := c.ReconcileVM(&vm); err != nil {
194+
log.Printf("ReconcileComponent: failed to reconcile VM %s: %v", vm.Metadata.Name, err)
195+
}
196+
}
197+
}
198+
187199
// Check component health
188200
healthy, err := c.CheckComponentHealth(comp)
189201
if err != nil {
@@ -220,6 +232,32 @@ func (c *Controller) ReconcileVM(vm *v1.VirtualMachine) error {
220232
return err
221233
}
222234

235+
// VMNotFound: the CS VM is gone — clear the stale ID and recreate.
236+
if vm.Status.ObservedState == "VMNotFound" {
237+
log.Printf("ReconcileVM: VM %s not found in CloudStack, clearing ID and recreating", vm.Metadata.Name)
238+
vm.CloudStackID = ""
239+
vm.Status.ObservedState = "Created"
240+
vm.Status.Ready = false
241+
if err := db.DB.Save(vm).Error; err != nil {
242+
return err
243+
}
244+
}
245+
246+
// Stopped: start the VM in CloudStack.
247+
if vm.Status.ObservedState == "Stopped" {
248+
log.Printf("ReconcileVM: VM %s is Stopped, starting it", vm.Metadata.Name)
249+
sp := c.csClient.VirtualMachine.NewStartVirtualMachineParams(vm.CloudStackID)
250+
if _, err := c.csClient.VirtualMachine.StartVirtualMachine(sp); err != nil {
251+
log.Printf("ReconcileVM: failed to start VM %s: %v", vm.Metadata.Name, err)
252+
return err
253+
}
254+
vm.Status.ObservedState = "Starting"
255+
vm.Status.Ready = false
256+
if err := db.DB.Save(vm).Error; err != nil {
257+
return err
258+
}
259+
}
260+
223261
// Check if VM exists; if not, create it
224262
if vm.CloudStackID == "" {
225263
if id, err := handlers.ApplyVirtualMachineManaged(vm, true); err != nil {
@@ -249,6 +287,21 @@ func (c *Controller) ReconcileVM(vm *v1.VirtualMachine) error {
249287
return err
250288
}
251289

290+
// vmHasHealthChecks returns true if the VM has health checks defined either
291+
// on its own spec or inherited from its owning component.
292+
func vmHasHealthChecks(vm *v1.VirtualMachine) bool {
293+
if len(vm.Spec.HealthChecks) > 0 {
294+
return true
295+
}
296+
if vm.Component != "" {
297+
var comp v1.Component
298+
if db.DB.Where("name = ?", vm.Component).First(&comp).Error == nil {
299+
return len(comp.Spec.HealthChecks) > 0
300+
}
301+
}
302+
return false
303+
}
304+
252305
// populateObservedSpec queries CloudStack for VM details and fills ObservedSpec
253306
func (c *Controller) populateObservedSpec(vm *v1.VirtualMachine) error {
254307
// Use SDK to list by id or name
@@ -264,6 +317,13 @@ func (c *Controller) populateObservedSpec(vm *v1.VirtualMachine) error {
264317
return err
265318
}
266319
if resp == nil || len(resp.VirtualMachines) == 0 {
320+
// VM had a CloudStack ID but is no longer found — mark as VMNotFound.
321+
if vm.CloudStackID != "" && vm.Status.ObservedState != "Removing" {
322+
vm.Status.ObservedState = "VMNotFound"
323+
vm.Status.Ready = false
324+
vm.Status.LastChecked = time.Now()
325+
return db.DB.Save(vm).Error
326+
}
267327
return nil
268328
}
269329

@@ -367,9 +427,51 @@ func (c *Controller) populateObservedSpec(vm *v1.VirtualMachine) error {
367427
}
368428
}
369429

370-
// Record observed state
430+
// State machine: map CloudStack hypervisor state to controller-managed states.
431+
//
432+
// "" / Created ──(CS ID assigned)──► Starting
433+
// Starting ──(CS Running)──► Started (has health checks) | Running (no checks)
434+
// Starting ──(CS Error)──► Error
435+
// Starting ──(CS Stopped)──► Stopped (reconciler will start it)
436+
// Error ──(CS Running)──► Starting (recovered, re-enter flow)
437+
// Started/IPNotFound/Running/Healthy/Unhealthy ──(CS Stopped)──► Stopped
438+
// Started/IPNotFound/Running/Healthy/Unhealthy ──(CS other non-Running)──► state unchanged
439+
// Removing ──── never overwritten
440+
// VMNotFound ──── never overwritten (handled above)
371441
if v.State != "" {
372-
vm.Status.ObservedState = v.State
442+
current := vm.Status.ObservedState
443+
switch current {
444+
case "Removing", "VMNotFound":
445+
// Terminal / in-progress states — never overwrite.
446+
case "", "Created":
447+
// Initial: CloudStack ID just assigned.
448+
vm.Status.ObservedState = "Starting"
449+
case "Starting", "Error":
450+
switch v.State {
451+
case "Running":
452+
if vmHasHealthChecks(vm) {
453+
vm.Status.ObservedState = "Started"
454+
} else {
455+
vm.Status.ObservedState = "Running"
456+
vm.Status.Ready = true
457+
}
458+
case "Error":
459+
vm.Status.ObservedState = "Error"
460+
vm.Status.Ready = false
461+
case "Stopped":
462+
vm.Status.ObservedState = "Stopped"
463+
vm.Status.Ready = false
464+
// All other CS transient states ("Starting", etc.) stay in current state.
465+
}
466+
case "Started", "IPNotFound", "Running", "Healthy", "Unhealthy":
467+
// Stable / health-check states.
468+
// Only act on Stopped — everything else (transient CS states) leaves
469+
// the controller state untouched so health checks continue normally.
470+
if v.State == "Stopped" {
471+
vm.Status.ObservedState = "Stopped"
472+
vm.Status.Ready = false
473+
}
474+
}
373475
}
374476

375477
vm.ObservedSpec = obs

0 commit comments

Comments
 (0)