Skip to content

Commit 5446ffc

Browse files
committed
Wire spec-driven replicas and session storage validation for MCPServer
- resolveDeploymentReplicas: nil-passthrough preserves HPA compatibility, stdio cap enforced at 1 as defense-in-depth - Add terminationGracePeriodSeconds (30s default) to proxy runner pod spec - validateStdioReplicaCap: Warning condition when stdio + spec.replicas > 1 - validateSessionStorageForReplicas: Warning condition when replicas > 1 without Redis session storage configured Closes #4217
1 parent f4931c0 commit 5446ffc

6 files changed

Lines changed: 422 additions & 71 deletions

File tree

cmd/thv-operator/api/v1alpha1/mcpserver_types.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,28 @@ const (
7979
ConditionReasonExternalAuthConfigMultiUpstream = "MultiUpstreamNotSupported"
8080
)
8181

82+
// ConditionStdioReplicaCapped indicates spec.replicas was capped at 1 for stdio transport.
83+
const ConditionStdioReplicaCapped = "StdioReplicaCapped"
84+
85+
const (
86+
// ConditionReasonStdioReplicaCapped is set when spec.replicas > 1 for a stdio transport.
87+
ConditionReasonStdioReplicaCapped = "StdioTransportCapAt1"
88+
// ConditionReasonStdioReplicaCapNotActive is set when the stdio replica cap does not apply.
89+
ConditionReasonStdioReplicaCapNotActive = "StdioReplicaCapNotActive"
90+
)
91+
92+
// ConditionSessionStorageWarning indicates replicas > 1 but no Redis session storage is configured.
93+
const ConditionSessionStorageWarning = "SessionStorageWarning"
94+
95+
const (
96+
// ConditionReasonSessionStorageMissing is set when replicas > 1 and no Redis session storage is configured.
97+
ConditionReasonSessionStorageMissing = "SessionStorageMissingForReplicas"
98+
// ConditionReasonSessionStorageConfigured is set when replicas > 1 and Redis session storage is configured.
99+
ConditionReasonSessionStorageConfigured = "SessionStorageConfigured"
100+
// ConditionReasonSessionStorageNotApplicable is set when replicas is nil or <= 1 and the warning is not active.
101+
ConditionReasonSessionStorageNotApplicable = "SessionStorageWarningNotApplicable"
102+
)
103+
82104
// MCPServerSpec defines the desired state of MCPServer
83105
type MCPServerSpec struct {
84106
// Image is the container image for the MCP server

cmd/thv-operator/api/v1alpha1/virtualmcpserver_types_test.go

Lines changed: 0 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -535,58 +535,3 @@ func TestVirtualMCPServerSpecScalingFieldsJSONRoundtrip(t *testing.T) {
535535
})
536536
}
537537
}
538-
539-
func TestVirtualMCPServerSpecScalingFieldsJSONRoundtrip(t *testing.T) {
540-
t.Parallel()
541-
542-
replicas := int32(2)
543-
544-
tests := []struct {
545-
name string
546-
spec VirtualMCPServerSpec
547-
wantKeys []string
548-
wantAbsent []string
549-
}{
550-
{
551-
name: "nil replicas are omitted",
552-
spec: VirtualMCPServerSpec{
553-
IncomingAuth: &IncomingAuthConfig{Type: "anonymous"},
554-
},
555-
wantAbsent: []string{`"replicas"`, `"sessionStorage"`},
556-
},
557-
{
558-
name: "set replicas are serialized",
559-
spec: VirtualMCPServerSpec{
560-
IncomingAuth: &IncomingAuthConfig{Type: "anonymous"},
561-
Replicas: &replicas,
562-
},
563-
wantKeys: []string{`"replicas":2`},
564-
},
565-
{
566-
name: "sessionStorage is serialized when set",
567-
spec: VirtualMCPServerSpec{
568-
IncomingAuth: &IncomingAuthConfig{Type: "anonymous"},
569-
SessionStorage: &SessionStorageConfig{
570-
Provider: "redis",
571-
Address: "redis:6379",
572-
},
573-
},
574-
wantKeys: []string{`"sessionStorage"`, `"provider":"redis"`},
575-
},
576-
}
577-
578-
for _, tc := range tests {
579-
t.Run(tc.name, func(t *testing.T) {
580-
t.Parallel()
581-
b, err := json.Marshal(tc.spec)
582-
require.NoError(t, err)
583-
out := string(b)
584-
for _, key := range tc.wantKeys {
585-
assert.Contains(t, out, key)
586-
}
587-
for _, key := range tc.wantAbsent {
588-
assert.NotContains(t, out, key)
589-
}
590-
})
591-
}
592-
}

cmd/thv-operator/controllers/mcpserver_controller.go

Lines changed: 111 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ const (
129129
authzLabelValueInline = "inline"
130130
)
131131

132+
const defaultTerminationGracePeriodSeconds = int64(30)
133+
134+
const stdioTransport = "stdio"
135+
132136
// detectPlatform detects the Kubernetes platform type (Kubernetes vs OpenShift)
133137
// It uses the shared platform detector to ensure detection is only performed once and cached
134138
func (r *MCPServerReconciler) detectPlatform(ctx context.Context) (kubernetes.Platform, error) {
@@ -189,6 +193,10 @@ func (r *MCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
189193
// Validate CABundleRef if specified
190194
r.validateCABundleRef(ctx, mcpServer)
191195

196+
// Validate stdio replica cap and session storage requirements
197+
r.validateStdioReplicaCap(ctx, mcpServer)
198+
r.validateSessionStorageForReplicas(ctx, mcpServer)
199+
192200
// Validate PodTemplateSpec early - before other validations
193201
// This ensures we fail fast if the spec is invalid
194202
if !r.validateAndUpdatePodTemplateStatus(ctx, mcpServer) {
@@ -392,7 +400,7 @@ func (r *MCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
392400
// Enforce stdio transport replica cap: stdio requires 1:1 proxy-to-backend
393401
// connections and cannot scale beyond 1. Other transports are hands-off
394402
// to allow HPAs, KEDA, or manual kubectl scale to manage replicas freely.
395-
if mcpServer.Spec.Transport == "stdio" &&
403+
if mcpServer.Spec.Transport == stdioTransport &&
396404
deployment.Spec.Replicas != nil && *deployment.Spec.Replicas > 1 {
397405
deployment.Spec.Replicas = int32Ptr(1)
398406
err = r.Update(ctx, deployment)
@@ -450,13 +458,18 @@ func (r *MCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
450458

451459
// Check if the deployment spec changed
452460
if r.deploymentNeedsUpdate(ctx, deployment, mcpServer, runConfigChecksum) {
453-
// Update template and metadata only — preserve Spec.Replicas so that
454-
// HPAs, KEDA, and manual scaling are not overwritten by the controller.
461+
// Update template and metadata. Also sync Spec.Replicas when spec.replicas is
462+
// explicitly set — this makes the operator authoritative for spec-driven scaling.
463+
// When spec.replicas is nil, preserve the live count so HPAs, KEDA, and manual
464+
// kubectl scale remain in control.
455465
newDeployment := r.deploymentForMCPServer(ctx, mcpServer, runConfigChecksum)
456466
deployment.Spec.Template = newDeployment.Spec.Template
457467
deployment.Spec.Selector = newDeployment.Spec.Selector
458468
deployment.Labels = newDeployment.Labels
459469
deployment.Annotations = ctrlutil.MergeAnnotations(newDeployment.Annotations, deployment.Annotations)
470+
if newDeployment.Spec.Replicas != nil {
471+
deployment.Spec.Replicas = newDeployment.Spec.Replicas
472+
}
460473
err = r.Update(ctx, deployment)
461474
if err != nil {
462475
ctxLogger.Error(err, "Failed to update Deployment",
@@ -931,7 +944,6 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
931944
ctx context.Context, m *mcpv1alpha1.MCPServer, runConfigChecksum string,
932945
) *appsv1.Deployment {
933946
ls := labelsForMCPServer(m.Name)
934-
replicas := int32(1)
935947

936948
// Prepare container args
937949
args := []string{"run"}
@@ -1196,7 +1208,7 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
11961208
Annotations: deploymentAnnotations,
11971209
},
11981210
Spec: appsv1.DeploymentSpec{
1199-
Replicas: &replicas,
1211+
Replicas: resolveDeploymentReplicas(m.Spec.Transport, m.Spec.Replicas),
12001212
Selector: &metav1.LabelSelector{
12011213
MatchLabels: ls, // Keep original labels for selector
12021214
},
@@ -1206,8 +1218,9 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
12061218
Annotations: deploymentTemplateAnnotations,
12071219
},
12081220
Spec: corev1.PodSpec{
1209-
ServiceAccountName: ctrlutil.ProxyRunnerServiceAccountName(m.Name),
1210-
ImagePullSecrets: imagePullSecrets,
1221+
ServiceAccountName: ctrlutil.ProxyRunnerServiceAccountName(m.Name),
1222+
ImagePullSecrets: imagePullSecrets,
1223+
TerminationGracePeriodSeconds: int64Ptr(defaultTerminationGracePeriodSeconds),
12111224
Containers: []corev1.Container{{
12121225
Image: getToolhiveRunnerImage(),
12131226
Name: "toolhive",
@@ -1694,6 +1707,15 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate(
16941707
return true
16951708
}
16961709

1710+
// Check if spec.replicas has changed. Only compare when spec.replicas is non-nil;
1711+
// nil means hands-off mode (HPA/KEDA manages replicas) and the live count is authoritative.
1712+
expectedReplicas := resolveDeploymentReplicas(mcpServer.Spec.Transport, mcpServer.Spec.Replicas)
1713+
if expectedReplicas != nil {
1714+
if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != *expectedReplicas {
1715+
return true
1716+
}
1717+
}
1718+
16971719
return false
16981720
}
16991721

@@ -1879,6 +1901,88 @@ func int32Ptr(i int32) *int32 {
18791901
return &i
18801902
}
18811903

1904+
// int64Ptr returns a pointer to an int64
1905+
func int64Ptr(i int64) *int64 {
1906+
return &i
1907+
}
1908+
1909+
// resolveDeploymentReplicas returns the replica count to set on Deployment.Spec.Replicas.
1910+
// Returns nil when spec.replicas is nil (hands-off mode for HPA/KEDA).
1911+
// Enforces stdio cap at 1 as defense-in-depth (reconciler also enforces this via status condition).
1912+
func resolveDeploymentReplicas(mcpTransport string, specReplicas *int32) *int32 {
1913+
if specReplicas == nil {
1914+
return nil
1915+
}
1916+
if mcpTransport == stdioTransport && *specReplicas > 1 {
1917+
return int32Ptr(1)
1918+
}
1919+
return specReplicas
1920+
}
1921+
1922+
// setStdioReplicaCappedCondition sets the StdioReplicaCapped status condition
1923+
func setStdioReplicaCappedCondition(mcpServer *mcpv1alpha1.MCPServer, status metav1.ConditionStatus, reason, message string) {
1924+
meta.SetStatusCondition(&mcpServer.Status.Conditions, metav1.Condition{
1925+
Type: mcpv1alpha1.ConditionStdioReplicaCapped,
1926+
Status: status,
1927+
Reason: reason,
1928+
Message: message,
1929+
ObservedGeneration: mcpServer.Generation,
1930+
})
1931+
}
1932+
1933+
// validateStdioReplicaCap checks if spec.replicas > 1 for stdio transport and sets a warning condition.
1934+
// The deployment builder enforces the cap at 1 as defense-in-depth.
1935+
// Clears the condition when transport or replica count no longer violates the cap.
1936+
func (r *MCPServerReconciler) validateStdioReplicaCap(ctx context.Context, mcpServer *mcpv1alpha1.MCPServer) {
1937+
if mcpServer.Spec.Transport == stdioTransport && mcpServer.Spec.Replicas != nil && *mcpServer.Spec.Replicas > 1 {
1938+
setStdioReplicaCappedCondition(mcpServer, metav1.ConditionTrue,
1939+
mcpv1alpha1.ConditionReasonStdioReplicaCapped,
1940+
"stdio transport requires exactly 1 replica; deployment will use 1 regardless of spec.replicas")
1941+
} else {
1942+
setStdioReplicaCappedCondition(mcpServer, metav1.ConditionFalse,
1943+
mcpv1alpha1.ConditionReasonStdioReplicaCapNotActive,
1944+
"stdio replica cap is not active")
1945+
}
1946+
if err := r.Status().Update(ctx, mcpServer); err != nil {
1947+
log.FromContext(ctx).Error(err, "Failed to update MCPServer status after stdio replica cap validation")
1948+
}
1949+
}
1950+
1951+
// setSessionStorageCondition sets the SessionStorageWarning status condition
1952+
func setSessionStorageCondition(mcpServer *mcpv1alpha1.MCPServer, status metav1.ConditionStatus, reason, message string) {
1953+
meta.SetStatusCondition(&mcpServer.Status.Conditions, metav1.Condition{
1954+
Type: mcpv1alpha1.ConditionSessionStorageWarning,
1955+
Status: status,
1956+
Reason: reason,
1957+
Message: message,
1958+
ObservedGeneration: mcpServer.Generation,
1959+
})
1960+
}
1961+
1962+
// validateSessionStorageForReplicas emits a Warning condition when replicas > 1 but session storage
1963+
// is not configured with a Redis backend. The deployment still proceeds; this is advisory only.
1964+
// Clears the condition when replicas drop back to nil or <= 1.
1965+
func (r *MCPServerReconciler) validateSessionStorageForReplicas(ctx context.Context, mcpServer *mcpv1alpha1.MCPServer) {
1966+
if mcpServer.Spec.Replicas != nil && *mcpServer.Spec.Replicas > 1 {
1967+
if mcpServer.Spec.SessionStorage == nil || mcpServer.Spec.SessionStorage.Provider != "redis" {
1968+
setSessionStorageCondition(mcpServer, metav1.ConditionTrue,
1969+
mcpv1alpha1.ConditionReasonSessionStorageMissing,
1970+
"replicas > 1 but sessionStorage.provider is not redis; sessions are not shared across replicas")
1971+
} else {
1972+
setSessionStorageCondition(mcpServer, metav1.ConditionFalse,
1973+
mcpv1alpha1.ConditionReasonSessionStorageConfigured,
1974+
"Redis session storage is configured")
1975+
}
1976+
} else {
1977+
setSessionStorageCondition(mcpServer, metav1.ConditionFalse,
1978+
mcpv1alpha1.ConditionReasonSessionStorageNotApplicable,
1979+
"session storage warning is not active")
1980+
}
1981+
if err := r.Status().Update(ctx, mcpServer); err != nil {
1982+
log.FromContext(ctx).Error(err, "Failed to update MCPServer status after session storage validation")
1983+
}
1984+
}
1985+
18821986
// SetupWithManager sets up the controller with the Manager.
18831987
func (r *MCPServerReconciler) SetupWithManager(mgr ctrl.Manager) error {
18841988
// Create a handler that maps MCPExternalAuthConfig changes to MCPServer reconciliation requests

cmd/thv-operator/controllers/mcpserver_pod_template_test.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,3 @@ func TestProxyRunnerStructuredLogsEnvVar(t *testing.T) {
404404
func boolPtr(b bool) *bool {
405405
return &b
406406
}
407-
408-
func int64Ptr(i int64) *int64 {
409-
return &i
410-
}

0 commit comments

Comments
 (0)