MetaCell · filippomc · May 6, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/applications/nfsserver/Dockerfile b/applications/nfsserver/Dockerfile
@@ -4,22 +4,32 @@ FROM golang:1.26.2 AS provisioner
 WORKDIR /usr/src/app
 COPY nfs-subdir-external-provisioner ./
 
-RUN make
+RUN go mod tidy && make
+
+# compile nfsvol (mount manager / watchdog)
+FROM golang:1.26.2 AS nfsvol
+
+WORKDIR /usr/src/nfsvol
+COPY nfsvol/ ./
+RUN go mod tidy && go build -o /usr/local/bin/nfsvol .
 
 #
-FROM k8s.gcr.io/volume-nfs:0.8
+# Upstream continuation of the old k8s.gcr.io/volume-nfs:0.8 image, maintained
+# by the Kubernetes project at test/images/volume/nfs. CentOS Stream 9 base
+# with nfs-utils already installed; we only need to add e2fsprogs for mkfs.ext4.
+FROM rockylinux/rockylinux:10.1-minimal
 
-RUN yum install -y \
-    e2fsprogs \
-    nfs-utils \
-    rpcbind \
-    && yum clean all \
-    && rm -rf /var/cache/yum
+RUN microdnf install -y \
+  e2fsprogs \
+  rpcbind \
+  nfs-utils \
+  && microdnf clean all
 
 COPY --from=provisioner /usr/src/app/bin/nfs-subdir-external-provisioner /usr/local/bin/nfs-subdir-external-provisioner
+COPY --from=nfsvol /usr/local/bin/nfsvol /usr/local/bin/nfsvol
 
 COPY resources/*.sh /usr/local/bin/
-RUN chmod +x /usr/local/bin/*.sh
+RUN chmod +x /usr/local/bin/*.sh /usr/local/bin/nfsvol
 
 RUN echo "/exports *(rw,fsid=0,insecure,no_subtree_check,no_root_squash,crossmnt)" > /etc/exports
 

diff --git a/applications/nfsserver/README-PROD.md b/applications/nfsserver/README-PROD.md
@@ -0,0 +1,106 @@
+# NFS Server — Production Caveats
+
+This NFS server is a pragmatic, single-replica design that trades some
+durability for operational simplicity. The defaults favour **client-pod
+availability over write durability**: during a server outage, applications
+see `EIO` and are expected to retry, rather than hanging indefinitely.
+
+Read this before relying on it for production workloads.
+
+## Architecture
+
+- One NFS server pod (single-replica Deployment, `strategy: Recreate`).
+- Backing storage: one RWO PVC (`nfs-exports`) holding per-PVC ext4 loopback
+  files at `/exports/<pv>.quota`, mounted at `/exports/<pv>`.
+- Clients mount via a stable ClusterIP Service + DNS FQDN, with
+  `soft,nolock,local_lock=all,nfsvers=3`.
+- Per-PVC exports are written to `/etc/exports.d/` with a deterministic
+  SHA-256-derived fsid, so client file handles survive server pod restarts.
+
+## Outage behaviour
+
+| Trigger | Outage duration (for active client pods) | Client-visible error |
+|---|---|---|
+| `kubectl delete pod nfs-server-...` | seconds (kube reschedules immediately) | brief `EIO`, then resumes transparently (fsid stable) |
+| `kubectl rollout restart` | seconds — Recreate waits for old pod first | brief `EIO`, then resumes |
+| Graceful node drain of NFS node | ~30–60 s (PVC detach + reattach) | brief `EIO`, then resumes |
+| **Ungraceful node loss** (node crash, network partition) | **up to ~6 minutes** (force-detach timeout) | `EIO` repeatedly until pod reattaches on another node |
+| Loopback goes stale on same host (rare) | up to 30 s (watchdog period) | transparent, clients do not notice |
+
+The ~6 minute ungraceful-loss window is inherent to RWO storage with
+cloud-provider CSI drivers and cannot be eliminated without switching to a
+different storage strategy (see "Not suitable for" below).
+
+## Application requirements
+
+Applications that use PVCs from this provisioner **must**:
+
+- Tolerate `EIO` on reads and writes. Retry with backoff. The current config
+  uses `soft` mount semantics — I/O returns an error rather than hanging.
+- Not rely on POSIX file-range locking (`flock`, `fcntl`) across pods.
+  `nolock,local_lock=all` disables cross-client locking. Shared-writer
+  workloads (e.g. SQLite, cooperating text editors) will race silently.
+- Not assume write-through durability during an outage. In-flight writes
+  that return `EIO` may or may not have reached disk.
+
+## What these caveats rule out
+
+This backend is **not suitable for**:
+
+- Databases that require fsync durability semantics (use a proper database
+  PVC, not NFS).
+- Workloads with multiple writers to the same file across nodes.
+- Strict HA requirements (no failover during ungraceful node loss).
+- Large cross-region deployments (single RWO PVC is region-local).
+
+It **is suitable for**:
+
+- Shared read-only / append-only data between pods (logs, content).
+- Cache / scratch volumes where a brief `EIO` is retryable.
+- Shared artifact storage between producer and consumer pods.
+
+## Operator responsibilities
+
+### Backup
+
+`nfs-exports` is a single cloud PVC with no built-in backup. If lost, every
+NFS-backed PVC in the cluster is lost. Operators must:
+
+- Schedule snapshots of `nfs-exports` (cloud-provider-specific).
+- Store snapshots in a separate region/account for real DR.
+
+Neither `nfsvol` nor the provisioner automates this. It is intentional; DR
+policy is a per-deployment decision.
+
+### Graceful node migration
+
+To move the NFS server pod to a different node:
+
+1. Cordon the target node preferences as needed.
+2. Either cordon+drain the source node (standard flow), or
+3. `kubectl delete pod nfs-server-...` — the pod terminates, PVC detaches,
+   and a new pod schedules on any eligible node.
+
+With `podDisruptionBudget.enabled: true` in `values.yaml`, `kubectl drain`
+will be blocked by the PDB. This is intentional — forces the operator to
+use the explicit delete-pod flow so automated tooling does not evict
+unaware.
+
+### Monitoring
+
+- `/healthz` on port 8080 exposes the watchdog health (readiness and
+  liveness probes already consume this).
+- Watch for `watchdog: remount failed` log lines — indicates the loopback
+  layer is inconsistent with `/exports/*.quota`.
+- Watch for `mount-all: N of M mounts failed` at startup.
+
+## Not addressed by this iteration
+
+- Multi-region / cross-cluster replication.
+- Automated snapshot scheduling.
+- Active/passive HA (would need shared block storage + fencing, or a move
+  to a managed NFS service — EFS, Filestore, Azure Files).
+
+If any of those become requirements, switch to a managed NFS or a proper
+CSI driver. This backend was designed for small, single-region, best-effort
+shared storage.
diff --git a/applications/nfsserver/deploy/templates/nfs-server.yaml b/applications/nfsserver/deploy/templates/nfs-server.yaml
@@ -19,6 +19,11 @@ metadata:
     app: nfs-server
     usesvolume: nfs-exports
 spec:
+  # Recreate strategy: with an RWO PVC, a surge pod can never attach while the
+  # old pod holds nfs-exports, so RollingUpdate would deadlock. Single-replica
+  # by design.
+  strategy:
+    type: Recreate
   selector:
     matchLabels:
       app: nfs-server
@@ -29,6 +34,7 @@ spec:
         app: nfs-server
         usesvolume: nfs-exports
     spec:
+      serviceAccountName: {{ template "nfs-subdir-external-provisioner.serviceAccountName" . }}
       affinity:
         podAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
@@ -53,13 +59,39 @@ spec:
           value: {{ .Values.apps.nfsserver.nfs.path }}
         - name: PROVISIONER_NAME
           value: {{ printf "%s-nfs-provisioner" .Values.namespace }}
+        # Single-replica Deployment backed by an RWO PVC — only one pod can
+        # ever mount /exports, so leader election is unnecessary and would
+        # otherwise need v1 Endpoints RBAC (deprecated in k8s 1.33+).
+        - name: ENABLE_LEADER_ELECTION
+          value: "false"
         ports:
         - name: nfs
           containerPort: 2049
         - name: mountd
           containerPort: 20048
         - name: rpcbind
           containerPort: 111
+        - name: healthz
+          containerPort: 8080
+        startupProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          initialDelaySeconds: 15
+          periodSeconds: 30
+          failureThreshold: 240
+        readinessProbe:
+          httpGet:
+            path: /ready
+            port: 8080
+          periodSeconds: 15
+          failureThreshold: 3
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          periodSeconds: 30
+          failureThreshold: 5
         securityContext:
           privileged: true
         volumeMounts:

diff --git a/applications/nfsserver/deploy/templates/pdb.yaml b/applications/nfsserver/deploy/templates/pdb.yaml
@@ -0,0 +1,21 @@
+{{- if .Values.apps.nfsserver.podDisruptionBudget.enabled }}
+# A PDB with minAvailable: 1 on a single-replica RWO-PVC-backed Deployment
+# blocks kubectl drain / automated node-group rotations — by design. It is a
+# guardrail forcing the operator to consciously migrate the NFS server pod
+# (terminate → wait for PVC detach → new pod attaches on target node) rather
+# than letting tooling evict it blindly.
+#
+# This does NOT help with ungraceful node loss (which goes via the ~6-minute
+# force-detach path, not the voluntary-disruption path).
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: nfs-server
+  labels:
+    app: nfs-server
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app: nfs-server
+{{- end }}
diff --git a/applications/nfsserver/deploy/values.yaml b/applications/nfsserver/deploy/values.yaml
@@ -4,7 +4,7 @@ harness:
     auto: false
   deployment:
     auto: false
-    image: gcr.io/metacellllc/cloudharness/nfsserver:1.0
+    # image: gcr.io/metacellllc/cloudharness/nfsserver:1.0
 
 
 # nfs server pvc disk size (/exports)
@@ -32,8 +32,15 @@ nfs:
   useDNS: true
   path: /exports     # /exports
   mountOptions:
+  # `soft` overrides the Linux client default (`hard`). Without it, applications
+  # block in uninterruptible sleep when the NFS server is unreachable. With
+  # `soft`, I/O returns EIO after a few retries — applications stay alive and
+  # can retry. This matches the availability-over-integrity tradeoff.
+  - soft
   - nolock
   - local_lock=all
+  - timeo=50         # 5 s per retry
+  - retrans=2        # 2 retries → ~15 s before EIO during server downtime
   volumeName: nfs-subdir-external-provisioner-root
   # Reclaim policy for the main nfs volume
   reclaimPolicy: Retain
@@ -84,6 +91,14 @@ leaderElection:
   # When set to false leader election will be disabled
   enabled: true
 
+# PodDisruptionBudget guardrail against accidental evictions.
+# Off by default — enabling it blocks `kubectl drain` of the NFS server's
+# node until the operator terminates the pod manually (RWO PVC means there
+# is no surge pod that could satisfy minAvailable=1 while the old one runs).
+# Does NOT help with ungraceful node loss (force-detach path).
+podDisruptionBudget:
+  enabled: false
+
 ## For RBAC support:
 rbac:
   # Specifies whether RBAC resources should be created