diff --git a/agent/internal/agent/handlers.go b/agent/internal/agent/handlers.go index 5423705..9cba7a6 100644 --- a/agent/internal/agent/handlers.go +++ b/agent/internal/agent/handlers.go @@ -138,7 +138,7 @@ func (a *Agent) ProcessBuild(item agenthttp.WorkQueueItem) error { } log.Printf("[build] starting build %s for commit %s (timeout: %d minutes)", Truncate(payload.BuildID, 8), Truncate(buildDetails.Build.CommitSha, 8), timeoutMinutes) - if err := a.Client.UpdateBuildStatus(payload.BuildID, "cloning", ""); err != nil { + if err := a.Client.UpdateBuildStatus(payload.BuildID, "cloning", "", ""); err != nil { log.Printf("[build] failed to update status to cloning: %v", err) } @@ -165,6 +165,7 @@ func (a *Agent) ProcessBuild(item agenthttp.WorkQueueItem) error { CloneURL: buildDetails.CloneURL, CommitSha: buildDetails.Build.CommitSha, Branch: buildDetails.Build.Branch, + ImageRepository: buildDetails.ImageRepository, ImageURI: buildDetails.ImageURI, ServiceID: buildDetails.Build.ServiceID, ProjectID: buildDetails.Build.ProjectID, @@ -174,7 +175,7 @@ func (a *Agent) ProcessBuild(item agenthttp.WorkQueueItem) error { } onStatusChange := func(status string) { - if err := a.Client.UpdateBuildStatus(payload.BuildID, status, ""); err != nil { + if err := a.Client.UpdateBuildStatus(payload.BuildID, status, "", buildConfig.ResolvedCommitSha); err != nil { log.Printf("[build] failed to update status to %s: %v", status, err) } } @@ -184,14 +185,14 @@ func (a *Agent) ProcessBuild(item agenthttp.WorkQueueItem) error { err = a.Builder.Build(ctx, buildConfig, checkCancelled, onStatusChange) if err != nil { log.Printf("[build] build %s failed: %v", Truncate(payload.BuildID, 8), err) - if updateErr := a.Client.UpdateBuildStatus(payload.BuildID, "failed", err.Error()); updateErr != nil { + if updateErr := a.Client.UpdateBuildStatus(payload.BuildID, "failed", err.Error(), buildConfig.ResolvedCommitSha); updateErr != nil { log.Printf("[build] failed to update status to failed: %v", updateErr) } return err } log.Printf("[build] build %s completed successfully", Truncate(payload.BuildID, 8)) - if err := a.Client.UpdateBuildStatus(payload.BuildID, "completed", ""); err != nil { + if err := a.Client.UpdateBuildStatus(payload.BuildID, "completed", "", buildConfig.ResolvedCommitSha); err != nil { log.Printf("[build] failed to update status to completed: %v", err) } diff --git a/agent/internal/build/build.go b/agent/internal/build/build.go index 95def39..d846ffb 100644 --- a/agent/internal/build/build.go +++ b/agent/internal/build/build.go @@ -20,16 +20,18 @@ import ( ) type Config struct { - BuildID string - CloneURL string - CommitSha string - Branch string - ImageURI string - ServiceID string - ProjectID string - RootDir string - Secrets map[string]string - TargetPlatforms []string + BuildID string + CloneURL string + CommitSha string + Branch string + ImageRepository string + ImageURI string + ResolvedCommitSha string + ServiceID string + ProjectID string + RootDir string + Secrets map[string]string + TargetPlatforms []string } type LogSender interface { @@ -68,6 +70,11 @@ func (b *Builder) Build(ctx context.Context, config *Config, checkCancelled func return fmt.Errorf("clone failed: %w", err) } + if config.CommitSha == "HEAD" && config.ImageRepository != "" && config.ResolvedCommitSha != "" { + config.ImageURI = fmt.Sprintf("%s:%s", config.ImageRepository, config.ResolvedCommitSha) + b.sendLog(config, fmt.Sprintf("Resolved image tag %s", config.ImageURI)) + } + if checkCancelled() { return fmt.Errorf("build cancelled") } @@ -165,9 +172,30 @@ func (b *Builder) clone(ctx context.Context, config *Config, buildDir string) er } b.sendLog(config, "Clone completed") + resolvedCommitSha, err := b.resolveCommitSha(ctx, config, buildDir) + if err != nil { + return err + } + config.ResolvedCommitSha = resolvedCommitSha + b.sendLog(config, fmt.Sprintf("Resolved commit %s", truncateStr(resolvedCommitSha, 8))) return nil } +func (b *Builder) resolveCommitSha(ctx context.Context, config *Config, buildDir string) (string, error) { + cmd := exec.CommandContext(ctx, "git", "-C", buildDir, "rev-parse", "HEAD") + output, err := b.runCommand(cmd, config) + if err != nil { + return "", fmt.Errorf("failed to resolve commit sha: %s: %w", output, err) + } + + resolvedCommitSha := strings.TrimSpace(output) + if resolvedCommitSha == "" { + return "", fmt.Errorf("resolved commit sha is empty") + } + + return resolvedCommitSha, nil +} + func (b *Builder) buildAndPush(ctx context.Context, config *Config, buildDir string) error { contextDir := buildDir if config.RootDir != "" { diff --git a/agent/internal/http/client.go b/agent/internal/http/client.go index f375d10..418ee6a 100644 --- a/agent/internal/http/client.go +++ b/agent/internal/http/client.go @@ -267,6 +267,7 @@ type BuildDetails struct { ProjectID string `json:"projectId"` } `json:"build"` CloneURL string `json:"cloneUrl"` + ImageRepository string `json:"imageRepository"` ImageURI string `json:"imageUri"` RootDir string `json:"rootDir"` Secrets map[string]string `json:"secrets"` @@ -301,13 +302,16 @@ func (c *Client) GetBuild(buildID string) (*BuildDetails, error) { return &result, nil } -func (c *Client) UpdateBuildStatus(buildID, status, errorMsg string) error { +func (c *Client) UpdateBuildStatus(buildID, status, errorMsg, resolvedCommitSha string) error { payload := map[string]string{ "status": status, } if errorMsg != "" { payload["error"] = errorMsg } + if resolvedCommitSha != "" { + payload["resolvedCommitSha"] = resolvedCommitSha + } body, err := json.Marshal(payload) if err != nil { diff --git a/docs/deployments/compose.mdx b/docs/deployments/compose.mdx index d3db646..d03e8e7 100644 --- a/docs/deployments/compose.mdx +++ b/docs/deployments/compose.mdx @@ -32,3 +32,5 @@ Each service in the compose file becomes a separate Techulus Cloud service withi ## Stateful Services If a service in the compose file defines volumes, it is automatically marked as stateful. Stateful services are limited to 1 replica and pinned to a single server. + +Imported stateful services use single-server local storage. Techulus Cloud does not currently provide replicated volumes or automatic failover for these services. Avoid importing production databases unless you accept the single-node storage risk and have an external backup and recovery plan. diff --git a/docs/index.mdx b/docs/index.mdx index 526b1ef..c42edd8 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -22,11 +22,11 @@ All communication between nodes happens over an encrypted **WireGuard mesh netwo - **Build from source** — push code and build with Railpack or your own Dockerfile. Or deploy pre-built images. - **GitHub auto-deploy** — connect a repo and deploy on every push. - **Automatic HTTPS** — TLS certificates are provisioned and renewed automatically via Let's Encrypt. -- **Persistent volumes** — attach named volumes for stateful workloads with scheduled backups to S3-compatible storage. +- **Persistent volumes** — attach named local volumes for stateful workloads with scheduled backups to S3-compatible storage. Replicated storage and HA failover are not yet supported. - **Service discovery** — containers resolve each other by name using `.internal` domains. - **Multi-environment** — run production, staging, and dev within the same project. - **GeoDNS** — route users to the nearest proxy node with automatic failover. -- **TCP/UDP proxy** — expose non-HTTP services like databases or game servers. +- **TCP/UDP proxy** — expose non-HTTP services like game servers or custom protocols. ## Next Steps diff --git a/docs/infrastructure/backups.mdx b/docs/infrastructure/backups.mdx index 006a54d..9400a99 100644 --- a/docs/infrastructure/backups.mdx +++ b/docs/infrastructure/backups.mdx @@ -5,6 +5,8 @@ description: "Automated database backups to S3-compatible storage." Techulus Cloud can automatically back up databases running in your containers to S3-compatible storage. Backups are triggered on a schedule or manually from the web UI. +> **Backups are not high availability:** Backups provide point-in-time disaster recovery only. If a server hosting a stateful service is lost, any writes after the last successful backup may be lost, and recovery requires restoring data before the service can run elsewhere. Backups do not provide replicated storage or automatic failover. + ## Supported Databases The agent detects the database type from the container image name and runs the appropriate dump command: diff --git a/docs/networking/service-discovery.mdx b/docs/networking/service-discovery.mdx index 7b83f0f..062d447 100644 --- a/docs/networking/service-discovery.mdx +++ b/docs/networking/service-discovery.mdx @@ -30,3 +30,5 @@ If you have a `postgres` service and a `web` service, the web service can connec ``` postgres://user:pass@postgres.internal:5432/mydb ``` + +For production databases, prefer an external managed database or another HA database setup. Techulus Cloud stateful volumes are single-server local storage and do not currently provide replicated storage or automatic failover. diff --git a/docs/networking/tcp-udp-proxy.mdx b/docs/networking/tcp-udp-proxy.mdx index dfcd6ed..4975291 100644 --- a/docs/networking/tcp-udp-proxy.mdx +++ b/docs/networking/tcp-udp-proxy.mdx @@ -1,9 +1,11 @@ --- title: "TCP/UDP Proxy" -description: "Expose non-HTTP services like databases and game servers." +description: "Expose non-HTTP services like game servers and custom protocols." --- -Not every service speaks HTTP. Techulus Cloud supports exposing raw TCP and UDP ports through proxy nodes for services like databases, game servers, or custom protocols. +Not every service speaks HTTP. Techulus Cloud supports exposing raw TCP and UDP ports through proxy nodes for services like game servers or custom protocols. + +> **Database exposure warning:** Database ports should usually remain private on the WireGuard network. Public database access increases security risk, and Techulus Cloud does not currently provide HA storage or automatic failover for production databases. ## Configuration @@ -20,7 +22,7 @@ Traffic is routed from the proxy node's external port through the WireGuard mesh ## TLS Passthrough -For TCP services that handle their own TLS (e.g., a database with native SSL), enable **TLS passthrough**. This forwards the encrypted connection directly to the container without Traefik terminating TLS. +For TCP services that handle their own TLS, enable **TLS passthrough**. This forwards the encrypted connection directly to the container without Traefik terminating TLS. ## Firewall diff --git a/docs/services/scaling.mdx b/docs/services/scaling.mdx index 420b491..001797b 100644 --- a/docs/services/scaling.mdx +++ b/docs/services/scaling.mdx @@ -17,7 +17,9 @@ When auto-placement is disabled, you manually configure how many replicas run on ## Server Pinning -Stateful services (those with [volumes](/services/volumes)) are automatically pinned to a single server. This ensures the container always has access to its persistent data. +Stateful services (those with [volumes](/services/volumes)) are automatically pinned to a single server. This ensures the container always mounts the same local data path and avoids accidentally starting on a server that does not have the volume. + +Pinning does not provide high availability. Volume data is not replicated across servers, and if the pinned server is lost, the service must be recovered from completed backups. You can also manually lock any service to a specific server by setting the locked server. This is useful for workloads that need to run on a particular machine. @@ -25,4 +27,5 @@ You can also manually lock any service to a specific server by setting the locke - Stateful services are limited to 1 replica. - Stateful services cannot use auto-placement — they are always pinned to their locked server. +- Stateful services do not automatically fail over to another server. - Maximum 10 replicas per service. diff --git a/docs/services/volumes.mdx b/docs/services/volumes.mdx index fec75aa..474d437 100644 --- a/docs/services/volumes.mdx +++ b/docs/services/volumes.mdx @@ -5,6 +5,8 @@ description: "Persistent storage for stateful services." Volumes provide persistent storage that survives container restarts and redeployments. +> **Stateful storage warning:** Volumes are stored on a single server's local filesystem. Techulus Cloud does not currently provide replicated volumes, automatic storage failover, or high availability for stateful services. If the server hosting a volume is lost, data can only be recovered from completed backups. We do not recommend running production databases on Techulus Cloud until HA storage and failover are implemented, unless you accept this risk and maintain an external recovery plan. + ## Adding Volumes Each volume has a name and a container path: @@ -14,7 +16,7 @@ Each volume has a name and a container path: | Name | Unique identifier for the volume | | Container path | Where the volume is mounted inside the container (e.g., `/var/lib/postgresql/data`) | -When you add a volume, the service automatically becomes **stateful**. Stateful services are locked to a single server and limited to 1 replica. When the last volume is removed, the service reverts to stateless. +When you add a volume, the service automatically becomes **stateful**. Stateful services are locked to a single server and limited to 1 replica so the container always mounts the same local data path. When the last volume is removed, the service reverts to stateless. ## Volume Backups @@ -44,4 +46,6 @@ You can restore a volume from any completed backup. The restore process download - Services with volumes are locked to a single server — they cannot be auto-placed across multiple nodes. - Replica count is fixed at 1 for stateful services. -- Volume data lives on the host filesystem. If the server is lost, data is only recoverable from backups. +- Volume data lives on the host filesystem and is not replicated to other servers. +- If the server is lost, data is only recoverable from completed backups. +- Backups are point-in-time recovery, not high availability or automatic failover. diff --git a/web/actions/builds.ts b/web/actions/builds.ts index ee2f6de..dc6a2cd 100644 --- a/web/actions/builds.ts +++ b/web/actions/builds.ts @@ -1,6 +1,6 @@ "use server"; -import { eq, desc } from "drizzle-orm"; +import { eq } from "drizzle-orm"; import { db } from "@/db"; import { builds, githubRepos, services } from "@/db/schema"; import { inngest } from "@/lib/inngest/client"; @@ -93,22 +93,14 @@ export async function triggerBuild( .where(eq(githubRepos.serviceId, serviceId)); if (githubRepo) { - const [latestBuild] = await db - .select() - .from(builds) - .where(eq(builds.serviceId, serviceId)) - .orderBy(desc(builds.createdAt)) - .limit(1); - await inngest.send( inngestEvents.buildTrigger.create({ serviceId, trigger, githubRepoId: githubRepo.id, - commitSha: latestBuild?.commitSha || "HEAD", - commitMessage: latestBuild?.commitMessage || triggerMessage, - branch: latestBuild?.branch || githubRepo.deployBranch || "main", - author: latestBuild?.author ?? undefined, + commitSha: "HEAD", + commitMessage: triggerMessage, + branch: githubRepo.deployBranch || githubRepo.defaultBranch || "main", }), ); diff --git a/web/app/api/v1/agent/builds/[id]/route.ts b/web/app/api/v1/agent/builds/[id]/route.ts index a261266..ea2d4cb 100644 --- a/web/app/api/v1/agent/builds/[id]/route.ts +++ b/web/app/api/v1/agent/builds/[id]/route.ts @@ -1,20 +1,20 @@ -import { NextRequest, NextResponse } from "next/server"; +import { and, eq } from "drizzle-orm"; +import { type NextRequest, NextResponse } from "next/server"; import { db } from "@/db"; +import { getSetting } from "@/db/queries"; import { builds, - githubRepos, githubInstallations, - services, + githubRepos, projects, secrets, + services, } from "@/db/schema"; -import { eq, and } from "drizzle-orm"; import { verifyAgentRequest } from "@/lib/agent-auth"; -import { getInstallationToken, buildCloneUrl } from "@/lib/github"; -import { getSetting } from "@/db/queries"; +import { buildCloneUrl, getInstallationToken } from "@/lib/github"; import { - SETTING_KEYS, DEFAULT_BUILD_TIMEOUT_MINUTES, + SETTING_KEYS, } from "@/lib/settings-keys"; export async function GET( @@ -75,8 +75,9 @@ export async function GET( { status: 500 }, ); } + const imageRepository = `${registryHost}/${project.id}/${service.id}`; const commitSha = build.commitSha === "HEAD" ? "latest" : build.commitSha; - const imageUri = `${registryHost}/${project.id}/${service.id}:${commitSha}`; + const imageUri = `${imageRepository}:${commitSha}`; let cloneUrl: string; @@ -131,7 +132,7 @@ export async function GET( } else if (service.githubRepoUrl) { cloneUrl = service.githubRepoUrl; if (!cloneUrl.endsWith(".git")) { - cloneUrl = cloneUrl + ".git"; + cloneUrl = `${cloneUrl}.git`; } } else { return NextResponse.json( @@ -168,6 +169,7 @@ export async function GET( projectId: project.id, }, cloneUrl, + imageRepository, imageUri, rootDir: service.githubRootDir || "", secrets: secretsMap, diff --git a/web/app/api/v1/agent/builds/[id]/status/route.ts b/web/app/api/v1/agent/builds/[id]/status/route.ts index f0e0a2f..84586ff 100644 --- a/web/app/api/v1/agent/builds/[id]/status/route.ts +++ b/web/app/api/v1/agent/builds/[id]/status/route.ts @@ -1,24 +1,25 @@ -import { NextRequest, NextResponse } from "next/server"; +import { and, eq } from "drizzle-orm"; +import { type NextRequest, NextResponse } from "next/server"; +import { deployService } from "@/actions/projects"; import { db } from "@/db"; import { builds, - services, + githubRepos, projects, serviceReplicas, - githubRepos, + services, } from "@/db/schema"; -import { eq, and } from "drizzle-orm"; import { verifyAgentRequest } from "@/lib/agent-auth"; -import { deployService } from "@/actions/projects"; -import { updateGitHubDeploymentStatus } from "@/lib/github"; import { sendBuildFailureAlert } from "@/lib/email"; -import { enqueueWork } from "@/lib/work-queue"; +import { updateGitHubDeploymentStatus } from "@/lib/github"; import { inngest } from "@/lib/inngest/client"; import { inngestEvents } from "@/lib/inngest/events"; +import { enqueueWork } from "@/lib/work-queue"; type StatusUpdate = { status: "cloning" | "building" | "pushing" | "completed" | "failed"; error?: string; + resolvedCommitSha?: string; }; type BuildCompletedEventData = { @@ -84,6 +85,15 @@ export async function POST( updateData.error = update.error; } + const effectiveCommitSha = + build.commitSha === "HEAD" && update.resolvedCommitSha + ? update.resolvedCommitSha + : build.commitSha; + + if (build.commitSha === "HEAD" && update.resolvedCommitSha) { + updateData.commitSha = update.resolvedCommitSha; + } + await db.update(builds).set(updateData).where(eq(builds.id, buildId)); if (build.githubDeploymentId && build.githubRepoId) { @@ -168,7 +178,7 @@ export async function POST( if (update.status === "completed") { console.log( - `[build:status] build ${buildId.slice(0, 8)} completed, targetPlatform=${build.targetPlatform}, serviceId=${build.serviceId}, commitSha=${build.commitSha?.slice(0, 8)}`, + `[build:status] build ${buildId.slice(0, 8)} completed, targetPlatform=${build.targetPlatform}, serviceId=${build.serviceId}, commitSha=${effectiveCommitSha.slice(0, 8)}`, ); const service = await db @@ -198,7 +208,8 @@ export async function POST( { status: 500 }, ); } - const commitSha = build.commitSha === "HEAD" ? "latest" : build.commitSha; + const commitSha = + effectiveCommitSha === "HEAD" ? "latest" : effectiveCommitSha; const baseImageUri = `${registryHost}/${project.id}/${service.id}:${commitSha}`; if (build.targetPlatform) { @@ -265,7 +276,7 @@ export async function POST( if (allCompleted && groupBuilds.length > 0) { console.log( - `[build:complete] all ${groupBuilds.length} platform builds completed for ${build.serviceId}@${build.commitSha.slice(0, 8)}`, + `[build:complete] all ${groupBuilds.length} platform builds completed for ${build.serviceId}@${commitSha.slice(0, 8)}`, ); const images = groupBuilds.map((b) => { diff --git a/web/app/api/v1/agent/expected-state/route.ts b/web/app/api/v1/agent/expected-state/route.ts index 2a82bfc..f364cd4 100644 --- a/web/app/api/v1/agent/expected-state/route.ts +++ b/web/app/api/v1/agent/expected-state/route.ts @@ -1,19 +1,19 @@ -import { NextRequest, NextResponse } from "next/server"; +import { and, eq, inArray } from "drizzle-orm"; +import { type NextRequest, NextResponse } from "next/server"; import { db } from "@/db"; import { - deployments, deploymentPorts, - services, - servicePorts, - serviceVolumes, + deployments, secrets, servers, + servicePorts, + services, + serviceVolumes, } from "@/db/schema"; -import { eq, and, inArray } from "drizzle-orm"; +import { getAllCertificatesForDomains } from "@/lib/acme-manager"; import { verifyAgentRequest } from "@/lib/agent-auth"; import { slugify } from "@/lib/utils"; import { getWireGuardPeers } from "@/lib/wireguard"; -import { getAllCertificatesForDomains } from "@/lib/acme-manager"; const EXPECTED_STATUSES = [ "pending", @@ -25,8 +25,8 @@ const EXPECTED_STATUSES = [ "unknown", ] as const; -const ROUTABLE_STATUSES = ["healthy", "running", "unknown"] as const; -const DNS_STATUSES = ["healthy", "running", "unknown"] as const; +const ROUTABLE_STATUSES = ["healthy", "running"] as const; +const DNS_STATUSES = ["healthy", "running"] as const; export async function GET(request: NextRequest) { const auth = await verifyAgentRequest(request); diff --git a/web/app/globals.css b/web/app/globals.css index 72649a7..b07102b 100644 --- a/web/app/globals.css +++ b/web/app/globals.css @@ -74,6 +74,19 @@ --color-destructive-foreground: var(--destructive-foreground); } +@theme { + --animate-shimmer: shimmer 1.4s ease-in-out infinite; + + @keyframes shimmer { + 0% { + transform: translateX(0); + } + 100% { + transform: translateX(300%); + } + } +} + @layer theme { :root { font-feature-settings: diff --git a/web/app/page.tsx b/web/app/page.tsx index bdc6fe0..a1b3f1e 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -1,16 +1,40 @@ import { Suspense } from "react"; import { SignInPage } from "@/components/auth/sign-in-page"; -import { Spinner } from "@/components/ui/spinner"; +import { Skeleton } from "@/components/ui/skeleton"; -export default function Page() { +function RootPageSkeleton() { return ( - - +
+ +
+ Loading +
+
+ ); +} + +export default function Page() { + return ( + }> ); diff --git a/web/components/builds/build-details.tsx b/web/components/builds/build-details.tsx index 0f0b27f..f48b343 100644 --- a/web/components/builds/build-details.tsx +++ b/web/components/builds/build-details.tsx @@ -18,6 +18,7 @@ import { useState } from "react"; import { toast } from "sonner"; import useSWR from "swr"; import { cancelBuild, retryBuild } from "@/actions/builds"; +import { LogViewer } from "@/components/logs/log-viewer"; import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { @@ -29,7 +30,6 @@ import { import type { Build, BuildStatus, GithubRepo, Service } from "@/db/types"; import { formatRelativeTime } from "@/lib/date"; import { fetcher } from "@/lib/fetcher"; -import { LogViewer } from "@/components/logs/log-viewer"; type BuildWithDates = Omit< Build, @@ -310,7 +310,7 @@ export function BuildDetails({ Build Failed -
+						
 							{build.error}
 						
diff --git a/web/components/builds/builds-viewer.tsx b/web/components/builds/builds-viewer.tsx index 8d53766..d8741ef 100644 --- a/web/components/builds/builds-viewer.tsx +++ b/web/components/builds/builds-viewer.tsx @@ -285,7 +285,7 @@ export function BuildsViewer({ )} {build.error && ( -
+									
 										{build.error}
 									
)} diff --git a/web/components/ui/item.tsx b/web/components/ui/item.tsx index 20d7662..a5aa71c 100644 --- a/web/components/ui/item.tsx +++ b/web/components/ui/item.tsx @@ -1,10 +1,9 @@ -import * as React from "react"; import { mergeProps } from "@base-ui/react/merge-props"; import { useRender } from "@base-ui/react/use-render"; import { cva, type VariantProps } from "class-variance-authority"; - -import { cn } from "@/lib/utils"; +import type * as React from "react"; import { Separator } from "@/components/ui/separator"; +import { cn } from "@/lib/utils"; function ItemGroup({ className, ...props }: React.ComponentProps<"div">) { return ( @@ -117,7 +116,7 @@ function ItemContent({ className, ...props }: React.ComponentProps<"div">) {
) {
({ a:hover]:text-primary line-clamp-2 font-normal [&>a]:underline [&>a]:underline-offset-4", + "text-muted-foreground text-left text-sm leading-normal group-data-[size=xs]/item:text-xs [&>a:hover]:text-primary line-clamp-2 min-w-0 font-normal [&>a]:underline [&>a]:underline-offset-4", className, )} {...props} diff --git a/web/components/ui/skeleton.tsx b/web/components/ui/skeleton.tsx new file mode 100644 index 0000000..f62db37 --- /dev/null +++ b/web/components/ui/skeleton.tsx @@ -0,0 +1,16 @@ +import { cn } from "@/lib/utils"; + +function Skeleton({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ); +} + +export { Skeleton }; diff --git a/web/lib/inngest/functions/rollout-helpers.ts b/web/lib/inngest/functions/rollout-helpers.ts index e1cb200..8a88940 100644 --- a/web/lib/inngest/functions/rollout-helpers.ts +++ b/web/lib/inngest/functions/rollout-helpers.ts @@ -1,6 +1,7 @@ import { randomUUID } from "node:crypto"; import { and, eq, inArray } from "drizzle-orm"; import { db } from "@/db"; +import { getService } from "@/db/queries"; import { deploymentPorts, deployments, @@ -11,12 +12,14 @@ import { services, serviceVolumes, } from "@/db/schema"; -import { calculateSpreadPlacement } from "@/lib/placement"; import { getCertificate, issueCertificate } from "@/lib/acme-manager"; +import { + calculateResourceAwarePlacement, + replaceServiceReplicaPlacements, +} from "@/lib/placement"; +import { buildCurrentConfig } from "@/lib/service-config"; import { assignContainerIp } from "@/lib/wireguard"; import { enqueueWork } from "@/lib/work-queue"; -import { buildCurrentConfig } from "@/lib/service-config"; -import { getService } from "@/db/queries"; const PORT_RANGE_START = 30000; const PORT_RANGE_END = 32767; @@ -96,20 +99,12 @@ export async function calculateServicePlacements( throw new Error("Maximum 10 replicas allowed"); } - const calculatedPlacements = await calculateSpreadPlacement(totalReplicas); + const calculatedPlacements = await calculateResourceAwarePlacement( + service, + totalReplicas, + ); - await db - .delete(serviceReplicas) - .where(eq(serviceReplicas.serviceId, service.id)); - - for (const placement of calculatedPlacements) { - await db.insert(serviceReplicas).values({ - id: randomUUID(), - serviceId: service.id, - serverId: placement.serverId, - count: placement.count, - }); - } + await replaceServiceReplicaPlacements(service.id, calculatedPlacements); placements = calculatedPlacements.map((p) => ({ serverId: p.serverId, @@ -331,7 +326,10 @@ export async function createDeploymentRecords( for (const placement of placements) { if (placement.replicas <= 0) continue; - const server = serverMap.get(placement.serverId)!; + const server = serverMap.get(placement.serverId); + if (!server) { + throw new Error(`Server ${placement.serverId} not found`); + } for (let i = 0; i < placement.replicas; i++) { replicaIndex++; diff --git a/web/lib/placement-planner.ts b/web/lib/placement-planner.ts new file mode 100644 index 0000000..e46f5e7 --- /dev/null +++ b/web/lib/placement-planner.ts @@ -0,0 +1,312 @@ +import { createHash } from "node:crypto"; + +export type PlacementResult = { serverId: string; count: number }[]; + +export type PlacementServerSnapshot = { + id: string; + status: string; + wireguardIp: string | null; + resourcesCpu: number | null; + resourcesMemory: number | null; + resourcesDisk: number | null; + healthStats?: { + cpuUsagePercent: number; + memoryUsagePercent: number; + memoryUsedMb: number; + diskUsagePercent: number; + diskUsedGb: number; + } | null; + containerHealth?: { + runtimeResponsive: boolean; + runningContainers: number; + stoppedContainers: number; + storageUsedGb: number; + } | null; +}; + +export type ReplicaAllocationSnapshot = { + serverId: string; + serviceId: string; + resourceCpuLimit: number | null; + resourceMemoryLimitMb: number | null; + count: number; +}; + +export type PlacementPlanInput = { + serviceId: string; + totalReplicas: number; + resourceCpuLimit: number | null; + resourceMemoryLimitMb: number | null; + servers: PlacementServerSnapshot[]; + existingReplicas: ReplicaAllocationSnapshot[]; + excludeServerIds?: string[]; +}; + +type ProjectedServerLoad = { + cpu: number; + memoryMb: number; + existingReplicas: number; + assignedServiceReplicas: number; +}; + +type CandidateScore = { + server: PlacementServerSnapshot; + score: number; + hashScore: number; +}; + +const EPSILON = 0.000001; +const DOMINANT_UTILIZATION_WEIGHT = 100; +const LIVE_CPU_PRESSURE_WEIGHT = 20; +const LIVE_MEMORY_PRESSURE_WEIGHT = 20; +const LIVE_DISK_PRESSURE_WEIGHT = 15; +const EXISTING_REPLICA_WEIGHT = 2; +const RUNNING_CONTAINER_WEIGHT = 0.5; +const UNRESPONSIVE_RUNTIME_PENALTY = 100; + +export function calculateResourceAwarePlacementFromSnapshot({ + serviceId, + totalReplicas, + resourceCpuLimit, + resourceMemoryLimitMb, + servers, + existingReplicas, + excludeServerIds, +}: PlacementPlanInput): PlacementResult { + if (totalReplicas < 1) { + throw new Error("At least one replica is required"); + } + + const excludedIds = new Set(excludeServerIds ?? []); + const eligibleServers = servers.filter( + (server) => + server.status === "online" && + server.wireguardIp !== null && + !excludedIds.has(server.id), + ); + + if (eligibleServers.length === 0) { + throw new Error("No healthy servers available for placement"); + } + + const projectedLoad = buildInitialProjectedLoad( + serviceId, + eligibleServers, + existingReplicas, + ); + const assignments: string[] = []; + + for (let replicaIndex = 0; replicaIndex < totalReplicas; replicaIndex++) { + const fittingCandidates = eligibleServers.filter((server) => + canFit(server, getProjectedLoad(projectedLoad, server.id), { + resourceCpuLimit, + resourceMemoryLimitMb, + }), + ); + + if (fittingCandidates.length === 0) { + throw new Error( + "No eligible servers have enough resources for placement", + ); + } + + const unassignedCandidates = fittingCandidates.filter( + (server) => + getProjectedLoad(projectedLoad, server.id).assignedServiceReplicas === + 0, + ); + const candidates = + unassignedCandidates.length > 0 + ? unassignedCandidates + : fittingCandidates; + + const rankedCandidates = candidates + .map((server) => ({ + server, + score: scoreServer(server, getProjectedLoad(projectedLoad, server.id), { + resourceCpuLimit, + resourceMemoryLimitMb, + }), + hashScore: rendezvousHashScore(serviceId, replicaIndex, server.id), + })) + .sort(compareCandidates); + + const selected = rankedCandidates[0].server; + assignments.push(selected.id); + + const selectedLoad = getProjectedLoad(projectedLoad, selected.id); + selectedLoad.cpu += resourceCpuLimit ?? 0; + selectedLoad.memoryMb += resourceMemoryLimitMb ?? 0; + selectedLoad.existingReplicas += 1; + selectedLoad.assignedServiceReplicas += 1; + } + + return groupAssignments(assignments); +} + +function buildInitialProjectedLoad( + serviceId: string, + servers: PlacementServerSnapshot[], + existingReplicas: ReplicaAllocationSnapshot[], +) { + const projectedLoad = new Map(); + const eligibleServerIds = new Set(servers.map((server) => server.id)); + + for (const server of servers) { + projectedLoad.set(server.id, { + cpu: 0, + memoryMb: 0, + existingReplicas: 0, + assignedServiceReplicas: 0, + }); + } + + for (const replica of existingReplicas) { + if ( + replica.serviceId === serviceId || + !eligibleServerIds.has(replica.serverId) + ) { + continue; + } + + const count = Math.max(0, replica.count); + const load = getProjectedLoad(projectedLoad, replica.serverId); + load.cpu += (replica.resourceCpuLimit ?? 0) * count; + load.memoryMb += (replica.resourceMemoryLimitMb ?? 0) * count; + load.existingReplicas += count; + } + + return projectedLoad; +} + +function getProjectedLoad( + projectedLoad: Map, + serverId: string, +) { + const load = projectedLoad.get(serverId); + if (!load) { + throw new Error(`Missing projected load for server ${serverId}`); + } + return load; +} + +function canFit( + server: PlacementServerSnapshot, + load: ProjectedServerLoad, + request: Pick< + PlacementPlanInput, + "resourceCpuLimit" | "resourceMemoryLimitMb" + >, +) { + const cpuCapacity = server.resourcesCpu; + const memoryCapacity = server.resourcesMemory; + const hasKnownCpuCapacity = cpuCapacity !== null && cpuCapacity > 0; + const hasKnownMemoryCapacity = memoryCapacity !== null && memoryCapacity > 0; + + if ( + request.resourceCpuLimit !== null && + hasKnownCpuCapacity && + load.cpu + request.resourceCpuLimit > cpuCapacity + EPSILON + ) { + return false; + } + + if ( + request.resourceMemoryLimitMb !== null && + hasKnownMemoryCapacity && + load.memoryMb + request.resourceMemoryLimitMb > memoryCapacity + EPSILON + ) { + return false; + } + + return true; +} + +function scoreServer( + server: PlacementServerSnapshot, + load: ProjectedServerLoad, + request: Pick< + PlacementPlanInput, + "resourceCpuLimit" | "resourceMemoryLimitMb" + >, +) { + const projectedCpu = + load.cpu + + (request.resourceCpuLimit === null ? 0 : request.resourceCpuLimit); + const projectedMemoryMb = + load.memoryMb + + (request.resourceMemoryLimitMb === null + ? 0 + : request.resourceMemoryLimitMb); + + const cpuCapacity = server.resourcesCpu; + const memoryCapacity = server.resourcesMemory; + const cpuUtilization = + cpuCapacity !== null && cpuCapacity > 0 ? projectedCpu / cpuCapacity : 0; + const memoryUtilization = + memoryCapacity !== null && memoryCapacity > 0 + ? projectedMemoryMb / memoryCapacity + : 0; + const dominantUtilization = Math.max(cpuUtilization, memoryUtilization); + + const liveCpuPressure = percentToRatio(server.healthStats?.cpuUsagePercent); + const liveMemoryPressure = percentToRatio( + server.healthStats?.memoryUsagePercent, + ); + const liveDiskPressure = percentToRatio(server.healthStats?.diskUsagePercent); + const runtimePenalty = + server.containerHealth?.runtimeResponsive === false + ? UNRESPONSIVE_RUNTIME_PENALTY + : 0; + const containerCountPenalty = + (server.containerHealth?.runningContainers ?? 0) * RUNNING_CONTAINER_WEIGHT; + + return ( + dominantUtilization * DOMINANT_UTILIZATION_WEIGHT + + liveCpuPressure * LIVE_CPU_PRESSURE_WEIGHT + + liveMemoryPressure * LIVE_MEMORY_PRESSURE_WEIGHT + + liveDiskPressure * LIVE_DISK_PRESSURE_WEIGHT + + load.existingReplicas * EXISTING_REPLICA_WEIGHT + + containerCountPenalty + + runtimePenalty + ); +} + +function percentToRatio(value: number | undefined) { + if (value === undefined || Number.isNaN(value)) return 0; + return Math.max(0, value) / 100; +} + +function rendezvousHashScore( + serviceId: string, + replicaIndex: number, + serverId: string, +) { + const digest = createHash("sha256") + .update(`${serviceId}:${replicaIndex}:${serverId}`) + .digest("hex") + .slice(0, 12); + return Number.parseInt(digest, 16); +} + +function compareCandidates(a: CandidateScore, b: CandidateScore) { + if (Math.abs(a.score - b.score) > EPSILON) { + return a.score - b.score; + } + + if (a.hashScore !== b.hashScore) { + return b.hashScore - a.hashScore; + } + + return a.server.id.localeCompare(b.server.id); +} + +function groupAssignments(assignments: string[]): PlacementResult { + const placements = new Map(); + + for (const serverId of assignments) { + placements.set(serverId, (placements.get(serverId) ?? 0) + 1); + } + + return [...placements].map(([serverId, count]) => ({ serverId, count })); +} diff --git a/web/lib/placement.ts b/web/lib/placement.ts index 06a4f2b..b391290 100644 --- a/web/lib/placement.ts +++ b/web/lib/placement.ts @@ -1,61 +1,98 @@ -import { db } from "@/db"; -import { servers } from "@/db/schema"; +import { randomUUID } from "node:crypto"; import { and, eq, isNotNull } from "drizzle-orm"; -import { getSetting } from "@/db/queries"; +import { db } from "@/db"; +import { servers, serviceReplicas, services, settings } from "@/db/schema"; +import type { Service } from "@/db/types"; +import { + calculateResourceAwarePlacementFromSnapshot, + type PlacementResult, + type PlacementServerSnapshot, +} from "@/lib/placement-planner"; import { SETTING_KEYS } from "@/lib/settings-keys"; -export type PlacementResult = { serverId: string; count: number }[]; - -export async function getHealthyServers(excludeServerIds?: string[]) { - const allOnlineServers = await db - .select({ - id: servers.id, - name: servers.name, - wireguardIp: servers.wireguardIp, - }) - .from(servers) - .where(and(eq(servers.status, "online"), isNotNull(servers.wireguardIp))); - - const excludedFromWorkload = await getSetting( - SETTING_KEYS.SERVERS_EXCLUDED_FROM_WORKLOAD_PLACEMENT, - ); - - const allExcludedIds = new Set([ - ...(excludeServerIds ?? []), - ...(excludedFromWorkload ?? []), - ]); - - if (allExcludedIds.size > 0) { - return allOnlineServers.filter((s) => !allExcludedIds.has(s.id)); - } +export type { PlacementResult }; - return allOnlineServers; -} - -export async function calculateSpreadPlacement( +export async function calculateResourceAwarePlacement( + service: Pick, totalReplicas: number, excludeServerIds?: string[], ): Promise { - const healthyServers = await getHealthyServers(excludeServerIds); + const [candidateServers, allocatedReplicas, excludedFromWorkload] = + await Promise.all([ + db + .select({ + id: servers.id, + status: servers.status, + wireguardIp: servers.wireguardIp, + resourcesCpu: servers.resourcesCpu, + resourcesMemory: servers.resourcesMemory, + resourcesDisk: servers.resourcesDisk, + healthStats: servers.healthStats, + containerHealth: servers.containerHealth, + }) + .from(servers) + .where( + and(eq(servers.status, "online"), isNotNull(servers.wireguardIp)), + ), + db + .select({ + serverId: serviceReplicas.serverId, + serviceId: serviceReplicas.serviceId, + resourceCpuLimit: services.resourceCpuLimit, + resourceMemoryLimitMb: services.resourceMemoryLimitMb, + count: serviceReplicas.count, + }) + .from(serviceReplicas) + .innerJoin(services, eq(serviceReplicas.serviceId, services.id)), + getExcludedFromWorkloadPlacement(), + ]); - if (healthyServers.length === 0) { - throw new Error("No healthy servers available for placement"); - } + return calculateResourceAwarePlacementFromSnapshot({ + serviceId: service.id, + totalReplicas, + resourceCpuLimit: service.resourceCpuLimit, + resourceMemoryLimitMb: service.resourceMemoryLimitMb, + servers: candidateServers satisfies PlacementServerSnapshot[], + existingReplicas: allocatedReplicas, + excludeServerIds: [...(excludeServerIds ?? []), ...excludedFromWorkload], + }); +} - const baseCount = Math.floor(totalReplicas / healthyServers.length); - const remainder = totalReplicas % healthyServers.length; +async function getExcludedFromWorkloadPlacement(): Promise { + const row = await db + .select({ value: settings.value }) + .from(settings) + .where( + eq(settings.key, SETTING_KEYS.SERVERS_EXCLUDED_FROM_WORKLOAD_PLACEMENT), + ) + .then((result) => result[0]); + + const value = row?.value; + return Array.isArray(value) + ? value.filter( + (serverId): serverId is string => typeof serverId === "string", + ) + : []; +} - const placements: PlacementResult = []; +export async function replaceServiceReplicaPlacements( + serviceId: string, + placements: PlacementResult, +) { + await db.transaction(async (tx) => { + await tx + .delete(serviceReplicas) + .where(eq(serviceReplicas.serviceId, serviceId)); - for (let i = 0; i < healthyServers.length; i++) { - const count = baseCount + (i < remainder ? 1 : 0); - if (count > 0) { - placements.push({ - serverId: healthyServers[i].id, - count, - }); - } - } + if (placements.length === 0) return; - return placements; + await tx.insert(serviceReplicas).values( + placements.map((placement) => ({ + id: randomUUID(), + serviceId, + serverId: placement.serverId, + count: placement.count, + })), + ); + }); } diff --git a/web/lib/scheduler.ts b/web/lib/scheduler.ts index 8e87fdb..6da74f4 100644 --- a/web/lib/scheduler.ts +++ b/web/lib/scheduler.ts @@ -1,19 +1,20 @@ +import { CronExpressionParser } from "cron-parser"; +import { and, eq, inArray, isNotNull, lt, ne } from "drizzle-orm"; +import { triggerBuild } from "@/actions/builds"; +import { deployService } from "@/actions/projects"; import { db } from "@/db"; import { deployments, rollouts, servers, services, - serviceReplicas, workQueue, } from "@/db/schema"; -import { and, eq, inArray, isNotNull, lt, ne } from "drizzle-orm"; -import { randomUUID } from "node:crypto"; -import { CronExpressionParser } from "cron-parser"; -import { calculateSpreadPlacement } from "@/lib/placement"; -import { deployService } from "@/actions/projects"; -import { triggerBuild } from "@/actions/builds"; -import { sendServerOfflineAlert, sendDeploymentMovedAlert } from "@/lib/email"; +import { sendDeploymentMovedAlert, sendServerOfflineAlert } from "@/lib/email"; +import { + calculateResourceAwarePlacement, + replaceServiceReplicaPlacements, +} from "@/lib/placement"; const STALE_THRESHOLD_MS = 120_000; // 2 minutes @@ -32,6 +33,8 @@ export async function triggerRecoveryForOfflineServers( autoPlace: services.autoPlace, stateful: services.stateful, replicas: services.replicas, + resourceCpuLimit: services.resourceCpuLimit, + resourceMemoryLimitMb: services.resourceMemoryLimitMb, }) .from(deployments) .innerJoin(services, eq(deployments.serviceId, services.id)) @@ -65,23 +68,17 @@ export async function triggerRecoveryForOfflineServers( console.log(`[scheduler] recovering service ${serviceId}`); - const newPlacements = await calculateSpreadPlacement( + const newPlacements = await calculateResourceAwarePlacement( + { + id: service.serviceId, + resourceCpuLimit: service.resourceCpuLimit, + resourceMemoryLimitMb: service.resourceMemoryLimitMb, + }, service.replicas, offlineServerIds, ); - await db - .delete(serviceReplicas) - .where(eq(serviceReplicas.serviceId, serviceId)); - - for (const placement of newPlacements) { - await db.insert(serviceReplicas).values({ - id: randomUUID(), - serviceId, - serverId: placement.serverId, - count: placement.count, - }); - } + await replaceServiceReplicaPlacements(serviceId, newPlacements); await deployService(serviceId); diff --git a/web/package.json b/web/package.json index 4054f25..00d80f6 100644 --- a/web/package.json +++ b/web/package.json @@ -59,7 +59,7 @@ "drizzle-kit": "^0.31.8", "eslint": "^9", "eslint-config-next": "16.2.6", - "portless": "^0.11.1", + "portless": "^0.13.0", "tailwindcss": "^4", "tsx": "^4.19.2", "typescript": "^5" diff --git a/web/pnpm-lock.yaml b/web/pnpm-lock.yaml index 83e1808..6f676af 100644 --- a/web/pnpm-lock.yaml +++ b/web/pnpm-lock.yaml @@ -142,8 +142,8 @@ importers: specifier: 16.2.6 version: 16.2.6(@typescript-eslint/parser@8.54.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3) portless: - specifier: ^0.11.1 - version: 0.11.1 + specifier: ^0.13.0 + version: 0.13.0 tailwindcss: specifier: ^4 version: 4.1.18 @@ -4949,8 +4949,8 @@ packages: resolution: {integrity: sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==} engines: {node: '>=16.20.0'} - portless@0.11.1: - resolution: {integrity: sha512-A/U6sBo/aC+MDoLjVTzAUeB18KTzJQs7z+MnuyyCjJTOZoCh9xZzOjSqUPA+aWS0uS3UytWPrNLA5AO96TzuKQ==} + portless@0.13.0: + resolution: {integrity: sha512-PxMZ5BHH+ZZi9rTq4T8m003aZxh56qI0aBdNsxLn7jrTtvNJYBWYD3lc5PuQrom/aVh6z8iLb+tKUI6b7QNYQw==} engines: {node: '>=20'} os: [darwin, linux, win32] hasBin: true @@ -10916,7 +10916,7 @@ snapshots: pkce-challenge@5.0.1: {} - portless@0.11.1: {} + portless@0.13.0: {} possible-typed-array-names@1.1.0: {}