From 2e3c6c342a5c04839a86ad14b4da42129eb59df1 Mon Sep 17 00:00:00 2001
From: krazer <caseymcc@krazer.net>
Date: Sat, 2 May 2026 11:18:45 -0400
Subject: [PATCH 1/2] updates for vulkan devices

Co-authored-by: Copilot <copilot@github.com>
---
 docs/server.md                       |    6 +
 examples/server_config.json          |   26 +
 src/arbiterAI/arbiterAI.cpp          |    4 +-
 src/arbiterAI/arbiterAI.h            |    2 +-
 src/arbiterAI/hardwareDetector.cpp   |   50 +-
 src/arbiterAI/modelFitCalculator.h   |    2 +-
 src/arbiterAI/modelManager.cpp       |    1 +
 src/arbiterAI/modelManager.h         |    1 +
 src/arbiterAI/modelRuntime.cpp       |  450 ++++++-
 src/arbiterAI/modelRuntime.h         |   46 +-
 src/arbiterAI/providers/llama.cpp    |   90 +-
 src/arbiterAI/telemetryCollector.cpp |    2 +-
 src/server/dashboard.h               |  400 +++++-
 src/server/dashboardConfig.h         | 1792 ++++++++++++++++++++++++++
 src/server/main.cpp                  |  473 ++++++-
 src/server/routes.cpp                |  882 ++++++++++++-
 src/server/routes.h                  |   10 +
 tests/modelRuntimeTests.cpp          |    6 +-
 tests/telemetryCollectorTests.cpp    |    2 +-
 19 files changed, 4135 insertions(+), 110 deletions(-)
 create mode 100644 src/server/dashboardConfig.h
diff --git a/docs/server.md b/docs/server.md
index 9c55fe6..9ea331b 100644
--- a/docs/server.md
+++ b/docs/server.md
@@ -65,6 +65,11 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
     "models_dir": "/models",
     "default_model": "",
     "default_variant": "",
+    "startup_defaults": {
+      "cpu": {"model": "", "variant": ""},
+      "cuda": {"model": "", "variant": ""},
+      "vulkan": {"model": "", "variant": ""}
+    },
     "override_path": "",
     "ram_budget_mb": 0,
     "max_concurrent_downloads": 2,
@@ -99,6 +104,7 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
 | `models_dir` | `string` | `"/models"` | Directory for downloaded model files |
 | `default_model` | `string` | `""` | Model to load on startup |
 | `default_variant` | `string` | `""` | Default quantization variant (e.g., `Q4_K_M`) |
+| `startup_defaults` | `object` | `{}` | Per-accelerator startup defaults used on restart. Keys: `cpu`, `cuda`, `vulkan`, each with `model` and optional `variant`. If unset, the server falls back to `default_model` / `default_variant`. |
 | `override_path` | `string` | `""` | Path to write runtime model config overrides |
 | `ram_budget_mb` | `int` | `0` | Ready-model RAM budget in MB (`0` = auto 50%) |
 | `max_concurrent_downloads` | `int` | `2` | Maximum simultaneous model downloads |
diff --git a/examples/server_config.json b/examples/server_config.json
index 35ea631..60d4f19 100644
--- a/examples/server_config.json
+++ b/examples/server_config.json
@@ -10,6 +10,32 @@
 
     "default_model": "",
     "default_variant": "",
+    "startup_defaults": {
+        "cpu": {
+            "model": "",
+            "variant": ""
+        },
+        "cuda": {
+            "model": "",
+            "variant": ""
+        },
+        "vulkan": {
+            "model": "",
+            "variant": ""
+        }
+    },
+
+    "startup_models": [
+        {
+            "model": "Qwen3.5-27B",
+            "variant": "Q4_K_M",
+            "context_size": 248832,
+            "runtime_options": {
+                "flash_attn": true
+            },
+            "devices": [1]
+        }
+    ],
 
     "override_path": "",
 
diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp
index 892b93c..43d7bb7 100644
--- a/src/arbiterAI/arbiterAI.cpp
+++ b/src/arbiterAI/arbiterAI.cpp
@@ -533,14 +533,14 @@ ErrorCode ArbiterAI::getAvailableModels(std::vector<std::string>& models)
 // ========== Local Model Management ==========
 
 ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize,
-    const RuntimeOptions *optionsOverride)
+    const RuntimeOptions *optionsOverride, const std::vector<int> &targetDevices)
 {
     RuntimeOptions opts;
     if(optionsOverride)
     {
         opts=*optionsOverride;
     }
-    return ModelRuntime::instance().loadModel(model, variant, contextSize, opts);
+    return ModelRuntime::instance().loadModel(model, variant, contextSize, opts, targetDevices);
 }
 
 ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant)
diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h
index 499ff2d..e5106fb 100644
--- a/src/arbiterAI/arbiterAI.h
+++ b/src/arbiterAI/arbiterAI.h
@@ -650,7 +650,7 @@ class ArbiterAI
      * @return ErrorCode indicating success, ModelDownloading, or failure
      */
     ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0,
-        const RuntimeOptions *optionsOverride=nullptr);
+        const RuntimeOptions *optionsOverride=nullptr, const std::vector<int> &targetDevices={});
 
     /**
      * @brief Download model files without loading into VRAM
diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp
index d0ac4de..7ca4685 100644
--- a/src/arbiterAI/hardwareDetector.cpp
+++ b/src/arbiterAI/hardwareDetector.cpp
@@ -560,7 +560,8 @@ void HardwareDetector::detectNvmlGpus()
             }
         }
 
-        spdlog::info("NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})",
+        spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+            "NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})",
             gpu.index, gpu.name, gpu.vramTotalMb, gpu.vramFreeMb, gpu.computeCapability);
 
         m_systemInfo.gpus.push_back(gpu);
@@ -764,8 +765,12 @@ void HardwareDetector::detectVulkanGpus()
 
             const VkPhysicalDeviceMemoryProperties &mp=memProps2.memoryProperties;
 
-            // Sum DEVICE_LOCAL heaps — on discrete GPUs this is dedicated VRAM,
-            // on UMA systems this is the GPU-accessible portion of system RAM.
+            // Collect DEVICE_LOCAL heap info for budget and usage tracking.
+            // Cards like the MI50 32GB expose multiple DEVICE_LOCAL heaps
+            // (e.g. CPU-visible BAR heap + GPU-only heap). The budget from
+            // VK_EXT_memory_budget is the authoritative measure of how much
+            // VRAM is actually allocatable — it accounts for BAR limitations,
+            // other processes, and driver reservations.
             uint64_t deviceLocalBudgetBytes=0;
             uint64_t deviceLocalUsageBytes=0;
             uint64_t deviceLocalSizeBytes=0;
@@ -799,20 +804,39 @@ void HardwareDetector::detectVulkanGpus()
 
             gpu.hasMemoryBudget=true;
 
-            // Budget is the best estimate of how much this process can allocate.
-            // On UMA, budget may be significantly larger than the raw heap size
-            // (driver exposes most of system RAM as available to the GPU).
             uint64_t budgetTotalMb=deviceLocalBudgetBytes/(1024ULL*1024ULL);
-            uint64_t budgetUsedMb=deviceLocalUsageBytes/(1024ULL*1024ULL);
             uint64_t heapSizeMb=deviceLocalSizeBytes/(1024ULL*1024ULL);
 
-            // Use the larger of heap size and budget for total — on some UMA
-            // drivers the budget exceeds the reported heap size.
-            uint64_t effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
+            uint64_t effectiveTotalMb;
             uint64_t effectiveFreeMb=(deviceLocalBudgetBytes>deviceLocalUsageBytes)
                 ? (deviceLocalBudgetBytes-deviceLocalUsageBytes)/(1024ULL*1024ULL)
                 : 0;
 
+            if(isIntegrated)
+            {
+                // UMA/integrated GPUs: budget may exceed heap size (driver
+                // exposes system RAM as GPU-accessible). Use the larger value.
+                effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
+            }
+            else
+            {
+                // Discrete GPUs: budget is the authoritative allocatable total.
+                // When a device has multiple DEVICE_LOCAL heaps (e.g. visible
+                // BAR heap + GPU-only heap), the budget for the BAR heap may
+                // be much smaller than its physical size if Resizable BAR is
+                // not enabled. Using heap size would over-report and cause
+                // model loads that overcommit VRAM and spill to system RAM.
+                effectiveTotalMb=budgetTotalMb;
+
+                if(budgetTotalMb<heapSizeMb*90/100)
+                {
+                    spdlog::warn("Vulkan GPU {}: allocatable budget ({}MB) is significantly less than "
+                        "physical VRAM ({}MB). This typically means Resizable BAR / Above 4G Decoding "
+                        "is not enabled in BIOS. Enable it to unlock the full VRAM.",
+                        gpu.index, budgetTotalMb, heapSizeMb);
+                }
+            }
+
             gpu.vramTotalMb=static_cast<int>(effectiveTotalMb);
             gpu.vramFreeMb=static_cast<int>(effectiveFreeMb);
 
@@ -826,8 +850,8 @@ void HardwareDetector::detectVulkanGpus()
             }
 
             spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
-                "Vulkan GPU {}: {} (budget: {}MB total, {}MB free, "
-                "heap size: {}MB, integrated={}, memoryBudget=true)",
+                "Vulkan GPU {}: {} (allocatable: {}MB, free: {}MB, "
+                "physical: {}MB, integrated={}, memoryBudget=true)",
                 gpu.index, gpu.name,
                 gpu.vramTotalMb, gpu.vramFreeMb,
                 static_cast<int>(heapSizeMb), gpu.unifiedMemory);
@@ -852,7 +876,7 @@ void HardwareDetector::detectVulkanGpus()
 
                 if(deviceLocal)
                 {
-                    vramTotalMb+=static_cast<int>(memProps.memoryHeaps[h].size/(1024*1024));
+                    vramTotalMb+=heapInfo.sizeMb;
                 }
             }
 
diff --git a/src/arbiterAI/modelFitCalculator.h b/src/arbiterAI/modelFitCalculator.h
index 6ed089a..cc62744 100644
--- a/src/arbiterAI/modelFitCalculator.h
+++ b/src/arbiterAI/modelFitCalculator.h
@@ -33,7 +33,6 @@ class ModelFitCalculator {
         const std::vector<ModelInfo> &models,
         const SystemInfo &hw);
 
-private:
     /// Sum free VRAM across a set of GPU indices.
     /// For unified memory GPUs, uses gpuAccessibleRamFreeMb when available.
     static int sumFreeVram(const SystemInfo &hw, const std::vector<int> &gpuIndices);
@@ -42,6 +41,7 @@ class ModelFitCalculator {
     /// For unified memory GPUs, uses gpuAccessibleRamMb when available.
     static int sumTotalVram(const SystemInfo &hw, const std::vector<int> &gpuIndices);
 
+private:
     /// Get all GPU indices from the system info.
     static std::vector<int> allGpuIndices(const SystemInfo &hw);
 
diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp
index d80cbe0..5c314d4 100644
--- a/src/arbiterAI/modelManager.cpp
+++ b/src/arbiterAI/modelManager.cpp
@@ -72,6 +72,7 @@ void RuntimeOptions::mergeFrom(const RuntimeOptions &other)
     if(other.swaFull.has_value()) swaFull=other.swaFull;
     if(other.nGpuLayers.has_value()) nGpuLayers=other.nGpuLayers;
     if(other.overrideTensor.has_value()) overrideTensor=other.overrideTensor;
+    if(other.vulkanNoHostVisibleVram.has_value()) vulkanNoHostVisibleVram=other.vulkanNoHostVisibleVram;
 }
 
 ModelManager &ModelManager::instance()
diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h
index f7ef48b..04309a1 100644
--- a/src/arbiterAI/modelManager.h
+++ b/src/arbiterAI/modelManager.h
@@ -51,6 +51,7 @@ struct RuntimeOptions {
     std::optional<bool> swaFull;                // --swa-full: full SWA (sliding window attention)
     std::optional<int> nGpuLayers;              // -ngl: number of GPU layers (99=all)
     std::optional<std::string> overrideTensor;  // -ot: tensor override pattern (e.g. "per_layer_token_embd.weight=CPU")
+    std::optional<bool> vulkanNoHostVisibleVram; // GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM: skip BAR-mapped heap, force device-local only
 
     /// Merge another set of options on top of this one (override only non-empty fields).
     void mergeFrom(const RuntimeOptions &other);
diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp
index 4b5df20..22774e4 100644
--- a/src/arbiterAI/modelRuntime.cpp
+++ b/src/arbiterAI/modelRuntime.cpp
@@ -9,6 +9,7 @@
 #include <ggml-backend.h>
 #include <spdlog/spdlog.h>
 #include <algorithm>
+#include <cstdlib>
 #include <filesystem>
 #include <thread>
 #include <regex>
@@ -72,8 +73,7 @@ void ModelRuntime::reset()
     }
 
     rt.m_models.clear();
-    rt.m_inferenceActive=false;
-    rt.m_inferenceModel.clear();
+    rt.m_activeInference.clear();
     while(!rt.m_pendingSwaps.empty())
     {
         rt.m_pendingSwaps.pop();
@@ -337,7 +337,8 @@ ErrorCode ModelRuntime::loadModel(
     const std::string &model,
     const std::string &variant,
     int contextSize,
-    const RuntimeOptions &optionsOverride)
+    const RuntimeOptions &optionsOverride,
+    const std::vector<int> &targetDevices)
 {
     std::lock_guard<std::mutex> lock(m_mutex);
 
@@ -428,6 +429,26 @@ ErrorCode ModelRuntime::loadModel(
         if(selectedVar)
         {
             ModelFit fit=ModelFitCalculator::calculateModelFit(modelInfo.value(), *selectedVar, hw);
+
+            // If caller specified target devices, override the auto-selected GPU indices
+            if(!targetDevices.empty())
+            {
+                fit.gpuIndices=targetDevices;
+                // Recalculate available VRAM for target devices only
+                int targetVram=ModelFitCalculator::sumFreeVram(hw, targetDevices);
+                if(targetVram>0)
+                {
+                    fit.canRun=true;
+                    fit.limitingFactor.clear();
+                    // Recalculate max context for the specified devices
+                    int targetTotalVram=ModelFitCalculator::sumTotalVram(hw, targetDevices);
+                    if(fit.maxContextSize<=0&&targetTotalVram>0)
+                    {
+                        fit.maxContextSize=fit.maxContextSize;
+                    }
+                }
+            }
+
             if(!fit.canRun)
             {
                 m_lastLoadError.reason=(fit.limitingFactor=="ram")
@@ -444,8 +465,12 @@ ErrorCode ModelRuntime::loadModel(
                 return ErrorCode::ModelLoadError;
             }
 
-            // Evict if needed to make room
-            evictIfNeeded(selectedVar->minVramMb);
+            // Evict if needed to make room on each assigned GPU
+            for(int gpuIdx:fit.gpuIndices)
+            {
+                int perGpuVram=selectedVar->minVramMb/static_cast<int>(fit.gpuIndices.size());
+                evictIfNeeded(perGpuVram, gpuIdx);
+            }
 
             // Check if all model files exist, initiate async download for any missing ones
             std::vector<VariantDownload> allFiles=selectedVar->getAllFiles();
@@ -506,6 +531,19 @@ ErrorCode ModelRuntime::loadModel(
             entry.gpuIndices=fit.gpuIndices;
             entry.lastUsed=std::chrono::steady_clock::now();
 
+            // Distribute estimated VRAM usage across assigned GPUs
+            entry.perGpuVramMb.clear();
+            if(!fit.gpuIndices.empty())
+            {
+                int perGpu=fit.estimatedVramUsageMb/static_cast<int>(fit.gpuIndices.size());
+                int remainder=fit.estimatedVramUsageMb%static_cast<int>(fit.gpuIndices.size());
+
+                for(size_t i=0; i<fit.gpuIndices.size(); ++i)
+                {
+                    entry.perGpuVramMb[fit.gpuIndices[i]]=perGpu+(static_cast<int>(i)<remainder?1:0);
+                }
+            }
+
             // Actually load llama.cpp model for local providers
             if(modelInfo->provider=="llama")
             {
@@ -838,6 +876,7 @@ ErrorCode ModelRuntime::unloadModel(const std::string &model)
         entry.state=ModelState::Unloaded;
         entry.vramUsageMb=0;
         entry.ramUsageMb=0;
+        entry.perGpuVramMb.clear();
         spdlog::info("Model '{}' unloaded", model);
     }
 
@@ -880,7 +919,7 @@ ErrorCode ModelRuntime::swapModel(
     int contextSize,
     const RuntimeOptions &optionsOverride)
 {
-    if(m_inferenceActive)
+    if(!m_activeInference.empty())
     {
         // Queue the swap for when inference completes
         std::lock_guard<std::mutex> lock(m_mutex);
@@ -922,6 +961,7 @@ ErrorCode ModelRuntime::swapModel(
                     pair.second.state=ModelState::Unloaded;
                     pair.second.vramUsageMb=0;
                     pair.second.ramUsageMb=0;
+                    pair.second.perGpuVramMb.clear();
                 }
             }
         }
@@ -1127,9 +1167,72 @@ std::vector<std::string> ModelRuntime::resolveBackendPriority(const ModelInfo &m
     return priority;
 }
 
-void ModelRuntime::evictIfNeeded(int requiredVramMb)
+void ModelRuntime::evictIfNeeded(int requiredVramMb, int gpuIndex)
 {
-    // Calculate current VRAM usage across all loaded models
+    if(gpuIndex>=0)
+    {
+        // Per-GPU eviction: only consider models on this specific GPU
+        int committedOnGpu=getCommittedVramMb(gpuIndex);
+        int estimatedFree=getEstimatedFreeVramMb(gpuIndex);
+
+        if(estimatedFree>=requiredVramMb)
+        {
+            return; // enough VRAM on this GPU
+        }
+
+        int needToFree=requiredVramMb-estimatedFree;
+
+        struct EvictCandidate {
+            std::string model;
+            int vramOnGpu;
+            std::chrono::steady_clock::time_point lastUsed;
+        };
+
+        std::vector<EvictCandidate> candidates;
+        for(const auto &pair:m_models)
+        {
+            if(pair.second.state==ModelState::Loaded&&
+                !pair.second.pinned&&
+                !m_activeInference.count(pair.first))
+            {
+                auto gpuIt=pair.second.perGpuVramMb.find(gpuIndex);
+                if(gpuIt!=pair.second.perGpuVramMb.end()&&gpuIt->second>0)
+                {
+                    candidates.push_back({pair.first, gpuIt->second, pair.second.lastUsed});
+                }
+            }
+        }
+
+        std::sort(candidates.begin(), candidates.end(),
+            [](const EvictCandidate &a, const EvictCandidate &b)
+            {
+                return a.lastUsed<b.lastUsed;
+            });
+
+        int freed=0;
+        for(const EvictCandidate &candidate:candidates)
+        {
+            if(freed>=needToFree)
+            {
+                break;
+            }
+
+            auto it=m_models.find(candidate.model);
+            if(it!=m_models.end())
+            {
+                freeLlamaModel(it->second);
+                it->second.state=ModelState::Unloaded;
+                it->second.vramUsageMb=0;
+                it->second.ramUsageMb=0;
+                it->second.perGpuVramMb.clear();
+                freed+=candidate.vramOnGpu;
+                spdlog::info("Evicted model '{}' to free {}MB VRAM on GPU {}", candidate.model, candidate.vramOnGpu, gpuIndex);
+            }
+        }
+        return;
+    }
+
+    // Global eviction (legacy path): sum across all GPUs
     int currentVramUsage=0;
     for(const auto &pair:m_models)
     {
@@ -1161,7 +1264,7 @@ void ModelRuntime::evictIfNeeded(int requiredVramMb)
     {
         if(pair.second.state==ModelState::Loaded&&
             !pair.second.pinned&&
-            pair.first!=m_inferenceModel)
+            !m_activeInference.count(pair.first))
         {
             candidates.push_back({pair.first, pair.second.estimatedVramUsageMb, pair.second.lastUsed});
         }
@@ -1189,6 +1292,7 @@ void ModelRuntime::evictIfNeeded(int requiredVramMb)
             it->second.state=ModelState::Unloaded;
             it->second.vramUsageMb=0;
             it->second.ramUsageMb=0;
+            it->second.perGpuVramMb.clear();
             freed+=candidate.vramMb;
             spdlog::info("Evicted model '{}' to free {}MB VRAM", candidate.model, candidate.vramMb);
         }
@@ -1197,8 +1301,7 @@ void ModelRuntime::evictIfNeeded(int requiredVramMb)
 
 void ModelRuntime::beginInference(const std::string &model)
 {
-    m_inferenceActive=true;
-    m_inferenceModel=model;
+    m_activeInference.insert(model);
 
     std::lock_guard<std::mutex> lock(m_mutex);
     auto it=m_models.find(model);
@@ -1208,27 +1311,76 @@ void ModelRuntime::beginInference(const std::string &model)
     }
 }
 
-void ModelRuntime::endInference()
+void ModelRuntime::endInference(const std::string &model)
 {
     // Record usage for storage tracking
-    if(!m_inferenceModel.empty())
+    if(!model.empty())
     {
         std::lock_guard<std::mutex> lock(m_mutex);
-        auto it=m_models.find(m_inferenceModel);
+        auto it=m_models.find(model);
         if(it!=m_models.end())
         {
-            StorageManager::instance().recordUsage(m_inferenceModel, it->second.variant);
+            StorageManager::instance().recordUsage(model, it->second.variant);
         }
     }
 
-    m_inferenceActive=false;
-    m_inferenceModel.clear();
-    drainPendingSwaps();
+    m_activeInference.erase(model);
+
+    if(m_activeInference.empty())
+    {
+        drainPendingSwaps();
+    }
 }
 
 bool ModelRuntime::isInferenceActive() const
 {
-    return m_inferenceActive;
+    return !m_activeInference.empty();
+}
+
+bool ModelRuntime::isInferenceActive(const std::string &model) const
+{
+    return m_activeInference.count(model)>0;
+}
+
+int ModelRuntime::getActiveInferenceCount() const
+{
+    return static_cast<int>(m_activeInference.size());
+}
+
+int ModelRuntime::getCommittedVramMb(int gpuIndex) const
+{
+    int committed=0;
+    for(const auto &pair:m_models)
+    {
+        if(pair.second.state!=ModelState::Loaded)
+        {
+            continue;
+        }
+
+        auto gpuIt=pair.second.perGpuVramMb.find(gpuIndex);
+        if(gpuIt!=pair.second.perGpuVramMb.end())
+        {
+            committed+=gpuIt->second;
+        }
+    }
+    return committed;
+}
+
+int ModelRuntime::getEstimatedFreeVramMb(int gpuIndex) const
+{
+    SystemInfo hw=HardwareDetector::instance().getSystemInfo();
+
+    if(gpuIndex<0||gpuIndex>=static_cast<int>(hw.gpus.size()))
+    {
+        return 0;
+    }
+
+    const GpuInfo &gpu=hw.gpus[gpuIndex];
+    int totalVram=gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0
+        ?gpu.gpuAccessibleRamMb:gpu.vramTotalMb;
+    int committed=getCommittedVramMb(gpuIndex);
+
+    return std::max(0, totalVram-committed);
 }
 
 std::string ModelRuntime::selectBestVariant(const ModelInfo &model) const
@@ -1401,6 +1553,21 @@ ErrorCode ModelRuntime::loadLlamaModel(
     const RuntimeOptions &options,
     const std::vector<std::string> &backendPriority)
 {
+    // Apply Vulkan environment variable overrides before backend init.
+    // These are read by ggml-vulkan.cpp via getenv() during device initialization.
+    if(options.vulkanNoHostVisibleVram.has_value())
+    {
+        if(options.vulkanNoHostVisibleVram.value())
+        {
+            setenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM", "1", 1);
+            spdlog::info("Set GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM=1 for model '{}'", model);
+        }
+        else
+        {
+            unsetenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM");
+        }
+    }
+
     initLlamaBackend();
 
     // Log available backend devices matching backendPriority for diagnostics.
@@ -1533,8 +1700,174 @@ ErrorCode ModelRuntime::loadLlamaModel(
             }
         }
 
-        // NOTE: mparams.devices is intentionally left as NULL (default).
-        // See comment above about why explicit device lists hurt UMA performance.
+        // NOTE: mparams.devices is intentionally left as NULL (default) on
+        // single-GPU / UMA systems. See comment above about why explicit device
+        // lists hurt UMA performance. On multi-GPU discrete systems, we target
+        // specific GPU(s) via mparams.devices — but NEVER include the CPU device,
+        // which would cause llama.cpp to split tensors across GPU and CPU equally.
+        std::vector<ggml_backend_dev_t> targetDevices;
+
+        SystemInfo hw=HardwareDetector::instance().getSystemInfo();
+        bool isMultiGpuDiscrete=hw.gpus.size()>1;
+        bool hasUmaGpu=false;
+        for(const GpuInfo &gpu:hw.gpus)
+        {
+            if(gpu.unifiedMemory)
+            {
+                hasUmaGpu=true;
+                break;
+            }
+        }
+
+        if(isMultiGpuDiscrete&&!hasUmaGpu&&!gpuIndices.empty())
+        {
+            // Build mapping from hardware detector GPU index to ggml backend device.
+            // Match by comparing device descriptions since index spaces differ:
+            // HW detector may skip duplicates (e.g. RTX 3060 via Vulkan when CUDA is primary)
+            // while ggml enumerates all backend devices.
+
+            size_t devCount=ggml_backend_dev_count();
+            struct GgmlGpuDev
+            {
+                ggml_backend_dev_t dev;
+                std::string name;
+                std::string description;
+            };
+            std::vector<GgmlGpuDev> ggmlGpus;
+
+            for(size_t i=0; i<devCount; ++i)
+            {
+                ggml_backend_dev_t dev=ggml_backend_dev_get(i);
+                enum ggml_backend_dev_type devType=ggml_backend_dev_type(dev);
+
+                if(devType==GGML_BACKEND_DEVICE_TYPE_CPU)
+                    continue;
+
+                GgmlGpuDev entry;
+                entry.dev=dev;
+                entry.name=ggml_backend_dev_name(dev);
+
+                entry.description=ggml_backend_dev_description(dev);
+
+                ggmlGpus.push_back(entry);
+                spdlog::debug("ggml GPU device: name='{}' desc='{}'", entry.name, entry.description);
+            }
+
+            // For each requested HW GPU index, find the matching ggml device
+            // by matching the HW GPU name against ggml device description or name
+            for(int idx:gpuIndices)
+            {
+                // Find the HW GPU info for this index
+                const GpuInfo *hwGpu=nullptr;
+                for(const GpuInfo &gpu:hw.gpus)
+                {
+                    if(gpu.index==idx)
+                    {
+                        hwGpu=&gpu;
+                        break;
+                    }
+                }
+
+                if(!hwGpu)
+                {
+                    spdlog::warn("GPU index {} not found in hardware info for model '{}'", idx, model);
+                    continue;
+                }
+
+                // Match against ggml devices by checking if the HW name appears
+                // in the ggml device description, preferring same-backend matches
+                ggml_backend_dev_t bestMatch=nullptr;
+                std::string bestMatchName;
+
+                // Determine expected ggml name prefix for this backend
+                std::string expectedPrefix;
+                if(hwGpu->backend==GpuBackend::CUDA) expectedPrefix="CUDA";
+                else if(hwGpu->backend==GpuBackend::Vulkan) expectedPrefix="Vulkan";
+
+                for(const GgmlGpuDev &ggmlDev:ggmlGpus)
+                {
+                    // Check backend match first
+                    if(!expectedPrefix.empty()&&ggmlDev.name.find(expectedPrefix)==std::string::npos)
+                        continue;
+
+                    // Check if HW GPU name appears in ggml description
+                    // HW name: "AMD Instinct MI50/MI60 (RADV VEGA20)"
+                    // ggml desc: "AMD RADV VEGA20" or similar
+                    // Try matching key substrings
+                    bool matches=false;
+
+                    // Extract key identifiers from both names for matching
+                    if(ggmlDev.description.find(hwGpu->name)!=std::string::npos)
+                    {
+                        matches=true;
+                    }
+                    else
+                    {
+                        // Try partial matching — extract words from HW name and check ggml desc
+                        // Look for distinctive substrings like "VEGA20", "MI50", "RTX 3060", etc.
+                        std::vector<std::string> keywords;
+                        std::string hwName=hwGpu->name;
+
+                        // Extract alphanumeric tokens from HW GPU name
+                        std::string token;
+                        for(char c:hwName)
+                        {
+                            if(std::isalnum(c)||(c=='-'))
+                            {
+                                token+=c;
+                            }
+                            else if(!token.empty())
+                            {
+                                if(token.size()>=3) keywords.push_back(token);
+                                token.clear();
+                            }
+                        }
+                        if(token.size()>=3) keywords.push_back(token);
+
+                        // Check if distinctive keywords from HW name appear in ggml description
+                        int matchCount=0;
+                        for(const std::string &kw:keywords)
+                        {
+                            if(ggmlDev.description.find(kw)!=std::string::npos)
+                            {
+                                ++matchCount;
+                            }
+                        }
+
+                        // Require at least 2 keyword matches or 1 match for short names
+                        if(matchCount>=2||(matchCount>=1&&keywords.size()<=2))
+                        {
+                            matches=true;
+                        }
+                    }
+
+                    if(matches)
+                    {
+                        bestMatch=ggmlDev.dev;
+                        bestMatchName=ggmlDev.name;
+                        break;
+                    }
+                }
+
+                if(bestMatch)
+                {
+                    targetDevices.push_back(bestMatch);
+                    spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}'",
+                        idx, hwGpu->name, bestMatchName, model);
+                }
+                else
+                {
+                    spdlog::warn("GPU hw[{}] '{}' could not be matched to any ggml device for model '{}'",
+                        idx, hwGpu->name, model);
+                }
+            }
+
+            if(!targetDevices.empty())
+            {
+                targetDevices.push_back(nullptr); // NULL terminator
+                mparams.devices=targetDevices.data();
+            }
+        }
 
         llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams);
         if(!llamaModel)
@@ -1649,6 +1982,7 @@ ErrorCode ModelRuntime::loadLlamaModel(
             return ErrorCode::ModelLoadError;
         }
 
+        std::string capturedLog=m_llamaLogCapture.str();
         endLlamaLogCapture();
 
         LoadedModel &entry=m_models[model];
@@ -1657,6 +1991,9 @@ ErrorCode ModelRuntime::loadLlamaModel(
         entry.maxContextSize=nativeContext;
         entry.contextSize=static_cast<int>(llama_n_ctx(llamaCtx));
 
+        // Parse per-device buffer allocations from llama.cpp log output
+        parseDeviceAllocations(entry, capturedLog);
+
         spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={}, ngl={}, flash_attn={}, mmap={}, backend_filter={})",
             model, entry.contextSize, entry.maxContextSize,
             options.nGpuLayers.value_or(99),
@@ -1693,6 +2030,77 @@ void ModelRuntime::freeLlamaModel(LoadedModel &entry)
     }
 }
 
+void ModelRuntime::parseDeviceAllocations(LoadedModel &entry, const std::string &logOutput)
+{
+    entry.deviceAllocations.clear();
+    entry.graphSplits=0;
+    entry.cpuMappedBufferMb=0;
+
+    // Parse: "load_tensors:   CPU_Mapped model buffer size =   682.03 MiB"
+    // Parse: "load_tensors:      Vulkan1 model buffer size = 15272.77 MiB"
+    std::regex modelBufRe(R"(load_tensors:\s+(\S+)\s+model buffer size\s*=\s*([\d.]+)\s*MiB)");
+    // Parse: "llama_kv_cache:    Vulkan1 KV buffer size =  8262.00 MiB"
+    std::regex kvBufRe(R"(llama_kv_cache:\s+(\S+)\s+KV buffer size\s*=\s*([\d.]+)\s*MiB)");
+    // Parse: "sched_reserve:    Vulkan1 compute buffer size =   801.28 MiB"
+    std::regex computeBufRe(R"(sched_reserve:\s+(\S+)\s+compute buffer size\s*=\s*([\d.]+)\s*MiB)");
+    // Parse: "sched_reserve: graph splits = 2"
+    std::regex graphSplitsRe(R"(sched_reserve:\s+graph splits\s*=\s*(\d+))");
+
+    std::istringstream stream(logOutput);
+    std::string line;
+
+    while(std::getline(stream, line))
+    {
+        std::smatch match;
+
+        if(std::regex_search(line, match, modelBufRe))
+        {
+            std::string device=match[1].str();
+            int sizeMb=static_cast<int>(std::round(std::stod(match[2].str())));
+
+            if(device=="CPU_Mapped")
+            {
+                entry.cpuMappedBufferMb=sizeMb;
+            }
+            else
+            {
+                entry.deviceAllocations[device].deviceName=device;
+                entry.deviceAllocations[device].modelBufferMb=sizeMb;
+            }
+        }
+        else if(std::regex_search(line, match, kvBufRe))
+        {
+            std::string device=match[1].str();
+            int sizeMb=static_cast<int>(std::round(std::stod(match[2].str())));
+            entry.deviceAllocations[device].deviceName=device;
+            entry.deviceAllocations[device].kvCacheBufferMb=sizeMb;
+        }
+        else if(std::regex_search(line, match, computeBufRe))
+        {
+            std::string device=match[1].str();
+            int sizeMb=static_cast<int>(std::round(std::stod(match[2].str())));
+
+            // Skip host-side compute buffers
+            if(device.find("Host")!=std::string::npos)
+                continue;
+
+            entry.deviceAllocations[device].deviceName=device;
+            entry.deviceAllocations[device].computeBufferMb=sizeMb;
+        }
+        else if(std::regex_search(line, match, graphSplitsRe))
+        {
+            entry.graphSplits=std::stoi(match[1].str());
+        }
+    }
+
+    // Calculate totals for each device
+    for(auto &pair:entry.deviceAllocations)
+    {
+        DeviceAllocation &alloc=pair.second;
+        alloc.totalMb=alloc.modelBufferMb+alloc.kvCacheBufferMb+alloc.computeBufferMb;
+    }
+}
+
 llama_model *ModelRuntime::getLlamaModel(const std::string &model) const
 {
     std::lock_guard<std::mutex> lock(m_mutex);
diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h
index 79a7bda..d2abd14 100644
--- a/src/arbiterAI/modelRuntime.h
+++ b/src/arbiterAI/modelRuntime.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <vector>
 #include <map>
+#include <set>
 #include <mutex>
 #include <atomic>
 #include <queue>
@@ -59,6 +60,14 @@ struct LoadErrorDetail {
     std::string llamaLog;       // raw llama.cpp log output captured during the load attempt
 };
 
+struct DeviceAllocation {
+    std::string deviceName;
+    int modelBufferMb=0;
+    int kvCacheBufferMb=0;
+    int computeBufferMb=0;
+    int totalMb=0;
+};
+
 struct LoadedModel {
     std::string modelName;
     std::string variant;
@@ -69,6 +78,10 @@ struct LoadedModel {
     int contextSize=0;
     int maxContextSize=0; // model's native/training context from GGUF metadata
     std::vector<int> gpuIndices;
+    std::map<int, int> perGpuVramMb; // gpu index → estimated VRAM usage on that GPU
+    std::map<std::string, DeviceAllocation> deviceAllocations; // device name → actual buffer allocations
+    int graphSplits=0;
+    int cpuMappedBufferMb=0;
     std::chrono::steady_clock::time_point lastUsed;
     bool pinned=false;
     llama_model *llamaModel=nullptr;
@@ -89,12 +102,14 @@ class ModelRuntime {
     /// @param variant   Quantization variant (empty = auto-select best fitting).
     /// @param contextSize  Context size (0 = use model default).
     /// @param optionsOverride  Optional runtime options to merge on top of model config defaults.
+    /// @param targetDevices  Optional GPU indices to target (empty = auto-select).
     /// @return ErrorCode::Success, ModelDownloading, ModelNotFound, ModelLoadError.
     ErrorCode loadModel(
         const std::string &model,
         const std::string &variant="",
         int contextSize=0,
-        const RuntimeOptions &optionsOverride=RuntimeOptions{});
+        const RuntimeOptions &optionsOverride=RuntimeOptions{},
+        const std::vector<int> &targetDevices={});
 
     /// Download model files without loading into VRAM.
     /// Launches an async background download that respects the concurrent
@@ -159,17 +174,24 @@ class ModelRuntime {
     std::vector<std::string> getDefaultBackendPriority() const;
 
     /// Evict least-recently-used non-pinned models to free VRAM.
-    void evictIfNeeded(int requiredVramMb);
+    /// When gpuIndex >= 0, only considers models on that specific GPU.
+    void evictIfNeeded(int requiredVramMb, int gpuIndex=-1);
 
-    /// Mark inference as started (blocks swap execution).
+    /// Mark inference as started on a model (blocks eviction of that model).
     void beginInference(const std::string &model);
 
-    /// Mark inference as completed and drain pending swaps.
-    void endInference();
+    /// Mark inference as completed on a model and drain pending swaps.
+    void endInference(const std::string &model);
 
-    /// Check if inference is currently active.
+    /// Check if any inference is currently active.
     bool isInferenceActive() const;
 
+    /// Check if inference is active on a specific model.
+    bool isInferenceActive(const std::string &model) const;
+
+    /// Get the number of models currently running inference.
+    int getActiveInferenceCount() const;
+
     /// Get the llama_model handle for a loaded local model.
     /// Returns nullptr if not loaded or not a local model.
     llama_model *getLlamaModel(const std::string &model) const;
@@ -189,6 +211,12 @@ class ModelRuntime {
     /// Public so the C-style callback can reach it; not intended for external use.
     void appendLlamaLog(const char *text);
 
+    /// Get the VRAM currently committed to loaded models on a specific GPU (MB).
+    int getCommittedVramMb(int gpuIndex) const;
+
+    /// Get the estimated free VRAM on a specific GPU accounting for loaded models (MB).
+    int getEstimatedFreeVramMb(int gpuIndex) const;
+
 private:
     ModelRuntime();
 
@@ -238,6 +266,9 @@ class ModelRuntime {
     /// Free llama.cpp resources for a model.
     void freeLlamaModel(LoadedModel &entry);
 
+    /// Parse per-device buffer allocations from llama.cpp log output.
+    void parseDeviceAllocations(LoadedModel &entry, const std::string &logOutput);
+
     /// Download a model file synchronously.
     /// @return true on success, false on failure.
     bool downloadModelFile(
@@ -251,8 +282,7 @@ class ModelRuntime {
     mutable std::mutex m_mutex;
     int m_readyRamBudgetMb=0;
     std::vector<std::string> m_defaultBackendPriority;
-    std::atomic<bool> m_inferenceActive{false};
-    std::string m_inferenceModel;
+    std::set<std::string> m_activeInference; // models currently running inference
     bool m_llamaInitialized=false;
 
     struct SwapRequest {
diff --git a/src/arbiterAI/providers/llama.cpp b/src/arbiterAI/providers/llama.cpp
index c3b1187..384f409 100644
--- a/src/arbiterAI/providers/llama.cpp
+++ b/src/arbiterAI/providers/llama.cpp
@@ -60,7 +60,7 @@ ErrorCode Llama::completion(const CompletionRequest &request,
     std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now();
     double totalTimeMs=std::chrono::duration<double, std::milli>(endTime-startTime).count();
 
-    runtime.endInference();
+    runtime.endInference(request.model);
 
     if(code==ErrorCode::Success)
     {
@@ -135,7 +135,7 @@ ErrorCode Llama::streamingCompletion(const CompletionRequest &request,
     std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now();
     double totalTimeMs=std::chrono::duration<double, std::milli>(endTime-startTime).count();
 
-    runtime.endInference();
+    runtime.endInference(request.model);
 
     if(code==ErrorCode::Success)
     {
@@ -209,24 +209,34 @@ ErrorCode Llama::getEmbeddings(const EmbeddingRequest &request,
     }
     tokens.resize(nTokens);
 
-    llama_batch batch=llama_batch_init(nTokens, 0, 1);
+    int nBatch=static_cast<int>(llama_n_batch(llamaCtx));
+    llama_batch batch=llama_batch_init(std::max(nBatch, 512), 0, 1);
 
-    batch.n_tokens=nTokens;
-    for(int32_t i=0; i<batch.n_tokens; i++)
+    for(int start=0; start<nTokens; start+=nBatch)
     {
-        batch.token[i]=tokens[i];
-        batch.pos[i]=i;
-        batch.n_seq_id[i]=1;
-        batch.seq_id[i][0]=0;
-        batch.logits[i]=0;
-    }
-    batch.logits[batch.n_tokens-1]=1;
+        int chunkSize=std::min(nBatch, nTokens-start);
+        bool isLastChunk=(start+chunkSize>=nTokens);
 
-    if(llama_decode(llamaCtx, batch)!=0)
-    {
-        spdlog::error("llama_decode failed for embeddings");
-        llama_batch_free(batch);
-        return ErrorCode::GenerationError;
+        batch.n_tokens=chunkSize;
+        for(int32_t i=0; i<chunkSize; i++)
+        {
+            batch.token[i]=tokens[start+i];
+            batch.pos[i]=start+i;
+            batch.n_seq_id[i]=1;
+            batch.seq_id[i][0]=0;
+            batch.logits[i]=0;
+        }
+        if(isLastChunk)
+        {
+            batch.logits[chunkSize-1]=1;
+        }
+
+        if(llama_decode(llamaCtx, batch)!=0)
+        {
+            spdlog::error("llama_decode failed for embeddings (chunk at offset {})", start);
+            llama_batch_free(batch);
+            return ErrorCode::GenerationError;
+        }
     }
 
     const float *embeddingsPtr=llama_get_embeddings(llamaCtx);
@@ -373,28 +383,38 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
     // Clear KV cache for fresh inference
     llama_memory_clear(llama_get_memory(ctx), true);
 
-    llama_batch batch=llama_batch_init(std::max(nTokens, 512), 0, 1);
+    int nBatch=static_cast<int>(llama_n_batch(ctx));
+    llama_batch batch=llama_batch_init(std::max(nBatch, 512), 0, 1);
 
-    // Fill batch with prompt tokens
-    batch.n_tokens=nTokens;
-    for(int32_t i=0; i<batch.n_tokens; i++)
-    {
-        batch.token[i]=tokensList[i];
-        batch.pos[i]=i;
-        batch.n_seq_id[i]=1;
-        batch.seq_id[i][0]=0;
-        batch.logits[i]=0;
-    }
-    batch.logits[batch.n_tokens-1]=1;
-
-    // Process prompt (timed)
+    // Process prompt (timed) — chunk into n_batch-sized pieces
     std::chrono::steady_clock::time_point promptStart=std::chrono::steady_clock::now();
 
-    if(llama_decode(ctx, batch)!=0)
+    for(int start=0; start<nTokens; start+=nBatch)
     {
-        spdlog::error("llama_decode failed during prompt processing");
-        llama_batch_free(batch);
-        return ErrorCode::GenerationError;
+        int chunkSize=std::min(nBatch, nTokens-start);
+        bool isLastChunk=(start+chunkSize>=nTokens);
+
+        batch.n_tokens=chunkSize;
+        for(int32_t i=0; i<chunkSize; i++)
+        {
+            batch.token[i]=tokensList[start+i];
+            batch.pos[i]=start+i;
+            batch.n_seq_id[i]=1;
+            batch.seq_id[i][0]=0;
+            batch.logits[i]=0;
+        }
+        // Only request logits for the very last token of the prompt
+        if(isLastChunk)
+        {
+            batch.logits[chunkSize-1]=1;
+        }
+
+        if(llama_decode(ctx, batch)!=0)
+        {
+            spdlog::error("llama_decode failed during prompt processing (chunk at offset {})", start);
+            llama_batch_free(batch);
+            return ErrorCode::GenerationError;
+        }
     }
 
     std::chrono::steady_clock::time_point promptEnd=std::chrono::steady_clock::now();
diff --git a/src/arbiterAI/telemetryCollector.cpp b/src/arbiterAI/telemetryCollector.cpp
index cd5266e..192631d 100644
--- a/src/arbiterAI/telemetryCollector.cpp
+++ b/src/arbiterAI/telemetryCollector.cpp
@@ -68,7 +68,7 @@ SystemSnapshot TelemetryCollector::getSnapshot() const
     snapshot.hardware=HardwareDetector::instance().getSystemInfo();
     snapshot.models=ModelRuntime::instance().getModelStates();
     snapshot.avgTokensPerSecond=getAvgTokensPerSecond();
-    snapshot.activeRequests=ModelRuntime::instance().isInferenceActive()?1:0;
+    snapshot.activeRequests=ModelRuntime::instance().getActiveInferenceCount();
 
     // Calculate average prompt/generation speeds over last 5 minutes
     std::chrono::system_clock::time_point cutoff=
diff --git a/src/server/dashboard.h b/src/server/dashboard.h
index 90d123c..a6469cd 100644
--- a/src/server/dashboard.h
+++ b/src/server/dashboard.h
@@ -37,11 +37,33 @@ body
     align-items: center;
     justify-content: space-between;
 }
+.header-title-block
+{
+    display: flex;
+    flex-direction: column;
+    gap: 6px;
+}
 .header h1
 {
     font-size: 20px;
     color: #7c8aff;
 }
+.header-links
+{
+    display: flex;
+    gap: 14px;
+    flex-wrap: wrap;
+}
+.header-link
+{
+    color: #7c8aff;
+    text-decoration: none;
+    font-size: 13px;
+}
+.header-link:hover
+{
+    text-decoration: underline;
+}
 .header .status
 {
     font-size: 13px;
@@ -482,6 +504,72 @@ td
 {
     accent-color: #7c8aff;
 }
+.settings-note
+{
+    font-size: 12px;
+    color: #888;
+    margin-bottom: 12px;
+}
+.settings-grid
+{
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
+    gap: 12px;
+}
+.settings-row
+{
+    display: flex;
+    flex-direction: column;
+    gap: 6px;
+}
+.settings-label-row
+{
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    font-size: 12px;
+}
+.settings-label-row label
+{
+    color: #ccc;
+    font-weight: 500;
+}
+.settings-hint
+{
+    color: #777;
+}
+.settings-select
+{
+    background: #11141c;
+    color: #ddd;
+    border: 1px solid #2a2d3a;
+    border-radius: 6px;
+    padding: 8px 10px;
+    font-size: 12px;
+}
+.settings-actions
+{
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    gap: 12px;
+    margin-top: 14px;
+    flex-wrap: wrap;
+}
+.settings-message
+{
+    min-height: 18px;
+    font-size: 12px;
+    color: #7c8aff;
+}
+.settings-message.error
+{
+    color: #ff8080;
+}
+.settings-message.success
+{
+    color: #7bd88f;
+}
 .card-header-row
 {
     display: flex;
@@ -509,7 +597,13 @@ td
 </head>
 <body>
 <div class="header">
-    <h1>ArbiterAI Dashboard <span id="versionBadge" class="version-badge"></span><span id="llamaBadge" class="version-badge"></span></h1>
+    <div class="header-title-block">
+        <h1>ArbiterAI Dashboard <span id="versionBadge" class="version-badge"></span><span id="llamaBadge" class="version-badge"></span></h1>
+        <div class="header-links">
+            <a href="/dashboard/config" class="header-link">Configuration</a>
+            <a href="/dashboard/storage" class="header-link">Downloaded Models</a>
+        </div>
+    </div>
     <div class="status"><span class="dot" id="statusDot"></span><span id="statusText">Connected</span></div>
 </div>
 <div class="container">
@@ -563,6 +657,7 @@ td
                     <th>State</th>
                     <th>Context</th>
                     <th>Max Context</th>
+                    <th>GPU(s)</th>
                     <th>VRAM (MB)</th>
                     <th>RAM (MB)</th>
                     <th>Pinned</th>
@@ -570,7 +665,7 @@ td
                 </tr>
             </thead>
             <tbody id="modelTable">
-                <tr><td colspan="8" style="color:#666;text-align:center;">No models loaded</td></tr>
+                <tr><td colspan="10" style="color:#666;text-align:center;">No models loaded</td></tr>
             </tbody>
         </table>
     </div>
@@ -648,6 +743,13 @@ const MAX_TPS_POINTS=60;
 let hasActiveDownloads=false;
 let logPanelOpen=true;
 let lastLogEpoch=0;
+let availableModelOptions=[];
+let serverConfigState=null;
+const STARTUP_ACCELERATORS=[
+    {key: "cpu", label: "CPU", selectId: "startupDefaultCpu", statusId: "startupDefaultCpuStatus"},
+    {key: "cuda", label: "CUDA", selectId: "startupDefaultCuda", statusId: "startupDefaultCudaStatus"},
+    {key: "vulkan", label: "Vulkan", selectId: "startupDefaultVulkan", statusId: "startupDefaultVulkanStatus"}
+];
 
 function toggleLogPanel()
 {
@@ -694,6 +796,220 @@ function escapeHtml(text)
     return el.innerHTML;
 }
 
+function encodeStartupModelValue(model, variant)
+{
+    return encodeURIComponent(JSON.stringify({model, variant: variant||""}));
+}
+
+function decodeStartupModelValue(value)
+{
+    if(!value) return {model: "", variant: ""};
+
+    try
+    {
+        return JSON.parse(decodeURIComponent(value));
+    }
+    catch(e)
+    {
+        return {model: "", variant: ""};
+    }
+}
+
+function formatStartupModelLabel(model, variant)
+{
+    return variant?`${model} (${variant})`:model;
+}
+
+function formatAcceleratorLabel(accelerator)
+{
+    const match=STARTUP_ACCELERATORS.find((item) => item.key===accelerator);
+    if(match) return match.label;
+    if(accelerator==="legacy") return "Legacy Default";
+    return accelerator?accelerator.toUpperCase():"Unknown";
+}
+
+function buildAvailableModelOptions(models)
+{
+    const seen=new Set();
+    const options=[];
+
+    if(!models) return options;
+
+    for(const model of models)
+    {
+        const modelName=model.model||"";
+        const variant=model.variant||"";
+        if(!modelName) continue;
+
+        const key=modelName+"\u0000"+variant;
+        if(seen.has(key)) continue;
+        seen.add(key);
+
+        options.push({
+            model: modelName,
+            variant,
+            sortKey: modelName.toLowerCase()+"\u0000"+variant.toLowerCase(),
+            label: formatStartupModelLabel(modelName, variant)
+        });
+    }
+
+    options.sort((left, right) => left.sortKey.localeCompare(right.sortKey));
+    return options;
+}
+
+function showStartupDefaultsMessage(text, state)
+{
+    const el=document.getElementById("startupDefaultsMessage");
+    if(!el) return;
+
+    el.textContent=text||"";
+    el.className="settings-message";
+
+    if(state==="error") el.classList.add("error");
+    if(state==="success") el.classList.add("success");
+}
+
+function getStartupDefaultEntry(accelerator)
+{
+    if(!serverConfigState||!serverConfigState.startup_defaults)
+    {
+        return {model: "", variant: ""};
+    }
+
+    const entry=serverConfigState.startup_defaults[accelerator];
+    return {
+        model: entry&&entry.model?entry.model:"",
+        variant: entry&&entry.variant?entry.variant:""
+    };
+}
+
+function isAcceleratorDetected(accelerator)
+{
+    if(!serverConfigState||!serverConfigState.detected_accelerators)
+    {
+        return accelerator==="cpu";
+    }
+
+    return serverConfigState.detected_accelerators.includes(accelerator);
+}
+
+function renderStartupSettings()
+{
+    if(!serverConfigState)
+    {
+        showStartupDefaultsMessage("Loading startup defaults...", "");
+        return;
+    }
+
+    const effective=serverConfigState.effective_startup_default||{};
+    const effectiveLabel=document.getElementById("startupDefaultsEffective");
+    if(effective.model)
+    {
+        effectiveLabel.textContent=`Next restart: ${formatAcceleratorLabel(effective.accelerator)} -> ${formatStartupModelLabel(effective.model, effective.variant||"")}`;
+    }
+    else
+    {
+        effectiveLabel.textContent="Next restart: no startup default configured";
+    }
+
+    for(const accelerator of STARTUP_ACCELERATORS)
+    {
+        const select=document.getElementById(accelerator.selectId);
+        const status=document.getElementById(accelerator.statusId);
+        const selectedEntry=getStartupDefaultEntry(accelerator.key);
+        const selectedValue=selectedEntry.model?encodeStartupModelValue(selectedEntry.model, selectedEntry.variant):"";
+
+        let html=`<option value=""${selectedValue?"":" selected"}>Do not auto-load</option>`;
+        let hasSelected=!selectedValue;
+
+        for(const option of availableModelOptions)
+        {
+            const value=encodeStartupModelValue(option.model, option.variant);
+            const selected=value===selectedValue?" selected":"";
+            if(selected) hasSelected=true;
+            html+=`<option value="${value}"${selected}>${escapeHtml(option.label)}</option>`;
+        }
+
+        if(selectedValue&&!hasSelected)
+        {
+            html+=`<option value="${selectedValue}" selected>${escapeHtml(formatStartupModelLabel(selectedEntry.model, selectedEntry.variant)+(availableModelOptions.length?" [missing from catalog]":""))}</option>`;
+        }
+
+        select.innerHTML=html;
+        status.textContent=isAcceleratorDetected(accelerator.key)?"Detected":"Not detected";
+    }
+
+    const saveButton=document.getElementById("saveStartupDefaultsBtn");
+    saveButton.disabled=!serverConfigState;
+}
+
+async function loadStartupSettings()
+{
+    const [modelsResponse, configResponse]=await Promise.all([
+        fetchJson("/api/models"),
+        fetchJson("/api/server/config")
+    ]);
+
+    if(modelsResponse&&modelsResponse.models)
+    {
+        availableModelOptions=buildAvailableModelOptions(modelsResponse.models);
+    }
+
+    if(configResponse)
+    {
+        serverConfigState=configResponse;
+    }
+
+    renderStartupSettings();
+}
+
+async function saveStartupDefaults()
+{
+    const saveButton=document.getElementById("saveStartupDefaultsBtn");
+    const startupDefaults={};
+
+    for(const accelerator of STARTUP_ACCELERATORS)
+    {
+        const value=decodeStartupModelValue(document.getElementById(accelerator.selectId).value);
+        startupDefaults[accelerator.key]={
+            model: value.model||"",
+            variant: value.variant||""
+        };
+    }
+
+    saveButton.disabled=true;
+    showStartupDefaultsMessage("Saving startup defaults...", "");
+
+    try
+    {
+        const response=await fetch("/api/server/config", {
+            method: "PUT",
+            headers: {"Content-Type": "application/json"},
+            body: JSON.stringify({startup_defaults: startupDefaults})
+        });
+        const data=await response.json();
+
+        if(!response.ok)
+        {
+            showStartupDefaultsMessage(data.error?.message||"Failed to save startup defaults.", "error");
+            return;
+        }
+
+        serverConfigState=data;
+        renderStartupSettings();
+        showStartupDefaultsMessage("Startup defaults saved. They will be used on the next server restart.", "success");
+    }
+    catch(e)
+    {
+        console.error("Saving startup defaults failed:", e);
+        showStartupDefaultsMessage("Failed to save startup defaults.", "error");
+    }
+    finally
+    {
+        saveButton.disabled=false;
+    }
+}
+
 async function refreshLogs(force)
 {
     if(!logPanelOpen) return;
@@ -929,7 +1245,7 @@ function buildHeapTooltip(gpu)
     return lines.join("<br>");
 }
 
-function renderGpus(gpus)
+function renderGpus(gpus, models)
 {
     const el=document.getElementById("gpuList");
 
@@ -939,6 +1255,25 @@ function renderGpus(gpus)
         return;
     }
 
+    // Build per-GPU model assignments from loaded models
+    const gpuModels={};
+    if(models&&models.length>0)
+    {
+        for(const m of models)
+        {
+            if(m.state!=="Loaded") continue;
+            const indices=m.gpu_indices||[];
+            const perGpu=m.per_gpu_vram_mb||{};
+
+            for(const idx of indices)
+            {
+                if(!gpuModels[idx]) gpuModels[idx]=[];
+                const vram=perGpu[String(idx)]||m.estimated_vram_mb||0;
+                gpuModels[idx].push({name:m.model, vram:vram});
+            }
+        }
+    }
+
     let html="";
     for(const gpu of gpus)
     {
@@ -989,10 +1324,22 @@ function renderGpus(gpus)
         const overrideBtn=`<button class="btn" style="font-size:0.7em;padding:2px 6px;margin-left:6px;" onclick="promptVramOverride(${gpu.index}, ${memTotal})" title="Override reported VRAM">✏</button>`;
         const clearBtn=gpu.vram_overridden?`<button class="btn btn-danger" style="font-size:0.7em;padding:2px 6px;margin-left:2px;" onclick="clearVramOverride(${gpu.index})" title="Clear VRAM override">✕</button>`:"";
 
+        // Show models loaded on this GPU
+        let modelsHtml="";
+        const assignedModels=gpuModels[gpu.index]||[];
+        if(assignedModels.length>0)
+        {
+            const modelTags=assignedModels.map(m=>
+                `<span style="display:inline-block;background:#2a2d40;border:1px solid #444;border-radius:4px;padding:1px 6px;margin:2px 2px 0 0;font-size:0.8em;">${m.name} <span style="color:#7c8aff;">${m.vram}MB</span></span>`
+            ).join("");
+            modelsHtml=`<div style="margin-top:4px;">${modelTags}</div>`;
+        }
+
         html+=`<div class="gpu-row">
             <div class="gpu-label"><span>${gpu.name} (${gpu.backend})${gpu.unified_memory?" ⚡ Unified":""}${overrideTag}${overrideBtn}${clearBtn}</span>${memSpan}</div>
             <div class="gpu-bar"><div class="gpu-bar-fill gpu-bar-vram" style="width:${memPct.toFixed(1)}%"></div></div>
             ${utilHtml}
+            ${modelsHtml}
         </div>`;
     }
     el.innerHTML=html;
@@ -1004,7 +1351,7 @@ function renderModels(models)
 
     if(!models||models.length===0)
     {
-        el.innerHTML='<tr><td colspan="9" style="color:#666;text-align:center;">No models loaded</td></tr>';
+        el.innerHTML='<tr><td colspan="10" style="color:#666;text-align:center;">No models loaded</td></tr>';
         return;
     }
 
@@ -1020,13 +1367,52 @@ function renderModels(models)
         const ctxDisplay=m.context_size? m.context_size.toLocaleString() : "-";
         const maxCtxDisplay=m.max_context_size? m.max_context_size.toLocaleString() : "-";
 
+        let gpuDisplay="-";
+        if(m.gpu_indices&&m.gpu_indices.length>0)
+        {
+            gpuDisplay=m.gpu_indices.join(", ");
+        }
+
+        let vramDisplay=`${m.estimated_vram_mb||m.vram_usage_mb||0}`;
+
+        // Show device allocation breakdown if available
+        if(m.device_allocations&&Object.keys(m.device_allocations).length>0)
+        {
+            let allocHtml='<div style="margin-top:4px;font-size:0.82em;color:#aaa;">';
+            for(const [devKey,alloc] of Object.entries(m.device_allocations))
+            {
+                const devName=alloc.device_name||devKey;
+                allocHtml+=`<div style="margin-bottom:2px;"><span style="color:#64b5f6;">${devName}</span>: `
+                    +`Model ${alloc.model_buffer_mb||0} MB`
+                    +` | KV ${alloc.kv_cache_buffer_mb||0} MB`
+                    +` | Compute ${alloc.compute_buffer_mb||0} MB`
+                    +` | <b>${alloc.total_mb||0} MB total</b></div>`;
+            }
+            if(m.cpu_mapped_buffer_mb&&m.cpu_mapped_buffer_mb>0)
+            {
+                allocHtml+=`<div style="margin-bottom:2px;"><span style="color:#ffb74d;">CPU</span>: ${m.cpu_mapped_buffer_mb} MB mapped</div>`;
+            }
+            if(m.graph_splits&&m.graph_splits>1)
+            {
+                allocHtml+=`<div style="color:#ce93d8;">Graph splits: ${m.graph_splits}</div>`;
+            }
+            allocHtml+='</div>';
+            vramDisplay+=allocHtml;
+        }
+        else if(m.per_gpu_vram_mb&&Object.keys(m.per_gpu_vram_mb).length>1)
+        {
+            const parts=Object.entries(m.per_gpu_vram_mb).map(([k,v])=>`GPU${k}:${v}`);
+            vramDisplay+=` <span style="color:#888;font-size:0.85em;">(${parts.join(", ")})</span>`;
+        }
+
         html+=`<tr>
             <td>${m.model}</td>
             <td>${m.variant||"-"}</td>
             <td><span class="badge ${stateClass(m.state)}">${m.state}</span></td>
             <td>${ctxDisplay}</td>
             <td>${maxCtxDisplay}</td>
-            <td>${m.vram_usage_mb||0}</td>
+            <td>${gpuDisplay}</td>
+            <td>${vramDisplay}</td>
             <td>${m.ram_usage_mb||0}</td>
             <td>${m.pinned?"Yes":"No"}</td>
             <td>${actions.join("")}</td>
@@ -1259,8 +1645,8 @@ async function refresh()
     }
 
     // GPUs
-    if(hw&&hw.gpus) renderGpus(hw.gpus);
-    else if(stats.hardware&&stats.hardware.gpus) renderGpus(stats.hardware.gpus);
+    if(hw&&hw.gpus) renderGpus(hw.gpus, stats.models||[]);
+    else if(stats.hardware&&stats.hardware.gpus) renderGpus(stats.hardware.gpus, stats.models||[]);
 
     // Models
     if(stats.models) renderModels(stats.models);
diff --git a/src/server/dashboardConfig.h b/src/server/dashboardConfig.h
new file mode 100644
index 0000000..af4f079
--- /dev/null
+++ b/src/server/dashboardConfig.h
@@ -0,0 +1,1792 @@
+#ifndef _ARBITERAI_SERVER_DASHBOARDCONFIG_H_
+#define _ARBITERAI_SERVER_DASHBOARDCONFIG_H_
+
+#include <string>
+
+namespace arbiterAI
+{
+namespace server
+{
+
+const std::string DASHBOARD_CONFIG_HTML=R"HTML(<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>ArbiterAI - Configuration</title>
+<style>
+*
+{
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body
+{
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    background: #0f1117;
+    color: #e0e0e0;
+    line-height: 1.6;
+}
+.header
+{
+    background: #1a1d27;
+    border-bottom: 1px solid #2a2d3a;
+    padding: 16px 24px;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 16px;
+}
+.header-left
+{
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    flex-wrap: wrap;
+}
+.header-title
+{
+    font-size: 20px;
+    color: #7c8aff;
+    font-weight: 600;
+}
+.header-link
+{
+    color: #7c8aff;
+    text-decoration: none;
+    font-size: 13px;
+}
+.header-link:hover
+{
+    text-decoration: underline;
+}
+.status
+{
+    font-size: 13px;
+    color: #888;
+}
+.status .dot
+{
+    display: inline-block;
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+    background: #4caf50;
+    margin-right: 6px;
+    vertical-align: middle;
+}
+.version-badge
+{
+    font-size: 12px;
+    color: #888;
+    background: #2a2d3a;
+    padding: 2px 8px;
+    border-radius: 4px;
+    font-weight: normal;
+    vertical-align: middle;
+}
+.container
+{
+    max-width: 1440px;
+    margin: 0 auto;
+    padding: 24px;
+}
+.card
+{
+    background: #1a1d27;
+    border: 1px solid #2a2d3a;
+    border-radius: 10px;
+    padding: 18px;
+    margin-bottom: 20px;
+}
+.card h2
+{
+    font-size: 14px;
+    color: #888;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+    margin-bottom: 10px;
+}
+.card p
+{
+    color: #b8bcc8;
+    font-size: 14px;
+}
+.hero-title
+{
+    font-size: 24px;
+    color: #f3f5fb;
+    margin-bottom: 8px;
+}
+.hero-subtitle
+{
+    color: #9aa3b8;
+    font-size: 14px;
+    max-width: 880px;
+}
+.grid
+{
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
+    gap: 18px;
+}
+.startup-card
+{
+    display: flex;
+    flex-direction: column;
+    gap: 14px;
+    min-height: 420px;
+}
+.startup-header
+{
+    display: flex;
+    justify-content: space-between;
+    gap: 12px;
+    align-items: flex-start;
+}
+.startup-title
+{
+    font-size: 18px;
+    color: #f3f5fb;
+    margin-bottom: 4px;
+}
+.startup-subtitle
+{
+    color: #8891a4;
+    font-size: 13px;
+}
+.startup-memory
+{
+    color: #9aa3b8;
+    font-size: 12px;
+    text-align: right;
+}
+.startup-field-label
+{
+    display: block;
+    font-size: 12px;
+    color: #7f8799;
+    margin-bottom: 6px;
+    text-transform: uppercase;
+    letter-spacing: 0.4px;
+}
+.context-slider
+{
+    -webkit-appearance: none;
+    appearance: none;
+    width: 100%;
+    height: 8px;
+    border-radius: 4px;
+    background: #32384b;
+    outline: none;
+    cursor: pointer;
+}
+.context-slider::-webkit-slider-thumb
+{
+    -webkit-appearance: none;
+    appearance: none;
+    width: 18px;
+    height: 18px;
+    border-radius: 50%;
+    background: #7c8aff;
+    cursor: pointer;
+    border: 2px solid #1a1f2e;
+}
+.context-slider::-moz-range-thumb
+{
+    width: 18px;
+    height: 18px;
+    border-radius: 50%;
+    background: #7c8aff;
+    cursor: pointer;
+    border: 2px solid #1a1f2e;
+}
+.context-slider:disabled
+{
+    opacity: 0.3;
+    cursor: not-allowed;
+}
+.context-slider:disabled::-webkit-slider-thumb
+{
+    cursor: not-allowed;
+}
+.context-slider-value
+{
+    float: right;
+    font-size: 13px;
+    color: #f1f4ff;
+    font-weight: 600;
+}
+.vram-override-row
+{
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 10px;
+    border: 1px solid #32384b;
+    border-radius: 8px;
+    margin-bottom: 8px;
+    background: #111520;
+}
+.vram-override-row .override-tag
+{
+    color: #f0a;
+    font-size: 0.85em;
+}
+.picker
+{
+    border: 1px solid #32384b;
+    border-radius: 8px;
+    background: #111520;
+    overflow: hidden;
+}
+.picker summary
+{
+    list-style: none;
+    cursor: pointer;
+    padding: 12px;
+}
+.picker summary::-webkit-details-marker
+{
+    display: none;
+}
+.picker-summary-row
+{
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 10px;
+}
+.picker-summary-title
+{
+    font-size: 14px;
+    color: #f1f4ff;
+    font-weight: 500;
+}
+.picker-summary-meta
+{
+    color: #8e96a9;
+    font-size: 12px;
+    margin-top: 4px;
+}
+.picker-chevron
+{
+    color: #7c8aff;
+    font-size: 11px;
+}
+.picker-menu
+{
+    border-top: 1px solid #23293a;
+    max-height: 330px;
+    overflow-y: auto;
+    padding: 8px;
+    display: grid;
+    gap: 8px;
+}
+.picker-option
+{
+    width: 100%;
+    border: 1px solid #2f3648;
+    background: #171c28;
+    color: #e8ecf7;
+    border-radius: 8px;
+    text-align: left;
+    padding: 10px;
+    cursor: pointer;
+}
+.picker-option:hover
+{
+    background: #1d2332;
+}
+.picker-option.selected
+{
+    border-color: #7c8aff;
+    box-shadow: inset 0 0 0 1px rgba(124, 138, 255, 0.35);
+}
+.picker-option-top
+{
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    gap: 12px;
+    margin-bottom: 6px;
+}
+.picker-option-title
+{
+    font-size: 14px;
+    font-weight: 500;
+    color: #f1f4ff;
+}
+.picker-option-meta
+{
+    font-size: 12px;
+    color: #9aa3b8;
+}
+.picker-empty
+{
+    color: #8c93a5;
+    font-size: 13px;
+    padding: 12px;
+}
+.compat-badge
+{
+    display: inline-flex;
+    align-items: center;
+    gap: 6px;
+    padding: 3px 9px;
+    border-radius: 999px;
+    font-size: 11px;
+    font-weight: 700;
+    text-transform: uppercase;
+    letter-spacing: 0.45px;
+}
+.compat-likely
+{
+    background: rgba(76, 175, 80, 0.12);
+    color: #6fdd7b;
+    border-color: rgba(76, 175, 80, 0.28);
+}
+.compat-tight
+{
+    background: rgba(240, 192, 64, 0.12);
+    color: #f0c040;
+    border-color: rgba(240, 192, 64, 0.28);
+}
+.compat-unlikely
+{
+    background: rgba(255, 96, 96, 0.12);
+    color: #ff7d7d;
+    border-color: rgba(255, 96, 96, 0.28);
+}
+.compat-cloud
+{
+    background: rgba(124, 138, 255, 0.14);
+    color: #9eb0ff;
+    border-color: rgba(124, 138, 255, 0.28);
+}
+.compat-undetected
+{
+    background: rgba(138, 146, 167, 0.14);
+    color: #bac2d8;
+    border-color: rgba(138, 146, 167, 0.28);
+}
+.compat-outline
+{
+    border: 1px solid transparent;
+}
+.startup-note
+{
+    margin-top: auto;
+    padding: 12px;
+    background: #121723;
+    border: 1px solid #23293a;
+    border-radius: 8px;
+    color: #aeb6ca;
+    font-size: 13px;
+    min-height: 84px;
+}
+.runtime-opts-section
+{
+    border: 1px solid #23293a;
+    border-radius: 8px;
+    overflow: hidden;
+}
+.runtime-opts-toggle
+{
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 10px 12px;
+    background: #111520;
+    cursor: pointer;
+    font-size: 12px;
+    color: #8891a4;
+    text-transform: uppercase;
+    letter-spacing: 0.4px;
+    border: none;
+    width: 100%;
+}
+.runtime-opts-toggle:hover
+{
+    background: #161c2b;
+}
+.runtime-opts-body
+{
+    display: none;
+    padding: 12px;
+    background: #111520;
+    border-top: 1px solid #23293a;
+}
+.runtime-opts-body.open
+{
+    display: block;
+}
+.runtime-opts-row
+{
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 12px;
+    margin-bottom: 10px;
+}
+.runtime-opts-row:last-child
+{
+    margin-bottom: 0;
+}
+.runtime-opts-label
+{
+    font-size: 13px;
+    color: #b5bdd1;
+    white-space: nowrap;
+}
+.runtime-opts-select
+{
+    border: 1px solid #32384b;
+    border-radius: 6px;
+    background: #171c28;
+    color: #e0e0e0;
+    padding: 6px 10px;
+    font-size: 13px;
+    min-width: 120px;
+}
+.summary-line
+{
+    display: flex;
+    justify-content: space-between;
+    gap: 12px;
+    color: #b5bdd1;
+    font-size: 13px;
+    margin-top: 6px;
+}
+.save-row
+{
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 12px;
+    flex-wrap: wrap;
+}
+.btn
+{
+    padding: 10px 16px;
+    border: 1px solid #2d3450;
+    background: #202845;
+    color: #eef2ff;
+    border-radius: 8px;
+    cursor: pointer;
+    font-size: 14px;
+    font-weight: 600;
+}
+.btn:hover
+{
+    background: #273154;
+}
+.btn:disabled
+{
+    opacity: 0.45;
+    cursor: not-allowed;
+}
+.settings-message
+{
+    min-height: 20px;
+    font-size: 13px;
+    color: #97a0b5;
+}
+.settings-message.success
+{
+    color: #6fdd7b;
+}
+.settings-message.error
+{
+    color: #ff8d8d;
+}
+.effective-banner
+{
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 12px;
+    flex-wrap: wrap;
+}
+.effective-copy
+{
+    color: #c7d0e4;
+    font-size: 14px;
+}
+.effective-copy strong
+{
+    color: #f3f5fb;
+}
+@media (max-width: 760px)
+{
+    .header,
+    .effective-banner,
+    .startup-header,
+    .save-row
+    {
+        flex-direction: column;
+        align-items: flex-start;
+    }
+
+    .startup-memory
+    {
+        text-align: left;
+    }
+}
+</style>
+</head>
+<body>
+<div class="header">
+    <div class="header-left">
+        <a href="/dashboard" class="header-link">&larr; Dashboard</a>
+        <a href="/dashboard/storage" class="header-link">Downloaded Models</a>
+        <span class="header-title">Configuration</span>
+        <span id="versionBadge" class="version-badge"></span>
+    </div>
+    <div class="status"><span class="dot" id="statusDot"></span><span id="statusText">Connected</span></div>
+</div>
+<div class="container">
+    <div class="card">
+        <div class="hero-title">Startup Configuration</div>
+        <div class="hero-subtitle">This is the server configuration. Models listed here are loaded on server startup. Adding a model saves it to the config and loads it immediately. Removing a model unloads it and removes it from the config.</div>
+    </div>
+
+    <!-- ── New Startup Models UI ────────────────────────── -->
+    <div id="startupModelsSection">
+        <div class="card">
+            <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;">
+                <div>
+                    <div class="hero-title" style="font-size:16px;">Startup Models</div>
+                    <div class="hero-subtitle">Select models to load on startup and assign compute devices.</div>
+                </div>
+                <button class="btn" onclick="addStartupModel()">+ Add Model</button>
+            </div>
+            <div id="startupModelsList"></div>
+        </div>
+    </div>
+
+    <!-- ── Legacy Accelerator UI (hidden when startup_models active) ── -->
+    <div id="legacyAcceleratorSection" style="display:none;">
+    <div class="grid">
+        <div class="card startup-card">
+            <div class="startup-header">
+                <div>
+                    <div class="startup-title">CPU</div>
+                    <div class="startup-subtitle" id="startupSubtitleCpu">Loading hardware status...</div>
+                </div>
+                <div class="startup-memory" id="startupMemoryCpu"></div>
+            </div>
+            <div>
+                <label class="startup-field-label">Model</label>
+                <details class="picker" id="startupPickerCpu">
+                    <summary id="startupSummaryCpu"></summary>
+                    <div class="picker-menu" id="startupMenuCpu"></div>
+                </details>
+            </div>
+            <div>
+                <label class="startup-field-label">Context Size <span class="context-slider-value" id="startupContextLabelCpu">&mdash;</span></label>
+                <input type="range" class="context-slider" min="4096" max="131072" step="1024" value="4096" id="startupContextCpu" oninput="updateContextLabel('cpu')" onchange="refreshAcceleratorOptions('cpu')" disabled>
+            </div>
+            <div class="runtime-opts-section" id="runtimeOptsSectionCpu">
+                <button type="button" class="runtime-opts-toggle" onclick="toggleRuntimeOpts('cpu')">Runtime Options <span id="runtimeOptsChevronCpu">&#9654;</span></button>
+                <div class="runtime-opts-body" id="runtimeOptsBodyCpu">
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">KV Cache (K)</span>
+                        <select class="runtime-opts-select" id="runtimeKvkCpu">
+                            <option value="">Default</option>
+                            <option value="f16">f16</option>
+                            <option value="q8_0">q8_0</option>
+                            <option value="q4_0">q4_0</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">KV Cache (V)</span>
+                        <select class="runtime-opts-select" id="runtimeKvvCpu">
+                            <option value="">Default</option>
+                            <option value="f16">f16</option>
+                            <option value="q8_0">q8_0</option>
+                            <option value="q4_0">q4_0</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">Flash Attention</span>
+                        <select class="runtime-opts-select" id="runtimeFlashAttnCpu">
+                            <option value="">Default</option>
+                            <option value="true">Enabled</option>
+                            <option value="false">Disabled</option>
+                        </select>
+                    </div>
+                </div>
+            </div>
+            <div class="startup-note" id="startupNoteCpu"></div>
+        </div>
+
+        <div class="card startup-card">
+            <div class="startup-header">
+                <div>
+                    <div class="startup-title">CUDA GPU</div>
+                    <div class="startup-subtitle" id="startupSubtitleCuda">Loading hardware status...</div>
+                </div>
+                <div class="startup-memory" id="startupMemoryCuda"></div>
+            </div>
+            <div>
+                <label class="startup-field-label">Model</label>
+                <details class="picker" id="startupPickerCuda">
+                    <summary id="startupSummaryCuda"></summary>
+                    <div class="picker-menu" id="startupMenuCuda"></div>
+                </details>
+            </div>
+            <div>
+                <label class="startup-field-label">Context Size <span class="context-slider-value" id="startupContextLabelCuda">&mdash;</span></label>
+                <input type="range" class="context-slider" min="4096" max="131072" step="1024" value="4096" id="startupContextCuda" oninput="updateContextLabel('cuda')" onchange="refreshAcceleratorOptions('cuda')" disabled>
+            </div>
+            <div class="runtime-opts-section" id="runtimeOptsSectionCuda">
+                <button type="button" class="runtime-opts-toggle" onclick="toggleRuntimeOpts('cuda')">Runtime Options <span id="runtimeOptsChevronCuda">&#9654;</span></button>
+                <div class="runtime-opts-body" id="runtimeOptsBodyCuda">
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">KV Cache (K)</span>
+                        <select class="runtime-opts-select" id="runtimeKvkCuda">
+                            <option value="">Default</option>
+                            <option value="f16">f16</option>
+                            <option value="q8_0">q8_0</option>
+                            <option value="q4_0">q4_0</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">KV Cache (V)</span>
+                        <select class="runtime-opts-select" id="runtimeKvvCuda">
+                            <option value="">Default</option>
+                            <option value="f16">f16</option>
+                            <option value="q8_0">q8_0</option>
+                            <option value="q4_0">q4_0</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">Flash Attention</span>
+                        <select class="runtime-opts-select" id="runtimeFlashAttnCuda">
+                            <option value="">Default</option>
+                            <option value="true">Enabled</option>
+                            <option value="false">Disabled</option>
+                        </select>
+                    </div>
+                </div>
+            </div>
+            <div class="startup-note" id="startupNoteCuda"></div>
+        </div>
+
+        <div class="card startup-card">
+            <div class="startup-header">
+                <div>
+                    <div class="startup-title">Vulkan GPU</div>
+                    <div class="startup-subtitle" id="startupSubtitleVulkan">Loading hardware status...</div>
+                </div>
+                <div class="startup-memory" id="startupMemoryVulkan"></div>
+            </div>
+            <div>
+                <label class="startup-field-label">Model</label>
+                <details class="picker" id="startupPickerVulkan">
+                    <summary id="startupSummaryVulkan"></summary>
+                    <div class="picker-menu" id="startupMenuVulkan"></div>
+                </details>
+            </div>
+            <div>
+                <label class="startup-field-label">Context Size <span class="context-slider-value" id="startupContextLabelVulkan">&mdash;</span></label>
+                <input type="range" class="context-slider" min="4096" max="131072" step="1024" value="4096" id="startupContextVulkan" oninput="updateContextLabel('vulkan')" onchange="refreshAcceleratorOptions('vulkan')" disabled>
+            </div>
+            <div class="runtime-opts-section" id="runtimeOptsSectionVulkan">
+                <button type="button" class="runtime-opts-toggle" onclick="toggleRuntimeOpts('vulkan')">Runtime Options <span id="runtimeOptsChevronVulkan">&#9654;</span></button>
+                <div class="runtime-opts-body" id="runtimeOptsBodyVulkan">
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">KV Cache (K)</span>
+                        <select class="runtime-opts-select" id="runtimeKvkVulkan">
+                            <option value="">Default</option>
+                            <option value="f16">f16</option>
+                            <option value="q8_0">q8_0</option>
+                            <option value="q4_0">q4_0</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">KV Cache (V)</span>
+                        <select class="runtime-opts-select" id="runtimeKvvVulkan">
+                            <option value="">Default</option>
+                            <option value="f16">f16</option>
+                            <option value="q8_0">q8_0</option>
+                            <option value="q4_0">q4_0</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">Flash Attention</span>
+                        <select class="runtime-opts-select" id="runtimeFlashAttnVulkan">
+                            <option value="">Default</option>
+                            <option value="true">Enabled</option>
+                            <option value="false">Disabled</option>
+                        </select>
+                    </div>
+                    <div class="runtime-opts-row">
+                        <span class="runtime-opts-label">No Host-Visible VRAM</span>
+                        <select class="runtime-opts-select" id="runtimeVkNoHostVisVulkan" title="GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM: Skip BAR-mapped heap, force device-local only. Useful for GPUs without Resizable BAR where the driver reports incorrect VRAM budgets.">
+                            <option value="">Default</option>
+                            <option value="true">Enabled</option>
+                            <option value="false">Disabled</option>
+                        </select>
+                    </div>
+                </div>
+            </div>
+            <div class="startup-note" id="startupNoteVulkan"></div>
+        </div>
+    </div>
+    </div><!-- end legacyAcceleratorSection -->
+
+    <div class="card" id="vramOverrideCard">
+        <h2>VRAM Overrides</h2>
+        <div class="hero-subtitle" style="margin-bottom:14px;">Override the reported VRAM for each GPU. Useful when the driver reports incorrect values or to simulate different hardware. Changes take effect immediately for fit calculations.</div>
+        <div id="vramOverrideList"><span style="color:#666;">Loading GPU info...</span></div>
+    </div>
+
+    <div class="card save-row">
+        <div class="settings-message" id="startupDefaultsMessage"></div>
+    </div>
+</div>
+
+<script>
+const STARTUP_ACCELERATORS=[
+    {key: "cpu", label: "CPU", contextId: "startupContextCpu", contextLabelId: "startupContextLabelCpu", summaryId: "startupSummaryCpu", menuId: "startupMenuCpu", noteId: "startupNoteCpu", subtitleId: "startupSubtitleCpu", memoryId: "startupMemoryCpu", pickerId: "startupPickerCpu", kvkId: "runtimeKvkCpu", kvvId: "runtimeKvvCpu", flashAttnId: "runtimeFlashAttnCpu", runtimeBodyId: "runtimeOptsBodyCpu", runtimeChevronId: "runtimeOptsChevronCpu"},
+    {key: "cuda", label: "CUDA GPU", contextId: "startupContextCuda", contextLabelId: "startupContextLabelCuda", summaryId: "startupSummaryCuda", menuId: "startupMenuCuda", noteId: "startupNoteCuda", subtitleId: "startupSubtitleCuda", memoryId: "startupMemoryCuda", pickerId: "startupPickerCuda", kvkId: "runtimeKvkCuda", kvvId: "runtimeKvvCuda", flashAttnId: "runtimeFlashAttnCuda", runtimeBodyId: "runtimeOptsBodyCuda", runtimeChevronId: "runtimeOptsChevronCuda"},
+    {key: "vulkan", label: "Vulkan GPU", contextId: "startupContextVulkan", contextLabelId: "startupContextLabelVulkan", summaryId: "startupSummaryVulkan", menuId: "startupMenuVulkan", noteId: "startupNoteVulkan", subtitleId: "startupSubtitleVulkan", memoryId: "startupMemoryVulkan", pickerId: "startupPickerVulkan", kvkId: "runtimeKvkVulkan", kvvId: "runtimeKvvVulkan", flashAttnId: "runtimeFlashAttnVulkan", vkNoHostVisId: "runtimeVkNoHostVisVulkan", runtimeBodyId: "runtimeOptsBodyVulkan", runtimeChevronId: "runtimeOptsChevronVulkan"}
+];
+
+let serverConfigState=null;
+let useStartupModels=false;
+let startupModelsState=[];
+let availableGpus=[];
+let availableModelOptions=[];
+const startupState={
+    cpu: {selected: {model: "", variant: ""}, contextSize: 0, runtimeOptions: {}, options: [], detected: true, availableVramMb: 0, availableRamMb: 0},
+    cuda: {selected: {model: "", variant: ""}, contextSize: 0, runtimeOptions: {}, options: [], detected: false, availableVramMb: 0, availableRamMb: 0},
+    vulkan: {selected: {model: "", variant: ""}, contextSize: 0, runtimeOptions: {}, options: [], detected: false, availableVramMb: 0, availableRamMb: 0}
+};
+
+function escapeHtml(text)
+{
+    const el=document.createElement("span");
+    el.textContent=text||"";
+    return el.innerHTML;
+}
+
+function toggleRuntimeOpts(accelerator)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    const body=document.getElementById(meta.runtimeBodyId);
+    const chevron=document.getElementById(meta.runtimeChevronId);
+    const isOpen=body.classList.toggle("open");
+    chevron.innerHTML=isOpen?"&#9660;":"&#9654;";
+}
+
+function readRuntimeOptsFromUI(accelerator)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    const opts={};
+    const kvk=document.getElementById(meta.kvkId).value;
+    const kvv=document.getElementById(meta.kvvId).value;
+    const fa=document.getElementById(meta.flashAttnId).value;
+    if(kvk) opts.kv_cache_type_k=kvk;
+    if(kvv) opts.kv_cache_type_v=kvv;
+    if(fa==="true") opts.flash_attn=true;
+    else if(fa==="false") opts.flash_attn=false;
+    if(meta.vkNoHostVisId)
+    {
+        const vkNhv=document.getElementById(meta.vkNoHostVisId).value;
+        if(vkNhv==="true") opts.vulkan_no_host_visible_vram=true;
+        else if(vkNhv==="false") opts.vulkan_no_host_visible_vram=false;
+    }
+    return opts;
+}
+
+function applyRuntimeOptsToUI(accelerator, opts)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    document.getElementById(meta.kvkId).value=(opts&&opts.kv_cache_type_k)||"";
+    document.getElementById(meta.kvvId).value=(opts&&opts.kv_cache_type_v)||"";
+    if(opts&&opts.flash_attn===true) document.getElementById(meta.flashAttnId).value="true";
+    else if(opts&&opts.flash_attn===false) document.getElementById(meta.flashAttnId).value="false";
+    else document.getElementById(meta.flashAttnId).value="";
+    if(meta.vkNoHostVisId)
+    {
+        if(opts&&opts.vulkan_no_host_visible_vram===true) document.getElementById(meta.vkNoHostVisId).value="true";
+        else if(opts&&opts.vulkan_no_host_visible_vram===false) document.getElementById(meta.vkNoHostVisId).value="false";
+        else document.getElementById(meta.vkNoHostVisId).value="";
+    }
+}
+
+async function fetchJson(url)
+{
+    try
+    {
+        const resp=await fetch(url);
+        if(!resp.ok) return null;
+        return await resp.json();
+    }
+    catch(e)
+    {
+        return null;
+    }
+}
+
+function showMessage(text, state)
+{
+    const el=document.getElementById("startupDefaultsMessage");
+    el.textContent=text||"";
+    el.className="settings-message";
+    if(state==="success") el.classList.add("success");
+    if(state==="error") el.classList.add("error");
+}
+
+function formatMb(value)
+{
+    if(!value||value<=0) return "-";
+    if(value>=1024) return (value/1024).toFixed(1)+" GB";
+    return value.toFixed(0)+" MB";
+}
+
+function formatContextSize(value)
+{
+    if(!value||value<=0) return "\u2014";
+    if(value>=1024) return (value/1024).toFixed(0)+"K";
+    return value+"";
+}
+
+function updateContextLabel(accelerator)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    const slider=document.getElementById(meta.contextId);
+    const label=document.getElementById(meta.contextLabelId);
+    const value=parseInt(slider.value, 10);
+    label.textContent=formatContextSize(value);
+    startupState[accelerator].contextSize=value;
+}
+
+function updateSliderGradient(accelerator)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    const slider=document.getElementById(meta.contextId);
+    const state=startupState[accelerator];
+    const selected=findSelectedOption(accelerator);
+
+    if(!selected||!selected.memory_per_1k_context_mb||selected.memory_per_1k_context_mb<=0)
+    {
+        slider.style.background="#32384b";
+        return;
+    }
+
+    const min=parseInt(slider.min);
+    const max=parseInt(slider.max);
+    const range=max-min;
+
+    if(range<=0)
+    {
+        slider.style.background="#32384b";
+        return;
+    }
+
+    const baseMemory=selected.base_memory_mb||0;
+    const baseContext=selected.base_context_size||0;
+    const memPer1k=selected.memory_per_1k_context_mb;
+    const availableMemory=accelerator==="cpu"?state.availableRamMb:state.availableVramMb;
+
+    if(availableMemory<=0)
+    {
+        slider.style.background="#32384b";
+        return;
+    }
+
+    const likelyCtx=baseContext+((0.85*availableMemory-baseMemory)/memPer1k)*1024;
+    const tightCtx=baseContext+((availableMemory-baseMemory)/memPer1k)*1024;
+
+    const likelyPct=Math.max(0, Math.min(100, ((likelyCtx-min)/range)*100));
+    const tightPct=Math.max(0, Math.min(100, ((tightCtx-min)/range)*100));
+
+    if(likelyPct>=100)
+    {
+        slider.style.background="linear-gradient(to right, rgba(76,175,80,0.35) 0%, rgba(76,175,80,0.35) 100%)";
+    }
+    else if(tightPct<=0)
+    {
+        slider.style.background="linear-gradient(to right, rgba(255,96,96,0.35) 0%, rgba(255,96,96,0.35) 100%)";
+    }
+    else
+    {
+        slider.style.background="linear-gradient(to right, rgba(76,175,80,0.35) 0%, rgba(76,175,80,0.35) "+likelyPct+"%, rgba(240,192,64,0.35) "+likelyPct+"%, rgba(240,192,64,0.35) "+tightPct+"%, rgba(255,96,96,0.35) "+tightPct+"%, rgba(255,96,96,0.35) 100%)";
+    }
+}
+
+async function loadGpuOverrides()
+{
+    const data=await fetchJson("/api/hardware");
+
+    if(!data||!data.gpus)
+    {
+        document.getElementById("vramOverrideList").innerHTML='<span style="color:#666;">No GPU data available.</span>';
+        return;
+    }
+
+    renderGpuOverrides(data.gpus);
+}
+
+function renderGpuOverrides(gpus)
+{
+    const el=document.getElementById("vramOverrideList");
+
+    if(!gpus||gpus.length===0)
+    {
+        el.innerHTML='<span style="color:#666;">No GPUs detected.</span>';
+        return;
+    }
+
+    let html="";
+
+    for(const gpu of gpus)
+    {
+        const overrideTag=gpu.vram_overridden?' <span class="override-tag">\u2699 Overridden</span>':"";
+        const memTotal=gpu.vram_total_mb;
+        const clearBtn=gpu.vram_overridden
+            ?'<button class="btn" style="font-size:12px;padding:6px 10px;border-color:#ff6060;color:#ff8d8d;" onclick="clearVramOverrideConfig('+gpu.index+')">\u2715 Clear</button>'
+            :"";
+        html+='<div class="vram-override-row">'
+            +'<div>'
+            +'<div style="color:#f1f4ff;font-size:14px;">'+escapeHtml(gpu.name)+' ('+escapeHtml(gpu.backend)+')'+overrideTag+'</div>'
+            +'<div style="color:#8e96a9;font-size:12px;margin-top:4px;">VRAM: '+formatMb(memTotal)+' total \u2022 Free: '+formatMb(gpu.vram_free_mb)+'</div>'
+            +'</div>'
+            +'<div style="display:flex;gap:6px;">'
+            +'<button class="btn" style="font-size:12px;padding:6px 10px;" onclick="promptVramOverrideConfig('+gpu.index+', '+memTotal+')">\u270F Override</button>'
+            +clearBtn
+            +'</div>'
+            +'</div>';
+    }
+
+    el.innerHTML=html;
+}
+
+async function promptVramOverrideConfig(gpuIndex, currentMb)
+{
+    const input=prompt("Enter VRAM override in MB for GPU "+gpuIndex+" (current: "+currentMb+" MB):", currentMb);
+
+    if(input===null) return;
+
+    const vramMb=parseInt(input, 10);
+
+    if(isNaN(vramMb)||vramMb<=0)
+    {
+        alert("Invalid value. Please enter a positive integer in MB.");
+        return;
+    }
+
+    try
+    {
+        const res=await fetch("/api/hardware/vram-override", {
+            method: "POST",
+            headers: {"Content-Type": "application/json"},
+            body: JSON.stringify({gpu_index: gpuIndex, vram_mb: vramMb})
+        });
+
+        if(!res.ok)
+        {
+            const err=await res.json();
+            alert("Failed: "+(err.error||"Unknown error"));
+            return;
+        }
+
+        await loadGpuOverrides();
+        await refreshAllStartupOptions();
+    }
+    catch(e)
+    {
+        console.error("VRAM override failed:", e);
+    }
+}
+
+async function clearVramOverrideConfig(gpuIndex)
+{
+    try
+    {
+        const res=await fetch("/api/hardware/vram-override/"+gpuIndex, {method: "DELETE"});
+
+        if(!res.ok)
+        {
+            const err=await res.json();
+            alert("Failed: "+(err.error||"Unknown error"));
+            return;
+        }
+
+        await loadGpuOverrides();
+        await refreshAllStartupOptions();
+    }
+    catch(e)
+    {
+        console.error("Clear VRAM override failed:", e);
+    }
+}
+
+function formatStartupModelLabel(model, variant)
+{
+    return variant?model+" ("+variant+")":model;
+}
+
+function encodeStartupModelValue(model, variant)
+{
+    return encodeURIComponent(JSON.stringify({model: model||"", variant: variant||""}));
+}
+
+function decodeStartupModelValue(value)
+{
+    if(!value) return {model: "", variant: ""};
+
+    try
+    {
+        return JSON.parse(decodeURIComponent(value));
+    }
+    catch(e)
+    {
+        return {model: "", variant: ""};
+    }
+}
+
+function getAcceleratorMeta(accelerator)
+{
+    return STARTUP_ACCELERATORS.find((item) => item.key===accelerator);
+}
+
+function normalizeContextSize(value)
+{
+    const parsed=parseInt(value, 10);
+    if(Number.isNaN(parsed)||parsed<=0)
+    {
+        return 0;
+    }
+
+    return parsed;
+}
+
+function formatCompatibilityClass(compatibility)
+{
+    return "compat-"+(compatibility||"unlikely");
+}
+
+function formatOptionMemoryLabel(accelerator, option)
+{
+    if(option.compatibility==="cloud")
+    {
+        return "Cloud provider";
+    }
+
+    if(accelerator==="cpu")
+    {
+        return "Needs ~"+formatMb(option.required_ram_mb)+" RAM";
+    }
+
+    return "Needs ~"+formatMb(option.required_vram_mb)+" VRAM";
+}
+
+function getStartupEntry(accelerator)
+{
+    if(!serverConfigState||!serverConfigState.startup_defaults)
+    {
+        return {model: "", variant: "", context_size: 0, runtime_options: {}};
+    }
+
+    const entry=serverConfigState.startup_defaults[accelerator]||{};
+    return {
+        model: entry.model||"",
+        variant: entry.variant||"",
+        context_size: entry.context_size||0,
+        runtime_options: entry.runtime_options||{}
+    };
+}
+
+function findSelectedOption(accelerator)
+{
+    const state=startupState[accelerator];
+    return state.options.find((option) => option.model===state.selected.model&&option.variant===state.selected.variant)||null;
+}
+
+function buildSummaryHtml(accelerator)
+{
+    const state=startupState[accelerator];
+    const selected=findSelectedOption(accelerator);
+
+    if(!state.selected.model)
+    {
+        return '<div class="picker-summary-row"><div><div class="picker-summary-title">Do not auto-load</div><div class="picker-summary-meta">Nothing will be preloaded for this startup slot.</div></div><span class="picker-chevron">Open</span></div>';
+    }
+
+    if(!selected)
+    {
+        const title=escapeHtml(formatStartupModelLabel(state.selected.model, state.selected.variant));
+        return '<div class="picker-summary-row"><div><div class="picker-summary-title">'+title+'</div><div class="picker-summary-meta">Saved selection is missing from the current model catalog.</div></div><span class="compat-badge compat-outline compat-unlikely">Missing</span></div>';
+    }
+
+    const title=escapeHtml(formatStartupModelLabel(selected.model, selected.variant));
+    const meta=escapeHtml(formatOptionMemoryLabel(accelerator, selected)+" • "+selected.compatibility_reason);
+    const compatClass=formatCompatibilityClass(selected.compatibility);
+    return '<div class="picker-summary-row"><div><div class="picker-summary-title">'+title+'</div><div class="picker-summary-meta">'+meta+'</div></div><span class="compat-badge compat-outline '+compatClass+'">'+escapeHtml(selected.compatibility_label)+'</span></div>';
+}
+
+function renderOptionButton(accelerator, option)
+{
+    const state=startupState[accelerator];
+    const selected=state.selected.model===option.model&&state.selected.variant===option.variant;
+    const compatClass=formatCompatibilityClass(option.compatibility);
+    const value=encodeStartupModelValue(option.model, option.variant);
+    const title=escapeHtml(formatStartupModelLabel(option.model, option.variant));
+    const meta=escapeHtml(formatOptionMemoryLabel(accelerator, option)+" • max context "+(option.max_context_size||"-")+" • "+option.compatibility_reason);
+    return '<button type="button" class="picker-option '+(selected?'selected ':'')+compatClass+'" onclick="selectStartupOption(\''+accelerator+'\', \''+value+'\')">'
+        +'<div class="picker-option-top"><span class="picker-option-title">'+title+'</span><span class="compat-badge compat-outline '+compatClass+'">'+escapeHtml(option.compatibility_label)+'</span></div>'
+        +'<div class="picker-option-meta">'+meta+'</div>'
+        +'</button>';
+}
+
+function renderAccelerator(accelerator)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    const state=startupState[accelerator];
+    const summaryEl=document.getElementById(meta.summaryId);
+    const menuEl=document.getElementById(meta.menuId);
+    const noteEl=document.getElementById(meta.noteId);
+    const subtitleEl=document.getElementById(meta.subtitleId);
+    const memoryEl=document.getElementById(meta.memoryId);
+    const selected=findSelectedOption(accelerator);
+
+    subtitleEl.textContent=state.detected
+        ? (accelerator==="cpu"?"Uses system RAM for CPU startup.":"Hardware detected and startup fit checked against total VRAM.")
+        : (accelerator==="cpu"?"CPU startup is always available if enough RAM remains.":"No compatible hardware is currently detected for this slot.");
+
+    memoryEl.textContent=accelerator==="cpu"
+        ? (state.availableRamMb?"Total RAM: "+formatMb(state.availableRamMb):"RAM unavailable")
+        : (state.detected?"Total VRAM: "+formatMb(state.availableVramMb):"Device not detected");
+
+    summaryEl.innerHTML=buildSummaryHtml(accelerator);
+
+    let html='<button type="button" class="picker-option '+(!state.selected.model?'selected ':'')+'" onclick="selectStartupOption(\''+accelerator+'\', \'\')">'
+        +'<div class="picker-option-top"><span class="picker-option-title">Do not auto-load</span><span class="compat-badge compat-outline compat-cloud">Disabled</span></div>'
+        +'<div class="picker-option-meta">Leave this startup slot empty.</div>'
+        +'</button>';
+
+    if(state.options.length===0)
+    {
+        html+='<div class="picker-empty">No startup options are available right now.</div>';
+    }
+    else
+    {
+        html+=state.options.map((option) => renderOptionButton(accelerator, option)).join('');
+    }
+
+    menuEl.innerHTML=html;
+
+    if(!state.selected.model)
+    {
+        noteEl.textContent='No model will be preloaded for '+meta.label+'.';
+        const slider=document.getElementById(meta.contextId);
+        const label=document.getElementById(meta.contextLabelId);
+        slider.disabled=true;
+        slider.style.background='#32384b';
+        label.textContent='\u2014';
+        return;
+    }
+
+    if(!selected)
+    {
+        noteEl.textContent='The saved selection is no longer present in the live model catalog. Save a replacement if you want this slot to preload on restart.';
+        const slider=document.getElementById(meta.contextId);
+        const label=document.getElementById(meta.contextLabelId);
+        slider.disabled=true;
+        slider.style.background='#32384b';
+        label.textContent='\u2014';
+        return;
+    }
+
+    const slider=document.getElementById(meta.contextId);
+    const label=document.getElementById(meta.contextLabelId);
+    const maxCtx=selected.max_context_size||131072;
+    slider.disabled=false;
+    slider.max=maxCtx;
+    slider.min=4096;
+
+    if(state.contextSize>0)
+    {
+        slider.value=Math.min(Math.max(state.contextSize, 4096), maxCtx);
+    }
+    else
+    {
+        slider.value=selected.effective_context_size||4096;
+    }
+
+    label.textContent=formatContextSize(parseInt(slider.value, 10));
+    updateSliderGradient(accelerator);
+
+    noteEl.textContent=selected.compatibility_reason+' Requested context: '+(selected.effective_context_size||'model default')+'.';
+}
+
+function renderEffectiveStartup()
+{
+    const el=document.getElementById('effectiveStartupCopy');
+    if(!serverConfigState)
+    {
+        el.textContent='Loading startup config...';
+        return;
+    }
+
+    // When using new startup_models mode, show summary from startupModelsState
+    if(useStartupModels)
+    {
+        const active=startupModelsState.filter(e=>e.model);
+        if(active.length===0)
+        {
+            el.innerHTML='Next restart: <strong>no startup models configured</strong>. Add a model above.';
+            return;
+        }
+
+        let html='Next restart will load: ';
+        const parts=active.map(e=>{
+            const label=formatStartupModelLabel(e.model, e.variant);
+            const ctx=e.context_size>0?formatContextSize(e.context_size):'default';
+            const devs=e.devices&&e.devices.length>0?'GPU '+e.devices.join(','):'auto';
+            return '<strong>'+escapeHtml(label)+'</strong> (ctx: '+escapeHtml(ctx)+', devices: '+escapeHtml(devs)+')';
+        });
+        el.innerHTML=html+parts.join('; ')+'.';
+        return;
+    }
+
+    // Legacy mode
+    const effective=serverConfigState.effective_startup_default||{};
+    if(!effective.model)
+    {
+        el.innerHTML='Next restart: <strong>no startup model configured</strong>.';
+        return;
+    }
+
+    const label=formatStartupModelLabel(effective.model, effective.variant||'');
+    const acceleratorLabel=(effective.accelerator||'legacy').toUpperCase();
+    const contextLabel=effective.context_size&&effective.context_size>0
+        ? 'context '+effective.context_size
+        : 'model default context';
+
+    let optsLabel='';
+    const ro=effective.runtime_options;
+    if(ro)
+    {
+        const parts=[];
+        if(ro.kv_cache_type_k) parts.push('KV-K: '+ro.kv_cache_type_k);
+        if(ro.kv_cache_type_v) parts.push('KV-V: '+ro.kv_cache_type_v);
+        if(ro.flash_attn===true) parts.push('flash_attn');
+        if(parts.length>0) optsLabel=' ('+parts.join(', ')+')';
+    }
+
+    el.innerHTML='Next restart: <strong>'+escapeHtml(acceleratorLabel)+'</strong> will try to load <strong>'+escapeHtml(label)+'</strong> using '+escapeHtml(contextLabel)+escapeHtml(optsLabel)+'.';
+}
+
+function applyServerConfigToState()
+{
+    for(const accelerator of STARTUP_ACCELERATORS)
+    {
+        const entry=getStartupEntry(accelerator.key);
+        startupState[accelerator.key].selected={model: entry.model, variant: entry.variant};
+        startupState[accelerator.key].contextSize=entry.context_size||0;
+        startupState[accelerator.key].runtimeOptions=entry.runtime_options||{};
+        const slider=document.getElementById(accelerator.contextId);
+
+        if(entry.context_size>0)
+        {
+            slider.value=entry.context_size;
+        }
+
+        applyRuntimeOptsToUI(accelerator.key, entry.runtime_options);
+    }
+}
+
+async function refreshAcceleratorOptions(accelerator)
+{
+    const meta=getAcceleratorMeta(accelerator);
+    const contextSize=normalizeContextSize(document.getElementById(meta.contextId).value);
+    startupState[accelerator].contextSize=contextSize;
+
+    const data=await fetchJson('/api/server/startup-options?accelerator='+encodeURIComponent(accelerator)+'&context_size='+encodeURIComponent(contextSize));
+    if(!data)
+    {
+        document.getElementById('statusDot').style.background='#ff4444';
+        document.getElementById('statusText').textContent='Disconnected';
+        return;
+    }
+
+    document.getElementById('statusDot').style.background='#4caf50';
+    document.getElementById('statusText').textContent='Connected';
+
+    startupState[accelerator].options=data.options||[];
+    startupState[accelerator].detected=!!data.detected;
+    startupState[accelerator].availableVramMb=data.available_vram_mb||0;
+    startupState[accelerator].availableRamMb=data.available_ram_mb||0;
+    renderAccelerator(accelerator);
+}
+
+async function refreshAllStartupOptions()
+{
+    await Promise.all(STARTUP_ACCELERATORS.map((accelerator) => refreshAcceleratorOptions(accelerator.key)));
+}
+
+function selectStartupOption(accelerator, encodedValue)
+{
+    const value=decodeStartupModelValue(encodedValue);
+    startupState[accelerator].selected={model: value.model||'', variant: value.variant||''};
+    startupState[accelerator].contextSize=0;
+    document.getElementById(getAcceleratorMeta(accelerator).pickerId).open=false;
+    renderAccelerator(accelerator);
+}
+
+async function saveStartupDefaults()
+{
+    const saveButton=document.getElementById('saveStartupDefaultsBtn');
+    const startupDefaults={};
+
+    for(const accelerator of STARTUP_ACCELERATORS)
+    {
+        startupDefaults[accelerator.key]={
+            model: startupState[accelerator.key].selected.model||'',
+            variant: startupState[accelerator.key].selected.variant||'',
+            context_size: normalizeContextSize(document.getElementById(accelerator.contextId).value),
+            runtime_options: readRuntimeOptsFromUI(accelerator.key)
+        };
+    }
+
+    saveButton.disabled=true;
+    showMessage('Saving startup defaults...', '');
+
+    try
+    {
+        const response=await fetch('/api/server/config', {
+            method: 'PUT',
+            headers: {'Content-Type': 'application/json'},
+            body: JSON.stringify({startup_defaults: startupDefaults})
+        });
+        const data=await response.json();
+
+        if(!response.ok)
+        {
+            showMessage((data.error&&data.error.message)||'Failed to save startup defaults.', 'error');
+            return;
+        }
+
+        serverConfigState=data;
+        applyServerConfigToState();
+        await refreshAllStartupOptions();
+        showMessage('Startup defaults saved. The next restart will auto-download missing startup models and load them when ready.', 'success');
+    }
+    catch(e)
+    {
+        console.error('Saving startup defaults failed:', e);
+        showMessage('Failed to save startup defaults.', 'error');
+    }
+    finally
+    {
+        saveButton.disabled=false;
+    }
+}
+
+// ── New Startup Models Functions ──────────────────────────────
+
+async function loadAvailableGpus()
+{
+    const data=await fetchJson('/api/hardware');
+    if(data&&data.gpus)
+    {
+        availableGpus=data.gpus;
+    }
+}
+
+async function loadAvailableModels()
+{
+    const data=await fetchJson('/api/server/startup-options?accelerator=vulkan');
+    if(data&&data.options)
+    {
+        availableModelOptions=data.options;
+    }
+
+    // Also try cuda
+    const cudaData=await fetchJson('/api/server/startup-options?accelerator=cuda');
+    if(cudaData&&cudaData.options)
+    {
+        for(const opt of cudaData.options)
+        {
+            if(!availableModelOptions.find(o=>o.model===opt.model&&o.variant===opt.variant))
+            {
+                availableModelOptions.push(opt);
+            }
+        }
+    }
+}
+
+function addStartupModel()
+{
+    startupModelsState.push({
+        model: '',
+        variant: '',
+        context_size: 0,
+        runtime_options: {},
+        devices: []
+    });
+    renderStartupModels();
+}
+
+async function removeStartupModel(index)
+{
+    const entry=startupModelsState[index];
+    const modelName=entry.model;
+    startupModelsState.splice(index, 1);
+    renderStartupModels();
+
+    // Unload the model if it was loaded
+    if(modelName)
+    {
+        try
+        {
+            await fetch('/api/models/'+encodeURIComponent(modelName)+'/unload', {method: 'POST'});
+        }
+        catch(e) {}
+    }
+
+    // Save config without this model
+    await saveAllStartupModels();
+}
+
+function updateStartupModelField(index, field, value)
+{
+    startupModelsState[index][field]=value;
+    if(field==='modelVariant')
+    {
+        const parts=value.split('||');
+        startupModelsState[index].model=parts[0]||'';
+        startupModelsState[index].variant=parts[1]||'';
+        delete startupModelsState[index].modelVariant;
+
+        // Set default context from model options
+        const opt=availableModelOptions.find(o=>o.model===parts[0]&&o.variant===(parts[1]||''));
+        if(opt&&opt.effective_context_size)
+        {
+            startupModelsState[index].context_size=opt.effective_context_size;
+        }
+        renderStartupModels();
+    }
+}
+
+function updateStartupModelDevice(index, gpuIndex, checked)
+{
+    const entry=startupModelsState[index];
+    if(checked)
+    {
+        if(!entry.devices.includes(gpuIndex))
+            entry.devices.push(gpuIndex);
+        entry.devices.sort();
+    }
+    else
+    {
+        entry.devices=entry.devices.filter(d=>d!==gpuIndex);
+    }
+}
+
+function updateStartupModelContext(index, value)
+{
+    startupModelsState[index].context_size=parseInt(value, 10)||0;
+    const label=document.getElementById('smCtxLabel_'+index);
+    if(label) label.textContent=formatContextSize(parseInt(value, 10));
+}
+
+function readStartupModelRuntimeOpts(index)
+{
+    const opts={};
+    const kvk=document.getElementById('smKvk_'+index);
+    const kvv=document.getElementById('smKvv_'+index);
+    const fa=document.getElementById('smFlashAttn_'+index);
+    const vkNhv=document.getElementById('smVkNoHostVis_'+index);
+    if(kvk&&kvk.value) opts.kv_cache_type_k=kvk.value;
+    if(kvv&&kvv.value) opts.kv_cache_type_v=kvv.value;
+    if(fa&&fa.value==='true') opts.flash_attn=true;
+    else if(fa&&fa.value==='false') opts.flash_attn=false;
+    if(vkNhv&&vkNhv.value==='true') opts.vulkan_no_host_visible_vram=true;
+    else if(vkNhv&&vkNhv.value==='false') opts.vulkan_no_host_visible_vram=false;
+    return opts;
+}
+
+function toggleStartupModelRuntime(index)
+{
+    const body=document.getElementById('smRuntimeBody_'+index);
+    const chevron=document.getElementById('smRuntimeChevron_'+index);
+    const isOpen=body.classList.toggle('open');
+    chevron.innerHTML=isOpen?'&#9660;':'&#9654;';
+}
+
+function renderStartupModels()
+{
+    const el=document.getElementById('startupModelsList');
+
+    if(startupModelsState.length===0)
+    {
+        el.innerHTML='<div style="color:#666;text-align:center;padding:16px;">No startup models configured. Click "+ Add Model" to add one.</div>';
+        return;
+    }
+
+    let html='';
+    for(let i=0; i<startupModelsState.length; i++)
+    {
+        const entry=startupModelsState[i];
+        const selectedValue=entry.model+'||'+entry.variant;
+
+        // Model selector
+        let modelOptions='<option value="">-- Select Model --</option>';
+        for(const opt of availableModelOptions)
+        {
+            const val=opt.model+'||'+(opt.variant||'');
+            const label=formatStartupModelLabel(opt.model, opt.variant);
+            const sel=val===selectedValue?' selected':'';
+            modelOptions+='<option value="'+escapeHtml(val)+'"'+sel+'>'+escapeHtml(label)+'</option>';
+        }
+
+        // Context slider
+        const modelOpt=availableModelOptions.find(o=>o.model===entry.model&&o.variant===entry.variant);
+        const maxCtx=modelOpt?modelOpt.max_context_size||131072:131072;
+        const ctxVal=entry.context_size>0?Math.min(entry.context_size, maxCtx):4096;
+        const ctxLabel=formatContextSize(ctxVal);
+
+        // Device checkboxes
+        let devicesHtml='';
+        for(const gpu of availableGpus)
+        {
+            const checked=entry.devices.includes(gpu.index)?'checked':'';
+            devicesHtml+='<label style="display:inline-flex;align-items:center;gap:4px;margin-right:12px;cursor:pointer;">'
+                +'<input type="checkbox" '+checked+' onchange="updateStartupModelDevice('+i+', '+gpu.index+', this.checked)">'
+                +'<span style="font-size:13px;">GPU'+gpu.index+': '+escapeHtml(gpu.name)+' ('+formatMb(gpu.vram_total_mb)+')</span>'
+                +'</label>';
+        }
+
+        // Runtime options
+        const ro=entry.runtime_options||{};
+
+        html+='<div style="border:1px solid #2a2d3a;border-radius:8px;padding:14px;margin-bottom:10px;background:#13151d;">'
+            +'<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:10px;">'
+            +'<span style="font-weight:600;color:#f1f4ff;">Model '+(i+1)+'</span>'
+            +'<div style="display:flex;gap:6px;">'
+            +'<button class="btn" style="font-size:12px;padding:4px 10px;" id="smSaveBtn_'+i+'" onclick="saveStartupModelEntry('+i+')">Save &amp; Load</button>'
+            +'<button class="btn btn-danger" style="font-size:12px;padding:4px 10px;" onclick="removeStartupModel('+i+')">Remove</button>'
+            +'</div>'
+            +'</div>'
+            +'<div class="settings-message" id="smStatus_'+i+'" style="margin-bottom:8px;"></div>'
+            +'<div style="margin-bottom:10px;">'
+            +'<label class="startup-field-label">Model &amp; Variant</label>'
+            +'<select style="width:100%;padding:8px;background:#1a1d27;color:#e0e0e0;border:1px solid #32384b;border-radius:6px;font-size:14px;" '
+            +'onchange="updateStartupModelField('+i+', \'modelVariant\', this.value)">'+modelOptions+'</select>'
+            +'</div>'
+            +'<div style="margin-bottom:10px;">'
+            +'<label class="startup-field-label">Context Size <span class="context-slider-value" id="smCtxLabel_'+i+'">'+ctxLabel+'</span></label>'
+            +'<input type="range" class="context-slider" min="4096" max="'+maxCtx+'" step="1024" value="'+ctxVal+'" '
+            +'oninput="updateStartupModelContext('+i+', this.value)">'
+            +'</div>'
+            +'<div style="margin-bottom:10px;">'
+            +'<label class="startup-field-label">Compute Devices</label>'
+            +'<div style="display:flex;flex-wrap:wrap;gap:4px;">'+devicesHtml+'</div>'
+            +(entry.devices.length===0?'<div style="color:#ff9800;font-size:12px;margin-top:4px;">No devices selected &mdash; auto-assignment will be used.</div>':'')
+            +'</div>'
+            +'<div class="runtime-opts-section">'
+            +'<button type="button" class="runtime-opts-toggle" onclick="toggleStartupModelRuntime('+i+')">Runtime Options <span id="smRuntimeChevron_'+i+'">&#9654;</span></button>'
+            +'<div class="runtime-opts-body" id="smRuntimeBody_'+i+'">'
+            +'<div class="runtime-opts-row"><span class="runtime-opts-label">KV Cache (K)</span>'
+            +'<select class="runtime-opts-select" id="smKvk_'+i+'"><option value="">Default</option><option value="f16"'+(ro.kv_cache_type_k==='f16'?' selected':'')+'>f16</option><option value="q8_0"'+(ro.kv_cache_type_k==='q8_0'?' selected':'')+'>q8_0</option><option value="q4_0"'+(ro.kv_cache_type_k==='q4_0'?' selected':'')+'>q4_0</option></select></div>'
+            +'<div class="runtime-opts-row"><span class="runtime-opts-label">KV Cache (V)</span>'
+            +'<select class="runtime-opts-select" id="smKvv_'+i+'"><option value="">Default</option><option value="f16"'+(ro.kv_cache_type_v==='f16'?' selected':'')+'>f16</option><option value="q8_0"'+(ro.kv_cache_type_v==='q8_0'?' selected':'')+'>q8_0</option><option value="q4_0"'+(ro.kv_cache_type_v==='q4_0'?' selected':'')+'>q4_0</option></select></div>'
+            +'<div class="runtime-opts-row"><span class="runtime-opts-label">Flash Attention</span>'
+            +'<select class="runtime-opts-select" id="smFlashAttn_'+i+'"><option value="">Default</option><option value="true"'+(ro.flash_attn===true?' selected':'')+'>Enabled</option><option value="false"'+(ro.flash_attn===false?' selected':'')+'>Disabled</option></select></div>'
+            +'<div class="runtime-opts-row"><span class="runtime-opts-label">No Host-Visible VRAM</span>'
+            +'<select class="runtime-opts-select" id="smVkNoHostVis_'+i+'"><option value="">Default</option><option value="true"'+(ro.vulkan_no_host_visible_vram===true?' selected':'')+'>Enabled</option><option value="false"'+(ro.vulkan_no_host_visible_vram===false?' selected':'')+'>Disabled</option></select></div>'
+            +'</div></div>'
+            +'</div>';
+    }
+
+    el.innerHTML=html;
+}
+
+async function saveAllStartupModels()
+{
+    // Collect runtime options from the UI before saving
+    for(let i=0; i<startupModelsState.length; i++)
+    {
+        startupModelsState[i].runtime_options=readStartupModelRuntimeOpts(i);
+    }
+
+    const models=startupModelsState.filter(e=>e.model).map(e=>({
+        model: e.model,
+        variant: e.variant||'',
+        context_size: e.context_size||0,
+        runtime_options: e.runtime_options||{},
+        devices: e.devices||[]
+    }));
+
+    try
+    {
+        const response=await fetch('/api/server/config', {
+            method: 'PUT',
+            headers: {'Content-Type': 'application/json'},
+            body: JSON.stringify({startup_models: models})
+        });
+        const data=await response.json();
+        if(response.ok) serverConfigState=data;
+        return response.ok;
+    }
+    catch(e)
+    {
+        return false;
+    }
+}
+
+function showModelStatus(index, text, state)
+{
+    const el=document.getElementById('smStatus_'+index);
+    if(!el) return;
+    el.textContent=text||'';
+    el.className='settings-message';
+    if(state==='success') el.classList.add('success');
+    if(state==='error') el.classList.add('error');
+}
+
+async function saveStartupModelEntry(index)
+{
+    const entry=startupModelsState[index];
+    if(!entry.model)
+    {
+        showModelStatus(index, 'Select a model first.', 'error');
+        return;
+    }
+
+    const btn=document.getElementById('smSaveBtn_'+index);
+    btn.disabled=true;
+
+    // Collect this entry's runtime opts from UI
+    entry.runtime_options=readStartupModelRuntimeOpts(index);
+
+    showModelStatus(index, 'Saving config...', '');
+
+    // Save all models to config
+    const saved=await saveAllStartupModels();
+    if(!saved)
+    {
+        showModelStatus(index, 'Failed to save config.', 'error');
+        btn.disabled=false;
+        return;
+    }
+
+    // Now load the model immediately
+    showModelStatus(index, 'Loading model...', '');
+
+    try
+    {
+        const body={
+            variant: entry.variant||'',
+            context_size: entry.context_size||0,
+            runtime_options: entry.runtime_options||{},
+            devices: entry.devices||[]
+        };
+
+        const resp=await fetch('/api/models/'+encodeURIComponent(entry.model)+'/load', {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json'},
+            body: JSON.stringify(body)
+        });
+        const data=await resp.json();
+
+        if(resp.ok)
+        {
+            if(data.status==='downloading')
+            {
+                showModelStatus(index, 'Model is downloading. It will load automatically when ready.', 'success');
+            }
+            else
+            {
+                showModelStatus(index, 'Model loaded successfully.', 'success');
+            }
+        }
+        else
+        {
+            const errMsg=(data.error&&data.error.message)||'Load failed (HTTP '+resp.status+')';
+            showModelStatus(index, errMsg, 'error');
+        }
+    }
+    catch(e)
+    {
+        showModelStatus(index, 'Network error loading model.', 'error');
+    }
+    finally
+    {
+        btn.disabled=false;
+    }
+}
+
+// ── End New Startup Models Functions ──────────────────────────
+
+async function loadVersion()
+{
+    const data=await fetchJson('/api/version');
+    if(data)
+    {
+        document.getElementById('versionBadge').textContent='v'+data.version;
+    }
+}
+
+async function initializePage()
+{
+    await loadVersion();
+    loadGpuOverrides();
+    await loadAvailableGpus();
+
+    const config=await fetchJson('/api/server/config');
+    if(!config)
+    {
+        document.getElementById('statusDot').style.background='#ff4444';
+        document.getElementById('statusText').textContent='Disconnected';
+        showMessage('Failed to load server configuration.', 'error');
+        return;
+    }
+
+    document.getElementById('statusDot').style.background='#4caf50';
+    document.getElementById('statusText').textContent='Connected';
+
+    serverConfigState=config;
+
+    useStartupModels=true;
+    document.getElementById('startupModelsSection').style.display='';
+    document.getElementById('legacyAcceleratorSection').style.display='none';
+
+    await loadAvailableModels();
+
+    // Show exactly what's in the server config file
+    if(config.startup_models&&Array.isArray(config.startup_models))
+    {
+        startupModelsState=config.startup_models.map(e=>({
+            model: e.model||'',
+            variant: e.variant||'',
+            context_size: e.context_size||0,
+            runtime_options: e.runtime_options||{},
+            devices: e.devices||[]
+        }));
+    }
+    else
+    {
+        startupModelsState=[];
+    }
+
+    renderStartupModels();
+}
+
+initializePage();
+</script>
+</body>
+</html>)HTML";
+
+} // namespace server
+} // namespace arbiterAI
+
+#endif//_ARBITERAI_SERVER_DASHBOARDCONFIG_H_
\ No newline at end of file
diff --git a/src/server/main.cpp b/src/server/main.cpp
index 7fb4f67..4d91c27 100644
--- a/src/server/main.cpp
+++ b/src/server/main.cpp
@@ -3,6 +3,7 @@
 
 #include "arbiterAI/arbiterAI.h"
 #include "arbiterAI/hardwareDetector.h"
+#include "arbiterAI/modelManager.h"
 #include "arbiterAI/modelRuntime.h"
 #include "arbiterAI/storageManager.h"
 
@@ -15,12 +16,382 @@
 #include <iostream>
 #include <fstream>
 #include <string>
+#include <thread>
 #include <vector>
 #include <filesystem>
+#include <map>
 
 namespace
 {
 
+struct StartupDefaultSelection {
+    std::string model;
+    std::string variant;
+    int contextSize=0;
+    arbiterAI::RuntimeOptions runtimeOptions;
+};
+
+struct StartupModelEntry {
+    std::string model;
+    std::string variant;
+    int contextSize=0;
+    arbiterAI::RuntimeOptions runtimeOptions;
+    std::vector<int> devices;
+};
+
+arbiterAI::RuntimeOptions parseStartupRuntimeOptions(const nlohmann::json &j)
+{
+    arbiterAI::RuntimeOptions opts;
+    if(!j.is_object()) return opts;
+    if(j.contains("flash_attn")&&j["flash_attn"].is_boolean())
+        opts.flashAttn=j["flash_attn"].get<bool>();
+    if(j.contains("kv_cache_type_k")&&j["kv_cache_type_k"].is_string())
+        opts.kvCacheTypeK=j["kv_cache_type_k"].get<std::string>();
+    if(j.contains("kv_cache_type_v")&&j["kv_cache_type_v"].is_string())
+        opts.kvCacheTypeV=j["kv_cache_type_v"].get<std::string>();
+    if(j.contains("no_mmap")&&j["no_mmap"].is_boolean())
+        opts.noMmap=j["no_mmap"].get<bool>();
+    if(j.contains("reasoning_budget")&&j["reasoning_budget"].is_number_integer())
+        opts.reasoningBudget=j["reasoning_budget"].get<int>();
+    if(j.contains("swa_full")&&j["swa_full"].is_boolean())
+        opts.swaFull=j["swa_full"].get<bool>();
+    if(j.contains("n_gpu_layers")&&j["n_gpu_layers"].is_number_integer())
+        opts.nGpuLayers=j["n_gpu_layers"].get<int>();
+    if(j.contains("override_tensor")&&j["override_tensor"].is_string())
+        opts.overrideTensor=j["override_tensor"].get<std::string>();
+    if(j.contains("vulkan_no_host_visible_vram")&&j["vulkan_no_host_visible_vram"].is_boolean())
+        opts.vulkanNoHostVisibleVram=j["vulkan_no_host_visible_vram"].get<bool>();
+    return opts;
+}
+
+int sanitizeContextSize(int contextSize)
+{
+    return contextSize>0?contextSize:0;
+}
+
+std::string toLowerCopy(const std::string &value)
+{
+    std::string lower=value;
+    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c)
+    {
+        return static_cast<char>(std::tolower(c));
+    });
+    return lower;
+}
+
+std::string normalizeAcceleratorKey(const std::string &value)
+{
+    std::string lower=toLowerCopy(value);
+
+    if(lower=="cpu"||lower=="cuda"||lower=="vulkan")
+    {
+        return lower;
+    }
+
+    return "";
+}
+
+std::map<std::string, StartupDefaultSelection> parseStartupDefaults(const nlohmann::json &cfg)
+{
+    std::map<std::string, StartupDefaultSelection> startupDefaults={
+        {"cpu", StartupDefaultSelection{}},
+        {"cuda", StartupDefaultSelection{}},
+        {"vulkan", StartupDefaultSelection{}}
+    };
+
+    nlohmann::json defaultsJson=cfg.value("startup_defaults", nlohmann::json::object());
+    if(!defaultsJson.is_object())
+    {
+        return startupDefaults;
+    }
+
+    for(auto it=defaultsJson.begin(); it!=defaultsJson.end(); ++it)
+    {
+        std::string key=normalizeAcceleratorKey(it.key());
+        if(key.empty())
+        {
+            continue;
+        }
+
+        if(it.value().is_string())
+        {
+            startupDefaults[key].model=it.value().get<std::string>();
+            startupDefaults[key].variant.clear();
+            startupDefaults[key].contextSize=0;
+            continue;
+        }
+
+        if(!it.value().is_object())
+        {
+            continue;
+        }
+
+        startupDefaults[key].model=it.value().value("model", "");
+        startupDefaults[key].variant=it.value().value("variant", "");
+        startupDefaults[key].contextSize=sanitizeContextSize(it.value().value("context_size", 0));
+        if(it.value().contains("runtime_options"))
+        {
+            startupDefaults[key].runtimeOptions=parseStartupRuntimeOptions(it.value()["runtime_options"]);
+        }
+    }
+
+    return startupDefaults;
+}
+
+std::vector<StartupModelEntry> parseStartupModels(const nlohmann::json &cfg)
+{
+    std::vector<StartupModelEntry> entries;
+
+    if(!cfg.contains("startup_models")||!cfg["startup_models"].is_array())
+    {
+        return entries;
+    }
+
+    for(const nlohmann::json &item:cfg["startup_models"])
+    {
+        if(!item.is_object()||!item.contains("model"))
+            continue;
+
+        StartupModelEntry entry;
+        entry.model=item.value("model", "");
+        entry.variant=item.value("variant", "");
+        entry.contextSize=sanitizeContextSize(item.value("context_size", 0));
+
+        if(item.contains("runtime_options"))
+        {
+            entry.runtimeOptions=parseStartupRuntimeOptions(item["runtime_options"]);
+        }
+
+        if(item.contains("devices")&&item["devices"].is_array())
+        {
+            for(const nlohmann::json &d:item["devices"])
+            {
+                if(d.is_number_integer())
+                {
+                    entry.devices.push_back(d.get<int>());
+                }
+            }
+        }
+
+        if(!entry.model.empty())
+        {
+            entries.push_back(std::move(entry));
+        }
+    }
+
+    return entries;
+}
+
+bool hasAccelerator(const arbiterAI::SystemInfo &hw, const std::string &accelerator)
+{
+    if(accelerator=="cpu")
+    {
+        return true;
+    }
+
+    for(const arbiterAI::GpuInfo &gpu:hw.gpus)
+    {
+        if(accelerator=="cuda"&&gpu.backend==arbiterAI::GpuBackend::CUDA)
+        {
+            return true;
+        }
+        if(accelerator=="vulkan"&&gpu.backend==arbiterAI::GpuBackend::Vulkan)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::vector<std::string> buildStartupAcceleratorOrder(
+    const arbiterAI::SystemInfo &hw,
+    const std::vector<std::string> &defaultBackendPriority)
+{
+    std::vector<std::string> order;
+
+    auto appendIfAvailable=[&order, &hw](const std::string &accelerator)
+    {
+        if(accelerator.empty()||!hasAccelerator(hw, accelerator))
+        {
+            return;
+        }
+        if(std::find(order.begin(), order.end(), accelerator)==order.end())
+        {
+            order.push_back(accelerator);
+        }
+    };
+
+    for(const std::string &backend:defaultBackendPriority)
+    {
+        appendIfAvailable(normalizeAcceleratorKey(backend));
+    }
+
+    appendIfAvailable("cuda");
+    appendIfAvailable("vulkan");
+    appendIfAvailable("cpu");
+
+    return order;
+}
+
+StartupDefaultSelection selectStartupDefault(
+    const arbiterAI::SystemInfo &hw,
+    const std::map<std::string, StartupDefaultSelection> &startupDefaults,
+    const std::vector<std::string> &defaultBackendPriority,
+    const std::string &legacyDefaultModel,
+    const std::string &legacyDefaultVariant,
+    std::string &selectedAccelerator)
+{
+    for(const std::string &accelerator:buildStartupAcceleratorOrder(hw, defaultBackendPriority))
+    {
+        auto it=startupDefaults.find(accelerator);
+        if(it!=startupDefaults.end()&&!it->second.model.empty())
+        {
+            selectedAccelerator=accelerator;
+            return it->second;
+        }
+    }
+
+    selectedAccelerator.clear();
+    return {legacyDefaultModel, legacyDefaultVariant, 0};
+}
+
+void scheduleStartupLoadAfterDownload(
+    const StartupDefaultSelection &selection,
+    const std::string &accelerator)
+{
+    std::thread([selection, accelerator]()
+    {
+        std::string variant=selection.variant;
+
+        for(int attempt=0; attempt<300; ++attempt)
+        {
+            std::this_thread::sleep_for(std::chrono::seconds(2));
+
+            std::optional<arbiterAI::LoadedModel> state=
+                arbiterAI::ModelRuntime::instance().getModelState(selection.model);
+            if(!state.has_value())
+            {
+                spdlog::warn("Startup default model '{}' disappeared while waiting for download to finish", selection.model);
+                return;
+            }
+
+            if(!state->variant.empty())
+            {
+                variant=state->variant;
+            }
+
+            if(state->state==arbiterAI::ModelState::Downloading)
+            {
+                continue;
+            }
+
+            if(state->state==arbiterAI::ModelState::Loaded||state->state==arbiterAI::ModelState::Ready)
+            {
+                spdlog::info("Startup default model '{}' is ready after background download for {}",
+                    selection.model,
+                    accelerator.empty()?"legacy startup":accelerator);
+                return;
+            }
+
+            if(state->state!=arbiterAI::ModelState::Unloaded)
+            {
+                spdlog::warn("Startup default model '{}' ended in unexpected state {} after download",
+                    selection.model,
+                    static_cast<int>(state->state));
+                return;
+            }
+
+            spdlog::info("Startup default model '{}' finished downloading; loading now for {}",
+                selection.model,
+                accelerator.empty()?"legacy startup":accelerator);
+
+            arbiterAI::RuntimeOptions opts=selection.runtimeOptions;
+            arbiterAI::ErrorCode loadErr=arbiterAI::ArbiterAI::instance().loadModel(
+                selection.model,
+                variant,
+                selection.contextSize,
+                &opts);
+
+            if(loadErr==arbiterAI::ErrorCode::Success)
+            {
+                spdlog::info("Startup default model '{}' loaded successfully after download", selection.model);
+                return;
+            }
+
+            if(loadErr==arbiterAI::ErrorCode::ModelDownloading)
+            {
+                continue;
+            }
+
+            spdlog::warn("Failed to load startup default model '{}' after download (error={})",
+                selection.model,
+                static_cast<int>(loadErr));
+            return;
+        }
+
+        spdlog::warn("Timed out waiting for startup default model '{}' to finish downloading", selection.model);
+    }).detach();
+}
+
+void scheduleStartupModelLoadAfterDownload(const StartupModelEntry &entry)
+{
+    std::thread([entry]()
+    {
+        std::string variant=entry.variant;
+
+        for(int attempt=0; attempt<300; ++attempt)
+        {
+            std::this_thread::sleep_for(std::chrono::seconds(2));
+
+            std::optional<arbiterAI::LoadedModel> state=
+                arbiterAI::ModelRuntime::instance().getModelState(entry.model);
+            if(!state.has_value())
+            {
+                spdlog::warn("Startup model '{}' disappeared while waiting for download", entry.model);
+                return;
+            }
+
+            if(!state->variant.empty())
+                variant=state->variant;
+
+            if(state->state==arbiterAI::ModelState::Downloading)
+                continue;
+
+            if(state->state==arbiterAI::ModelState::Loaded||state->state==arbiterAI::ModelState::Ready)
+            {
+                spdlog::info("Startup model '{}' is ready after background download", entry.model);
+                return;
+            }
+
+            if(state->state!=arbiterAI::ModelState::Unloaded)
+            {
+                spdlog::warn("Startup model '{}' ended in unexpected state {}", entry.model, static_cast<int>(state->state));
+                return;
+            }
+
+            spdlog::info("Startup model '{}' finished downloading; loading now", entry.model);
+
+            arbiterAI::RuntimeOptions opts=entry.runtimeOptions;
+            arbiterAI::ErrorCode loadErr=arbiterAI::ArbiterAI::instance().loadModel(
+                entry.model, variant, entry.contextSize, &opts, entry.devices);
+
+            if(loadErr==arbiterAI::ErrorCode::Success)
+            {
+                spdlog::info("Startup model '{}' loaded successfully after download", entry.model);
+                return;
+            }
+            if(loadErr==arbiterAI::ErrorCode::ModelDownloading)
+                continue;
+
+            spdlog::warn("Failed to load startup model '{}' after download (error={})", entry.model, static_cast<int>(loadErr));
+            return;
+        }
+
+        spdlog::warn("Timed out waiting for startup model '{}' to finish downloading", entry.model);
+    }).detach();
+}
+
 int64_t parseStorageLimit(const std::string &str)
 {
     if(str.empty()||str=="0") return 0;
@@ -137,6 +508,7 @@ int main(int argc, char *argv[])
     std::string modelsDir=cfg.value("models_dir", "/models");
     std::string defaultModel=cfg.value("default_model", "");
     std::string defaultVariant=cfg.value("default_variant", "");
+    std::map<std::string, StartupDefaultSelection> startupDefaults=parseStartupDefaults(cfg);
     std::string overridePath=cfg.value("override_path", "");
     std::string injectedConfigDir=cfg.value("injected_config_dir", "");
     int ramBudget=cfg.value("ram_budget_mb", 0);
@@ -294,23 +666,96 @@ int main(int argc, char *argv[])
         spdlog::info("Max concurrent downloads set to {}", maxDownloads);
     }
 
-    // ── Load default model ───────────────────────────────────────
-    if(!defaultModel.empty())
-    {
-        spdlog::info("Loading default model: {} (variant: {})", defaultModel, defaultVariant.empty()?"auto":defaultVariant);
-        arbiterAI::ErrorCode loadErr=ai.loadModel(defaultModel, defaultVariant);
+    // ── Load startup models ─────────────────────────────────────
+    arbiterAI::HardwareDetector::instance().refresh();
+    arbiterAI::SystemInfo startupHardware=arbiterAI::HardwareDetector::instance().getSystemInfo();
 
-        if(loadErr==arbiterAI::ErrorCode::Success)
-        {
-            spdlog::info("Default model '{}' loaded successfully", defaultModel);
-        }
-        else if(loadErr==arbiterAI::ErrorCode::ModelDownloading)
+    // New format: startup_models array (preferred)
+    std::vector<StartupModelEntry> startupModels=parseStartupModels(cfg);
+
+    if(!startupModels.empty())
+    {
+        for(const StartupModelEntry &entry:startupModels)
         {
-            spdlog::info("Default model '{}' is downloading...", defaultModel);
+            std::string devicesStr;
+            if(!entry.devices.empty())
+            {
+                for(size_t i=0; i<entry.devices.size(); ++i)
+                {
+                    if(i>0) devicesStr+=", ";
+                    devicesStr+=std::to_string(entry.devices[i]);
+                }
+            }
+            else
+            {
+                devicesStr="auto";
+            }
+
+            spdlog::info("Loading startup model: {} (variant: {}, devices: [{}])",
+                entry.model, entry.variant.empty()?"auto":entry.variant, devicesStr);
+
+            arbiterAI::RuntimeOptions startupOpts=entry.runtimeOptions;
+            arbiterAI::ErrorCode loadErr=ai.loadModel(
+                entry.model, entry.variant, entry.contextSize, &startupOpts, entry.devices);
+
+            if(loadErr==arbiterAI::ErrorCode::Success)
+            {
+                spdlog::info("Startup model '{}' loaded successfully", entry.model);
+            }
+            else if(loadErr==arbiterAI::ErrorCode::ModelDownloading)
+            {
+                spdlog::info("Startup model '{}' is downloading...", entry.model);
+                scheduleStartupModelLoadAfterDownload(entry);
+            }
+            else
+            {
+                spdlog::warn("Failed to load startup model '{}' (error={})", entry.model, static_cast<int>(loadErr));
+            }
         }
-        else
+    }
+    else
+    {
+        // Legacy format: startup_defaults keyed by accelerator
+        std::string selectedAccelerator;
+        StartupDefaultSelection startupSelection=selectStartupDefault(
+            startupHardware,
+            startupDefaults,
+            defaultBackendPriority,
+            defaultModel,
+            defaultVariant,
+            selectedAccelerator);
+
+        if(!startupSelection.model.empty())
         {
-            spdlog::warn("Failed to load default model '{}' (error={})", defaultModel, static_cast<int>(loadErr));
+            if(selectedAccelerator.empty())
+            {
+                spdlog::info("Loading legacy default model: {} (variant: {})", startupSelection.model, startupSelection.variant.empty()?"auto":startupSelection.variant);
+            }
+            else
+            {
+                spdlog::info("Loading startup default model for {}: {} (variant: {})", selectedAccelerator, startupSelection.model, startupSelection.variant.empty()?"auto":startupSelection.variant);
+            }
+
+            arbiterAI::RuntimeOptions startupOpts=startupSelection.runtimeOptions;
+            arbiterAI::ErrorCode loadErr=ai.loadModel(
+                startupSelection.model,
+                startupSelection.variant,
+                startupSelection.contextSize,
+                &startupOpts);
+
+            if(loadErr==arbiterAI::ErrorCode::Success)
+            {
+                spdlog::info("Startup default model '{}' loaded successfully", startupSelection.model);
+            }
+            else if(loadErr==arbiterAI::ErrorCode::ModelDownloading)
+            {
+                spdlog::info("Startup default model '{}' is downloading...", startupSelection.model);
+                scheduleStartupLoadAfterDownload(startupSelection, selectedAccelerator);
+            }
+            else
+            {
+                spdlog::warn("Failed to load startup default model '{}' (error={})", startupSelection.model, static_cast<int>(loadErr));
+            }
         }
     }
 
@@ -318,6 +763,7 @@ int main(int argc, char *argv[])
     httplib::Server server;
 
     arbiterAI::server::registerRoutes(server);
+    arbiterAI::server::setServerConfigPath(configPath);
 
     if(!overridePath.empty())
     {
@@ -360,6 +806,7 @@ int main(int argc, char *argv[])
     spdlog::info("  POST /api/storage/cleanup/run     - Run cleanup");
     spdlog::info("  GET  /api/downloads          - Active downloads");
     spdlog::info("  GET  /dashboard              - Live dashboard");
+    spdlog::info("  GET  /dashboard/config       - Startup configuration");
 
     spdlog::info("Starting server on {}:{}", host, port);
     spdlog::info("Dashboard: http://{}:{}/dashboard", host=="0.0.0.0"?"localhost":host, port);
diff --git a/src/server/routes.cpp b/src/server/routes.cpp
index 843b88b..7fd8dc2 100644
--- a/src/server/routes.cpp
+++ b/src/server/routes.cpp
@@ -1,5 +1,6 @@
 #include "routes.h"
 #include "dashboard.h"
+#include "dashboardConfig.h"
 #include "logBuffer.h"
 
 #include "arbiterAI/arbiterAI.h"
@@ -12,10 +13,13 @@
 
 #include <nlohmann/json.hpp>
 #include <spdlog/spdlog.h>
+#include <algorithm>
 #include <ctime>
+#include <fstream>
 #include <random>
 #include <sstream>
 #include <iomanip>
+#include <mutex>
 
 namespace arbiterAI
 {
@@ -26,6 +30,643 @@ namespace
 {
 
 std::string g_overridePath;
+std::string g_serverConfigPath;
+std::mutex g_serverConfigMutex;
+constexpr const char *STARTUP_ACCELERATOR_CPU="cpu";
+constexpr const char *STARTUP_ACCELERATOR_CUDA="cuda";
+constexpr const char *STARTUP_ACCELERATOR_VULKAN="vulkan";
+
+int sanitizeContextSize(int contextSize)
+{
+    return contextSize>0?contextSize:0;
+}
+
+std::string toLowerCopy(const std::string &value)
+{
+    std::string lower=value;
+    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c)
+    {
+        return static_cast<char>(std::tolower(c));
+    });
+    return lower;
+}
+
+std::string normalizeAcceleratorKey(const std::string &value)
+{
+    std::string lower=toLowerCopy(value);
+
+    if(lower==STARTUP_ACCELERATOR_CPU)
+    {
+        return STARTUP_ACCELERATOR_CPU;
+    }
+    if(lower==STARTUP_ACCELERATOR_CUDA)
+    {
+        return STARTUP_ACCELERATOR_CUDA;
+    }
+    if(lower==STARTUP_ACCELERATOR_VULKAN)
+    {
+        return STARTUP_ACCELERATOR_VULKAN;
+    }
+
+    return "";
+}
+
+nlohmann::json defaultStartupDefaultsJson()
+{
+    return {
+        {STARTUP_ACCELERATOR_CPU, {{"model", ""}, {"variant", ""}, {"context_size", 0}, {"runtime_options", nlohmann::json::object()}}},
+        {STARTUP_ACCELERATOR_CUDA, {{"model", ""}, {"variant", ""}, {"context_size", 0}, {"runtime_options", nlohmann::json::object()}}},
+        {STARTUP_ACCELERATOR_VULKAN, {{"model", ""}, {"variant", ""}, {"context_size", 0}, {"runtime_options", nlohmann::json::object()}}}
+    };
+}
+
+nlohmann::json sanitizeStartupDefaults(const nlohmann::json &startupDefaults)
+{
+    nlohmann::json sanitized=defaultStartupDefaultsJson();
+
+    if(!startupDefaults.is_object())
+    {
+        return sanitized;
+    }
+
+    for(auto it=startupDefaults.begin(); it!=startupDefaults.end(); ++it)
+    {
+        std::string key=normalizeAcceleratorKey(it.key());
+        if(key.empty())
+        {
+            continue;
+        }
+
+        if(it.value().is_string())
+        {
+            sanitized[key]={
+                {"model", it.value().get<std::string>()},
+                {"variant", ""},
+                {"context_size", 0},
+                {"runtime_options", nlohmann::json::object()}
+            };
+            continue;
+        }
+
+        if(!it.value().is_object())
+        {
+            continue;
+        }
+
+        sanitized[key]={
+            {"model", it.value().value("model", "")},
+            {"variant", it.value().value("variant", "")},
+            {"context_size", sanitizeContextSize(it.value().value("context_size", 0))},
+            {"runtime_options", it.value().value("runtime_options", nlohmann::json::object())}
+        };
+    }
+
+    return sanitized;
+}
+
+bool hasAccelerator(const SystemInfo &hw, const std::string &accelerator);
+
+int defaultStartupContextSize(const ModelInfo &model)
+{
+    if(model.contextScaling.has_value()&&model.contextScaling->baseContext>0)
+    {
+        return model.contextScaling->baseContext;
+    }
+
+    return model.contextWindow>0?model.contextWindow:0;
+}
+
+int effectiveStartupContextSize(const ModelInfo &model, int requestedContextSize)
+{
+    if(requestedContextSize>0)
+    {
+        return requestedContextSize;
+    }
+
+    return defaultStartupContextSize(model);
+}
+
+SystemInfo filterSystemInfoForAccelerator(const SystemInfo &hw, const std::string &accelerator)
+{
+    SystemInfo filtered=hw;
+    filtered.gpus.clear();
+
+    if(accelerator==STARTUP_ACCELERATOR_CPU)
+    {
+        return filtered;
+    }
+
+    for(const GpuInfo &gpu:hw.gpus)
+    {
+        if(accelerator==STARTUP_ACCELERATOR_CUDA&&gpu.backend==GpuBackend::CUDA)
+        {
+            filtered.gpus.push_back(gpu);
+        }
+        else if(accelerator==STARTUP_ACCELERATOR_VULKAN&&gpu.backend==GpuBackend::Vulkan)
+        {
+            filtered.gpus.push_back(gpu);
+        }
+    }
+
+    return filtered;
+}
+
+int sumEffectiveFreeMemoryMb(const SystemInfo &hw)
+{
+    int total=0;
+
+    for(const GpuInfo &gpu:hw.gpus)
+    {
+        if(gpu.unifiedMemory&&gpu.gpuAccessibleRamFreeMb>0)
+        {
+            total+=gpu.gpuAccessibleRamFreeMb;
+        }
+        else
+        {
+            total+=gpu.vramFreeMb;
+        }
+    }
+
+    return total;
+}
+
+int sumEffectiveTotalMemoryMb(const SystemInfo &hw)
+{
+    int total=0;
+
+    for(const GpuInfo &gpu:hw.gpus)
+    {
+        if(gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0)
+        {
+            total+=gpu.gpuAccessibleRamMb;
+        }
+        else
+        {
+            total+=gpu.vramTotalMb;
+        }
+    }
+
+    return total;
+}
+
+SystemInfo asStartupSystemInfo(const SystemInfo &hw)
+{
+    SystemInfo startup=hw;
+    startup.freeRamMb=hw.totalRamMb;
+
+    for(GpuInfo &gpu:startup.gpus)
+    {
+        gpu.vramFreeMb=gpu.vramTotalMb;
+
+        if(gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0)
+        {
+            gpu.gpuAccessibleRamFreeMb=gpu.gpuAccessibleRamMb;
+        }
+    }
+
+    return startup;
+}
+
+int estimateStartupRequiredVramMb(
+    const ModelInfo &model,
+    const ModelVariant &variant,
+    int requestedContextSize)
+{
+    int requiredVramMb=variant.minVramMb;
+
+    if(model.contextScaling.has_value()&&model.contextScaling->vramPer1kContextMb>0)
+    {
+        int effectiveContext=effectiveStartupContextSize(model, requestedContextSize);
+        int baseContext=model.contextScaling->baseContext;
+        if(effectiveContext<baseContext)
+        {
+            effectiveContext=baseContext;
+        }
+
+        int extraContext=effectiveContext-baseContext;
+        if(extraContext>0)
+        {
+            int extraChunks=(extraContext+1023)/1024;
+            requiredVramMb+=extraChunks*model.contextScaling->vramPer1kContextMb;
+        }
+    }
+
+    return requiredVramMb;
+}
+
+int estimateStartupRequiredRamMb(
+    const ModelInfo &model,
+    const ModelVariant &variant,
+    int requestedContextSize)
+{
+    int requiredRamMb=std::max(
+        model.hardwareRequirements.has_value()?model.hardwareRequirements->minSystemRamMb:0,
+        variant.fileSizeMb);
+
+    if(model.contextScaling.has_value()&&model.contextScaling->vramPer1kContextMb>0)
+    {
+        int effectiveContext=effectiveStartupContextSize(model, requestedContextSize);
+        int baseContext=model.contextScaling->baseContext;
+        if(effectiveContext<baseContext)
+        {
+            effectiveContext=baseContext;
+        }
+
+        int extraContext=effectiveContext-baseContext;
+        if(extraContext>0)
+        {
+            int extraChunks=(extraContext+1023)/1024;
+            requiredRamMb+=extraChunks*model.contextScaling->vramPer1kContextMb;
+        }
+    }
+
+    return requiredRamMb;
+}
+
+std::string startupCompatibilityLabel(const std::string &compatibility)
+{
+    if(compatibility=="likely")
+    {
+        return "Likely";
+    }
+    if(compatibility=="tight")
+    {
+        return "Tight fit";
+    }
+    if(compatibility=="cloud")
+    {
+        return "Cloud";
+    }
+    if(compatibility=="undetected")
+    {
+        return "No device";
+    }
+
+    return "Unlikely";
+}
+
+int startupCompatibilitySortRank(const std::string &compatibility)
+{
+    if(compatibility=="likely")
+    {
+        return 0;
+    }
+    if(compatibility=="tight")
+    {
+        return 1;
+    }
+    if(compatibility=="cloud")
+    {
+        return 2;
+    }
+    if(compatibility=="unlikely")
+    {
+        return 3;
+    }
+
+    return 4;
+}
+
+nlohmann::json buildStartupOptionJson(
+    const std::string &accelerator,
+    const SystemInfo &hw,
+    const ModelInfo &model,
+    const std::string &variantName,
+    int requestedContextSize)
+{
+    SystemInfo startupHw=asStartupSystemInfo(hw);
+
+    nlohmann::json option={
+        {"model", model.model},
+        {"variant", variantName},
+        {"provider", model.provider},
+        {"requested_context_size", sanitizeContextSize(requestedContextSize)},
+        {"effective_context_size", effectiveStartupContextSize(model, requestedContextSize)},
+        {"max_context_size", 0},
+        {"required_vram_mb", 0},
+        {"required_ram_mb", 0},
+        {"available_vram_mb", 0},
+        {"available_ram_mb", startupHw.freeRamMb},
+        {"can_run", true},
+        {"compatibility", "cloud"},
+        {"compatibility_label", "Cloud"},
+        {"compatibility_reason", "Provider-managed model; no local download or VRAM requirement."},
+        {"sort_rank", startupCompatibilitySortRank("cloud")}
+    };
+
+    if(model.variants.empty())
+    {
+        return option;
+    }
+
+    const ModelVariant *selectedVariant=nullptr;
+    for(const ModelVariant &candidate:model.variants)
+    {
+        if(candidate.quantization==variantName)
+        {
+            selectedVariant=&candidate;
+            break;
+        }
+    }
+
+    if(!selectedVariant)
+    {
+        option["can_run"]=false;
+        option["compatibility"]="unlikely";
+        option["compatibility_label"]=startupCompatibilityLabel("unlikely");
+        option["compatibility_reason"]="Variant metadata is missing from the live model catalog.";
+        option["sort_rank"]=startupCompatibilitySortRank("unlikely");
+        return option;
+    }
+
+    SystemInfo acceleratorHw=filterSystemInfoForAccelerator(startupHw, accelerator);
+    bool acceleratorDetected=hasAccelerator(hw, accelerator);
+    int availableVramMb=sumEffectiveTotalMemoryMb(acceleratorHw);
+    int requiredVramMb=estimateStartupRequiredVramMb(model, *selectedVariant, requestedContextSize);
+    int requiredRamMb=estimateStartupRequiredRamMb(model, *selectedVariant, requestedContextSize);
+    int desiredContextSize=effectiveStartupContextSize(model, requestedContextSize);
+
+    option["required_vram_mb"]=requiredVramMb;
+    option["required_ram_mb"]=requiredRamMb;
+    option["available_vram_mb"]=availableVramMb;
+    option["available_ram_mb"]=startupHw.freeRamMb;
+    option["base_memory_mb"]=static_cast<int>(selectedVariant->minVramMb);
+    option["base_context_size"]=model.contextScaling.has_value()?model.contextScaling->baseContext:0;
+    option["memory_per_1k_context_mb"]=model.contextScaling.has_value()?model.contextScaling->vramPer1kContextMb:0;
+
+    if(accelerator==STARTUP_ACCELERATOR_CPU)
+    {
+        int maxContextSize=model.contextScaling.has_value()
+            ? model.contextScaling->maxContext
+            : model.contextWindow;
+        bool contextFits=maxContextSize<=0||desiredContextSize<=0||desiredContextSize<=maxContextSize;
+        bool canRun=requiredRamMb<=startupHw.freeRamMb&&contextFits;
+        std::string compatibility=canRun
+            ? (requiredRamMb>=static_cast<int>(startupHw.freeRamMb*0.85f)?"tight":"likely")
+            : "unlikely";
+        std::string reason;
+
+        if(canRun)
+        {
+            reason="Fits in total system RAM for CPU startup.";
+        }
+        else if(!contextFits)
+        {
+            reason="Requested context exceeds the CPU startup limit.";
+        }
+        else
+        {
+            reason="Needs more system RAM than the device has for CPU startup.";
+        }
+
+        option["max_context_size"]=maxContextSize;
+        option["can_run"]=canRun;
+        option["compatibility"]=compatibility;
+        option["compatibility_label"]=startupCompatibilityLabel(compatibility);
+        option["compatibility_reason"]=reason;
+        option["sort_rank"]=startupCompatibilitySortRank(compatibility);
+        return option;
+    }
+
+    ModelFit fit=ModelFitCalculator::calculateModelFit(model, *selectedVariant, acceleratorHw);
+    bool contextFits=fit.maxContextSize<=0||desiredContextSize<=0||desiredContextSize<=fit.maxContextSize;
+    bool wouldFallbackToCpu=fit.canRun&&fit.gpuIndices.empty();
+    bool canRun=acceleratorDetected&&fit.canRun&&!wouldFallbackToCpu&&contextFits&&requiredVramMb<=availableVramMb;
+    std::string compatibility;
+    std::string reason;
+
+    if(!acceleratorDetected)
+    {
+        compatibility="undetected";
+        reason="No compatible accelerator is currently detected for this startup slot.";
+    }
+    else if(wouldFallbackToCpu)
+    {
+        compatibility="unlikely";
+        reason="Total VRAM would force a CPU fallback instead of using this accelerator.";
+    }
+    else if(!fit.canRun)
+    {
+        compatibility="unlikely";
+        if(fit.limitingFactor=="ram")
+        {
+            reason="Insufficient system RAM for this model on the device.";
+        }
+        else
+        {
+            reason="Insufficient total VRAM on the device.";
+        }
+    }
+    else if(!contextFits)
+    {
+        compatibility="unlikely";
+        reason="Requested context is higher than the model can sustain on the device.";
+    }
+    else if(requiredVramMb>=static_cast<int>(availableVramMb*0.85f))
+    {
+        compatibility="tight";
+        reason="Fits, but VRAM is tight for the requested context.";
+    }
+    else
+    {
+        compatibility="likely";
+        reason="Fits comfortably on the device for the requested context.";
+    }
+
+    option["max_context_size"]=fit.maxContextSize;
+    option["can_run"]=canRun;
+    option["compatibility"]=compatibility;
+    option["compatibility_label"]=startupCompatibilityLabel(compatibility);
+    option["compatibility_reason"]=reason;
+    option["sort_rank"]=startupCompatibilitySortRank(compatibility);
+    return option;
+}
+
+std::vector<std::string> parseDefaultBackendPriority(const nlohmann::json &cfg)
+{
+    std::vector<std::string> priority;
+
+    nlohmann::json hardwareCfg=cfg.value("hardware", nlohmann::json::object());
+    if(hardwareCfg.contains("default_backend_priority")&&hardwareCfg["default_backend_priority"].is_array())
+    {
+        for(const nlohmann::json &backend:hardwareCfg["default_backend_priority"])
+        {
+            if(backend.is_string())
+            {
+                priority.push_back(toLowerCopy(backend.get<std::string>()));
+            }
+        }
+    }
+
+    return priority;
+}
+
+bool hasAccelerator(const SystemInfo &hw, const std::string &accelerator)
+{
+    if(accelerator==STARTUP_ACCELERATOR_CPU)
+    {
+        return true;
+    }
+
+    for(const GpuInfo &gpu:hw.gpus)
+    {
+        if(accelerator==STARTUP_ACCELERATOR_CUDA&&gpu.backend==GpuBackend::CUDA)
+        {
+            return true;
+        }
+        if(accelerator==STARTUP_ACCELERATOR_VULKAN&&gpu.backend==GpuBackend::Vulkan)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::vector<std::string> buildStartupAcceleratorOrder(
+    const SystemInfo &hw,
+    const std::vector<std::string> &defaultBackendPriority)
+{
+    std::vector<std::string> order;
+
+    auto appendIfAvailable=[&order, &hw](const std::string &accelerator)
+    {
+        if(accelerator.empty()||!hasAccelerator(hw, accelerator))
+        {
+            return;
+        }
+        if(std::find(order.begin(), order.end(), accelerator)==order.end())
+        {
+            order.push_back(accelerator);
+        }
+    };
+
+    for(const std::string &backend:defaultBackendPriority)
+    {
+        appendIfAvailable(normalizeAcceleratorKey(backend));
+    }
+
+    appendIfAvailable(STARTUP_ACCELERATOR_CUDA);
+    appendIfAvailable(STARTUP_ACCELERATOR_VULKAN);
+    appendIfAvailable(STARTUP_ACCELERATOR_CPU);
+
+    return order;
+}
+
+nlohmann::json resolveEffectiveStartupDefault(const nlohmann::json &cfg, const SystemInfo &hw)
+{
+    nlohmann::json startupDefaults=sanitizeStartupDefaults(cfg.value("startup_defaults", nlohmann::json::object()));
+    std::vector<std::string> acceleratorOrder=buildStartupAcceleratorOrder(hw, parseDefaultBackendPriority(cfg));
+
+    for(const std::string &accelerator:acceleratorOrder)
+    {
+        nlohmann::json entry=startupDefaults.value(accelerator, nlohmann::json::object());
+        std::string model=entry.value("model", "");
+        if(!model.empty())
+        {
+            return {
+                {"accelerator", accelerator},
+                {"model", model},
+                {"variant", entry.value("variant", "")},
+                {"context_size", sanitizeContextSize(entry.value("context_size", 0))},
+                {"runtime_options", entry.value("runtime_options", nlohmann::json::object())}
+            };
+        }
+    }
+
+    return {
+        {"accelerator", "legacy"},
+        {"model", cfg.value("default_model", "")},
+        {"variant", cfg.value("default_variant", "")},
+        {"context_size", 0}
+    };
+}
+
+bool loadServerConfigJson(nlohmann::json &cfg, std::string &error)
+{
+    if(g_serverConfigPath.empty())
+    {
+        error="Server config path is not set";
+        return false;
+    }
+
+    std::ifstream file(g_serverConfigPath);
+    if(!file.is_open())
+    {
+        error="Cannot open server config file";
+        return false;
+    }
+
+    try
+    {
+        cfg=nlohmann::json::parse(file, nullptr, true, true);
+    }
+    catch(const std::exception &e)
+    {
+        error=e.what();
+        return false;
+    }
+
+    if(!cfg.is_object())
+    {
+        error="Server config root must be a JSON object";
+        return false;
+    }
+
+    return true;
+}
+
+bool saveServerConfigJson(const nlohmann::json &cfg, std::string &error)
+{
+    if(g_serverConfigPath.empty())
+    {
+        error="Server config path is not set";
+        return false;
+    }
+
+    std::ofstream file(g_serverConfigPath, std::ios::trunc);
+    if(!file.is_open())
+    {
+        error="Cannot open server config file for writing";
+        return false;
+    }
+
+    file<<cfg.dump(4)<<std::endl;
+    if(!file.good())
+    {
+        error="Failed to write server config file";
+        return false;
+    }
+
+    return true;
+}
+
+nlohmann::json buildServerConfigResponse(const nlohmann::json &cfg)
+{
+    HardwareDetector::instance().refresh();
+    SystemInfo hw=HardwareDetector::instance().getSystemInfo();
+
+    nlohmann::json detectedAccelerators=nlohmann::json::array();
+    for(const std::string &accelerator:buildStartupAcceleratorOrder(hw, parseDefaultBackendPriority(cfg)))
+    {
+        detectedAccelerators.push_back(accelerator);
+    }
+
+    nlohmann::json response={
+        {"default_model", cfg.value("default_model", "")},
+        {"default_variant", cfg.value("default_variant", "")},
+        {"startup_defaults", sanitizeStartupDefaults(cfg.value("startup_defaults", nlohmann::json::object()))},
+        {"detected_accelerators", detectedAccelerators},
+        {"effective_startup_default", resolveEffectiveStartupDefault(cfg, hw)}
+    };
+
+    if(cfg.contains("startup_models")&&cfg["startup_models"].is_array())
+    {
+        response["startup_models"]=cfg["startup_models"];
+    }
+
+    return response;
+}
 
 /// Generate a unique ID with the given prefix (e.g., "chatcmpl-").
 std::string generateId(const std::string &prefix="chatcmpl-")
@@ -172,6 +813,8 @@ nlohmann::json runtimeOptionsToJson(const RuntimeOptions &opts)
         j["n_gpu_layers"]=opts.nGpuLayers.value();
     if(opts.overrideTensor.has_value())
         j["override_tensor"]=opts.overrideTensor.value();
+    if(opts.vulkanNoHostVisibleVram.has_value())
+        j["vulkan_no_host_visible_vram"]=opts.vulkanNoHostVisibleVram.value();
 
     return j;
 }
@@ -196,6 +839,8 @@ RuntimeOptions parseRuntimeOptions(const nlohmann::json &j)
         opts.nGpuLayers=j["n_gpu_layers"].get<int>();
     if(j.contains("override_tensor")&&j["override_tensor"].is_string())
         opts.overrideTensor=j["override_tensor"].get<std::string>();
+    if(j.contains("vulkan_no_host_visible_vram")&&j["vulkan_no_host_visible_vram"].is_boolean())
+        opts.vulkanNoHostVisibleVram=j["vulkan_no_host_visible_vram"].get<bool>();
 
     return opts;
 }
@@ -218,9 +863,37 @@ nlohmann::json loadedModelToJson(const LoadedModel &m)
         {"context_size", m.contextSize},
         {"max_context_size", m.maxContextSize},
         {"gpu_indices", gpuIndices},
-        {"pinned", m.pinned}
+        {"pinned", m.pinned},
+        {"graph_splits", m.graphSplits},
+        {"cpu_mapped_buffer_mb", m.cpuMappedBufferMb}
     };
 
+    if(!m.perGpuVramMb.empty())
+    {
+        nlohmann::json perGpuJson=nlohmann::json::object();
+        for(const auto &pair:m.perGpuVramMb)
+        {
+            perGpuJson[std::to_string(pair.first)]=pair.second;
+        }
+        j["per_gpu_vram_mb"]=perGpuJson;
+    }
+
+    if(!m.deviceAllocations.empty())
+    {
+        nlohmann::json allocations=nlohmann::json::object();
+        for(const auto &pair:m.deviceAllocations)
+        {
+            allocations[pair.first]={
+                {"device_name", pair.second.deviceName},
+                {"model_buffer_mb", pair.second.modelBufferMb},
+                {"kv_cache_buffer_mb", pair.second.kvCacheBufferMb},
+                {"compute_buffer_mb", pair.second.computeBufferMb},
+                {"total_mb", pair.second.totalMb}
+            };
+        }
+        j["device_allocations"]=allocations;
+    }
+
     nlohmann::json activeOpts=runtimeOptionsToJson(m.activeOptions);
     if(!activeOpts.empty())
     {
@@ -329,6 +1002,12 @@ std::pair<std::string, std::string> parseModelVariant(const std::string &modelId
 
 // ========== Override Path ==========
 
+void setServerConfigPath(const std::string &path)
+{
+    std::lock_guard<std::mutex> lock(g_serverConfigMutex);
+    g_serverConfigPath=path;
+}
+
 void setOverridePath(const std::string &path)
 {
     g_overridePath=path;
@@ -355,7 +1034,7 @@ void registerRoutes(httplib::Server &server)
     server.Options(R"(.*)", [](const httplib::Request &, httplib::Response &res)
     {
         res.set_header("Access-Control-Allow-Origin", "*");
-        res.set_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS, DELETE");
+        res.set_header("Access-Control-Allow-Methods", "GET, POST, PUT, OPTIONS, DELETE");
         res.set_header("Access-Control-Allow-Headers", "Content-Type, Authorization");
         res.set_header("Access-Control-Max-Age", "86400");
         res.status=204;
@@ -376,6 +1055,11 @@ void registerRoutes(httplib::Server &server)
     // Version
     server.Get("/api/version", handleGetVersion);
 
+    // Server config
+    server.Get("/api/server/config", handleGetServerConfig);
+    server.Put("/api/server/config", handleSetServerConfig);
+    server.Get("/api/server/startup-options", handleGetStartupOptions);
+
     // Chat completions (OpenAI-compatible)
     server.Post("/v1/chat/completions", handleChatCompletions);
     server.Get("/v1/models", handleListModelsV1);
@@ -432,6 +1116,7 @@ void registerRoutes(httplib::Server &server)
     server.Get("/api/downloads", handleGetActiveDownloads);
 
     // Dashboard
+    server.Get("/dashboard/config", handleDashboardConfig);
     server.Get("/dashboard/storage", handleDashboardStorage);
     server.Get("/dashboard", handleDashboard);
 
@@ -976,6 +1661,181 @@ void handleGetVersion(const httplib::Request &, httplib::Response &res)
     res.set_content(j.dump(), "application/json");
 }
 
+// ========== Server Config ==========
+
+void handleGetServerConfig(const httplib::Request &, httplib::Response &res)
+{
+    std::lock_guard<std::mutex> lock(g_serverConfigMutex);
+
+    nlohmann::json cfg;
+    std::string error;
+    if(!loadServerConfigJson(cfg, error))
+    {
+        res.status=500;
+        res.set_content(errorJson("Failed to load server config: "+error).dump(), "application/json");
+        return;
+    }
+
+    res.set_content(buildServerConfigResponse(cfg).dump(), "application/json");
+}
+
+void handleSetServerConfig(const httplib::Request &req, httplib::Response &res)
+{
+    nlohmann::json body;
+    try
+    {
+        body=nlohmann::json::parse(req.body);
+    }
+    catch(const std::exception &)
+    {
+        res.status=400;
+        res.set_content(errorJson("Invalid JSON body", "invalid_request_error", "", "parse_error").dump(), "application/json");
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(g_serverConfigMutex);
+
+    nlohmann::json cfg;
+    std::string error;
+    if(!loadServerConfigJson(cfg, error))
+    {
+        res.status=500;
+        res.set_content(errorJson("Failed to load server config: "+error).dump(), "application/json");
+        return;
+    }
+
+    if(body.contains("default_model"))
+    {
+        if(!body["default_model"].is_string())
+        {
+            res.status=400;
+            res.set_content(errorJson("'default_model' must be a string", "invalid_request_error", "default_model", "invalid_type").dump(), "application/json");
+            return;
+        }
+        cfg["default_model"]=body["default_model"].get<std::string>();
+    }
+
+    if(body.contains("default_variant"))
+    {
+        if(!body["default_variant"].is_string())
+        {
+            res.status=400;
+            res.set_content(errorJson("'default_variant' must be a string", "invalid_request_error", "default_variant", "invalid_type").dump(), "application/json");
+            return;
+        }
+        cfg["default_variant"]=body["default_variant"].get<std::string>();
+    }
+
+    if(body.contains("startup_defaults"))
+    {
+        if(!body["startup_defaults"].is_object())
+        {
+            res.status=400;
+            res.set_content(errorJson("'startup_defaults' must be an object", "invalid_request_error", "startup_defaults", "invalid_type").dump(), "application/json");
+            return;
+        }
+
+        cfg["startup_defaults"]=sanitizeStartupDefaults(body["startup_defaults"]);
+    }
+
+    if(body.contains("startup_models"))
+    {
+        if(!body["startup_models"].is_array())
+        {
+            res.status=400;
+            res.set_content(errorJson("'startup_models' must be an array", "invalid_request_error", "startup_models", "invalid_type").dump(), "application/json");
+            return;
+        }
+
+        cfg["startup_models"]=body["startup_models"];
+    }
+
+    if(!saveServerConfigJson(cfg, error))
+    {
+        res.status=500;
+        res.set_content(errorJson("Failed to save server config: "+error).dump(), "application/json");
+        return;
+    }
+
+    res.set_content(buildServerConfigResponse(cfg).dump(), "application/json");
+}
+
+void handleGetStartupOptions(const httplib::Request &req, httplib::Response &res)
+{
+    std::string accelerator=normalizeAcceleratorKey(req.has_param("accelerator")
+        ? req.get_param_value("accelerator")
+        : "");
+    if(accelerator.empty())
+    {
+        res.status=400;
+        res.set_content(errorJson("Missing or invalid 'accelerator' query parameter", "invalid_request_error", "accelerator", "invalid_value").dump(), "application/json");
+        return;
+    }
+
+    int contextSize=0;
+    if(req.has_param("context_size"))
+    {
+        try
+        {
+            contextSize=sanitizeContextSize(std::stoi(req.get_param_value("context_size")));
+        }
+        catch(const std::exception &)
+        {
+            res.status=400;
+            res.set_content(errorJson("'context_size' must be an integer", "invalid_request_error", "context_size", "invalid_value").dump(), "application/json");
+            return;
+        }
+    }
+
+    HardwareDetector::instance().refresh();
+    SystemInfo hw=HardwareDetector::instance().getSystemInfo();
+
+    std::vector<ModelInfo> models=ModelManager::instance().getModelsByRanking();
+    std::vector<nlohmann::json> options;
+    options.reserve(models.size());
+
+    for(const ModelInfo &model:models)
+    {
+        if(model.variants.empty())
+        {
+            options.push_back(buildStartupOptionJson(accelerator, hw, model, "", contextSize));
+            continue;
+        }
+
+        for(const ModelVariant &variant:model.variants)
+        {
+            options.push_back(buildStartupOptionJson(accelerator, hw, model, variant.quantization, contextSize));
+        }
+    }
+
+    std::sort(options.begin(), options.end(), [](const nlohmann::json &left, const nlohmann::json &right)
+    {
+        int leftRank=left.value("sort_rank", 99);
+        int rightRank=right.value("sort_rank", 99);
+        if(leftRank!=rightRank)
+        {
+            return leftRank<rightRank;
+        }
+
+        std::string leftKey=toLowerCopy(left.value("model", ""))+"\u0000"+toLowerCopy(left.value("variant", ""));
+        std::string rightKey=toLowerCopy(right.value("model", ""))+"\u0000"+toLowerCopy(right.value("variant", ""));
+        return leftKey<rightKey;
+    });
+
+    SystemInfo startupHw=asStartupSystemInfo(hw);
+
+    nlohmann::json response={
+        {"accelerator", accelerator},
+        {"detected", hasAccelerator(hw, accelerator)},
+        {"context_size", contextSize},
+        {"available_vram_mb", sumEffectiveTotalMemoryMb(filterSystemInfoForAccelerator(startupHw, accelerator))},
+        {"available_ram_mb", startupHw.freeRamMb},
+        {"options", options}
+    };
+
+    res.set_content(response.dump(), "application/json");
+}
+
 // ========== Model Management ==========
 
 void handleGetModels(const httplib::Request &, httplib::Response &res)
@@ -1072,6 +1932,7 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
         std::string variant;
         int contextSize=0;
         RuntimeOptions optionsOverride;
+        std::vector<int> targetDevices;
 
         // Accept parameters from query string
         if(req.has_param("variant"))
@@ -1093,6 +1954,14 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
                     contextSize=body["context_size"].get<int>();
                 if(body.contains("runtime_options")&&body["runtime_options"].is_object())
                     optionsOverride=parseRuntimeOptions(body["runtime_options"]);
+                if(body.contains("devices")&&body["devices"].is_array())
+                {
+                    for(const auto &d:body["devices"])
+                    {
+                        if(d.is_number_integer())
+                            targetDevices.push_back(d.get<int>());
+                    }
+                }
             }
             catch(const nlohmann::json::parse_error &)
             {
@@ -1100,9 +1969,9 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
             }
         }
 
-        spdlog::info("Load request: model='{}' variant='{}' context={}", modelName, variant, contextSize);
+        spdlog::info("Load request: model='{}' variant='{}' context={} devices={}", modelName, variant, contextSize, targetDevices.size());
 
-        ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize, &optionsOverride);
+        ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize, &optionsOverride, targetDevices);
 
         if(err==ErrorCode::Success)
         {
@@ -2335,6 +3204,11 @@ void handleDashboard(const httplib::Request &, httplib::Response &res)
     res.set_content(DASHBOARD_HTML, "text/html");
 }
 
+void handleDashboardConfig(const httplib::Request &, httplib::Response &res)
+{
+    res.set_content(DASHBOARD_CONFIG_HTML, "text/html");
+}
+
 void handleDashboardStorage(const httplib::Request &, httplib::Response &res)
 {
     res.set_content(DASHBOARD_STORAGE_HTML, "text/html");
diff --git a/src/server/routes.h b/src/server/routes.h
index 418daf4..6b20831 100644
--- a/src/server/routes.h
+++ b/src/server/routes.h
@@ -11,6 +11,9 @@ namespace server
 /// Register all route handlers on the given HTTP server.
 void registerRoutes(httplib::Server &server);
 
+/// Set the main server configuration JSON path for persisted dashboard updates.
+void setServerConfigPath(const std::string &path);
+
 /// Set the override path for persisting runtime model configs.
 void setOverridePath(const std::string &path);
 
@@ -32,6 +35,12 @@ void handleHealth(const httplib::Request &req, httplib::Response &res);
 
 void handleGetVersion(const httplib::Request &req, httplib::Response &res);
 
+// ========== Server Config ==========
+
+void handleGetServerConfig(const httplib::Request &req, httplib::Response &res);
+void handleSetServerConfig(const httplib::Request &req, httplib::Response &res);
+void handleGetStartupOptions(const httplib::Request &req, httplib::Response &res);
+
 // ========== Model Management ==========
 
 void handleGetModels(const httplib::Request &req, httplib::Response &res);
@@ -88,6 +97,7 @@ void handleGetActiveDownloads(const httplib::Request &req, httplib::Response &re
 // ========== Dashboard ==========
 
 void handleDashboard(const httplib::Request &req, httplib::Response &res);
+void handleDashboardConfig(const httplib::Request &req, httplib::Response &res);
 void handleDashboardStorage(const httplib::Request &req, httplib::Response &res);
 
 } // namespace server
diff --git a/tests/modelRuntimeTests.cpp b/tests/modelRuntimeTests.cpp
index 86ee084..b76ef3f 100644
--- a/tests/modelRuntimeTests.cpp
+++ b/tests/modelRuntimeTests.cpp
@@ -304,7 +304,7 @@ TEST_F(ModelRuntimeTest, EndInferenceDrainsSwapQueue)
     rt.swapModel("mock-model-2");
 
     // End inference — should drain the queue and execute swap
-    rt.endInference();
+    rt.endInference("mock-model");
 
     EXPECT_FALSE(rt.isInferenceActive());
 
@@ -330,7 +330,7 @@ TEST_F(ModelRuntimeTest, MultipleQueuedSwapsOnlyExecutesLatest)
     rt.swapModel("mock-model-2");
     rt.swapModel("mock-model"); // swap back to mock-model
 
-    rt.endInference();
+    rt.endInference("mock-model");
 
     // mock-model should be loaded (the latest swap target)
     auto state=rt.getModelState("mock-model");
@@ -350,7 +350,7 @@ TEST_F(ModelRuntimeTest, BeginEndInferenceTracksState)
     rt.beginInference("mock-model");
     EXPECT_TRUE(rt.isInferenceActive());
 
-    rt.endInference();
+    rt.endInference("mock-model");
     EXPECT_FALSE(rt.isInferenceActive());
 }
 
diff --git a/tests/telemetryCollectorTests.cpp b/tests/telemetryCollectorTests.cpp
index 35f4d00..ce95299 100644
--- a/tests/telemetryCollectorTests.cpp
+++ b/tests/telemetryCollectorTests.cpp
@@ -266,7 +266,7 @@ TEST_F(TelemetryCollectorTest, SnapshotActiveRequests)
     SystemSnapshot snapshot2=tc.getSnapshot();
     EXPECT_EQ(snapshot2.activeRequests, 1);
 
-    ModelRuntime::instance().endInference();
+    ModelRuntime::instance().endInference("tel-mock-1");
 
     SystemSnapshot snapshot3=tc.getSnapshot();
     EXPECT_EQ(snapshot3.activeRequests, 0);

From 6d44752aa05339ead2b9b8e426ad6c8355d24edd Mon Sep 17 00:00:00 2001
From: krazer <caseymcc@krazer.net>
Date: Sun, 10 May 2026 12:02:08 -0400
Subject: [PATCH 2/2] feat: context size auto-select and VRAM fit visualization

- Dashboard config page: context slider allows 0 (auto), shows VRAM
  fit gradient (green/yellow/red) based on selected compute devices,
  hard-caps slider at max VRAM capacity, info banner for auto mode
- Model runtime: context_size=0 now selects largest context fitting
  in available VRAM (min of hardware max and native training context)
- Add context stress test script
- Update .gitignore for ansible/ and tmp/
---
 .gitignore                     |   3 +-
 arbiterAI_config               |   2 +-
 scripts/context_stress_test.sh | 157 +++++++++++++++++++++++++++++++++
 src/arbiterAI/modelManager.cpp |  16 ++++
 src/arbiterAI/modelRuntime.cpp |  89 ++++++++++++++++---
 src/arbiterAI/modelRuntime.h   |   7 ++
 src/server/dashboardConfig.h   | 103 +++++++++++++++++++--
 src/server/main.cpp            |   3 +
 src/server/routes.cpp          |  36 ++++----
 9 files changed, 375 insertions(+), 41 deletions(-)
 create mode 100755 scripts/context_stress_test.sh

diff --git a/.gitignore b/.gitignore
index 99f7b81..e9d191a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,4 +45,5 @@ models/
 # local info
 push-server.sh
 docs/tasks/
-tmp/
\ No newline at end of file
+tmp/
+ansible/
diff --git a/arbiterAI_config b/arbiterAI_config
index cffe407..e6a4342 160000
--- a/arbiterAI_config
+++ b/arbiterAI_config
@@ -1 +1 @@
-Subproject commit cffe4077ba15986e26035d2e69ad8341f6dcc83d
+Subproject commit e6a4342141f6e84f229be0141ae1374b16194110
diff --git a/scripts/context_stress_test.sh b/scripts/context_stress_test.sh
new file mode 100755
index 0000000..49e13a3
--- /dev/null
+++ b/scripts/context_stress_test.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+# context_stress_test.sh — Progressively fill context on ai-lab to find the real limit
+#
+# The model (Qwen3.5-27B:Q4_K_M) is loaded with 248832 context on the MI50 32GB.
+# This script sends increasingly large prompts and observes when llama.cpp errors.
+#
+# Strategy: Use a binary search approach. Start with a known-good token count,
+# then double until failure, then binary search between last-good and first-bad.
+
+set -euo pipefail
+
+SERVER="http://192.168.2.101:8081"
+MODEL="Qwen3.5-27B"
+RESULTS_FILE="/tmp/context_stress_results.txt"
+
+echo "Context Stress Test - $(date)" | tee "$RESULTS_FILE"
+echo "Server: $SERVER" | tee -a "$RESULTS_FILE"
+echo "Model: $MODEL (Q4_K_M)" | tee -a "$RESULTS_FILE"
+echo "Configured context: 248832" | tee -a "$RESULTS_FILE"
+echo "========================================" | tee -a "$RESULTS_FILE"
+
+# Generate a repeating text block to fill context
+# ~4 chars per token for English text is a rough estimate
+# We'll use a simple repeating pattern
+generate_payload() {
+    local target_tokens=$1
+    # Each word "hello " is roughly 1-2 tokens; use ~3.5 chars/token estimate
+    local char_count=$((target_tokens * 4))
+    
+    # Generate repeating text
+    local text=""
+    local block="The quick brown fox jumps over the lazy dog. This is a test of context window capacity. "
+    local block_len=${#block}
+    local repeats=$((char_count / block_len + 1))
+    
+    # Use python for efficiency with large strings
+    python3 -c "
+import json, sys
+
+target_chars = $char_count
+block = 'The quick brown fox jumps over the lazy dog. This is a test of context window capacity. '
+text = (block * ($repeats))[:target_chars]
+
+payload = {
+    'model': '$MODEL',
+    'messages': [
+        {'role': 'system', 'content': 'You are a helpful assistant. Respond with exactly one word: OK'},
+        {'role': 'user', 'content': text}
+    ],
+    'max_tokens': 5,
+    'temperature': 0.0
+}
+
+json.dump(payload, sys.stdout)
+"
+}
+
+# Send a request and check if it succeeds
+test_context() {
+    local target_tokens=$1
+    local start_time=$(date +%s%N)
+    
+    echo -n "  Testing ~${target_tokens} tokens... " | tee -a "$RESULTS_FILE"
+    
+    # Generate payload and send
+    local response
+    local http_code
+    
+    # Write payload to temp file to handle large sizes
+    generate_payload "$target_tokens" > /tmp/context_test_payload.json
+    local payload_size=$(wc -c < /tmp/context_test_payload.json)
+    echo -n "(payload: ${payload_size} bytes) " | tee -a "$RESULTS_FILE"
+    
+    # Send request with extended timeout (large context = slow)
+    response=$(curl -sf -w "\n%{http_code}" \
+        --max-time 300 \
+        -X POST "${SERVER}/v1/chat/completions" \
+        -H "Content-Type: application/json" \
+        -d @/tmp/context_test_payload.json 2>&1) || {
+        local exit_code=$?
+        echo "CURL_ERROR (exit=$exit_code)" | tee -a "$RESULTS_FILE"
+        echo "  Response: $(echo "$response" | tail -5)" | tee -a "$RESULTS_FILE"
+        return 1
+    }
+    
+    http_code=$(echo "$response" | tail -1)
+    local body=$(echo "$response" | sed '$d')
+    
+    local end_time=$(date +%s%N)
+    local elapsed_ms=$(( (end_time - start_time) / 1000000 ))
+    
+    if [ "$http_code" = "200" ]; then
+        local prompt_tokens=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('usage',{}).get('prompt_tokens','?'))" 2>/dev/null || echo "?")
+        echo "OK (HTTP 200, prompt_tokens=${prompt_tokens}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE"
+        return 0
+    else
+        local error_msg=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',{}).get('message','unknown')[:200])" 2>/dev/null || echo "$body" | head -c 200)
+        echo "FAILED (HTTP ${http_code}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE"
+        echo "  Error: ${error_msg}" | tee -a "$RESULTS_FILE"
+        return 1
+    fi
+}
+
+# Phase 1: Exponential probing - find the ballpark where it fails
+echo "" | tee -a "$RESULTS_FILE"
+echo "Phase 1: Exponential probing" | tee -a "$RESULTS_FILE"
+echo "----------------------------------------" | tee -a "$RESULTS_FILE"
+
+# Start with small amounts and increase
+TOKEN_SIZES=(1000 4000 8000 16000 32000 64000 96000 128000 160000 192000 224000 240000 248000)
+
+last_good=0
+first_bad=0
+
+for tokens in "${TOKEN_SIZES[@]}"; do
+    if test_context "$tokens"; then
+        last_good=$tokens
+    else
+        first_bad=$tokens
+        break
+    fi
+done
+
+if [ "$first_bad" -eq 0 ]; then
+    echo "" | tee -a "$RESULTS_FILE"
+    echo "All tests passed! Model handled up to ~${last_good} tokens." | tee -a "$RESULTS_FILE"
+    echo "The full 248832 context appears usable." | tee -a "$RESULTS_FILE"
+else
+    # Phase 2: Binary search between last_good and first_bad
+    echo "" | tee -a "$RESULTS_FILE"
+    echo "Phase 2: Binary search between ${last_good} and ${first_bad}" | tee -a "$RESULTS_FILE"
+    echo "----------------------------------------" | tee -a "$RESULTS_FILE"
+    
+    low=$last_good
+    high=$first_bad
+    
+    while [ $((high - low)) -gt 2000 ]; do
+        mid=$(( (low + high) / 2 ))
+        if test_context "$mid"; then
+            low=$mid
+        else
+            high=$mid
+        fi
+    done
+    
+    echo "" | tee -a "$RESULTS_FILE"
+    echo "========================================" | tee -a "$RESULTS_FILE"
+    echo "RESULT: Maximum usable context is approximately ${low}-${high} tokens" | tee -a "$RESULTS_FILE"
+    echo "  Last successful: ~${low} tokens" | tee -a "$RESULTS_FILE"
+    echo "  First failure:   ~${high} tokens" | tee -a "$RESULTS_FILE"
+    echo "  Configured max:  248832 tokens" | tee -a "$RESULTS_FILE"
+    echo "  Utilization:     $(python3 -c "print(f'{${low}/248832*100:.1f}%')")" | tee -a "$RESULTS_FILE"
+fi
+
+echo "" | tee -a "$RESULTS_FILE"
+echo "Full results saved to: $RESULTS_FILE" | tee -a "$RESULTS_FILE"
+echo "Done - $(date)" | tee -a "$RESULTS_FILE"
diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp
index 5c314d4..11f629a 100644
--- a/src/arbiterAI/modelManager.cpp
+++ b/src/arbiterAI/modelManager.cpp
@@ -366,6 +366,22 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in
                     variant.files.push_back(vd);
                 }
             }
+
+            // Skip CLIP/mmproj variants — these are multimodal projection
+            // files, not standalone models.  Loading them as the main model
+            // causes llama.cpp to fail with "CLIP cannot be used as main model".
+            std::string primaryFile=variant.getPrimaryFilename();
+            std::string primaryLower=primaryFile;
+            std::transform(primaryLower.begin(), primaryLower.end(), primaryLower.begin(), ::tolower);
+            if(primaryLower.find("mmproj")!=std::string::npos||
+                primaryLower.find("clip-")!=std::string::npos||
+                primaryLower.find("vision-")!=std::string::npos)
+            {
+                spdlog::debug("Skipping multimodal projection variant '{}' for model '{}' (file: {})",
+                    variant.quantization, info.model, primaryFile);
+                continue;
+            }
+
             info.variants.push_back(variant);
         }
     }
diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp
index 22774e4..ccd3d0c 100644
--- a/src/arbiterAI/modelRuntime.cpp
+++ b/src/arbiterAI/modelRuntime.cpp
@@ -279,6 +279,19 @@ LoadErrorDetail ModelRuntime::classifyLoadFailure(
         return detail;
     }
 
+    // Check for CLIP / multimodal projection file loaded as main model
+    if(logLower.find("clip cannot be used as main model")!=std::string::npos||
+        logLower.find("mmproj")!=std::string::npos&&logLower.find("clip")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::UnsupportedArch;
+        detail.summary="File is a CLIP/mmproj multimodal projection, not a standalone model: "+filePath;
+        detail.suggestion="This file is a vision encoder projection used with --mmproj, not a model. "
+            "Remove this variant from the model config and use the correct GGUF model file instead.";
+        detail.action="fix_config";
+        detail.recoverable=false;
+        return detail;
+    }
+
     // Check for unsupported architecture
     if(logLower.find("unknown model architecture")!=std::string::npos||
         logLower.find("unsupported model")!=std::string::npos||
@@ -333,6 +346,23 @@ int ModelRuntime::getMaxConcurrentDownloads() const
     return m_maxConcurrentDownloads;
 }
 
+void ModelRuntime::setModelsDir(const std::string &dir)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_modelsDir=dir;
+    // Ensure trailing slash for path concatenation
+    if(!m_modelsDir.empty()&&m_modelsDir.back()!='/')
+    {
+        m_modelsDir+='/';
+    }
+}
+
+std::string ModelRuntime::getModelsDir() const
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return m_modelsDir;
+}
+
 ErrorCode ModelRuntime::loadModel(
     const std::string &model,
     const std::string &variant,
@@ -481,7 +511,7 @@ ErrorCode ModelRuntime::loadModel(
                 bool anyMissing=false;
                 for(const VariantDownload &file:allFiles)
                 {
-                    std::string filePath="/models/"+file.filename;
+                    std::string filePath=m_modelsDir+file.filename;
                     if(!std::filesystem::exists(filePath)&&!file.url.empty())
                     {
                         anyMissing=true;
@@ -555,7 +585,7 @@ ErrorCode ModelRuntime::loadModel(
                 // Resolve backend priority: model config > architecture rule > server default
                 std::vector<std::string> effectiveBackendPriority=resolveBackendPriority(*modelInfo);
 
-                std::string filePath="/models/"+primaryFilename;
+                std::string filePath=m_modelsDir+primaryFilename;
                 ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices,
                     fit.maxContextSize, resolvedOptions, effectiveBackendPriority);
                 if(loadResult!=ErrorCode::Success)
@@ -660,7 +690,7 @@ ErrorCode ModelRuntime::downloadModel(
     bool anyMissing=false;
     for(const VariantDownload &file:allFiles)
     {
-        std::string filePath="/models/"+file.filename;
+        std::string filePath=m_modelsDir+file.filename;
         if(!std::filesystem::exists(filePath)&&!file.url.empty())
         {
             anyMissing=true;
@@ -752,7 +782,7 @@ void ModelRuntime::runBackgroundDownload(
     std::vector<const VariantDownload *> missingFiles;
     for(const VariantDownload &file:allFiles)
     {
-        std::string filePath="/models/"+file.filename;
+        std::string filePath=m_modelsDir+file.filename;
         if(!std::filesystem::exists(filePath)&&!file.url.empty())
         {
             missingFiles.push_back(&file);
@@ -773,7 +803,7 @@ void ModelRuntime::runBackgroundDownload(
     bool allDownloadsOk=true;
     for(const VariantDownload *file:missingFiles)
     {
-        std::string filePath="/models/"+file->filename;
+        std::string filePath=m_modelsDir+file->filename;
         bool downloadOk=downloadModelFile(
             file->url,
             filePath,
@@ -812,7 +842,7 @@ void ModelRuntime::runBackgroundDownload(
     std::vector<std::string> extraFiles;
     for(size_t i=0; i<allFiles.size(); ++i)
     {
-        std::string filePath="/models/"+allFiles[i].filename;
+        std::string filePath=m_modelsDir+allFiles[i].filename;
         int64_t actualSize=0;
         std::error_code ec;
         if(std::filesystem::exists(filePath, ec))
@@ -1784,11 +1814,20 @@ ErrorCode ModelRuntime::loadLlamaModel(
                 if(hwGpu->backend==GpuBackend::CUDA) expectedPrefix="CUDA";
                 else if(hwGpu->backend==GpuBackend::Vulkan) expectedPrefix="Vulkan";
 
-                for(const GgmlGpuDev &ggmlDev:ggmlGpus)
+                // Try matching with backend prefix first, then without (fallback).
+                // The HW detector may report a GPU as CUDA while ggml only has
+                // Vulkan backends available (or vice versa).
+                std::vector<std::string> prefixesToTry={expectedPrefix, ""};
+
+                for(const std::string &prefix:prefixesToTry)
                 {
-                    // Check backend match first
-                    if(!expectedPrefix.empty()&&ggmlDev.name.find(expectedPrefix)==std::string::npos)
-                        continue;
+                    if(bestMatch) break;
+
+                    for(const GgmlGpuDev &ggmlDev:ggmlGpus)
+                    {
+                        // Check backend match first
+                        if(!prefix.empty()&&ggmlDev.name.find(prefix)==std::string::npos)
+                            continue;
 
                     // Check if HW GPU name appears in ggml description
                     // HW name: "AMD Instinct MI50/MI60 (RADV VEGA20)"
@@ -1847,13 +1886,23 @@ ErrorCode ModelRuntime::loadLlamaModel(
                         bestMatchName=ggmlDev.name;
                         break;
                     }
+                    }
                 }
 
                 if(bestMatch)
                 {
                     targetDevices.push_back(bestMatch);
-                    spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}'",
-                        idx, hwGpu->name, bestMatchName, model);
+                    bool backendFallback=!expectedPrefix.empty()&&bestMatchName.find(expectedPrefix)==std::string::npos;
+                    if(backendFallback)
+                    {
+                        spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}' (backend fallback: {} not available)",
+                            idx, hwGpu->name, bestMatchName, model, expectedPrefix);
+                    }
+                    else
+                    {
+                        spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}'",
+                            idx, hwGpu->name, bestMatchName, model);
+                    }
                 }
                 else
                 {
@@ -1895,12 +1944,24 @@ ErrorCode ModelRuntime::loadLlamaModel(
 
         // Resolve actual context to allocate:
         //   contextSize > 0  → user/config requested explicit size
-        //   contextSize == 0 → use model's native training context
+        //   contextSize == 0 → auto-select the largest context that fits in
+        //                      available VRAM, capped by the model's native
+        //                      training context
         // In both cases, cap by the hardware-fit maximum.
         int actualContext=contextSize;
         if(actualContext<=0)
         {
-            actualContext=nativeContext;
+            // Auto-select: use hardware maximum, but don't exceed native context
+            if(maxHardwareContext>0)
+            {
+                actualContext=std::min(maxHardwareContext, nativeContext);
+                spdlog::info("Auto-selecting context size {} (hardware max={}, native={}) for model '{}'",
+                    actualContext, maxHardwareContext, nativeContext, model);
+            }
+            else
+            {
+                actualContext=nativeContext;
+            }
         }
         if(maxHardwareContext>0&&actualContext>maxHardwareContext)
         {
diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h
index d2abd14..b238851 100644
--- a/src/arbiterAI/modelRuntime.h
+++ b/src/arbiterAI/modelRuntime.h
@@ -119,6 +119,12 @@ class ModelRuntime {
         const std::string &model,
         const std::string &variant="");
 
+    /// Set the base directory for model files (default: "/models").
+    void setModelsDir(const std::string &dir);
+
+    /// Get the current models directory.
+    std::string getModelsDir() const;
+
     /// Set the maximum number of concurrent model downloads (default: 2).
     void setMaxConcurrentDownloads(int max);
 
@@ -280,6 +286,7 @@ class ModelRuntime {
 
     std::map<std::string, LoadedModel> m_models;
     mutable std::mutex m_mutex;
+    std::string m_modelsDir="/models/";
     int m_readyRamBudgetMb=0;
     std::vector<std::string> m_defaultBackendPriority;
     std::set<std::string> m_activeInference; // models currently running inference
diff --git a/src/server/dashboardConfig.h b/src/server/dashboardConfig.h
index af4f079..6eb5dc8 100644
--- a/src/server/dashboardConfig.h
+++ b/src/server/dashboardConfig.h
@@ -1248,7 +1248,7 @@ function renderEffectiveStartup()
         let html='Next restart will load: ';
         const parts=active.map(e=>{
             const label=formatStartupModelLabel(e.model, e.variant);
-            const ctx=e.context_size>0?formatContextSize(e.context_size):'default';
+            const ctx=e.context_size>0?formatContextSize(e.context_size):'auto';
             const devs=e.devices&&e.devices.length>0?'GPU '+e.devices.join(','):'auto';
             return '<strong>'+escapeHtml(label)+'</strong> (ctx: '+escapeHtml(ctx)+', devices: '+escapeHtml(devs)+')';
         });
@@ -1489,13 +1489,94 @@ function updateStartupModelDevice(index, gpuIndex, checked)
     {
         entry.devices=entry.devices.filter(d=>d!==gpuIndex);
     }
+    updateStartupModelSliderGradient(index);
 }
 
 function updateStartupModelContext(index, value)
 {
-    startupModelsState[index].context_size=parseInt(value, 10)||0;
+    const parsed=parseInt(value, 10)||0;
+    startupModelsState[index].context_size=parsed;
     const label=document.getElementById('smCtxLabel_'+index);
-    if(label) label.textContent=formatContextSize(parseInt(value, 10));
+    if(label) label.textContent=parsed===0?'Auto':formatContextSize(parsed);
+    const autoInfo=document.getElementById('smCtxAutoInfo_'+index);
+    if(autoInfo) autoInfo.style.display=parsed===0?'block':'none';
+    updateStartupModelSliderGradient(index);
+}
+
+function getSelectedDevicesVram(index)
+{
+    const entry=startupModelsState[index];
+    if(!entry||!entry.devices||entry.devices.length===0)
+    {
+        // No devices selected: sum all GPU VRAM
+        let total=0;
+        for(const gpu of availableGpus) total+=gpu.vram_total_mb||0;
+        return total;
+    }
+    let total=0;
+    for(const gpuIdx of entry.devices)
+    {
+        const gpu=availableGpus.find(g=>g.index===gpuIdx);
+        if(gpu) total+=gpu.vram_total_mb||0;
+    }
+    return total;
+}
+
+function updateStartupModelSliderGradient(index)
+{
+    const slider=document.getElementById('smCtxSlider_'+index);
+    if(!slider) return;
+    const entry=startupModelsState[index];
+    const modelOpt=availableModelOptions.find(o=>o.model===entry.model&&o.variant===entry.variant);
+
+    if(!modelOpt||!modelOpt.memory_per_1k_context_mb||modelOpt.memory_per_1k_context_mb<=0)
+    {
+        slider.style.background='#32384b';
+        return;
+    }
+
+    const min=parseInt(slider.min);
+    const max=parseInt(slider.max);
+    const range=max-min;
+    if(range<=0){ slider.style.background='#32384b'; return; }
+
+    const baseMemory=modelOpt.base_memory_mb||0;
+    const baseContext=modelOpt.base_context_size||0;
+    const memPer1k=modelOpt.memory_per_1k_context_mb;
+    const availableMemory=getSelectedDevicesVram(index);
+    if(availableMemory<=0){ slider.style.background='#32384b'; return; }
+
+    const likelyCtx=baseContext+((0.85*availableMemory-baseMemory)/memPer1k)*1024;
+    const tightCtx=baseContext+((availableMemory-baseMemory)/memPer1k)*1024;
+
+    // Hard max: clamp slider to the tight max
+    const hardMaxCtx=Math.floor(tightCtx/1024)*1024;
+    if(hardMaxCtx>0&&hardMaxCtx<max)
+    {
+        slider.max=Math.max(hardMaxCtx, min);
+        if(parseInt(slider.value)>parseInt(slider.max))
+        {
+            slider.value=slider.max;
+            updateStartupModelContext(index, slider.value);
+            return;
+        }
+    }
+
+    const likelyPct=Math.max(0, Math.min(100, ((likelyCtx-min)/range)*100));
+    const tightPct=Math.max(0, Math.min(100, ((tightCtx-min)/range)*100));
+
+    if(likelyPct>=100)
+    {
+        slider.style.background='linear-gradient(to right, rgba(76,175,80,0.35) 0%, rgba(76,175,80,0.35) 100%)';
+    }
+    else if(tightPct<=0)
+    {
+        slider.style.background='linear-gradient(to right, rgba(255,96,96,0.35) 0%, rgba(255,96,96,0.35) 100%)';
+    }
+    else
+    {
+        slider.style.background='linear-gradient(to right, rgba(76,175,80,0.35) 0%, rgba(76,175,80,0.35) '+likelyPct+'%, rgba(240,192,64,0.35) '+likelyPct+'%, rgba(240,192,64,0.35) '+tightPct+'%, rgba(255,96,96,0.35) '+tightPct+'%, rgba(255,96,96,0.35) 100%)';
+    }
 }
 
 function readStartupModelRuntimeOpts(index)
@@ -1551,8 +1632,9 @@ function renderStartupModels()
         // Context slider
         const modelOpt=availableModelOptions.find(o=>o.model===entry.model&&o.variant===entry.variant);
         const maxCtx=modelOpt?modelOpt.max_context_size||131072:131072;
-        const ctxVal=entry.context_size>0?Math.min(entry.context_size, maxCtx):4096;
-        const ctxLabel=formatContextSize(ctxVal);
+        const ctxVal=entry.context_size>0?Math.min(entry.context_size, maxCtx):0;
+        const ctxLabel=ctxVal===0?'Auto':formatContextSize(ctxVal);
+        const isAutoCtx=ctxVal===0;
 
         // Device checkboxes
         let devicesHtml='';
@@ -1584,8 +1666,11 @@ function renderStartupModels()
             +'</div>'
             +'<div style="margin-bottom:10px;">'
             +'<label class="startup-field-label">Context Size <span class="context-slider-value" id="smCtxLabel_'+i+'">'+ctxLabel+'</span></label>'
-            +'<input type="range" class="context-slider" min="4096" max="'+maxCtx+'" step="1024" value="'+ctxVal+'" '
+            +'<input type="range" class="context-slider" id="smCtxSlider_'+i+'" min="0" max="'+maxCtx+'" step="1024" value="'+ctxVal+'" '
             +'oninput="updateStartupModelContext('+i+', this.value)">'
+            +'<div id="smCtxAutoInfo_'+i+'" style="display:'+(isAutoCtx?'block':'none')+';margin-top:6px;padding:8px 10px;background:rgba(124,138,255,0.08);border:1px solid rgba(124,138,255,0.2);border-radius:6px;font-size:12px;color:#9eb0ff;">'
+            +'\u2139\uFE0F <strong>Auto:</strong> The server will select the largest context size that fits in the available VRAM of the assigned device(s).'
+            +'</div>'
             +'</div>'
             +'<div style="margin-bottom:10px;">'
             +'<label class="startup-field-label">Compute Devices</label>'
@@ -1608,6 +1693,12 @@ function renderStartupModels()
     }
 
     el.innerHTML=html;
+
+    // Apply VRAM color gradients after DOM is updated
+    for(let i=0; i<startupModelsState.length; i++)
+    {
+        updateStartupModelSliderGradient(i);
+    }
 }
 
 async function saveAllStartupModels()
diff --git a/src/server/main.cpp b/src/server/main.cpp
index 4d91c27..3926c6f 100644
--- a/src/server/main.cpp
+++ b/src/server/main.cpp
@@ -632,6 +632,9 @@ int main(int argc, char *argv[])
     arbiterAI::StorageManager &storage=arbiterAI::StorageManager::instance();
     storage.initialize(modelsDir);
 
+    // ── Models directory for ModelRuntime ─────────────────────────
+    arbiterAI::ModelRuntime::instance().setModelsDir(modelsDir);
+
     if(storageLimitBytes>0)
     {
         storage.setStorageLimit(storageLimitBytes);
diff --git a/src/server/routes.cpp b/src/server/routes.cpp
index 7fd8dc2..f2e5880 100644
--- a/src/server/routes.cpp
+++ b/src/server/routes.cpp
@@ -1471,38 +1471,36 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
 
 void handleListModelsV1(const httplib::Request &, httplib::Response &res)
 {
-    std::vector<std::string> modelNames;
-    ArbiterAI::instance().getAvailableModels(modelNames);
+    // Return only currently loaded models (OpenAI-compatible: models ready for inference)
+    std::vector<LoadedModel> states=ModelRuntime::instance().getModelStates();
 
     auto created=static_cast<int64_t>(std::time(nullptr));
 
     nlohmann::json data=nlohmann::json::array();
-    for(const std::string &name:modelNames)
+    for(const LoadedModel &m:states)
     {
-        // Always emit the bare model name
+        if(m.state!=ModelState::Loaded)
+            continue;
+
+        // Emit bare model name
         data.push_back({
-            {"id", name},
+            {"id", m.modelName},
             {"object", "model"},
             {"created", created},
             {"owned_by", "arbiterai"},
             {"permission", nlohmann::json::array()}
         });
 
-        // For models with variants, also emit "model:variant" entries
-        ModelInfo info;
-        if(ArbiterAI::instance().getModelInfo(name, info)==ErrorCode::Success
-            &&!info.variants.empty())
+        // Also emit "model:variant" if a variant is loaded
+        if(!m.variant.empty())
         {
-            for(const ModelVariant &v:info.variants)
-            {
-                data.push_back({
-                    {"id", name+":"+v.quantization},
-                    {"object", "model"},
-                    {"created", created},
-                    {"owned_by", "arbiterai"},
-                    {"permission", nlohmann::json::array()}
-                });
-            }
+            data.push_back({
+                {"id", m.modelName+":"+m.variant},
+                {"object", "model"},
+                {"created", created},
+                {"owned_by", "arbiterai"},
+                {"permission", nlohmann::json::array()}
+            });
         }
     }