diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index ca54224..bffdcf5 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -21,11 +21,12 @@ A C++17 library providing a unified interface for multiple LLM providers.
 
 1. **All commands** must go through `./runDocker.sh ...`.
 2. **All development** (building, testing, running) must be done inside the Docker container. The host environment is not guaranteed to have the correct tools or dependencies.
-3. **Do not** use `python`, `pip`, `pytest` — the host may not have the correct Python version or dependencies.
-4. **Do not** create or use a virtualenv on the host. The container is the virtualenv.
-5. The project source is **bind-mounted** at `/app` inside the container. Edits to files on the host are immediately visible inside the container.
-6. If you change the `Dockerfile`, run `./runDocker.sh --rebuild`.
-7. Don't launch the server, ask the user to launch so that its not running in the agents terminal.
+3. **Do not** run commands in the terminal with `2>&1` as the user cannot verify the command is running or not.
+4. **Do not** use `python`, `pip`, `pytest` — the host may not have the correct Python version or dependencies.
+5. **Do not** create or use a virtualenv on the host. The container is the virtualenv.
+6. The project source is **bind-mounted** at `/app` inside the container. Edits to files on the host are immediately visible inside the container.
+7. If you change the `Dockerfile`, run `./runDocker.sh --rebuild`.
+8. Don't launch the server, ask the user to launch so that its not running in the agents terminal.
 
 ## Active Tasks
 
diff --git a/.gitignore b/.gitignore
index 004da88..99f7b81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,9 +35,6 @@
 build/
 vcpkg_installed/
 
-# Cloned config repository
-arbiterAI_config/
-
 # Generated at build time by CMake
 **/generated/
 
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..a60ce16
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "arbiterAI_config"]
+	path = arbiterAI_config
+	url = https://github.com/caseymcc/arbiterAI_config.git
\ No newline at end of file
diff --git a/arbiterAI_config b/arbiterAI_config
new file mode 160000
index 0000000..cffe407
--- /dev/null
+++ b/arbiterAI_config
@@ -0,0 +1 @@
+Subproject commit cffe4077ba15986e26035d2e69ad8341f6dcc83d
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1e72eaf..079cfad 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,11 +1,12 @@
 # syntax=docker/dockerfile:1
-ARG DOCKER_VERSION=1.2.0
+ARG DOCKER_VERSION=1.2.1
 FROM ubuntu:24.04
 
 # Install basic build tools, Python 3, and GPU libraries.
 # Vulkan headers + glslc are needed at build time for llama.cpp's Vulkan backend.
-# At runtime, GPU inference requires a Vulkan ICD on the host; without one
-# llama.cpp falls back to CPU-only.
+# ROCm HIP SDK is needed at build time for llama.cpp's HIP/ROCm backend.
+# At runtime, GPU inference requires a Vulkan ICD or ROCm driver on the host;
+# without one llama.cpp falls back to CPU-only.
 RUN apt-get update && apt-get install -y \
     build-essential \
     cmake \
@@ -34,8 +35,25 @@ RUN apt-get update && apt-get install -y \
     mesa-vulkan-drivers \
     glslc \
     glslang-tools \
+    wget \
     && rm -rf /var/lib/apt/lists/*
 
+# Install ROCm HIP SDK (build-time only — no kernel driver needed in container)
+# Use hiplibsdk usecase to get HIP development libraries and CMake configs
+# (the plain "hip" usecase only installs runtime, missing hip-lang-config.cmake)
+RUN wget -q https://repo.radeon.com/amdgpu-install/7.2.1/ubuntu/noble/amdgpu-install_7.2.1.70201-1_all.deb && \
+    apt-get update && \
+    apt-get install -y ./amdgpu-install_7.2.1.70201-1_all.deb && \
+    apt-get update && \
+    amdgpu-install -y --usecase=hiplibsdk --no-dkms && \
+    rm -f amdgpu-install_7.2.1.70201-1_all.deb && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV ROCM_PATH=/opt/rocm
+ENV HIP_PATH=/opt/rocm
+ENV PATH="${ROCM_PATH}/bin:${PATH}"
+ENV CMAKE_PREFIX_PATH="${ROCM_PATH}:${CMAKE_PREFIX_PATH}"
+
 RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
     apt-get install -y nodejs && \
     rm -rf /var/lib/apt/lists/*
diff --git a/docs/server.md b/docs/server.md
index 84b655c..9c55fe6 100644
--- a/docs/server.md
+++ b/docs/server.md
@@ -77,7 +77,8 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
     "hardware": {
         "vram_overrides": {
             "0": 32000
-        }
+        },
+        "default_backend_priority": ["vulkan"]
     },
     "logging": {
         "level": "info",
@@ -116,6 +117,7 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `vram_overrides` | `object` | `{}` | GPU index → VRAM MB overrides (e.g., `{"0": 32000}`) |
+| `default_backend_priority` | `string[]` | `[]` | Default GPU backend preference for models without their own `backend_priority` (e.g., `["vulkan"]`). Empty = all backends. |
 
 **`logging` object:**
 
diff --git a/examples/server_config.json b/examples/server_config.json
index ef8c883..35ea631 100644
--- a/examples/server_config.json
+++ b/examples/server_config.json
@@ -24,7 +24,8 @@
     },
 
     "hardware": {
-        "vram_overrides": {}
+        "vram_overrides": {},
+        "default_backend_priority": []
     },
 
     "logging": {
diff --git a/schemas/model_config.schema.json b/schemas/model_config.schema.json
index d112f68..537777b 100644
--- a/schemas/model_config.schema.json
+++ b/schemas/model_config.schema.json
@@ -247,6 +247,58 @@
                 }
               }
             }
+          },
+          "runtime_options": {
+            "type": "object",
+            "description": "Runtime options for llama.cpp model loading and inference. Applied as defaults; can be overridden at load time via the API.",
+            "properties": {
+              "flash_attn": {
+                "type": "boolean",
+                "description": "Enable or disable flash attention (-fa)"
+              },
+              "kv_cache_type_k": {
+                "type": "string",
+                "description": "KV cache data type for keys (-ctk)",
+                "enum": ["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"]
+              },
+              "kv_cache_type_v": {
+                "type": "string",
+                "description": "KV cache data type for values (-ctv)",
+                "enum": ["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"]
+              },
+              "no_mmap": {
+                "type": "boolean",
+                "description": "Disable memory-mapped file I/O (--no-mmap)"
+              },
+              "reasoning_budget": {
+                "type": "integer",
+                "description": "Reasoning token budget (--reasoning-budget). 0 disables reasoning tokens.",
+                "minimum": 0
+              },
+              "swa_full": {
+                "type": "boolean",
+                "description": "Use full-size sliding window attention cache (--swa-full)"
+              },
+              "n_gpu_layers": {
+                "type": "integer",
+                "description": "Number of layers to offload to GPU (-ngl). 99 offloads all.",
+                "minimum": 0
+              },
+              "override_tensor": {
+                "type": "string",
+                "description": "Tensor override pattern (-ot) for routing tensors to CPU/GPU"
+              }
+            },
+            "additionalProperties": false
+          },
+          "backend_priority": {
+            "type": "array",
+            "description": "Ordered preference for GPU backends. First available backend is used.",
+            "items": {
+              "type": "string",
+              "enum": ["vulkan", "rocm", "cuda"]
+            },
+            "uniqueItems": true
           }
         }
       }
diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp
index 9d8b013..892b93c 100644
--- a/src/arbiterAI/arbiterAI.cpp
+++ b/src/arbiterAI/arbiterAI.cpp
@@ -532,9 +532,15 @@ ErrorCode ArbiterAI::getAvailableModels(std::vector<std::string>& models)
 
 // ========== Local Model Management ==========
 
-ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize)
+ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize,
+    const RuntimeOptions *optionsOverride)
 {
-    return ModelRuntime::instance().loadModel(model, variant, contextSize);
+    RuntimeOptions opts;
+    if(optionsOverride)
+    {
+        opts=*optionsOverride;
+    }
+    return ModelRuntime::instance().loadModel(model, variant, contextSize, opts);
 }
 
 ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant)
diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h
index 7f817ea..0f8d540 100644
--- a/src/arbiterAI/arbiterAI.h
+++ b/src/arbiterAI/arbiterAI.h
@@ -34,6 +34,7 @@ struct ModelFit;
 struct LoadedModel;
 struct SystemSnapshot;
 struct InferenceStats;
+struct RuntimeOptions;
 
 /**
  * @struct VersionInfo
@@ -241,6 +242,42 @@ inline void from_json(const nlohmann::json &j, ToolCall &t)
  * - "assistant": may include tool_calls when the model invokes tools
  * - "tool": includes tool_call_id linking the result back to a specific tool call
  */
+
+/// Extract text from an OpenAI `content` field.
+/// The spec allows content as either a plain string or an array of content
+/// parts (e.g. [{"type":"text","text":"..."},{"type":"image_url",...}]).
+/// This helper concatenates all "text" parts and ignores non-text entries.
+inline std::string contentToString(const nlohmann::json &contentJson)
+{
+    if(contentJson.is_string())
+        return contentJson.get<std::string>();
+
+    if(contentJson.is_array())
+    {
+        std::string result;
+        for(const nlohmann::json &part:contentJson)
+        {
+            if(part.is_string())
+            {
+                if(!result.empty()) result+=' ';
+                result+=part.get<std::string>();
+            }
+            else if(part.is_object()
+                && part.contains("type")
+                && part.at("type").get<std::string>()=="text"
+                && part.contains("text"))
+            {
+                if(!result.empty()) result+=' ';
+                result+=part.at("text").get<std::string>();
+            }
+            // Skip non-text parts (image_url, etc.)
+        }
+        return result;
+    }
+
+    return {};
+}
+
 struct Message
 {
     std::string role;
@@ -262,7 +299,7 @@ inline void from_json(const nlohmann::json &j, Message &m)
 {
     j.at("role").get_to(m.role);
     if(j.contains("content") && !j.at("content").is_null())
-        j.at("content").get_to(m.content);
+        m.content=contentToString(j.at("content"));
     if(j.contains("tool_call_id"))
         m.toolCallId=j.at("tool_call_id").get<std::string>();
     if(j.contains("tool_calls"))
@@ -605,9 +642,11 @@ class ArbiterAI
      * @param model Model name
      * @param variant Quantization variant (empty = auto-select)
      * @param contextSize Context size (0 = model default)
+     * @param optionsOverride Optional runtime options to merge on top of model config defaults (nullptr = use config defaults)
      * @return ErrorCode indicating success, ModelDownloading, or failure
      */
-    ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0);
+    ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0,
+        const RuntimeOptions *optionsOverride=nullptr);
 
     /**
      * @brief Download model files without loading into VRAM
diff --git a/src/arbiterAI/configDownloader.cpp b/src/arbiterAI/configDownloader.cpp
index 7571dac..d4d8198 100644
--- a/src/arbiterAI/configDownloader.cpp
+++ b/src/arbiterAI/configDownloader.cpp
@@ -195,20 +195,24 @@ ConfigDownloadStatus ConfigDownloader::checkoutVersion()
 
     spdlog::info("Checking out version: {}", m_version);
 
-    // Try to resolve the version as a direct ref, remote branch, or tag
+    // Try to resolve the version — prefer remote branch refs first so that
+    // a fetch+checkout always picks up the latest remote commit rather than
+    // a stale local branch ref that was never fast-forwarded.
     git_object *obj=nullptr;
-    error=git_revparse_single(&obj, repo, m_version.c_str());
+
+    // 1. Try as a remote-tracking branch (most common path after fetch)
+    std::string remoteBranch="refs/remotes/origin/"+m_version;
+    error=git_revparse_single(&obj, repo, remoteBranch.c_str());
 
     if(error!=0)
     {
-        // Try as a remote branch
-        std::string remoteBranch="refs/remotes/origin/"+m_version;
-        error=git_revparse_single(&obj, repo, remoteBranch.c_str());
+        // 2. Try as a direct ref / local branch / SHA
+        error=git_revparse_single(&obj, repo, m_version.c_str());
     }
 
     if(error!=0)
     {
-        // Try as a tag
+        // 3. Try as a tag
         std::string tag="refs/tags/"+m_version;
         error=git_revparse_single(&obj, repo, tag.c_str());
     }
diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp
index 1523cd2..d80cbe0 100644
--- a/src/arbiterAI/modelManager.cpp
+++ b/src/arbiterAI/modelManager.cpp
@@ -62,6 +62,18 @@ bool ModelVariant::isSplit() const
     return files.size()>1;
 }
 
+void RuntimeOptions::mergeFrom(const RuntimeOptions &other)
+{
+    if(other.flashAttn.has_value()) flashAttn=other.flashAttn;
+    if(other.kvCacheTypeK.has_value()) kvCacheTypeK=other.kvCacheTypeK;
+    if(other.kvCacheTypeV.has_value()) kvCacheTypeV=other.kvCacheTypeV;
+    if(other.noMmap.has_value()) noMmap=other.noMmap;
+    if(other.reasoningBudget.has_value()) reasoningBudget=other.reasoningBudget;
+    if(other.swaFull.has_value()) swaFull=other.swaFull;
+    if(other.nGpuLayers.has_value()) nGpuLayers=other.nGpuLayers;
+    if(other.overrideTensor.has_value()) overrideTensor=other.overrideTensor;
+}
+
 ModelManager &ModelManager::instance()
 {
     static ModelManager instance;
@@ -104,6 +116,13 @@ bool ModelManager::initialize(const std::vector<std::filesystem::path> &configPa
                 }
             }
         }
+
+        // Load GPU backend rules from the config repo
+        auto backendsPath=m_configDownloader.getLocalPath()/"configs"/"defaults"/"backends"/"gpu_backends.json";
+        if(std::filesystem::exists(backendsPath))
+        {
+            loadGpuBackendRules(backendsPath);
+        }
     }
     else
     {
@@ -350,6 +369,53 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in
         }
     }
 
+    // Runtime options (llama.cpp model load/inference parameters)
+    if(modelJson.contains("runtime_options")&&modelJson["runtime_options"].is_object())
+    {
+        auto &ro=modelJson["runtime_options"];
+
+        if(ro.contains("flash_attn")&&ro["flash_attn"].is_boolean())
+            info.runtimeOptions.flashAttn=ro["flash_attn"].get<bool>();
+        if(ro.contains("kv_cache_type_k")&&ro["kv_cache_type_k"].is_string())
+            info.runtimeOptions.kvCacheTypeK=ro["kv_cache_type_k"].get<std::string>();
+        if(ro.contains("kv_cache_type_v")&&ro["kv_cache_type_v"].is_string())
+            info.runtimeOptions.kvCacheTypeV=ro["kv_cache_type_v"].get<std::string>();
+        if(ro.contains("no_mmap")&&ro["no_mmap"].is_boolean())
+            info.runtimeOptions.noMmap=ro["no_mmap"].get<bool>();
+        if(ro.contains("reasoning_budget")&&ro["reasoning_budget"].is_number_integer())
+            info.runtimeOptions.reasoningBudget=ro["reasoning_budget"].get<int>();
+        if(ro.contains("swa_full")&&ro["swa_full"].is_boolean())
+            info.runtimeOptions.swaFull=ro["swa_full"].get<bool>();
+        if(ro.contains("n_gpu_layers")&&ro["n_gpu_layers"].is_number_integer())
+            info.runtimeOptions.nGpuLayers=ro["n_gpu_layers"].get<int>();
+        if(ro.contains("override_tensor")&&ro["override_tensor"].is_string())
+            info.runtimeOptions.overrideTensor=ro["override_tensor"].get<std::string>();
+    }
+
+    // Backend priority (ordered preference for GPU compute backends)
+    if(modelJson.contains("backend_priority")&&modelJson["backend_priority"].is_array())
+    {
+        for(const auto &bp:modelJson["backend_priority"])
+        {
+            if(bp.is_string())
+            {
+                info.backendPriority.push_back(bp.get<std::string>());
+            }
+        }
+    }
+
+    // Disabled backends (model-level override to exclude specific backends)
+    if(modelJson.contains("disabled_backends")&&modelJson["disabled_backends"].is_array())
+    {
+        for(const auto &db:modelJson["disabled_backends"])
+        {
+            if(db.is_string())
+            {
+                info.disabledBackends.push_back(db.get<std::string>());
+            }
+        }
+    }
+
     return true;
 }
 
@@ -632,6 +698,7 @@ bool ModelManager::addModelFromJson(const nlohmann::json &modelJson, std::string
     m_models.push_back(info);
     m_modelProviderMap[info.model]=info.provider;
     m_runtimeModels.insert(info.model);
+    saveInjectedConfig(info.model);
     return true;
 }
 
@@ -683,6 +750,7 @@ bool ModelManager::updateModelFromJson(const nlohmann::json &modelJson, std::str
 
     m_modelProviderMap[info.model]=info.provider;
     m_runtimeModels.insert(info.model);
+    saveInjectedConfig(info.model);
     return true;
 }
 
@@ -696,10 +764,129 @@ bool ModelManager::removeModel(const std::string &modelName)
 
     m_models.erase(it);
     m_modelProviderMap.erase(modelName);
-    m_runtimeModels.erase(modelName);
+
+    if(m_runtimeModels.count(modelName))
+    {
+        m_runtimeModels.erase(modelName);
+        removeInjectedConfig(modelName);
+    }
+
     return true;
 }
 
+bool ModelManager::loadGpuBackendRules(const std::filesystem::path &filePath)
+{
+    spdlog::info("Loading GPU backend rules from: {}", filePath.string());
+
+    try
+    {
+        std::ifstream file(filePath);
+
+        if(!file.is_open())
+        {
+            spdlog::warn("Cannot open GPU backend rules file: {}", filePath.string());
+            return false;
+        }
+
+        nlohmann::json j=nlohmann::json::parse(file, nullptr, true, true);
+
+        if(!j.contains("gpu_backends")||!j["gpu_backends"].is_array())
+        {
+            spdlog::warn("GPU backend rules file missing 'gpu_backends' array");
+            return false;
+        }
+
+        m_gpuBackendRules.clear();
+
+        for(const nlohmann::json &entry:j["gpu_backends"])
+        {
+            GpuBackendRule rule;
+            rule.name=entry.value("name", "");
+
+            if(entry.contains("match")&&entry["match"].is_array())
+            {
+                for(const nlohmann::json &m:entry["match"])
+                {
+                    rule.match.push_back(m.get<std::string>());
+                }
+            }
+
+            if(entry.contains("disabled_backends")&&entry["disabled_backends"].is_array())
+            {
+                for(const nlohmann::json &d:entry["disabled_backends"])
+                {
+                    rule.disabledBackends.push_back(d.get<std::string>());
+                }
+            }
+
+            if(entry.contains("backend_priority")&&entry["backend_priority"].is_array())
+            {
+                for(const nlohmann::json &bp:entry["backend_priority"])
+                {
+                    rule.backendPriority.push_back(bp.get<std::string>());
+                }
+            }
+
+            rule.notes=entry.value("notes", "");
+
+            if(!rule.match.empty())
+            {
+                spdlog::info("  GPU backend rule '{}': match=[{}], priority=[{}], disabled=[{}]",
+                    rule.name,
+                    [&]()
+                    {
+                        std::string s;
+                        for(const std::string &m:rule.match) { if(!s.empty()) s+=", "; s+=m; }
+                        return s;
+                    }(),
+                    [&]()
+                    {
+                        std::string s;
+                        for(const std::string &p:rule.backendPriority) { if(!s.empty()) s+=", "; s+=p; }
+                        return s;
+                    }(),
+                    [&]()
+                    {
+                        std::string s;
+                        for(const std::string &d:rule.disabledBackends) { if(!s.empty()) s+=", "; s+=d; }
+                        return s;
+                    }());
+                m_gpuBackendRules.push_back(std::move(rule));
+            }
+        }
+
+        spdlog::info("Loaded {} GPU backend rules", m_gpuBackendRules.size());
+        return true;
+    }
+    catch(const std::exception &e)
+    {
+        spdlog::warn("Failed to parse GPU backend rules: {}", e.what());
+        return false;
+    }
+}
+
+std::optional<GpuBackendRule> ModelManager::findGpuBackendRule(const std::string &gpuName) const
+{
+    std::string gpuLower=gpuName;
+    std::transform(gpuLower.begin(), gpuLower.end(), gpuLower.begin(), ::tolower);
+
+    for(const GpuBackendRule &rule:m_gpuBackendRules)
+    {
+        for(const std::string &pattern:rule.match)
+        {
+            std::string patternLower=pattern;
+            std::transform(patternLower.begin(), patternLower.end(), patternLower.begin(), ::tolower);
+
+            if(gpuLower.find(patternLower)!=std::string::npos)
+            {
+                return rule;
+            }
+        }
+    }
+
+    return std::nullopt;
+}
+
 nlohmann::json ModelManager::modelInfoToJson(const ModelInfo &info)
 {
     nlohmann::json j;
@@ -802,6 +989,41 @@ nlohmann::json ModelManager::modelInfoToJson(const ModelInfo &info)
         j["variants"]=variants;
     }
 
+    // Runtime options
+    {
+        nlohmann::json ro;
+        if(info.runtimeOptions.flashAttn.has_value())
+            ro["flash_attn"]=info.runtimeOptions.flashAttn.value();
+        if(info.runtimeOptions.kvCacheTypeK.has_value())
+            ro["kv_cache_type_k"]=info.runtimeOptions.kvCacheTypeK.value();
+        if(info.runtimeOptions.kvCacheTypeV.has_value())
+            ro["kv_cache_type_v"]=info.runtimeOptions.kvCacheTypeV.value();
+        if(info.runtimeOptions.noMmap.has_value())
+            ro["no_mmap"]=info.runtimeOptions.noMmap.value();
+        if(info.runtimeOptions.reasoningBudget.has_value())
+            ro["reasoning_budget"]=info.runtimeOptions.reasoningBudget.value();
+        if(info.runtimeOptions.swaFull.has_value())
+            ro["swa_full"]=info.runtimeOptions.swaFull.value();
+        if(info.runtimeOptions.nGpuLayers.has_value())
+            ro["n_gpu_layers"]=info.runtimeOptions.nGpuLayers.value();
+        if(info.runtimeOptions.overrideTensor.has_value())
+            ro["override_tensor"]=info.runtimeOptions.overrideTensor.value();
+        if(!ro.empty())
+            j["runtime_options"]=ro;
+    }
+
+    // Backend priority
+    if(!info.backendPriority.empty())
+    {
+        j["backend_priority"]=info.backendPriority;
+    }
+
+    // Disabled backends
+    if(!info.disabledBackends.empty())
+    {
+        j["disabled_backends"]=info.disabledBackends;
+    }
+
     return j;
 }
 
@@ -849,4 +1071,177 @@ bool ModelManager::saveOverrides(const std::filesystem::path &overridePath) cons
     return true;
 }
 
+std::string ModelManager::sanitizeFilename(const std::string &name)
+{
+    std::string result;
+    result.reserve(name.size());
+
+    for(char c:name)
+    {
+        if(std::isalnum(static_cast<unsigned char>(c))||c=='-'||c=='_'||c=='.')
+        {
+            result+=c;
+        }
+        else
+        {
+            result+='_';
+        }
+    }
+
+    return result;
+}
+
+void ModelManager::setInjectedConfigDir(const std::filesystem::path &dir)
+{
+    m_injectedConfigDir=dir;
+
+    if(!dir.empty())
+    {
+        std::error_code ec;
+        std::filesystem::create_directories(dir, ec);
+
+        if(ec)
+        {
+            spdlog::error("Failed to create injected config directory '{}': {}", dir.string(), ec.message());
+        }
+        else
+        {
+            spdlog::info("Injected model configs will be persisted to: {}", dir.string());
+        }
+    }
+}
+
+int ModelManager::loadInjectedConfigs()
+{
+    if(m_injectedConfigDir.empty()||!std::filesystem::exists(m_injectedConfigDir))
+    {
+        return 0;
+    }
+
+    int loaded=0;
+
+    for(const auto &entry:std::filesystem::directory_iterator(m_injectedConfigDir))
+    {
+        if(entry.path().extension()!=".json")
+            continue;
+
+        try
+        {
+            std::ifstream file(entry.path());
+            if(!file.is_open())
+            {
+                spdlog::warn("Failed to open injected config: {}", entry.path().string());
+                continue;
+            }
+
+            nlohmann::json j=nlohmann::json::parse(file);
+
+            if(!j.contains("model")||!j["model"].is_string())
+            {
+                spdlog::warn("Injected config missing 'model' field: {}", entry.path().string());
+                continue;
+            }
+
+            std::string modelName=j["model"].get<std::string>();
+
+            // Skip if a model with this name already exists (repo configs take precedence)
+            auto existing=std::find_if(m_models.begin(), m_models.end(),
+                [&modelName](const ModelInfo &info) { return info.model==modelName; });
+
+            if(existing!=m_models.end())
+            {
+                spdlog::debug("Skipping injected config for '{}' — already loaded from config repo", modelName);
+                continue;
+            }
+
+            std::string error;
+            if(addModelFromJson(j, error))
+            {
+                spdlog::info("Restored injected model config: {}", modelName);
+                ++loaded;
+            }
+            else
+            {
+                spdlog::warn("Failed to restore injected config '{}': {}", modelName, error);
+            }
+        }
+        catch(const nlohmann::json::parse_error &e)
+        {
+            spdlog::warn("Failed to parse injected config '{}': {}", entry.path().string(), e.what());
+        }
+    }
+
+    if(loaded>0)
+    {
+        spdlog::info("Restored {} injected model config(s) from {}", loaded, m_injectedConfigDir.string());
+    }
+
+    return loaded;
+}
+
+bool ModelManager::saveInjectedConfig(const std::string &modelName) const
+{
+    if(m_injectedConfigDir.empty())
+        return true; // no persistence configured — not an error
+
+    // Find the model info
+    auto it=std::find_if(m_models.begin(), m_models.end(),
+        [&modelName](const ModelInfo &info) { return info.model==modelName; });
+
+    if(it==m_models.end())
+        return false;
+
+    nlohmann::json j=modelInfoToJson(*it);
+
+    std::string filename=sanitizeFilename(modelName)+".json";
+    std::filesystem::path filePath=m_injectedConfigDir/filename;
+    std::filesystem::path tempPath=filePath.string()+".tmp";
+
+    std::ofstream file(tempPath);
+    if(!file.is_open())
+    {
+        spdlog::error("Failed to write injected config for '{}': cannot open {}", modelName, tempPath.string());
+        return false;
+    }
+
+    file<<j.dump(4);
+    file.close();
+
+    std::error_code ec;
+    std::filesystem::rename(tempPath, filePath, ec);
+
+    if(ec)
+    {
+        spdlog::error("Failed to persist injected config for '{}': {}", modelName, ec.message());
+        std::filesystem::remove(tempPath, ec);
+        return false;
+    }
+
+    spdlog::info("Persisted injected model config: {} -> {}", modelName, filePath.string());
+    return true;
+}
+
+bool ModelManager::removeInjectedConfig(const std::string &modelName) const
+{
+    if(m_injectedConfigDir.empty())
+        return true;
+
+    std::string filename=sanitizeFilename(modelName)+".json";
+    std::filesystem::path filePath=m_injectedConfigDir/filename;
+
+    std::error_code ec;
+    if(std::filesystem::exists(filePath, ec))
+    {
+        std::filesystem::remove(filePath, ec);
+        if(ec)
+        {
+            spdlog::error("Failed to remove injected config for '{}': {}", modelName, ec.message());
+            return false;
+        }
+        spdlog::info("Removed injected model config: {}", filePath.string());
+    }
+
+    return true;
+}
+
 } // namespace arbiterAI
diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h
index 703cb0f..f7ef48b 100644
--- a/src/arbiterAI/modelManager.h
+++ b/src/arbiterAI/modelManager.h
@@ -40,6 +40,22 @@ struct VariantDownload {
     std::string filename;
 };
 
+/// Runtime options that control llama.cpp model loading and inference behavior.
+/// These can be set per-model in the config and overridden at load time via the API.
+struct RuntimeOptions {
+    std::optional<bool> flashAttn;              // -fa: enable/disable flash attention
+    std::optional<std::string> kvCacheTypeK;    // -ctk: KV cache type for keys (e.g. "f16", "q8_0", "q4_0")
+    std::optional<std::string> kvCacheTypeV;    // -ctv: KV cache type for values
+    std::optional<bool> noMmap;                 // --no-mmap: disable memory mapping
+    std::optional<int> reasoningBudget;         // --reasoning-budget: reasoning token budget (0=disabled)
+    std::optional<bool> swaFull;                // --swa-full: full SWA (sliding window attention)
+    std::optional<int> nGpuLayers;              // -ngl: number of GPU layers (99=all)
+    std::optional<std::string> overrideTensor;  // -ot: tensor override pattern (e.g. "per_layer_token_embd.weight=CPU")
+
+    /// Merge another set of options on top of this one (override only non-empty fields).
+    void mergeFrom(const RuntimeOptions &other);
+};
+
 struct ModelVariant {
     std::string quantization;
     int fileSizeMb=0;
@@ -103,11 +119,24 @@ struct ModelInfo
     std::optional<HardwareRequirements> hardwareRequirements;
     std::optional<ContextScaling> contextScaling;
     std::vector<ModelVariant> variants;
+    RuntimeOptions runtimeOptions;              // Per-model llama.cpp runtime options
+    std::vector<std::string> backendPriority;   // Ordered preference: ["vulkan", "rocm", "cuda"]
+    std::vector<std::string> disabledBackends;  // Backends to exclude (model-level override)
 
     bool isCompatible(const std::string &clientVersion) const;
     bool isSchemaCompatible(const std::string &schemaVersion) const;
 };
 
+/// GPU architecture backend configuration entry.
+/// Matched against detected GPU names to determine default backend behavior.
+struct GpuBackendRule {
+    std::string name;                           // Human-readable name (e.g. "AMD RDNA 3.5 (Strix Point)")
+    std::vector<std::string> match;             // Case-insensitive substrings to match against GPU name
+    std::vector<std::string> disabledBackends;  // Backends to disable for this architecture
+    std::vector<std::string> backendPriority;   // Preferred backend order
+    std::string notes;                          // Human-readable notes
+};
+
 class ModelManager
 {
 public:
@@ -129,6 +158,24 @@ class ModelManager
     static nlohmann::json modelInfoToJson(const ModelInfo &info);
     bool saveOverrides(const std::filesystem::path &overridePath) const;
 
+    /// Set the directory where injected model configs are persisted as individual
+    /// JSON files. Each file is named after the model (sanitized). On add/update
+    /// the file is written; on delete the file is removed. Call loadInjectedConfigs()
+    /// after initialize() to restore previously injected models.
+    void setInjectedConfigDir(const std::filesystem::path &dir);
+
+    /// Load all previously persisted injected model configs from the injected
+    /// config directory. Models that already exist (from the config repo) are
+    /// skipped — injected configs never shadow repo configs.
+    int loadInjectedConfigs();
+
+    /// Find the first GpuBackendRule whose match patterns hit the given GPU name.
+    /// Returns nullopt if no rule matches.
+    std::optional<GpuBackendRule> findGpuBackendRule(const std::string &gpuName) const;
+
+    /// Get all loaded GPU backend rules (for diagnostics / API).
+    const std::vector<GpuBackendRule> &getGpuBackendRules() const { return m_gpuBackendRules; }
+
 public:
     static int compareVersions(const std::string &v1, const std::string &v2);
 
@@ -140,11 +187,24 @@ class ModelManager
     bool validateModelJson(const nlohmann::json &modelJson, std::string &error) const;
     void mergeModelInfo(ModelInfo &existing, const ModelInfo &source, const nlohmann::json &sourceJson) const;
 
+    /// Persist a single injected model config to the injected config directory.
+    bool saveInjectedConfig(const std::string &modelName) const;
+
+    /// Remove a single injected model config file from the injected config directory.
+    bool removeInjectedConfig(const std::string &modelName) const;
+
+    /// Sanitize a model name into a safe filename (alphanumeric, hyphens, underscores).
+    static std::string sanitizeFilename(const std::string &name);
+
     std::vector<ModelInfo> m_models;
     std::map<std::string, std::string> m_modelProviderMap;
     std::set<std::string> m_runtimeModels;
+    std::vector<GpuBackendRule> m_gpuBackendRules;
     ConfigDownloader m_configDownloader;
+    std::filesystem::path m_injectedConfigDir;
     bool m_initialized{ false };
+
+    bool loadGpuBackendRules(const std::filesystem::path &filePath);
 };
 
 } // namespace arbiterAI
diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp
index f4f80dd..4b5df20 100644
--- a/src/arbiterAI/modelRuntime.cpp
+++ b/src/arbiterAI/modelRuntime.cpp
@@ -1,10 +1,12 @@
 #include "arbiterAI/modelRuntime.h"
 #include "arbiterAI/hardwareDetector.h"
+#include "arbiterAI/modelManager.h"
 #include "arbiterAI/telemetryCollector.h"
 #include "arbiterAI/storageManager.h"
 
 #include <llama.h>
 #include <ggml.h>
+#include <ggml-backend.h>
 #include <spdlog/spdlog.h>
 #include <algorithm>
 #include <filesystem>
@@ -14,6 +16,23 @@
 namespace arbiterAI
 {
 
+/// Map a string KV cache type name to the corresponding ggml_type enum value.
+/// Returns GGML_TYPE_COUNT if the string is not recognized.
+static ggml_type parseGgmlType(const std::string &name)
+{
+    if(name=="f32")  return GGML_TYPE_F32;
+    if(name=="f16")  return GGML_TYPE_F16;
+    if(name=="bf16") return GGML_TYPE_BF16;
+    if(name=="q8_0") return GGML_TYPE_Q8_0;
+    if(name=="q4_0") return GGML_TYPE_Q4_0;
+    if(name=="q4_1") return GGML_TYPE_Q4_1;
+    if(name=="q5_0") return GGML_TYPE_Q5_0;
+    if(name=="q5_1") return GGML_TYPE_Q5_1;
+
+    spdlog::warn("Unknown ggml type '{}', ignoring", name);
+    return GGML_TYPE_COUNT;
+}
+
 ModelRuntime &ModelRuntime::instance()
 {
     static ModelRuntime runtime;
@@ -151,6 +170,7 @@ const char *loadFailureReasonToString(LoadFailureReason reason)
         case LoadFailureReason::ContextTooLarge:   return "context_too_large";
         case LoadFailureReason::UnsupportedArch:   return "unsupported_arch";
         case LoadFailureReason::BackendError:      return "backend_error";
+        case LoadFailureReason::VulkanDeviceLost:  return "vulkan_device_lost";
         default:                                   return "unknown";
     }
 }
@@ -197,6 +217,23 @@ LoadErrorDetail ModelRuntime::classifyLoadFailure(
         return detail;
     }
 
+    // Check for Vulkan device lost (GPU driver reset or hung pipeline)
+    if(logLower.find("errordevicelost")!=std::string::npos||
+        logLower.find("error_device_lost")!=std::string::npos||
+        logLower.find("device lost")!=std::string::npos||
+        logLower.find("vk_error_device_lost")!=std::string::npos||
+        logLower.find("vk::queue::submit")!=std::string::npos&&logLower.find("lost")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::VulkanDeviceLost;
+        detail.summary="Vulkan device lost during model load — the GPU driver may have reset";
+        detail.suggestion="The Vulkan backend will be reinitialized automatically. "
+            "If this persists, restart the server. Possible causes: GPU overheating, "
+            "driver bug, or another process crashing the GPU.";
+        detail.action="reinit_backend";
+        detail.recoverable=true;
+        return detail;
+    }
+
     // Check for VRAM / GPU memory allocation failures
     if(logLower.find("out of memory")!=std::string::npos||
         logLower.find("cuda error")!=std::string::npos||
@@ -299,7 +336,8 @@ int ModelRuntime::getMaxConcurrentDownloads() const
 ErrorCode ModelRuntime::loadModel(
     const std::string &model,
     const std::string &variant,
-    int contextSize)
+    int contextSize,
+    const RuntimeOptions &optionsOverride)
 {
     std::lock_guard<std::mutex> lock(m_mutex);
 
@@ -471,9 +509,17 @@ ErrorCode ModelRuntime::loadModel(
             // Actually load llama.cpp model for local providers
             if(modelInfo->provider=="llama")
             {
+                // Resolve runtime options: model config defaults + API override
+                RuntimeOptions resolvedOptions=modelInfo->runtimeOptions;
+                resolvedOptions.mergeFrom(optionsOverride);
+                entry.activeOptions=resolvedOptions;
+
+                // Resolve backend priority: model config > architecture rule > server default
+                std::vector<std::string> effectiveBackendPriority=resolveBackendPriority(*modelInfo);
+
                 std::string filePath="/models/"+primaryFilename;
                 ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices,
-                    fit.maxContextSize);
+                    fit.maxContextSize, resolvedOptions, effectiveBackendPriority);
                 if(loadResult!=ErrorCode::Success)
                 {
                     m_models.erase(model);
@@ -831,7 +877,8 @@ ErrorCode ModelRuntime::unpinModel(const std::string &model)
 ErrorCode ModelRuntime::swapModel(
     const std::string &newModel,
     const std::string &variant,
-    int contextSize)
+    int contextSize,
+    const RuntimeOptions &optionsOverride)
 {
     if(m_inferenceActive)
     {
@@ -841,6 +888,7 @@ ErrorCode ModelRuntime::swapModel(
         req.model=newModel;
         req.variant=variant;
         req.contextSize=contextSize;
+        req.optionsOverride=optionsOverride;
         m_pendingSwaps.push(req);
         spdlog::info("Swap to '{}' queued (inference active)", newModel);
         return ErrorCode::ModelDownloading; // "queued" status
@@ -879,7 +927,7 @@ ErrorCode ModelRuntime::swapModel(
         }
     }
 
-    ErrorCode result=loadModel(newModel, variant, contextSize);
+    ErrorCode result=loadModel(newModel, variant, contextSize, optionsOverride);
 
     // Record swap telemetry
     std::chrono::steady_clock::time_point swapEnd=std::chrono::steady_clock::now();
@@ -943,6 +991,142 @@ int ModelRuntime::getReadyRamBudget() const
     return m_readyRamBudgetMb;
 }
 
+void ModelRuntime::setDefaultBackendPriority(const std::vector<std::string> &priority)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_defaultBackendPriority=priority;
+
+    if(!priority.empty())
+    {
+        std::string joined;
+        for(const std::string &p:priority)
+        {
+            if(!joined.empty()) joined+=", ";
+            joined+=p;
+        }
+        spdlog::info("Default backend priority set to [{}]", joined);
+    }
+    else
+    {
+        spdlog::info("Default backend priority cleared (all backends)");
+    }
+}
+
+std::vector<std::string> ModelRuntime::getDefaultBackendPriority() const
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return m_defaultBackendPriority;
+}
+
+std::vector<std::string> ModelRuntime::resolveBackendPriority(const ModelInfo &model) const
+{
+    // Layered resolution:
+    //   1. Model config backend_priority (highest priority)
+    //   2. GPU architecture rule from config repo (matched by GPU name)
+    //   3. Server default_backend_priority (lowest priority)
+    //
+    // disabled_backends are collected from all layers (union) and removed
+    // from the final priority list.
+
+    std::vector<std::string> priority;
+    std::vector<std::string> disabled;
+
+    // Collect disabled backends from model config
+    for(const std::string &d:model.disabledBackends)
+    {
+        std::string lower=d;
+        std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+        disabled.push_back(lower);
+    }
+
+    // Look up architecture rule based on detected GPU names
+    std::optional<GpuBackendRule> archRule;
+    std::vector<GpuInfo> gpus=HardwareDetector::instance().getGpus();
+
+    for(const GpuInfo &gpu:gpus)
+    {
+        archRule=ModelManager::instance().findGpuBackendRule(gpu.name);
+        if(archRule)
+        {
+            spdlog::debug("GPU '{}' matched architecture rule '{}'", gpu.name, archRule->name);
+            break;
+        }
+    }
+
+    // Collect disabled backends from architecture rule
+    if(archRule)
+    {
+        for(const std::string &d:archRule->disabledBackends)
+        {
+            std::string lower=d;
+            std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+            if(std::find(disabled.begin(), disabled.end(), lower)==disabled.end())
+                disabled.push_back(lower);
+        }
+    }
+
+    // Determine priority: model config > architecture rule > server default
+    if(!model.backendPriority.empty())
+    {
+        priority=model.backendPriority;
+    }
+    else if(archRule&&!archRule->backendPriority.empty())
+    {
+        priority=archRule->backendPriority;
+    }
+    else
+    {
+        priority=m_defaultBackendPriority;
+    }
+
+    // Remove disabled backends from the priority list
+    if(!disabled.empty()&&!priority.empty())
+    {
+        std::vector<std::string> filtered;
+
+        for(const std::string &p:priority)
+        {
+            std::string lower=p;
+            std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+
+            // Also check rocm/hip aliasing
+            bool isDisabled=false;
+            for(const std::string &d:disabled)
+            {
+                if(lower==d) { isDisabled=true; break; }
+                if((lower=="rocm"||lower=="hip")&&(d=="rocm"||d=="hip")) { isDisabled=true; break; }
+            }
+
+            if(!isDisabled)
+                filtered.push_back(p);
+        }
+
+        priority=filtered;
+    }
+
+    if(!priority.empty()||!disabled.empty())
+    {
+        spdlog::info("Backend resolution for '{}': priority=[{}], disabled=[{}], source={}",
+            model.model,
+            [&]()
+            {
+                std::string s;
+                for(const std::string &p:priority) { if(!s.empty()) s+=", "; s+=p; }
+                return s.empty()?"(all)":s;
+            }(),
+            [&]()
+            {
+                std::string s;
+                for(const std::string &d:disabled) { if(!s.empty()) s+=", "; s+=d; }
+                return s.empty()?"(none)":s;
+            }(),
+            !model.backendPriority.empty()?"model config":
+                (archRule?"architecture rule '"+archRule->name+"'":"server default"));
+    }
+
+    return priority;
+}
+
 void ModelRuntime::evictIfNeeded(int requiredVramMb)
 {
     // Calculate current VRAM usage across all loaded models
@@ -1113,7 +1297,7 @@ void ModelRuntime::drainPendingSwaps()
 
     // Release lock before calling swapModel (it acquires its own lock)
     m_mutex.unlock();
-    swapModel(latest.model, latest.variant, latest.contextSize);
+    swapModel(latest.model, latest.variant, latest.contextSize, latest.optionsOverride);
     m_mutex.lock();
 }
 
@@ -1189,94 +1373,310 @@ void ModelRuntime::initLlamaBackend()
     }
 }
 
+void ModelRuntime::reinitLlamaBackend()
+{
+    spdlog::warn("Reinitializing llama.cpp backend (Vulkan device recovery)");
+
+    if(m_llamaInitialized)
+    {
+        llama_backend_free();
+        m_llamaInitialized=false;
+    }
+
+    // Brief pause to let the GPU driver settle after a device-lost event
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+    llama_backend_init();
+    m_llamaInitialized=true;
+
+    spdlog::info("llama.cpp backend reinitialized successfully");
+}
+
 ErrorCode ModelRuntime::loadLlamaModel(
     const std::string &model,
     const std::string &filePath,
     int contextSize,
     const std::vector<int> &gpuIndices,
-    int maxHardwareContext)
+    int maxHardwareContext,
+    const RuntimeOptions &options,
+    const std::vector<std::string> &backendPriority)
 {
     initLlamaBackend();
 
-    // Start capturing llama.cpp log output for diagnostics
-    beginLlamaLogCapture();
+    // Log available backend devices matching backendPriority for diagnostics.
+    // NOTE: We intentionally do NOT set mparams.devices — llama.cpp's default
+    // device selection (devices=NULL) produces much better tensor placement on
+    // UMA/iGPU systems. When an explicit device list is provided that includes
+    // both GPU and CPU, llama.cpp treats them as co-equal allocation targets and
+    // splits model tensors and KV cache across both, which dramatically hurts
+    // performance on unified-memory architectures. Leaving devices=NULL lets
+    // llama.cpp's internal logic keep everything on the GPU device.
+    if(!backendPriority.empty())
+    {
+        size_t devCount=ggml_backend_dev_count();
+        std::vector<std::string> matchedDevices;
 
-    llama_model_params mparams=llama_model_default_params();
-    mparams.n_gpu_layers=99; // offload all layers to GPU by default
+        for(const std::string &preferred:backendPriority)
+        {
+            std::string prefLower=preferred;
+            std::transform(prefLower.begin(), prefLower.end(), prefLower.begin(), ::tolower);
 
-    llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams);
-    if(!llamaModel)
-    {
-        std::string captured=m_llamaLogCapture.str();
-        endLlamaLogCapture();
+            for(size_t i=0; i<devCount; ++i)
+            {
+                ggml_backend_dev_t dev=ggml_backend_dev_get(i);
+                enum ggml_backend_dev_type devType=ggml_backend_dev_type(dev);
 
-        m_lastLoadError=classifyLoadFailure(captured, model, filePath, contextSize);
-        spdlog::error("Failed to load llama model from: {} — {}", filePath, m_lastLoadError.summary);
-        return ErrorCode::ModelLoadError;
-    }
+                if(devType==GGML_BACKEND_DEVICE_TYPE_CPU)
+                    continue;
 
-    // Query native training context from GGUF metadata
-    int nativeContext=llama_model_n_ctx_train(llamaModel);
+                ggml_backend_reg_t reg=ggml_backend_dev_backend_reg(dev);
+                if(!reg)
+                    continue;
 
-    // Resolve actual context to allocate:
-    //   contextSize > 0  → user/config requested explicit size
-    //   contextSize == 0 → use model's native training context
-    // In both cases, cap by the hardware-fit maximum.
-    int actualContext=contextSize;
-    if(actualContext<=0)
-    {
-        actualContext=nativeContext;
-    }
-    if(maxHardwareContext>0&&actualContext>maxHardwareContext)
-    {
-        spdlog::info("Capping context from {} to {} (hardware limit) for model '{}'",
-            actualContext, maxHardwareContext, model);
-        actualContext=maxHardwareContext;
+                std::string regName=ggml_backend_reg_name(reg);
+                std::string regLower=regName;
+                std::transform(regLower.begin(), regLower.end(), regLower.begin(), ::tolower);
+
+                bool match=(regLower==prefLower);
+                if(!match&&(prefLower=="rocm"||prefLower=="hip"))
+                    match=(regLower=="rocm"||regLower=="hip");
+                if(!match&&prefLower=="cuda")
+                    match=(regLower=="cuda");
+
+                if(match)
+                {
+                    std::string desc=std::string(ggml_backend_dev_name(dev))
+                        +" ("+ggml_backend_dev_description(dev)+") ["+regName+"]";
+                    matchedDevices.push_back(desc);
+                }
+            }
+        }
+
+        if(!matchedDevices.empty())
+        {
+            spdlog::info("Backend priority for '{}': [{}] — {} GPU device(s) available",
+                model,
+                [&]()
+                {
+                    std::string s;
+                    for(const std::string &p:backendPriority)
+                    {
+                        if(!s.empty()) s+=", ";
+                        s+=p;
+                    }
+                    return s;
+                }(),
+                matchedDevices.size());
+
+            for(size_t i=0; i<matchedDevices.size(); ++i)
+            {
+                spdlog::info("  device[{}]: {}", i, matchedDevices[i]);
+            }
+        }
+        else
+        {
+            spdlog::warn("Backend priority for '{}': no GPU devices matched [{}]",
+                model,
+                [&]()
+                {
+                    std::string s;
+                    for(const std::string &p:backendPriority)
+                    {
+                        if(!s.empty()) s+=", ";
+                        s+=p;
+                    }
+                    return s;
+                }());
+        }
     }
 
-    llama_context_params cparams=llama_context_default_params();
-    cparams.n_ctx=static_cast<uint32_t>(actualContext);
-    cparams.n_threads=std::thread::hardware_concurrency();
-    cparams.n_threads_batch=std::thread::hardware_concurrency();
+    int maxAttempts=2; // 1 normal + 1 retry after backend reinit
 
-    llama_context *llamaCtx=llama_init_from_model(llamaModel, cparams);
-    if(!llamaCtx)
+    for(int attempt=0; attempt<maxAttempts; ++attempt)
     {
-        std::string captured=m_llamaLogCapture.str();
-        endLlamaLogCapture();
+        if(attempt>0)
+        {
+            spdlog::warn("Retrying model load for '{}' (attempt {}/{})", model, attempt+1, maxAttempts);
+        }
+
+        // Start capturing llama.cpp log output for diagnostics
+        beginLlamaLogCapture();
 
-        m_lastLoadError=classifyLoadFailure(captured, model, filePath, actualContext);
+        llama_model_params mparams=llama_model_default_params();
+        mparams.n_gpu_layers=options.nGpuLayers.value_or(99);
 
-        // If classification didn't catch a specific VRAM/context issue,
-        // context creation failure is almost always a memory issue
-        if(m_lastLoadError.reason==LoadFailureReason::Unknown||
-            m_lastLoadError.reason==LoadFailureReason::BackendError)
+        if(options.noMmap.has_value()&&options.noMmap.value())
         {
-            m_lastLoadError.reason=LoadFailureReason::InsufficientVram;
-            m_lastLoadError.summary="Failed to create context (size="+std::to_string(actualContext)+
-                ") — likely insufficient GPU memory";
-            m_lastLoadError.suggestion="Try a smaller context size or use a smaller quantization variant. "
-                "You can also unload other models to free VRAM.";
-            m_lastLoadError.action="reduce_context";
-            m_lastLoadError.recoverable=true;
+            mparams.use_mmap=false;
         }
 
-        spdlog::error("Failed to create llama context for model: {} — {}", model, m_lastLoadError.summary);
-        llama_model_free(llamaModel);
-        return ErrorCode::ModelLoadError;
-    }
+        // On UMA/iGPU systems (e.g. AMD APUs), mmap causes model tensors to be
+        // imported as host-visible "CPU_Mapped" buffers via VK_EXT_external_memory_host
+        // instead of being allocated as device-local memory. This bypasses the Vulkan
+        // backend's normal allocation path and results in dramatically slower GPU access.
+        // Auto-disable mmap when any active GPU is a unified-memory device, unless
+        // the user explicitly set no_mmap=false.
+        if(!options.noMmap.has_value()&&mparams.use_mmap)
+        {
+            std::vector<GpuInfo> gpus=HardwareDetector::instance().getGpus();
+
+            for(const GpuInfo &gpu:gpus)
+            {
+                if(gpu.unifiedMemory)
+                {
+                    spdlog::info("UMA device detected ({}), disabling mmap for model '{}' "
+                        "to ensure device-local memory allocation",
+                        gpu.name, model);
+                    mparams.use_mmap=false;
+                    break;
+                }
+            }
+        }
+
+        // NOTE: mparams.devices is intentionally left as NULL (default).
+        // See comment above about why explicit device lists hurt UMA performance.
 
-    endLlamaLogCapture();
+        llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams);
+        if(!llamaModel)
+        {
+            std::string captured=m_llamaLogCapture.str();
+            endLlamaLogCapture();
 
-    LoadedModel &entry=m_models[model];
-    entry.llamaModel=llamaModel;
-    entry.llamaCtx=llamaCtx;
-    entry.maxContextSize=nativeContext;
-    entry.contextSize=static_cast<int>(llama_n_ctx(llamaCtx));
+            m_lastLoadError=classifyLoadFailure(captured, model, filePath, contextSize);
+            spdlog::error("Failed to load llama model from: {} — {}", filePath, m_lastLoadError.summary);
 
-    spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={})",
-        model, entry.contextSize, entry.maxContextSize);
-    return ErrorCode::Success;
+            // If Vulkan device lost and we haven't retried yet, reinit and try again
+            if(m_lastLoadError.reason==LoadFailureReason::VulkanDeviceLost&&attempt+1<maxAttempts)
+            {
+                spdlog::warn("Vulkan device lost detected during model load — "
+                    "reinitializing backend and retrying");
+                reinitLlamaBackend();
+                continue;
+            }
+
+            return ErrorCode::ModelLoadError;
+        }
+
+        // Query native training context from GGUF metadata
+        int nativeContext=llama_model_n_ctx_train(llamaModel);
+
+        // Resolve actual context to allocate:
+        //   contextSize > 0  → user/config requested explicit size
+        //   contextSize == 0 → use model's native training context
+        // In both cases, cap by the hardware-fit maximum.
+        int actualContext=contextSize;
+        if(actualContext<=0)
+        {
+            actualContext=nativeContext;
+        }
+        if(maxHardwareContext>0&&actualContext>maxHardwareContext)
+        {
+            spdlog::info("Capping context from {} to {} (hardware limit) for model '{}'",
+                actualContext, maxHardwareContext, model);
+            actualContext=maxHardwareContext;
+        }
+
+        llama_context_params cparams=llama_context_default_params();
+        cparams.n_ctx=static_cast<uint32_t>(actualContext);
+        cparams.n_threads=std::thread::hardware_concurrency();
+        cparams.n_threads_batch=std::thread::hardware_concurrency();
+
+        // Apply runtime options to context params
+        if(options.flashAttn.has_value())
+        {
+            cparams.flash_attn_type=options.flashAttn.value()
+                ?LLAMA_FLASH_ATTN_TYPE_ENABLED
+                :LLAMA_FLASH_ATTN_TYPE_DISABLED;
+        }
+
+        if(options.kvCacheTypeK.has_value())
+        {
+            ggml_type kType=parseGgmlType(options.kvCacheTypeK.value());
+            if(kType!=GGML_TYPE_COUNT)
+            {
+                cparams.type_k=kType;
+            }
+        }
+
+        if(options.kvCacheTypeV.has_value())
+        {
+            ggml_type vType=parseGgmlType(options.kvCacheTypeV.value());
+            if(vType!=GGML_TYPE_COUNT)
+            {
+                cparams.type_v=vType;
+            }
+        }
+
+        if(options.swaFull.has_value())
+        {
+            cparams.swa_full=options.swaFull.value();
+        }
+
+        llama_context *llamaCtx=llama_init_from_model(llamaModel, cparams);
+        if(!llamaCtx)
+        {
+            std::string captured=m_llamaLogCapture.str();
+            endLlamaLogCapture();
+
+            m_lastLoadError=classifyLoadFailure(captured, model, filePath, actualContext);
+
+            // If Vulkan device lost and we haven't retried yet, reinit and try again
+            if(m_lastLoadError.reason==LoadFailureReason::VulkanDeviceLost&&attempt+1<maxAttempts)
+            {
+                spdlog::warn("Vulkan device lost detected during context creation — "
+                    "reinitializing backend and retrying");
+                llama_model_free(llamaModel);
+                reinitLlamaBackend();
+                continue;
+            }
+
+            // If classification didn't catch a specific VRAM/context issue,
+            // context creation failure is almost always a memory issue
+            if(m_lastLoadError.reason==LoadFailureReason::Unknown||
+                m_lastLoadError.reason==LoadFailureReason::BackendError)
+            {
+                m_lastLoadError.reason=LoadFailureReason::InsufficientVram;
+                m_lastLoadError.summary="Failed to create context (size="+std::to_string(actualContext)+
+                    ") — likely insufficient GPU memory";
+                m_lastLoadError.suggestion="Try a smaller context size or use a smaller quantization variant. "
+                    "You can also unload other models to free VRAM.";
+                m_lastLoadError.action="reduce_context";
+                m_lastLoadError.recoverable=true;
+            }
+
+            spdlog::error("Failed to create llama context for model: {} — {}", model, m_lastLoadError.summary);
+            llama_model_free(llamaModel);
+            return ErrorCode::ModelLoadError;
+        }
+
+        endLlamaLogCapture();
+
+        LoadedModel &entry=m_models[model];
+        entry.llamaModel=llamaModel;
+        entry.llamaCtx=llamaCtx;
+        entry.maxContextSize=nativeContext;
+        entry.contextSize=static_cast<int>(llama_n_ctx(llamaCtx));
+
+        spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={}, ngl={}, flash_attn={}, mmap={}, backend_filter={})",
+            model, entry.contextSize, entry.maxContextSize,
+            options.nGpuLayers.value_or(99),
+            options.flashAttn.has_value()?(options.flashAttn.value()?"enabled":"disabled"):"auto",
+            mparams.use_mmap?"on":"off",
+            backendPriority.empty()?"all":[&]()
+            {
+                std::string s;
+                for(const std::string &p:backendPriority)
+                {
+                    if(!s.empty()) s+=",";
+                    s+=p;
+                }
+                return s;
+            }());
+        return ErrorCode::Success;
+    }
+
+    // Should not reach here, but just in case
+    return ErrorCode::ModelLoadError;
 }
 
 void ModelRuntime::freeLlamaModel(LoadedModel &entry)
diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h
index d8c07e2..79a7bda 100644
--- a/src/arbiterAI/modelRuntime.h
+++ b/src/arbiterAI/modelRuntime.h
@@ -42,7 +42,8 @@ enum class LoadFailureReason {
     InsufficientRam,    // not enough system RAM
     ContextTooLarge,    // requested context exceeds model or hardware limits
     UnsupportedArch,    // model architecture not supported by this llama.cpp build
-    BackendError        // llama.cpp internal error
+    BackendError,       // llama.cpp internal error
+    VulkanDeviceLost    // Vulkan device lost — GPU driver reset or hung pipeline
 };
 
 /// Convert a LoadFailureReason to a stable, snake_case string for API responses.
@@ -72,6 +73,7 @@ struct LoadedModel {
     bool pinned=false;
     llama_model *llamaModel=nullptr;
     llama_context *llamaCtx=nullptr;
+    RuntimeOptions activeOptions; // llama.cpp options active for this loaded model
 };
 
 class ModelRuntime {
@@ -86,11 +88,13 @@ class ModelRuntime {
     /// @param model     Model name from ModelManager.
     /// @param variant   Quantization variant (empty = auto-select best fitting).
     /// @param contextSize  Context size (0 = use model default).
+    /// @param optionsOverride  Optional runtime options to merge on top of model config defaults.
     /// @return ErrorCode::Success, ModelDownloading, ModelNotFound, ModelLoadError.
     ErrorCode loadModel(
         const std::string &model,
         const std::string &variant="",
-        int contextSize=0);
+        int contextSize=0,
+        const RuntimeOptions &optionsOverride=RuntimeOptions{});
 
     /// Download model files without loading into VRAM.
     /// Launches an async background download that respects the concurrent
@@ -121,7 +125,8 @@ class ModelRuntime {
     ErrorCode swapModel(
         const std::string &newModel,
         const std::string &variant="",
-        int contextSize=0);
+        int contextSize=0,
+        const RuntimeOptions &optionsOverride=RuntimeOptions{});
 
     /// Get the state of all tracked models.
     std::vector<LoadedModel> getModelStates() const;
@@ -145,6 +150,14 @@ class ModelRuntime {
     /// Get the current RAM budget for "Ready" tier models.
     int getReadyRamBudget() const;
 
+    /// Set the default backend priority used when a model config does not
+    /// specify its own backend_priority.  E.g. ["vulkan"] to force Vulkan
+    /// on systems where ROCm is unstable.
+    void setDefaultBackendPriority(const std::vector<std::string> &priority);
+
+    /// Get the current default backend priority.
+    std::vector<std::string> getDefaultBackendPriority() const;
+
     /// Evict least-recently-used non-pinned models to free VRAM.
     void evictIfNeeded(int requiredVramMb);
 
@@ -197,15 +210,30 @@ class ModelRuntime {
     /// Initialize the llama.cpp backend (called once on first local model load).
     void initLlamaBackend();
 
+    /// Resolve the effective backend priority for a model, applying layered
+    /// rules: (1) model config, (2) architecture rule from config repo matched
+    /// against GPU name, (3) server default.  Disabled backends from all layers
+    /// are removed from the result.
+    std::vector<std::string> resolveBackendPriority(const ModelInfo &model) const;
+
+    /// Tear down and reinitialize the llama.cpp backend.
+    /// Used to recover from Vulkan device-lost errors.
+    void reinitLlamaBackend();
+
     /// Load a GGUF file into llama.cpp.
     /// @param contextSize      Requested context (0 = use model's native training context).
     /// @param maxHardwareContext  Hardware-fit limit (0 = no limit).
+    /// @param options           Resolved runtime options to apply.
+    /// @param backendPriority   Ordered backend preference (e.g. ["vulkan","rocm"]).
+    ///                          Empty = use all available backends (default).
     ErrorCode loadLlamaModel(
         const std::string &model,
         const std::string &filePath,
         int contextSize,
         const std::vector<int> &gpuIndices,
-        int maxHardwareContext=0);
+        int maxHardwareContext=0,
+        const RuntimeOptions &options=RuntimeOptions{},
+        const std::vector<std::string> &backendPriority={});
 
     /// Free llama.cpp resources for a model.
     void freeLlamaModel(LoadedModel &entry);
@@ -222,6 +250,7 @@ class ModelRuntime {
     std::map<std::string, LoadedModel> m_models;
     mutable std::mutex m_mutex;
     int m_readyRamBudgetMb=0;
+    std::vector<std::string> m_defaultBackendPriority;
     std::atomic<bool> m_inferenceActive{false};
     std::string m_inferenceModel;
     bool m_llamaInitialized=false;
@@ -230,6 +259,7 @@ class ModelRuntime {
         std::string model;
         std::string variant;
         int contextSize=0;
+        RuntimeOptions optionsOverride;
     };
     std::queue<SwapRequest> m_pendingSwaps;
 
diff --git a/src/server/dashboard.h b/src/server/dashboard.h
index d68ed32..90d123c 100644
--- a/src/server/dashboard.h
+++ b/src/server/dashboard.h
@@ -554,7 +554,7 @@ td
         </div>
     </div>
     <div class="card" style="margin-bottom:20px;">
-        <h2>Loaded Models</h2>
+        <h2>Loaded Models <a href="/dashboard/storage" style="float:right;font-size:12px;color:#7c8aff;text-decoration:none;font-weight:normal;text-transform:none;letter-spacing:normal;">Downloaded Models &rarr;</a></h2>
         <table>
             <thead>
                 <tr>
@@ -575,41 +575,27 @@ td
         </table>
     </div>
     <div class="card" style="margin-bottom:20px;">
-        <h2>Downloaded Models</h2>
-        <div id="storageBarSection">
-            <div class="storage-info">
-                <span id="storageUsedLabel">Used: -</span>
-                <span id="storageLimitLabel">Limit: -</span>
-            </div>
-            <div class="storage-bar-outer">
-                <div class="storage-bar-fill" id="storageBarFill" style="width:0%"></div>
-                <div class="storage-bar-text" id="storageBarText">-</div>
-            </div>
-            <div class="storage-info">
-                <span id="storageCleanupLabel">Auto-cleanup: -</span>
-                <span id="storageCandidatesLabel"></span>
-            </div>
-        </div>
+        <h2>Active Requests</h2>
         <div id="downloadProgressSection" style="margin:8px 0;"></div>
-        <table>
-            <thead>
-                <tr>
-                    <th>Model</th>
-                    <th>Variant</th>
-                    <th>Size</th>
-                    <th>Downloaded</th>
-                    <th>Last Used</th>
-                    <th>Uses</th>
-                    <th>State</th>
-                    <th>Hot Ready</th>
-                    <th>Protected</th>
-                    <th>Actions</th>
-                </tr>
-            </thead>
-            <tbody id="downloadedModelTable">
-                <tr><td colspan="10" style="color:#666;text-align:center;">No downloaded models</td></tr>
-            </tbody>
-        </table>
+        <div style="max-height:360px;overflow-y:auto;">
+            <table>
+                <thead>
+                    <tr>
+                        <th>Model</th>
+                        <th>Status</th>
+                        <th>Input Tokens</th>
+                        <th>Output Tokens</th>
+                        <th>Prompt t/s</th>
+                        <th>Gen t/s</th>
+                        <th>Latency</th>
+                        <th>Total Time</th>
+                    </tr>
+                </thead>
+                <tbody id="activeRequestTable">
+                    <tr><td colspan="8" style="color:#666;text-align:center;">No recent requests</td></tr>
+                </tbody>
+            </table>
+        </div>
     </div>
     <div class="grid">
         <div class="card">
@@ -1136,7 +1122,6 @@ async function toggleHotReady(name, variant, currentlyHotReady)
     const method=currentlyHotReady?"DELETE":"POST";
     const url="/api/models/"+encodeURIComponent(name)+"/variants/"+encodeURIComponent(variant)+"/hot-ready";
     await fetch(url, {method});
-    await refreshStorage();
 }
 
 async function toggleProtected(name, variant, currentlyProtected)
@@ -1144,37 +1129,6 @@ async function toggleProtected(name, variant, currentlyProtected)
     const method=currentlyProtected?"DELETE":"POST";
     const url="/api/models/"+encodeURIComponent(name)+"/variants/"+encodeURIComponent(variant)+"/protected";
     await fetch(url, {method});
-    await refreshStorage();
-}
-
-async function deleteModelFile(name, variant)
-{
-    if(!confirm("Delete "+name+" "+variant+"? This cannot be undone.")) return;
-    const url="/api/models/"+encodeURIComponent(name)+"/files"+(variant?"?variant="+encodeURIComponent(variant):"");
-    const resp=await fetch(url, {method:"DELETE"});
-    if(resp.status===409)
-    {
-        const data=await resp.json();
-        alert(data.error?.message||"Cannot delete: variant is guarded");
-    }
-    await refreshStorage();
-}
-
-function renderStorageBar(storage)
-{
-    if(!storage) return;
-
-    const used=storage.used_by_models_bytes||0;
-    const limit=storage.storage_limit_bytes;
-    const free=storage.free_disk_bytes||0;
-    const total=limit>0?limit:(used+free);
-    const pct=total>0?(used/total*100):0;
-
-    document.getElementById("storageUsedLabel").textContent="Used: "+formatBytesJs(used);
-    document.getElementById("storageLimitLabel").textContent=limit>0?"Limit: "+formatBytesJs(limit):"Limit: All free space";
-    document.getElementById("storageBarFill").style.width=pct.toFixed(1)+"%";
-    document.getElementById("storageBarText").textContent=formatBytesJs(used)+" / "+formatBytesJs(total)+" ("+pct.toFixed(1)+"%)";
-    document.getElementById("storageCleanupLabel").textContent="Auto-cleanup: "+(storage.cleanup_enabled?"ON":"OFF");
 }
 
 function renderDownloadProgress(downloads)
@@ -1213,64 +1167,46 @@ function renderDownloadProgress(downloads)
     el.innerHTML=html;
 }
 
-function renderDownloadedModels(models)
+function renderActiveRequests(history)
 {
-    const el=document.getElementById("downloadedModelTable");
+    const el=document.getElementById("activeRequestTable");
 
-    if(!models||models.length===0)
+    if(!history||history.length===0)
     {
-        el.innerHTML='<tr><td colspan="10" style="color:#666;text-align:center;">No downloaded models</td></tr>';
+        el.innerHTML='<tr><td colspan="8" style="color:#666;text-align:center;">No recent requests</td></tr>';
         return;
     }
 
+    const recent=history.slice(-20).reverse();
     let html="";
-    for(const m of models)
+    for(const s of recent)
     {
-        const ageClass=rowAgeClass(m.last_used_at);
-        const guarded=m.hot_ready||m.protected;
-        const hrClass=m.hot_ready?"btn-toggle active":"btn-toggle";
-        const prClass=m.protected?"btn-toggle active":"btn-toggle";
-        const deleteDisabled=guarded?"btn-disabled":"";
-        const deleteTitle=guarded?"Clear hot_ready and protected first":"Delete model file";
+        const promptTps=s.prompt_tokens_per_second||0;
+        const genTps=s.generation_tokens_per_second||0;
+        const totalMs=s.total_time_ms||0;
+        const latencyMs=s.latency_ms||0;
+        const isActive=(totalMs===0&&latencyMs===0);
 
-        html+=`<tr class="${ageClass}">
-            <td>${m.model}</td>
-            <td>${m.variant||"-"}</td>
-            <td>${m.file_size_display||formatBytesJs(m.file_size_bytes)}</td>
-            <td>${formatDate(m.downloaded_at)}</td>
-            <td>${formatDate(m.last_used_at)}</td>
-            <td>${m.usage_count||0}</td>
-            <td><span class="badge ${stateClass(m.runtime_state)}">${m.runtime_state||"Unloaded"}</span></td>
-            <td><button class="btn ${hrClass}" onclick="toggleHotReady('${m.model}','${m.variant}',${m.hot_ready})">${m.hot_ready?"ON":"OFF"}</button></td>
-            <td><button class="btn ${prClass}" onclick="toggleProtected('${m.model}','${m.variant}',${m.protected})">${m.protected?"ON":"OFF"}</button></td>
-            <td><button class="btn btn-danger ${deleteDisabled}" title="${deleteTitle}" onclick="${guarded?"":`deleteModelFile('${m.model}','${m.variant}')`}" ${guarded?"disabled":""}>Delete</button></td>
+        html+=`<tr>
+            <td>${s.model}</td>
+            <td><span class="badge ${isActive?"badge-downloading":"badge-loaded"}">${isActive?"Running":"Done"}</span></td>
+            <td>${s.prompt_tokens.toLocaleString()}</td>
+            <td>${s.completion_tokens.toLocaleString()}</td>
+            <td>${promptTps.toFixed(1)}</td>
+            <td>${genTps.toFixed(1)}</td>
+            <td>${latencyMs.toFixed(0)} ms</td>
+            <td>${totalMs.toFixed(0)} ms</td>
         </tr>`;
     }
     el.innerHTML=html;
 }
 
-async function refreshStorage()
+async function refreshDownloads()
 {
-    const [storage, storageModels, downloads, cleanupPreview]=await Promise.all([
-        fetchJson("/api/storage"),
-        fetchJson("/api/storage/models"),
-        fetchJson("/api/downloads"),
-        fetchJson("/api/storage/cleanup/preview")
-    ]);
-
-    renderStorageBar(storage);
+    const downloads=await fetchJson("/api/downloads");
 
     if(downloads&&downloads.downloads) renderDownloadProgress(downloads.downloads);
     else renderDownloadProgress([]);
-
-    if(storageModels&&storageModels.models) renderDownloadedModels(storageModels.models);
-    else renderDownloadedModels([]);
-
-    if(cleanupPreview)
-    {
-        const count=cleanupPreview.candidate_count||0;
-        document.getElementById("storageCandidatesLabel").textContent=count>0?count+" cleanup candidate"+(count>1?"s":""):"";
-    }
 }
 
 async function refresh()
@@ -1332,11 +1268,14 @@ async function refresh()
     // Inference history
     if(history) renderInferences(history);
 
+    // Active requests summary
+    if(history) renderActiveRequests(history);
+
     // Swaps
     if(swaps) renderSwaps(swaps);
 
-    // Storage (runs in parallel)
-    refreshStorage();
+    // Active downloads
+    refreshDownloads();
 }
 
 async function loadVersion()
@@ -1361,6 +1300,532 @@ setInterval(refreshLogs, POLL_INTERVAL);
 </body>
 </html>)HTML";
 
+const std::string DASHBOARD_STORAGE_HTML=R"HTML(<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>ArbiterAI — Downloaded Models</title>
+<style>
+*
+{
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body
+{
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    background: #0f1117;
+    color: #e0e0e0;
+    line-height: 1.6;
+}
+.header
+{
+    background: #1a1d27;
+    border-bottom: 1px solid #2a2d3a;
+    padding: 16px 24px;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+}
+.header h1
+{
+    font-size: 20px;
+    color: #7c8aff;
+}
+.header .status
+{
+    font-size: 13px;
+    color: #888;
+}
+.header .status .dot
+{
+    display: inline-block;
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+    background: #4caf50;
+    margin-right: 6px;
+    vertical-align: middle;
+}
+.version-badge
+{
+    font-size: 12px;
+    color: #888;
+    background: #2a2d3a;
+    padding: 2px 8px;
+    border-radius: 4px;
+    margin-left: 8px;
+    font-weight: normal;
+    vertical-align: middle;
+}
+.back-link
+{
+    color: #7c8aff;
+    text-decoration: none;
+    font-size: 13px;
+    margin-right: 16px;
+}
+.back-link:hover
+{
+    text-decoration: underline;
+}
+.container
+{
+    max-width: 1400px;
+    margin: 0 auto;
+    padding: 20px 24px;
+}
+.card
+{
+    background: #1a1d27;
+    border: 1px solid #2a2d3a;
+    border-radius: 8px;
+    padding: 16px;
+    margin-bottom: 20px;
+}
+.card h2
+{
+    font-size: 14px;
+    color: #888;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+    margin-bottom: 12px;
+}
+table
+{
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 13px;
+}
+th
+{
+    text-align: left;
+    padding: 8px 10px;
+    color: #888;
+    font-weight: 500;
+    border-bottom: 1px solid #2a2d3a;
+}
+td
+{
+    padding: 8px 10px;
+    border-bottom: 1px solid #1f2230;
+}
+.badge
+{
+    display: inline-block;
+    padding: 2px 8px;
+    border-radius: 4px;
+    font-size: 11px;
+    font-weight: 600;
+    text-transform: uppercase;
+}
+.badge-loaded
+{
+    background: #1b3a2a;
+    color: #4caf50;
+}
+.badge-ready
+{
+    background: #2a3040;
+    color: #7c8aff;
+}
+.badge-unloaded
+{
+    background: #2a2020;
+    color: #888;
+}
+.badge-downloading
+{
+    background: #2a2a10;
+    color: #f0c040;
+}
+.btn
+{
+    padding: 4px 12px;
+    border: 1px solid #2a2d3a;
+    background: #1a1d27;
+    color: #ccc;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 12px;
+    margin-right: 4px;
+}
+.btn:hover
+{
+    background: #252838;
+    color: #fff;
+}
+.btn-danger
+{
+    border-color: #5a2020;
+}
+.btn-danger:hover
+{
+    background: #3a1515;
+    color: #ff6060;
+}
+.btn-disabled
+{
+    opacity: 0.4;
+    cursor: not-allowed;
+}
+.btn-toggle
+{
+    padding: 2px 8px;
+    font-size: 11px;
+}
+.btn-toggle.active
+{
+    background: #1b3a2a;
+    border-color: #4caf50;
+    color: #4caf50;
+}
+.storage-bar-outer
+{
+    background: #1f2230;
+    border-radius: 4px;
+    height: 24px;
+    margin: 8px 0;
+    overflow: hidden;
+    position: relative;
+}
+.storage-bar-fill
+{
+    height: 100%;
+    border-radius: 4px;
+    background: linear-gradient(90deg, #4a6cf7, #7c8aff);
+    transition: width 0.5s ease;
+}
+.storage-bar-text
+{
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    height: 100%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 12px;
+    color: #e0e0e0;
+    font-weight: 500;
+}
+.storage-info
+{
+    display: flex;
+    justify-content: space-between;
+    font-size: 12px;
+    color: #888;
+    margin-bottom: 4px;
+}
+.row-fresh
+{
+    border-left: 3px solid #4caf50;
+}
+.row-stale
+{
+    border-left: 3px solid #f0c040;
+}
+.row-old
+{
+    border-left: 3px solid #ff4444;
+}
+.progress-inline
+{
+    display: inline-block;
+    width: 120px;
+    height: 12px;
+    background: #1f2230;
+    border-radius: 3px;
+    overflow: hidden;
+    vertical-align: middle;
+    margin-right: 6px;
+}
+.progress-inline-fill
+{
+    height: 100%;
+    background: linear-gradient(90deg, #4a6cf7, #7c8aff);
+    transition: width 0.3s ease;
+}
+</style>
+</head>
+<body>
+<div class="header">
+    <div>
+        <a href="/dashboard" class="back-link">&larr; Dashboard</a>
+        <span style="font-size:20px;color:#7c8aff;font-weight:600;">Downloaded Models</span>
+        <span id="versionBadge" class="version-badge"></span>
+    </div>
+    <div class="status"><span class="dot" id="statusDot"></span><span id="statusText">Connected</span></div>
+</div>
+<div class="container">
+    <div class="card">
+        <h2>Storage</h2>
+        <div id="storageBarSection">
+            <div class="storage-info">
+                <span id="storageUsedLabel">Used: -</span>
+                <span id="storageLimitLabel">Limit: -</span>
+            </div>
+            <div class="storage-bar-outer">
+                <div class="storage-bar-fill" id="storageBarFill" style="width:0%"></div>
+                <div class="storage-bar-text" id="storageBarText">-</div>
+            </div>
+            <div class="storage-info">
+                <span id="storageCleanupLabel">Auto-cleanup: -</span>
+                <span id="storageCandidatesLabel"></span>
+            </div>
+        </div>
+    </div>
+    <div id="downloadProgressCard"></div>
+    <div class="card">
+        <h2>Model Files</h2>
+        <table>
+            <thead>
+                <tr>
+                    <th>Model</th>
+                    <th>Variant</th>
+                    <th>Size</th>
+                    <th>Downloaded</th>
+                    <th>Last Used</th>
+                    <th>Uses</th>
+                    <th>State</th>
+                    <th>Hot Ready</th>
+                    <th>Protected</th>
+                    <th>Actions</th>
+                </tr>
+            </thead>
+            <tbody id="downloadedModelTable">
+                <tr><td colspan="10" style="color:#666;text-align:center;">No downloaded models</td></tr>
+            </tbody>
+        </table>
+    </div>
+</div>
+<script>
+const POLL_INTERVAL=3000;
+
+function formatBytesJs(bytes)
+{
+    if(bytes>=1073741824) return (bytes/1073741824).toFixed(1)+" GB";
+    if(bytes>=1048576) return (bytes/1048576).toFixed(1)+" MB";
+    return bytes+" B";
+}
+
+function formatDate(isoStr)
+{
+    if(!isoStr) return "-";
+    const d=new Date(isoStr);
+    return d.toLocaleDateString();
+}
+
+function daysSince(isoStr)
+{
+    if(!isoStr) return 999;
+    const d=new Date(isoStr);
+    const now=new Date();
+    return Math.floor((now-d)/(1000*60*60*24));
+}
+
+function rowAgeClass(lastUsed)
+{
+    const days=daysSince(lastUsed);
+    if(days>30) return "row-old";
+    if(days>14) return "row-stale";
+    return "row-fresh";
+}
+
+function stateClass(state)
+{
+    const map={"Loaded":"loaded", "Ready":"ready", "Unloaded":"unloaded", "Downloading":"downloading", "Unloading":"unloaded"};
+    return "badge-"+(map[state]||"unloaded");
+}
+
+async function fetchJson(url)
+{
+    try
+    {
+        const resp=await fetch(url);
+        if(!resp.ok) return null;
+        return await resp.json();
+    }
+    catch(e)
+    {
+        return null;
+    }
+}
+
+async function toggleHotReady(name, variant, currentlyHotReady)
+{
+    const method=currentlyHotReady?"DELETE":"POST";
+    const url="/api/models/"+encodeURIComponent(name)+"/variants/"+encodeURIComponent(variant)+"/hot-ready";
+    await fetch(url, {method});
+    await refreshStorage();
+}
+
+async function toggleProtected(name, variant, currentlyProtected)
+{
+    const method=currentlyProtected?"DELETE":"POST";
+    const url="/api/models/"+encodeURIComponent(name)+"/variants/"+encodeURIComponent(variant)+"/protected";
+    await fetch(url, {method});
+    await refreshStorage();
+}
+
+async function deleteModelFile(name, variant)
+{
+    if(!confirm("Delete "+name+" "+variant+"? This cannot be undone.")) return;
+    const url="/api/models/"+encodeURIComponent(name)+"/files"+(variant?"?variant="+encodeURIComponent(variant):"");
+    const resp=await fetch(url, {method:"DELETE"});
+    if(resp.status===409)
+    {
+        const data=await resp.json();
+        alert(data.error?.message||"Cannot delete: variant is guarded");
+    }
+    await refreshStorage();
+}
+
+function renderStorageBar(storage)
+{
+    if(!storage) return;
+
+    const used=storage.used_by_models_bytes||0;
+    const limit=storage.storage_limit_bytes;
+    const free=storage.free_disk_bytes||0;
+    const total=limit>0?limit:(used+free);
+    const pct=total>0?(used/total*100):0;
+
+    document.getElementById("storageUsedLabel").textContent="Used: "+formatBytesJs(used);
+    document.getElementById("storageLimitLabel").textContent=limit>0?"Limit: "+formatBytesJs(limit):"Limit: All free space";
+    document.getElementById("storageBarFill").style.width=pct.toFixed(1)+"%";
+    document.getElementById("storageBarText").textContent=formatBytesJs(used)+" / "+formatBytesJs(total)+" ("+pct.toFixed(1)+"%)";
+    document.getElementById("storageCleanupLabel").textContent="Auto-cleanup: "+(storage.cleanup_enabled?"ON":"OFF");
+}
+
+function renderDownloadProgress(downloads)
+{
+    const el=document.getElementById("downloadProgressCard");
+    if(!downloads||downloads.length===0)
+    {
+        el.innerHTML="";
+        return;
+    }
+
+    let html='<div class="card"><h2>Active Downloads</h2>';
+    for(const dl of downloads)
+    {
+        const pct=dl.percent_complete||0;
+        const downloaded=dl.bytes_downloaded||0;
+        const total=dl.total_bytes||0;
+        const speed=dl.speed_mbps||0;
+        const eta=dl.eta_seconds||0;
+
+        html+=`<div style="padding:6px 0;border-bottom:1px solid #1f2230;">
+            <span style="font-weight:500;">${dl.model}</span>
+            <span style="color:#888;margin-left:4px;">${dl.variant||""}</span>
+            <span class="badge badge-downloading" style="margin-left:8px;">Downloading</span>
+            <div style="margin-top:4px;">
+                <div class="progress-inline"><div class="progress-inline-fill" style="width:${pct.toFixed(1)}%"></div></div>
+                <span style="font-size:12px;color:#ccc;">${pct.toFixed(1)}%</span>
+                ${total>0?`<span style="font-size:12px;color:#888;margin-left:8px;">${formatBytesJs(downloaded)} / ${formatBytesJs(total)}</span>`:""}
+                ${speed>0?`<span style="font-size:12px;color:#888;margin-left:8px;">${speed.toFixed(1)} MB/s</span>`:""}
+                ${eta>0?`<span style="font-size:12px;color:#888;margin-left:8px;">~${eta}s left</span>`:""}
+            </div>
+        </div>`;
+    }
+    html+="</div>";
+    el.innerHTML=html;
+}
+
+function renderDownloadedModels(models)
+{
+    const el=document.getElementById("downloadedModelTable");
+
+    if(!models||models.length===0)
+    {
+        el.innerHTML='<tr><td colspan="10" style="color:#666;text-align:center;">No downloaded models</td></tr>';
+        return;
+    }
+
+    let html="";
+    for(const m of models)
+    {
+        const ageClass=rowAgeClass(m.last_used_at);
+        const guarded=m.hot_ready||m.protected;
+        const hrClass=m.hot_ready?"btn-toggle active":"btn-toggle";
+        const prClass=m.protected?"btn-toggle active":"btn-toggle";
+        const deleteDisabled=guarded?"btn-disabled":"";
+        const deleteTitle=guarded?"Clear hot_ready and protected first":"Delete model file";
+
+        html+=`<tr class="${ageClass}">
+            <td>${m.model}</td>
+            <td>${m.variant||"-"}</td>
+            <td>${m.file_size_display||formatBytesJs(m.file_size_bytes)}</td>
+            <td>${formatDate(m.downloaded_at)}</td>
+            <td>${formatDate(m.last_used_at)}</td>
+            <td>${m.usage_count||0}</td>
+            <td><span class="badge ${stateClass(m.runtime_state)}">${m.runtime_state||"Unloaded"}</span></td>
+            <td><button class="btn ${hrClass}" onclick="toggleHotReady('${m.model}','${m.variant}',${m.hot_ready})">${m.hot_ready?"ON":"OFF"}</button></td>
+            <td><button class="btn ${prClass}" onclick="toggleProtected('${m.model}','${m.variant}',${m.protected})">${m.protected?"ON":"OFF"}</button></td>
+            <td><button class="btn btn-danger ${deleteDisabled}" title="${deleteTitle}" onclick="${guarded?"":`deleteModelFile('${m.model}','${m.variant}')`}" ${guarded?"disabled":""}>Delete</button></td>
+        </tr>`;
+    }
+    el.innerHTML=html;
+}
+
+async function refreshStorage()
+{
+    const [storage, storageModels, downloads, cleanupPreview]=await Promise.all([
+        fetchJson("/api/storage"),
+        fetchJson("/api/storage/models"),
+        fetchJson("/api/downloads"),
+        fetchJson("/api/storage/cleanup/preview")
+    ]);
+
+    const dot=document.getElementById("statusDot");
+    const statusText=document.getElementById("statusText");
+
+    if(!storage)
+    {
+        dot.style.background="#ff4444";
+        statusText.textContent="Disconnected";
+    }
+    else
+    {
+        dot.style.background="#4caf50";
+        statusText.textContent="Connected";
+    }
+
+    renderStorageBar(storage);
+
+    if(downloads&&downloads.downloads) renderDownloadProgress(downloads.downloads);
+    else renderDownloadProgress([]);
+
+    if(storageModels&&storageModels.models) renderDownloadedModels(storageModels.models);
+    else renderDownloadedModels([]);
+
+    if(cleanupPreview)
+    {
+        const count=cleanupPreview.candidate_count||0;
+        document.getElementById("storageCandidatesLabel").textContent=count>0?count+" cleanup candidate"+(count>1?"s":""):"";
+    }
+}
+
+async function loadVersion()
+{
+    const data=await fetchJson("/api/version");
+    if(data)
+    {
+        document.getElementById("versionBadge").textContent="v"+data.version;
+    }
+}
+
+loadVersion();
+refreshStorage();
+setInterval(refreshStorage, POLL_INTERVAL);
+</script>
+</body>
+</html>)HTML";
+
 } // namespace server
 } // namespace arbiterAI
 
diff --git a/src/server/main.cpp b/src/server/main.cpp
index 5e156fc..7fb4f67 100644
--- a/src/server/main.cpp
+++ b/src/server/main.cpp
@@ -138,6 +138,7 @@ int main(int argc, char *argv[])
     std::string defaultModel=cfg.value("default_model", "");
     std::string defaultVariant=cfg.value("default_variant", "");
     std::string overridePath=cfg.value("override_path", "");
+    std::string injectedConfigDir=cfg.value("injected_config_dir", "");
     int ramBudget=cfg.value("ram_budget_mb", 0);
     int maxDownloads=cfg.value("max_concurrent_downloads", 2);
 
@@ -152,6 +153,15 @@ int main(int argc, char *argv[])
     nlohmann::json hwCfg=cfg.value("hardware", nlohmann::json::object());
     nlohmann::json vramOverrides=hwCfg.value("vram_overrides", nlohmann::json::object());
 
+    std::vector<std::string> defaultBackendPriority;
+    if(hwCfg.contains("default_backend_priority")&&hwCfg["default_backend_priority"].is_array())
+    {
+        for(const nlohmann::json &bp:hwCfg["default_backend_priority"])
+        {
+            defaultBackendPriority.push_back(bp.get<std::string>());
+        }
+    }
+
     // Logging
     nlohmann::json logCfg=cfg.value("logging", nlohmann::json::object());
     std::string logLevel=logCfg.value("level", "info");
@@ -220,6 +230,17 @@ int main(int argc, char *argv[])
 
     spdlog::info("ArbiterAI initialized successfully");
 
+    // ── Restore injected model configs ───────────────────────────
+    if(!injectedConfigDir.empty())
+    {
+        arbiterAI::ModelManager::instance().setInjectedConfigDir(injectedConfigDir);
+        int restored=arbiterAI::ModelManager::instance().loadInjectedConfigs();
+        if(restored>0)
+        {
+            spdlog::info("Restored {} injected model config(s)", restored);
+        }
+    }
+
     // ── Apply VRAM overrides ─────────────────────────────────────
     for(auto it=vramOverrides.begin(); it!=vramOverrides.end(); ++it)
     {
@@ -260,6 +281,12 @@ int main(int argc, char *argv[])
         spdlog::info("Ready model RAM budget set to {} MB", ramBudget);
     }
 
+    // ── Default backend priority ─────────────────────────────────
+    if(!defaultBackendPriority.empty())
+    {
+        arbiterAI::ModelRuntime::instance().setDefaultBackendPriority(defaultBackendPriority);
+    }
+
     // ── Concurrent download limit ────────────────────────────────
     if(maxDownloads>0)
     {
diff --git a/src/server/routes.cpp b/src/server/routes.cpp
index ef178e6..843b88b 100644
--- a/src/server/routes.cpp
+++ b/src/server/routes.cpp
@@ -111,7 +111,23 @@ nlohmann::json systemInfoToJson(const SystemInfo &hw)
     nlohmann::json gpus=nlohmann::json::array();
     for(const GpuInfo &gpu:hw.gpus)
     {
-        gpus.push_back(gpuInfoToJson(gpu));
+        nlohmann::json gpuJson=gpuInfoToJson(gpu);
+
+        // Attach matched architecture rule (if any)
+        std::optional<GpuBackendRule> rule=ModelManager::instance().findGpuBackendRule(gpu.name);
+        if(rule)
+        {
+            nlohmann::json ruleJson={
+                {"name", rule->name}
+            };
+            if(!rule->disabledBackends.empty())
+                ruleJson["disabled_backends"]=rule->disabledBackends;
+            if(!rule->backendPriority.empty())
+                ruleJson["backend_priority"]=rule->backendPriority;
+            gpuJson["architecture_rule"]=ruleJson;
+        }
+
+        gpus.push_back(gpuJson);
     }
 
     return {
@@ -136,6 +152,54 @@ std::string modelStateToString(ModelState state)
     }
 }
 
+nlohmann::json runtimeOptionsToJson(const RuntimeOptions &opts)
+{
+    nlohmann::json j=nlohmann::json::object();
+
+    if(opts.flashAttn.has_value())
+        j["flash_attn"]=opts.flashAttn.value();
+    if(opts.kvCacheTypeK.has_value())
+        j["kv_cache_type_k"]=opts.kvCacheTypeK.value();
+    if(opts.kvCacheTypeV.has_value())
+        j["kv_cache_type_v"]=opts.kvCacheTypeV.value();
+    if(opts.noMmap.has_value())
+        j["no_mmap"]=opts.noMmap.value();
+    if(opts.reasoningBudget.has_value())
+        j["reasoning_budget"]=opts.reasoningBudget.value();
+    if(opts.swaFull.has_value())
+        j["swa_full"]=opts.swaFull.value();
+    if(opts.nGpuLayers.has_value())
+        j["n_gpu_layers"]=opts.nGpuLayers.value();
+    if(opts.overrideTensor.has_value())
+        j["override_tensor"]=opts.overrideTensor.value();
+
+    return j;
+}
+
+RuntimeOptions parseRuntimeOptions(const nlohmann::json &j)
+{
+    RuntimeOptions opts;
+
+    if(j.contains("flash_attn")&&j["flash_attn"].is_boolean())
+        opts.flashAttn=j["flash_attn"].get<bool>();
+    if(j.contains("kv_cache_type_k")&&j["kv_cache_type_k"].is_string())
+        opts.kvCacheTypeK=j["kv_cache_type_k"].get<std::string>();
+    if(j.contains("kv_cache_type_v")&&j["kv_cache_type_v"].is_string())
+        opts.kvCacheTypeV=j["kv_cache_type_v"].get<std::string>();
+    if(j.contains("no_mmap")&&j["no_mmap"].is_boolean())
+        opts.noMmap=j["no_mmap"].get<bool>();
+    if(j.contains("reasoning_budget")&&j["reasoning_budget"].is_number_integer())
+        opts.reasoningBudget=j["reasoning_budget"].get<int>();
+    if(j.contains("swa_full")&&j["swa_full"].is_boolean())
+        opts.swaFull=j["swa_full"].get<bool>();
+    if(j.contains("n_gpu_layers")&&j["n_gpu_layers"].is_number_integer())
+        opts.nGpuLayers=j["n_gpu_layers"].get<int>();
+    if(j.contains("override_tensor")&&j["override_tensor"].is_string())
+        opts.overrideTensor=j["override_tensor"].get<std::string>();
+
+    return opts;
+}
+
 nlohmann::json loadedModelToJson(const LoadedModel &m)
 {
     nlohmann::json gpuIndices=nlohmann::json::array();
@@ -144,7 +208,7 @@ nlohmann::json loadedModelToJson(const LoadedModel &m)
         gpuIndices.push_back(idx);
     }
 
-    return {
+    nlohmann::json j={
         {"model", m.modelName},
         {"variant", m.variant},
         {"state", modelStateToString(m.state)},
@@ -156,6 +220,14 @@ nlohmann::json loadedModelToJson(const LoadedModel &m)
         {"gpu_indices", gpuIndices},
         {"pinned", m.pinned}
     };
+
+    nlohmann::json activeOpts=runtimeOptionsToJson(m.activeOptions);
+    if(!activeOpts.empty())
+    {
+        j["runtime_options"]=activeOpts;
+    }
+
+    return j;
 }
 
 nlohmann::json inferenceStatsToJson(const InferenceStats &s)
@@ -226,6 +298,33 @@ std::string errorCodeToString(ErrorCode code)
     }
 }
 
+/// Parse a model identifier that may contain a ":variant" suffix.
+/// Examples:
+///   "Qwen3.5-27B:Q4_K_M" → ("Qwen3.5-27B", "Q4_K_M")
+///   "gpt-4"               → ("gpt-4", "")
+///   "gpt-oss-120b:Q8_0"   → ("gpt-oss-120b", "Q8_0")
+/// Uses rfind to handle model names that may themselves contain colons.
+std::pair<std::string, std::string> parseModelVariant(const std::string &modelId)
+{
+    // Only split on ':' if the suffix looks like a quantization variant
+    // (starts with Q, F, IQ, or BF — e.g. Q4_K_M, F16, IQ4_XS, BF16).
+    // This avoids breaking model names that contain colons for other reasons.
+    size_t pos=modelId.rfind(':');
+    if(pos!=std::string::npos&&pos+1<modelId.size())
+    {
+        std::string suffix=modelId.substr(pos+1);
+        char first=suffix[0];
+
+        if(first=='Q'||first=='q'||first=='F'||first=='f'
+            ||(suffix.size()>=2&&(suffix.substr(0, 2)=="IQ"||suffix.substr(0, 2)=="iq"
+                ||suffix.substr(0, 2)=="BF"||suffix.substr(0, 2)=="bf")))
+        {
+            return {modelId.substr(0, pos), suffix};
+        }
+    }
+    return {modelId, ""};
+}
+
 } // anonymous namespace
 
 // ========== Override Path ==========
@@ -312,6 +411,9 @@ void registerRoutes(httplib::Server &server)
     // Logs
     server.Get("/api/logs", handleGetLogs);
 
+    // Runtime options
+    server.Get("/api/runtime-options", handleGetRuntimeOptions);
+
     // Storage management
     server.Get("/api/storage", handleGetStorage);
     server.Get("/api/storage/models", handleGetStorageModels);
@@ -330,6 +432,7 @@ void registerRoutes(httplib::Server &server)
     server.Get("/api/downloads", handleGetActiveDownloads);
 
     // Dashboard
+    server.Get("/dashboard/storage", handleDashboardStorage);
     server.Get("/dashboard", handleDashboard);
 
     spdlog::info("Registered all HTTP routes");
@@ -358,6 +461,31 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
     {
         arbiterRequest.model=requestJson.at("model");
 
+        // Parse "model:variant" syntax (e.g. "Qwen3.5-27B:Q4_K_M")
+        // Strip the variant from the model name so the core API gets the bare
+        // model name.  If a variant was specified, pre-load it so the llama
+        // provider uses the right quantization.
+        auto [baseName, requestedVariant]=parseModelVariant(arbiterRequest.model);
+        arbiterRequest.model=baseName;
+
+        if(!requestedVariant.empty())
+        {
+            ErrorCode loadErr=ArbiterAI::instance().loadModel(baseName, requestedVariant);
+            if(loadErr==ErrorCode::ModelDownloading)
+            {
+                res.status=503;
+                res.set_content(errorJson("Model '"+baseName+"' variant '"+requestedVariant
+                    +"' is still downloading", "server_error", "model", "model_downloading").dump(),
+                    "application/json");
+                return;
+            }
+            if(loadErr!=ErrorCode::Success)
+            {
+                spdlog::warn("Failed to pre-load model '{}' variant '{}' (error={})",
+                    baseName, requestedVariant, errorCodeToString(loadErr));
+            }
+        }
+
         // Parse messages with full OpenAI message format support
         for(const nlohmann::json &msg:requestJson.at("messages"))
         {
@@ -365,8 +493,9 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
             m.role=msg.at("role").get<std::string>();
 
             // content can be null for assistant messages with tool_calls
+            // content can be a string or an array of content parts (OpenAI spec)
             if(msg.contains("content") && !msg.at("content").is_null())
-                m.content=msg.at("content").get<std::string>();
+                m.content=contentToString(msg.at("content"));
 
             // tool_call_id for role="tool" messages
             if(msg.contains("tool_call_id"))
@@ -466,6 +595,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
     bool stream=requestJson.value("stream", false);
     std::string requestId=generateId("chatcmpl-");
     auto created=std::time(nullptr);
+    std::string responseModelId=requestJson.at("model").get<std::string>();
 
     // Check for stream_options.include_usage
     bool includeUsage=false;
@@ -478,14 +608,14 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
     {
         res.set_chunked_content_provider(
             "text/event-stream",
-            [arbiterRequest, requestId, created, includeUsage](size_t, httplib::DataSink &sink)
+            [arbiterRequest, requestId, created, includeUsage, responseModelId](size_t, httplib::DataSink &sink)
             {
                 // Send initial chunk with role
                 nlohmann::json roleChunk={
                     {"id", requestId},
                     {"object", "chat.completion.chunk"},
                     {"created", created},
-                    {"model", arbiterRequest.model},
+                    {"model", responseModelId},
                     {"system_fingerprint", nullptr},
                     {"choices", {{
                         {"index", 0},
@@ -503,7 +633,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
                         {"id", requestId},
                         {"object", "chat.completion.chunk"},
                         {"created", created},
-                        {"model", arbiterRequest.model},
+                        {"model", responseModelId},
                         {"system_fingerprint", nullptr},
                         {"choices", {{
                             {"index", 0},
@@ -529,7 +659,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
                     {"id", requestId},
                     {"object", "chat.completion.chunk"},
                     {"created", created},
-                    {"model", arbiterRequest.model},
+                    {"model", responseModelId},
                     {"system_fingerprint", nullptr},
                     {"choices", {{
                         {"index", 0},
@@ -547,7 +677,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
                         {"id", requestId},
                         {"object", "chat.completion.chunk"},
                         {"created", created},
-                        {"model", arbiterRequest.model},
+                        {"model", responseModelId},
                         {"system_fingerprint", nullptr},
                         {"choices", nlohmann::json::array()},
                         {"usage", {
@@ -636,7 +766,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res)
             {"id", requestId},
             {"object", "chat.completion"},
             {"created", created},
-            {"model", arbiterResponse.model},
+            {"model", responseModelId},
             {"system_fingerprint", nullptr},
             {"choices", {{
                 {"index", 0},
@@ -664,6 +794,7 @@ void handleListModelsV1(const httplib::Request &, httplib::Response &res)
     nlohmann::json data=nlohmann::json::array();
     for(const std::string &name:modelNames)
     {
+        // Always emit the bare model name
         data.push_back({
             {"id", name},
             {"object", "model"},
@@ -671,6 +802,23 @@ void handleListModelsV1(const httplib::Request &, httplib::Response &res)
             {"owned_by", "arbiterai"},
             {"permission", nlohmann::json::array()}
         });
+
+        // For models with variants, also emit "model:variant" entries
+        ModelInfo info;
+        if(ArbiterAI::instance().getModelInfo(name, info)==ErrorCode::Success
+            &&!info.variants.empty())
+        {
+            for(const ModelVariant &v:info.variants)
+            {
+                data.push_back({
+                    {"id", name+":"+v.quantization},
+                    {"object", "model"},
+                    {"created", created},
+                    {"owned_by", "arbiterai"},
+                    {"permission", nlohmann::json::array()}
+                });
+            }
+        }
     }
 
     nlohmann::json response={
@@ -684,18 +832,29 @@ void handleListModelsV1(const httplib::Request &, httplib::Response &res)
 void handleGetModelV1(const httplib::Request &req, httplib::Response &res)
 {
     std::string modelId=req.matches[1];
+    auto [baseName, variantName]=parseModelVariant(modelId);
 
-    std::vector<std::string> modelNames;
-    ArbiterAI::instance().getAvailableModels(modelNames);
+    ModelInfo info;
+    bool found=(ArbiterAI::instance().getModelInfo(baseName, info)==ErrorCode::Success);
 
-    bool found=false;
-    for(const std::string &name:modelNames)
+    // If a variant was specified, verify it exists on this model
+    if(found&&!variantName.empty()&&!info.variants.empty())
     {
-        if(name==modelId)
+        bool variantFound=false;
+        for(const ModelVariant &v:info.variants)
         {
-            found=true;
-            break;
+            if(v.quantization==variantName)
+            {
+                variantFound=true;
+                break;
+            }
         }
+        if(!variantFound) found=false;
+    }
+    else if(!variantName.empty()&&info.variants.empty())
+    {
+        // Variant requested but model has no variants
+        found=false;
     }
 
     if(!found)
@@ -830,7 +989,24 @@ void handleGetModels(const httplib::Request &, httplib::Response &res)
     // Add models with hardware fit info
     for(const ModelFit &f:fits)
     {
-        models.push_back(modelFitToJson(f));
+        nlohmann::json modelJson=modelFitToJson(f);
+
+        // Include runtime_options and backend_priority from model config
+        ModelInfo info;
+        if(ArbiterAI::instance().getModelInfo(f.model, info)==ErrorCode::Success)
+        {
+            nlohmann::json opts=runtimeOptionsToJson(info.runtimeOptions);
+            if(!opts.empty())
+            {
+                modelJson["runtime_options"]=opts;
+            }
+            if(!info.backendPriority.empty())
+            {
+                modelJson["backend_priority"]=info.backendPriority;
+            }
+        }
+
+        models.push_back(modelJson);
     }
 
     // Add cloud models (no fit data)
@@ -844,7 +1020,7 @@ void handleGetModels(const httplib::Request &, httplib::Response &res)
     {
         if(fitModels.find(name)==fitModels.end())
         {
-            models.push_back({
+            nlohmann::json modelJson={
                 {"model", name},
                 {"variant", ""},
                 {"can_run", true},
@@ -852,7 +1028,23 @@ void handleGetModels(const httplib::Request &, httplib::Response &res)
                 {"limiting_factor", ""},
                 {"estimated_vram_mb", 0},
                 {"gpu_indices", nlohmann::json::array()}
-            });
+            };
+
+            ModelInfo info;
+            if(ArbiterAI::instance().getModelInfo(name, info)==ErrorCode::Success)
+            {
+                nlohmann::json opts=runtimeOptionsToJson(info.runtimeOptions);
+                if(!opts.empty())
+                {
+                    modelJson["runtime_options"]=opts;
+                }
+                if(!info.backendPriority.empty())
+                {
+                    modelJson["backend_priority"]=info.backendPriority;
+                }
+            }
+
+            models.push_back(modelJson);
         }
     }
 
@@ -879,6 +1071,7 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
         std::string modelName=req.matches[1];
         std::string variant;
         int contextSize=0;
+        RuntimeOptions optionsOverride;
 
         // Accept parameters from query string
         if(req.has_param("variant"))
@@ -898,6 +1091,8 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
                     contextSize=body["context"].get<int>();
                 if(body.contains("context_size")&&body["context_size"].is_number_integer())
                     contextSize=body["context_size"].get<int>();
+                if(body.contains("runtime_options")&&body["runtime_options"].is_object())
+                    optionsOverride=parseRuntimeOptions(body["runtime_options"]);
             }
             catch(const nlohmann::json::parse_error &)
             {
@@ -907,7 +1102,7 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
 
         spdlog::info("Load request: model='{}' variant='{}' context={}", modelName, variant, contextSize);
 
-        ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize);
+        ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize, &optionsOverride);
 
         if(err==ErrorCode::Success)
         {
@@ -919,6 +1114,12 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
             {
                 response["context_size"]=state->contextSize;
                 response["max_context_size"]=state->maxContextSize;
+
+                nlohmann::json activeOpts=runtimeOptionsToJson(state->activeOptions);
+                if(!activeOpts.empty())
+                {
+                    response["runtime_options"]=activeOpts;
+                }
             }
 
             res.set_content(response.dump(), "application/json");
@@ -1247,7 +1448,7 @@ void handleAddModelConfig(const httplib::Request &req, httplib::Response &res)
         added.push_back(modelJson["model"].get<std::string>());
     }
 
-    // Persist if override path is set
+    // Legacy: persist to single override file if configured
     if(!g_overridePath.empty())
     {
         mm.saveOverrides(g_overridePath);
@@ -1315,7 +1516,7 @@ void handleUpdateModelConfig(const httplib::Request &req, httplib::Response &res
             created.push_back(modelName);
     }
 
-    // Persist if override path is set
+    // Legacy: persist to single override file if configured
     if(!g_overridePath.empty())
     {
         mm.saveOverrides(g_overridePath);
@@ -1358,7 +1559,7 @@ void handleDeleteModelConfig(const httplib::Request &req, httplib::Response &res
         return;
     }
 
-    // Persist if override path is set
+    // Legacy: persist to single override file if configured
     if(!g_overridePath.empty())
     {
         mm.saveOverrides(g_overridePath);
@@ -1430,7 +1631,15 @@ void handleGetHardware(const httplib::Request &, httplib::Response &res)
     HardwareDetector::instance().refresh();
     SystemInfo hw=HardwareDetector::instance().getSystemInfo();
 
-    res.set_content(systemInfoToJson(hw).dump(), "application/json");
+    nlohmann::json j=systemInfoToJson(hw);
+
+    std::vector<std::string> defaultBP=ModelRuntime::instance().getDefaultBackendPriority();
+    if(!defaultBP.empty())
+    {
+        j["default_backend_priority"]=defaultBP;
+    }
+
+    res.set_content(j.dump(), "application/json");
 }
 
 void handleSetVramOverride(const httplib::Request &req, httplib::Response &res)
@@ -1989,6 +2198,78 @@ void handleGetActiveDownloads(const httplib::Request &, httplib::Response &res)
     res.set_content(nlohmann::json{{"downloads", downloads}}.dump(), "application/json");
 }
 
+// ========== Runtime Options ==========
+
+void handleGetRuntimeOptions(const httplib::Request &, httplib::Response &res)
+{
+    // Return a schema describing all available runtime options, their types,
+    // defaults, and valid values — so callers know what can be set.
+    nlohmann::json options=nlohmann::json::array();
+
+    options.push_back({
+        {"name", "flash_attn"},
+        {"type", "boolean"},
+        {"description", "Enable or disable flash attention (-fa). Some models crash with it enabled."},
+        {"default", nullptr}
+    });
+    options.push_back({
+        {"name", "kv_cache_type_k"},
+        {"type", "string"},
+        {"description", "KV cache data type for keys (-ctk). Lower precision uses less VRAM."},
+        {"valid_values", {"f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"}},
+        {"default", "f16"}
+    });
+    options.push_back({
+        {"name", "kv_cache_type_v"},
+        {"type", "string"},
+        {"description", "KV cache data type for values (-ctv). Lower precision uses less VRAM."},
+        {"valid_values", {"f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"}},
+        {"default", "f16"}
+    });
+    options.push_back({
+        {"name", "no_mmap"},
+        {"type", "boolean"},
+        {"description", "Disable memory-mapped file I/O (--no-mmap). Required for some models/systems."},
+        {"default", false}
+    });
+    options.push_back({
+        {"name", "reasoning_budget"},
+        {"type", "integer"},
+        {"description", "Reasoning token budget (--reasoning-budget). 0 disables reasoning/thinking tokens."},
+        {"default", nullptr}
+    });
+    options.push_back({
+        {"name", "swa_full"},
+        {"type", "boolean"},
+        {"description", "Use full-size sliding window attention cache (--swa-full)."},
+        {"default", nullptr}
+    });
+    options.push_back({
+        {"name", "n_gpu_layers"},
+        {"type", "integer"},
+        {"description", "Number of layers to offload to GPU (-ngl). 99 offloads all layers."},
+        {"default", 99}
+    });
+    options.push_back({
+        {"name", "override_tensor"},
+        {"type", "string"},
+        {"description", "Tensor override pattern (-ot). Advanced: route specific tensors to CPU/GPU."},
+        {"default", nullptr}
+    });
+
+    nlohmann::json backendPriorityInfo={
+        {"name", "backend_priority"},
+        {"type", "array of strings"},
+        {"description", "Ordered preference for GPU backends. First available backend is used."},
+        {"valid_values", {"vulkan", "rocm", "cuda"}}
+    };
+
+    res.set_content(nlohmann::json{
+        {"runtime_options", options},
+        {"backend_priority", backendPriorityInfo}
+    }.dump(), "application/json");
+}
+
 // ========== Logs ==========
 
 void handleGetLogs(const httplib::Request &req, httplib::Response &res)
@@ -2054,5 +2335,10 @@ void handleDashboard(const httplib::Request &, httplib::Response &res)
     res.set_content(DASHBOARD_HTML, "text/html");
 }
 
+void handleDashboardStorage(const httplib::Request &, httplib::Response &res)
+{
+    res.set_content(DASHBOARD_STORAGE_HTML, "text/html");
+}
+
 } // namespace server
 } // namespace arbiterAI
diff --git a/src/server/routes.h b/src/server/routes.h
index ed79d13..418daf4 100644
--- a/src/server/routes.h
+++ b/src/server/routes.h
@@ -59,6 +59,10 @@ void handleGetHardware(const httplib::Request &req, httplib::Response &res);
 void handleSetVramOverride(const httplib::Request &req, httplib::Response &res);
 void handleClearVramOverride(const httplib::Request &req, httplib::Response &res);
 
+// ========== Runtime Options ==========
+
+void handleGetRuntimeOptions(const httplib::Request &req, httplib::Response &res);
+
 // ========== Logs ==========
 
 void handleGetLogs(const httplib::Request &req, httplib::Response &res);
@@ -84,6 +88,7 @@ void handleGetActiveDownloads(const httplib::Request &req, httplib::Response &re
 // ========== Dashboard ==========
 
 void handleDashboard(const httplib::Request &req, httplib::Response &res);
+void handleDashboardStorage(const httplib::Request &req, httplib::Response &res);
 
 } // namespace server
 } // namespace arbiterAI
diff --git a/vcpkg/custom_ports/llama-cpp/portfile.cmake b/vcpkg/custom_ports/llama-cpp/portfile.cmake
index 7007666..d734586 100644
--- a/vcpkg/custom_ports/llama-cpp/portfile.cmake
+++ b/vcpkg/custom_ports/llama-cpp/portfile.cmake
@@ -1,16 +1,39 @@
-vcpkg_from_github(
-    OUT_SOURCE_PATH SOURCE_PATH
-    REPO ggml-org/llama.cpp
-    REF b${VERSION}
-    SHA512 b05f130a2052d3c2cec483c3b098f71585fe7d00fa1971786c0a646717f82320211801780625b9aabc9fc1e1797f8995381e40661f3e8a115c72710f147083cd
-    HEAD_REF master
-)
+if("head" IN_LIST FEATURES)
+    # Build from latest master HEAD — use git clone directly
+    # vcpkg_from_github with HEAD_REF only works with `--head` flag,
+    # so we clone manually for manifest-mode compatibility.
+    set(SOURCE_PATH "${CURRENT_BUILDTREES_DIR}/src/llama-cpp-master")
+    if(NOT EXISTS "${SOURCE_PATH}/.git")
+        file(REMOVE_RECURSE "${SOURCE_PATH}")
+        vcpkg_execute_required_process(
+            COMMAND git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${SOURCE_PATH}"
+            WORKING_DIRECTORY "${CURRENT_BUILDTREES_DIR}/src"
+            LOGNAME clone-llama-cpp
+        )
+    else()
+        vcpkg_execute_required_process(
+            COMMAND git pull --ff-only
+            WORKING_DIRECTORY "${SOURCE_PATH}"
+            LOGNAME pull-llama-cpp
+        )
+    endif()
+else()
+    # Build from a pinned release tag
+    vcpkg_from_github(
+        OUT_SOURCE_PATH SOURCE_PATH
+        REPO ggml-org/llama.cpp
+        REF b${VERSION}
+        SHA512 6be3482ef58872ee4a386ba831175e53ce0d93c6992e4389ffd97f9af3cc7becdd1356fda575702681f55261e7fe81bc1baa12edd0d5f809aa80684f5c890bac
+        HEAD_REF master
+    )
+endif()
 
 vcpkg_cmake_configure(
     SOURCE_PATH "${SOURCE_PATH}"
     OPTIONS
       -DGGML_CCACHE=OFF
       -DGGML_VULKAN=ON
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
       -DLLAMA_BUILD_TESTS=OFF
       -DLLAMA_BUILD_EXAMPLES=OFF
       -DLLAMA_BUILD_TOOLS=OFF
diff --git a/vcpkg/custom_ports/llama-cpp/vcpkg.json b/vcpkg/custom_ports/llama-cpp/vcpkg.json
index f6565d7..cc672ae 100644
--- a/vcpkg/custom_ports/llama-cpp/vcpkg.json
+++ b/vcpkg/custom_ports/llama-cpp/vcpkg.json
@@ -1,7 +1,7 @@
 {
     "name": "llama-cpp",
-    "version": "8573",
-    "port-version": 1,
+    "version": "8748",
+    "port-version": 0,
     "description": "LLM inference in C/C++",
     "homepage": "https://github.com/ggml-org/llama.cpp",
     "license": "MIT",
@@ -14,5 +14,10 @@
         "name": "vcpkg-cmake-config",
         "host": true
       }
-    ]
+    ],
+    "features": {
+      "head": {
+        "description": "Build from latest master HEAD instead of a pinned release tag"
+      }
+    }
   }
\ No newline at end of file