diff --git a/CMakeLists.txt b/CMakeLists.txt
index eccca43..a9cb298 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,10 @@
 cmake_minimum_required(VERSION 3.15)
 project(arbiterAI VERSION 0.1.6 LANGUAGES C CXX)
 
+# Read llama-cpp build number from our custom vcpkg port
+file(READ "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg/custom_ports/llama-cpp/vcpkg.json" LLAMA_PORT_JSON)
+string(JSON ARBITERAI_LLAMACPP_BUILD GET "${LLAMA_PORT_JSON}" "version")
+
 # Generate version header from template
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/src/arbiterAI/version.h.in"
@@ -139,6 +143,7 @@ target_link_libraries(arbiterai
         tests/telemetryCollectorTests.cpp
         tests/llamaProviderTests.cpp
         tests/storageManagerTests.cpp
+        tests/serverConnectTests.cpp
     )
     
     target_link_libraries(arbiterai_tests
@@ -185,7 +190,6 @@ target_link_libraries(arbiterAI-server
     PRIVATE
         arbiterai
         httplib::httplib
-        cxxopts::cxxopts
         spdlog::spdlog
 )
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 95a8318..1e72eaf 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,8 +1,11 @@
 # syntax=docker/dockerfile:1
-ARG DOCKER_VERSION=1.1.1
-FROM ubuntu:22.04
+ARG DOCKER_VERSION=1.2.0
+FROM ubuntu:24.04
 
-# Install basic build tools and Python 3
+# Install basic build tools, Python 3, and GPU libraries.
+# Vulkan headers + glslc are needed at build time for llama.cpp's Vulkan backend.
+# At runtime, GPU inference requires a Vulkan ICD on the host; without one
+# llama.cpp falls back to CPU-only.
 RUN apt-get update && apt-get install -y \
     build-essential \
     cmake \
@@ -26,17 +29,11 @@ RUN apt-get update && apt-get install -y \
     libxrandr-dev \
     libxinerama-dev \
     libglu1-mesa-dev \
-    && apt-get remove -y nodejs npm \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install NVIDIA CUDA toolkit and Vulkan SDK for GPU-accelerated inference.
-# These are optional at runtime — builds and tests work without a GPU,
-# but llama.cpp inference falls back to CPU-only.
-RUN apt-get update && apt-get install -y \
-    nvidia-cuda-toolkit \
     vulkan-tools \
     libvulkan-dev \
     mesa-vulkan-drivers \
+    glslc \
+    glslang-tools \
     && rm -rf /var/lib/apt/lists/*
 
 RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
diff --git a/docs/server.md b/docs/server.md
index 99f025c..84b655c 100644
--- a/docs/server.md
+++ b/docs/server.md
@@ -12,8 +12,9 @@ Standalone HTTP server that wraps the ArbiterAI library, providing an OpenAI-com
    - [Model Config Injection](#33-model-config-injection)
    - [Telemetry](#34-telemetry)
    - [Storage Management](#35-storage-management)
-   - [Health & Version](#36-health--version)
-   - [Dashboard](#37-dashboard)
+   - [Logs](#36-logs)
+   - [Health & Version](#37-health--version)
+   - [Dashboard](#38-dashboard)
 4. [Configuration Persistence](#4-configuration-persistence)
 5. [Error Format](#5-error-format)
 
@@ -45,40 +46,94 @@ The server supports:
 
 ### CLI Options
 
-| Option | Default | Description |
-|--------|---------|-------------|
-| `-p, --port` | `8080` | HTTP port |
-| `-H, --host` | `0.0.0.0` | Bind address |
-| `-c, --config` | `config` | Model config directory path(s) |
-| `-m, --model` | *(none)* | Default model to load on startup |
-| `-v, --variant` | *(none)* | Default quantization variant (e.g., `Q4_K_M`) |
-| `--override-path` | *(none)* | Path to write runtime model config overrides (enables persistence) |
-| `--ram-budget` | `0` (auto 50%) | Ready-model RAM budget in MB |
-| `--models-dir` | `/models` | Directory where downloaded model files are stored |
-| `--storage-limit` | `0` (unlimited) | Maximum storage for model files (e.g., `50G`, `500M`). `0` = use all free disk space. |
-| `--cleanup-enabled` | `true` | Enable automated storage cleanup |
-| `--cleanup-max-age` | `720` | Max age in hours before a variant becomes a cleanup candidate (default: 30 days) |
-| `--cleanup-interval` | `24` | Hours between automated cleanup runs |
-| `--log-level` | `info` | Log level (`trace`, `debug`, `info`, `warn`, `error`) |
-| `-h, --help` | | Print usage |
+The server accepts only two command-line options:
 
-### Examples
+| Option | Description |
+|--------|-------------|
+| `-c, --config <path>` | Path to server configuration JSON file (**required**) |
+| `-h, --help` | Print usage |
 
-```bash
-# Start with defaults
-./arbiterAI-server
+### Configuration File
+
+All server settings are defined in a JSON configuration file. See [`examples/server_config.json`](../examples/server_config.json) for a complete example.
+
+```json
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+    "model_config_paths": ["config"],
+    "models_dir": "/models",
+    "default_model": "",
+    "default_variant": "",
+    "override_path": "",
+    "ram_budget_mb": 0,
+    "max_concurrent_downloads": 2,
+    "storage": {
+        "limit": "0",
+        "cleanup_enabled": true,
+        "cleanup_max_age_days": 30,
+        "cleanup_interval_hours": 24
+    },
+    "hardware": {
+        "vram_overrides": {
+            "0": 32000
+        }
+    },
+    "logging": {
+        "level": "info",
+        "directory": "",
+        "rotate_hour": 0,
+        "retain_days": 7
+    }
+}
+```
+
+#### Configuration Reference
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `host` | `string` | `"0.0.0.0"` | Bind address |
+| `port` | `int` | `8080` | HTTP port |
+| `model_config_paths` | `string[]` | `["config"]` | Model config directory paths |
+| `models_dir` | `string` | `"/models"` | Directory for downloaded model files |
+| `default_model` | `string` | `""` | Model to load on startup |
+| `default_variant` | `string` | `""` | Default quantization variant (e.g., `Q4_K_M`) |
+| `override_path` | `string` | `""` | Path to write runtime model config overrides |
+| `ram_budget_mb` | `int` | `0` | Ready-model RAM budget in MB (`0` = auto 50%) |
+| `max_concurrent_downloads` | `int` | `2` | Maximum simultaneous model downloads |
+
+**`storage` object:**
 
-# Custom port, load a model on startup
-./arbiterAI-server -p 9090 -m gpt-4 --log-level debug
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `limit` | `string` | `"0"` | Max storage for model files (e.g., `"50G"`, `"500M"`). `"0"` = all free disk. |
+| `cleanup_enabled` | `bool` | `true` | Enable automated storage cleanup |
+| `cleanup_max_age_days` | `int` | `30` | Days since last use before cleanup candidacy |
+| `cleanup_interval_hours` | `int` | `24` | Hours between automated cleanup runs |
 
-# Enable runtime config persistence
-./arbiterAI-server --override-path /data/overrides.json
+**`hardware` object:**
 
-# Load a local model with a specific variant
-./arbiterAI-server -m qwen2.5-7b-instruct -v Q4_K_M --ram-budget 8192
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `vram_overrides` | `object` | `{}` | GPU index → VRAM MB overrides (e.g., `{"0": 32000}`) |
 
-# Limit model storage to 50 GB with cleanup every 12 hours
-./arbiterAI-server --models-dir /data/models --storage-limit 50G --cleanup-interval 12
+**`logging` object:**
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `level` | `string` | `"info"` | Log level (`trace`, `debug`, `info`, `warn`, `error`) |
+| `directory` | `string` | `""` | Directory for log files (empty = console only) |
+| `rotate_hour` | `int` | `0` | Hour of day (0–23) to rotate log files |
+| `retain_days` | `int` | `7` | Number of daily log files to keep |
+
+### Examples
+
+```bash
+# Start with a config file
+./arbiterAI-server --config /etc/arbiterai/server_config.json
+
+# Short form
+./arbiterAI-server -c server_config.json
 ```
 
 ---
@@ -338,6 +393,56 @@ Load a model into VRAM for inference.
 
 **Response (202):** `{"status": "downloading", "model": "qwen2.5-7b-instruct"}` — model file is being downloaded.
 
+**Response (400):** Model load failed. The response includes structured error details so callers can programmatically react to the failure.
+
+```json
+{
+  "error": {
+    "message": "Model architecture is not supported by this llama.cpp build",
+    "type": "invalid_request_error",
+    "code": "model_load_error",
+    "param": "model",
+    "details": {
+      "model": "qwen3.5-27b",
+      "variant": "Q4_K_M",
+      "context_requested": 4096,
+      "error_code": "model_load_error",
+      "reason": "unsupported_arch",
+      "recoverable": false,
+      "action": "update_server",
+      "suggestion": "Update the server to a newer version that supports this model architecture, or use a different model.",
+      "llama_log": "llama_model_load: error loading model architecture: unknown model architecture: 'qwen35'\nllama_model_load_from_file_impl: failed to load model"
+    }
+  }
+}
+```
+
+**`details.reason` values:**
+
+| Reason | Description | Recoverable |
+|--------|-------------|-------------|
+| `file_not_found` | GGUF file does not exist at the expected path | Yes |
+| `file_corrupt` | GGUF header invalid, bad magic, or file truncated | Yes |
+| `insufficient_vram` | Not enough GPU memory to load model at requested context | Yes |
+| `insufficient_ram` | Not enough system RAM | Yes |
+| `context_too_large` | Requested context size exceeds model or hardware limits | Yes |
+| `unsupported_arch` | Model architecture not supported by this llama.cpp build | No |
+| `backend_error` | Generic llama.cpp internal error | No |
+| `unknown` | Could not classify the failure | No |
+
+**`details.action` values:**
+
+| Action | Description |
+|--------|-------------|
+| `redownload` | Re-download the model file (file missing) |
+| `delete_and_redownload` | Delete the corrupt file, then re-download |
+| `reduce_context` | Retry with a smaller context size |
+| `use_smaller_variant` | Try a smaller quantization variant |
+| `update_server` | Update the server to a newer version |
+| `check_logs` | Inspect the `llama_log` field or server logs for details |
+
+**`details.recoverable`** is `true` when the caller can take an automated action (re-download, reduce context, switch variant) to resolve the failure. When `false`, human intervention or a server update is required.
+
 **Response (507):** Insufficient storage — the model file won't fit within the configured storage limit. Includes `available_bytes`, `required_bytes`, and `storage_limit_bytes` for programmatic decision-making.
 
 #### `POST /api/models/:name/unload`
@@ -944,7 +1049,45 @@ List all active downloads with progress, speed, and ETA.
 
 ---
 
-### 3.6 Health & Version
+### 3.6 Logs
+
+#### `GET /api/logs`
+
+Retrieve recent server log entries from the in-memory ring buffer. Useful for debugging model load failures, provider errors, and server behaviour without SSH access.
+
+**Query parameters:**
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `count` | `200` | Number of log entries to return (max `1000`) |
+| `level` | *(all)* | Filter by minimum level: `trace`, `debug`, `info`, `warning`, `error`, `critical` |
+
+**Response:**
+
+```json
+{
+  "logs": [
+    {
+      "timestamp": "2025-01-15T14:30:05.123Z",
+      "epoch_ms": 1736952605123,
+      "level": "info",
+      "message": "Loading model qwen2.5-7b-instruct variant Q4_K_M context 4096"
+    },
+    {
+      "timestamp": "2025-01-15T14:30:06.456Z",
+      "epoch_ms": 1736952606456,
+      "level": "error",
+      "message": "Model load failed: insufficient VRAM"
+    }
+  ]
+}
+```
+
+The ring buffer holds the most recent 1000 entries. Entries are returned in chronological order (oldest first). The dashboard polls this endpoint to display a live scrolling log panel.
+
+---
+
+### 3.7 Health & Version
 
 #### `GET /health` (or `/v1/health`)
 
@@ -969,7 +1112,7 @@ Library version.
 
 ---
 
-### 3.7 Dashboard
+### 3.8 Dashboard
 
 #### `GET /dashboard`
 
@@ -983,6 +1126,7 @@ Returns an HTML page with a live-updating dashboard showing:
 - **Download progress** — Active downloads with progress bar, bytes transferred, speed (MB/s), and ETA
 - **Row age coloring** — Fresh (green, <14 days), stale (yellow, 14–30 days), old (red, >30 days)
 - Model deletion (guarded variants show disabled delete button with tooltip)
+- **Server log panel** — Collapsible live-scrolling log viewer with level filtering (trace/debug/info/warning/error/critical) and auto-scroll toggle. Polls `/api/logs` every 2 seconds.
 
 Open in a browser: `http://localhost:8080/dashboard`
 
diff --git a/examples/server_config.json b/examples/server_config.json
new file mode 100644
index 0000000..ef8c883
--- /dev/null
+++ b/examples/server_config.json
@@ -0,0 +1,36 @@
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+
+    "model_config_paths": [
+        "config"
+    ],
+
+    "models_dir": "/models",
+
+    "default_model": "",
+    "default_variant": "",
+
+    "override_path": "",
+
+    "ram_budget_mb": 0,
+    "max_concurrent_downloads": 2,
+
+    "storage": {
+        "limit": "0",
+        "cleanup_enabled": true,
+        "cleanup_max_age_days": 30,
+        "cleanup_interval_hours": 24
+    },
+
+    "hardware": {
+        "vram_overrides": {}
+    },
+
+    "logging": {
+        "level": "info",
+        "directory": "",
+        "rotate_hour": 0,
+        "retain_days": 7
+    }
+}
diff --git a/schemas/model_config.schema.json b/schemas/model_config.schema.json
index c282b51..d112f68 100644
--- a/schemas/model_config.schema.json
+++ b/schemas/model_config.schema.json
@@ -201,7 +201,7 @@
                 },
                 "download": {
                   "type": "object",
-                  "description": "Download metadata for this variant",
+                  "description": "Download metadata for this variant (single file)",
                   "properties": {
                     "url": {
                       "type": "string",
@@ -219,6 +219,31 @@
                     }
                   },
                   "required": ["url", "filename"]
+                },
+                "files": {
+                  "type": "array",
+                  "description": "Download metadata for split/multi-shard GGUF variants. When present, takes precedence over the single download field.",
+                  "items": {
+                    "type": "object",
+                    "properties": {
+                      "url": {
+                        "type": "string",
+                        "format": "uri",
+                        "description": "Download URL for this shard"
+                      },
+                      "sha256": {
+                        "type": "string",
+                        "pattern": "^[a-f0-9]{64}$",
+                        "description": "SHA256 checksum for this shard"
+                      },
+                      "filename": {
+                        "type": "string",
+                        "description": "Local filename for this shard"
+                      }
+                    },
+                    "required": ["url", "filename"]
+                  },
+                  "minItems": 1
                 }
               }
             }
diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp
index 53869fc..9d8b013 100644
--- a/src/arbiterAI/arbiterAI.cpp
+++ b/src/arbiterAI/arbiterAI.cpp
@@ -29,7 +29,8 @@ std::string VersionInfo::toString() const
 
 VersionInfo getVersion()
 {
-    return {ARBITERAI_VERSION_MAJOR, ARBITERAI_VERSION_MINOR, ARBITERAI_VERSION_PATCH};
+    return {ARBITERAI_VERSION_MAJOR, ARBITERAI_VERSION_MINOR, ARBITERAI_VERSION_PATCH,
+        "b" ARBITERAI_LLAMACPP_BUILD};
 }
 
 VersionInfo ArbiterAI::getVersion()
@@ -87,8 +88,8 @@ ErrorCode ArbiterAI::initialize(const std::vector<std::filesystem::path> &config
     // Mark global singleton initialized so subsequent operations succeed
     ArbiterAI::instance().initialized = true;
 
-    // Initialize StorageManager with default models directory
-    StorageManager::instance().initialize("/models");
+    // Note: StorageManager is initialized by the application (e.g., server main.cpp)
+    // with the user-configured models directory path, not here.
 
     return ErrorCode::Success;
 }
@@ -536,6 +537,21 @@ ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &vari
     return ModelRuntime::instance().loadModel(model, variant, contextSize);
 }
 
+ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant)
+{
+    return ModelRuntime::instance().downloadModel(model, variant);
+}
+
+void ArbiterAI::setMaxConcurrentDownloads(int max)
+{
+    ModelRuntime::instance().setMaxConcurrentDownloads(max);
+}
+
+int ArbiterAI::getMaxConcurrentDownloads() const
+{
+    return ModelRuntime::instance().getMaxConcurrentDownloads();
+}
+
 ErrorCode ArbiterAI::unloadModel(const std::string &model)
 {
     return ModelRuntime::instance().unloadModel(model);
diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h
index 2a6e2cb..7f817ea 100644
--- a/src/arbiterAI/arbiterAI.h
+++ b/src/arbiterAI/arbiterAI.h
@@ -43,6 +43,7 @@ struct VersionInfo {
     int major;
     int minor;
     int patch;
+    std::string llamaCppBuild; ///< llama.cpp build number (e.g. "b8573")
 
     /// Returns the version as "major.minor.patch".
     std::string toString() const;
@@ -608,6 +609,30 @@ class ArbiterAI
      */
     ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0);
 
+    /**
+     * @brief Download model files without loading into VRAM
+     *
+     * Launches an async background download that respects the concurrent
+     * download limit.  Returns ModelDownloading on success (download started),
+     * Success if files are already present, or an error code.
+     * @param model Model name
+     * @param variant Quantization variant (empty = auto-select)
+     * @return ErrorCode::ModelDownloading, Success, ModelNotFound, InsufficientStorage
+     */
+    ErrorCode downloadModel(const std::string &model, const std::string &variant="");
+
+    /**
+     * @brief Set the maximum number of concurrent model downloads
+     * @param max Maximum concurrent downloads (minimum 1, default 2)
+     */
+    void setMaxConcurrentDownloads(int max);
+
+    /**
+     * @brief Get the current concurrent download limit
+     * @return Maximum concurrent downloads
+     */
+    int getMaxConcurrentDownloads() const;
+
     /**
      * @brief Unload a model from VRAM/RAM
      * @param model Model name
diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp
index 0d292d0..d0ac4de 100644
--- a/src/arbiterAI/hardwareDetector.cpp
+++ b/src/arbiterAI/hardwareDetector.cpp
@@ -54,7 +54,9 @@ typedef enum
 typedef enum
 {
     VK_STRUCTURE_TYPE_APPLICATION_INFO=0,
-    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO=1
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO=1,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2=1000059006,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT=1000237000
 } VkStructureType;
 
 typedef enum
@@ -98,8 +100,8 @@ typedef VkInstance_T *VkInstance;
 struct VkPhysicalDevice_T;
 typedef VkPhysicalDevice_T *VkPhysicalDevice;
 
-typedef struct VkPhysicalDeviceLimits {
-    char padding[504]; // exact size of VkPhysicalDeviceLimits
+typedef struct alignas(8) VkPhysicalDeviceLimits {
+    char padding[504]; // exact size and alignment of VkPhysicalDeviceLimits
 } VkPhysicalDeviceLimits;
 
 typedef struct VkPhysicalDeviceSparseProperties {
@@ -147,6 +149,29 @@ typedef VkResult (*VkEnumeratePhysicalDevicesFunc)(VkInstance, uint32_t *, VkPhy
 typedef void (*VkGetPhysicalDevicePropertiesFunc)(VkPhysicalDevice, VkPhysicalDeviceProperties *);
 typedef void (*VkGetPhysicalDeviceMemoryPropertiesFunc)(VkPhysicalDevice, VkPhysicalDeviceMemoryProperties *);
 
+// Vulkan 1.1+ / VK_EXT_memory_budget structures and function pointers
+
+typedef struct VkPhysicalDeviceMemoryProperties2 {
+    VkStructureType sType;
+    void *pNext;
+    VkPhysicalDeviceMemoryProperties memoryProperties;
+} VkPhysicalDeviceMemoryProperties2;
+
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT {
+    VkStructureType sType;
+    void *pNext;
+    uint64_t heapBudget[16]; // VK_MAX_MEMORY_HEAPS
+    uint64_t heapUsage[16];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+
+typedef struct VkExtensionProperties {
+    char extensionName[256]; // VK_MAX_EXTENSION_NAME_SIZE
+    uint32_t specVersion;
+} VkExtensionProperties;
+
+typedef void (*VkGetPhysicalDeviceMemoryProperties2Func)(VkPhysicalDevice, VkPhysicalDeviceMemoryProperties2 *);
+typedef VkResult (*VkEnumerateDeviceExtensionPropertiesFunc)(VkPhysicalDevice, const char *, uint32_t *, VkExtensionProperties *);
+
 const int NVML_DEVICE_NAME_BUFFER_SIZE=96;
 
 } // anonymous namespace
@@ -184,6 +209,9 @@ void HardwareDetector::refresh()
     detectNvmlGpus();
     detectVulkanGpus();
     detectUnifiedMemory();
+    applyVramOverrides();
+
+    m_firstRefreshDone=true;
 }
 
 SystemInfo HardwareDetector::getSystemInfo() const
@@ -226,6 +254,95 @@ bool HardwareDetector::isVulkanAvailable() const
     return m_vulkanLoaded;
 }
 
+void HardwareDetector::setVramOverride(int gpuIndex, int vramMb)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_vramOverrides[gpuIndex]=vramMb;
+
+    // Apply immediately to current data
+    for(GpuInfo &gpu:m_systemInfo.gpus)
+    {
+        if(gpu.index==gpuIndex)
+        {
+            // Preserve the actual used amount: free = newTotal - used
+            int usedMb=gpu.vramTotalMb-gpu.vramFreeMb;
+            gpu.vramTotalMb=vramMb;
+            gpu.vramFreeMb=std::max(0, vramMb-usedMb);
+            gpu.vramOverridden=true;
+
+            if(gpu.unifiedMemory)
+            {
+                int accessibleUsedMb=gpu.gpuAccessibleRamMb-gpu.gpuAccessibleRamFreeMb;
+                gpu.gpuAccessibleRamMb=vramMb;
+                gpu.gpuAccessibleRamFreeMb=std::max(0, vramMb-accessibleUsedMb);
+            }
+
+            spdlog::info("VRAM override applied to GPU {}: {} MB", gpuIndex, vramMb);
+            break;
+        }
+    }
+}
+
+void HardwareDetector::clearVramOverride(int gpuIndex)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_vramOverrides.erase(gpuIndex);
+    spdlog::info("VRAM override cleared for GPU {}", gpuIndex);
+}
+
+void HardwareDetector::clearAllVramOverrides()
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_vramOverrides.clear();
+    spdlog::info("All VRAM overrides cleared");
+}
+
+bool HardwareDetector::hasVramOverride(int gpuIndex) const
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return m_vramOverrides.find(gpuIndex)!=m_vramOverrides.end();
+}
+
+int HardwareDetector::getVramOverride(int gpuIndex) const
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it=m_vramOverrides.find(gpuIndex);
+
+    if(it!=m_vramOverrides.end())
+    {
+        return it->second;
+    }
+    return 0;
+}
+
+void HardwareDetector::applyVramOverrides()
+{
+    for(GpuInfo &gpu:m_systemInfo.gpus)
+    {
+        auto it=m_vramOverrides.find(gpu.index);
+
+        if(it==m_vramOverrides.end())
+        {
+            continue;
+        }
+
+        int overrideMb=it->second;
+
+        // Preserve the actual used amount: free = newTotal - used
+        int usedMb=gpu.vramTotalMb-gpu.vramFreeMb;
+        gpu.vramTotalMb=overrideMb;
+        gpu.vramFreeMb=std::max(0, overrideMb-usedMb);
+        gpu.vramOverridden=true;
+
+        if(gpu.unifiedMemory)
+        {
+            int accessibleUsedMb=gpu.gpuAccessibleRamMb-gpu.gpuAccessibleRamFreeMb;
+            gpu.gpuAccessibleRamMb=overrideMb;
+            gpu.gpuAccessibleRamFreeMb=std::max(0, overrideMb-accessibleUsedMb);
+        }
+    }
+}
+
 // --- System RAM detection ---
 
 void HardwareDetector::detectSystemRam()
@@ -474,6 +591,10 @@ bool HardwareDetector::loadVulkan()
     m_vkGetPhysicalDeviceProperties=dlsym(m_vulkanLib, "vkGetPhysicalDeviceProperties");
     m_vkGetPhysicalDeviceMemoryProperties=dlsym(m_vulkanLib, "vkGetPhysicalDeviceMemoryProperties");
 
+    // Vulkan 1.1+ optional symbols for memory budget queries
+    m_vkGetPhysicalDeviceMemoryProperties2=dlsym(m_vulkanLib, "vkGetPhysicalDeviceMemoryProperties2");
+    m_vkEnumerateDeviceExtensionProperties=dlsym(m_vulkanLib, "vkEnumerateDeviceExtensionProperties");
+
     if(!m_vkCreateInstance||!m_vkDestroyInstance||
         !m_vkEnumeratePhysicalDevices||!m_vkGetPhysicalDeviceProperties||
         !m_vkGetPhysicalDeviceMemoryProperties)
@@ -505,6 +626,8 @@ void HardwareDetector::unloadVulkan()
     m_vkEnumeratePhysicalDevices=nullptr;
     m_vkGetPhysicalDeviceProperties=nullptr;
     m_vkGetPhysicalDeviceMemoryProperties=nullptr;
+    m_vkGetPhysicalDeviceMemoryProperties2=nullptr;
+    m_vkEnumerateDeviceExtensionProperties=nullptr;
 #endif
 }
 
@@ -521,12 +644,25 @@ void HardwareDetector::detectVulkanGpus()
     VkGetPhysicalDevicePropertiesFunc getProperties=reinterpret_cast<VkGetPhysicalDevicePropertiesFunc>(m_vkGetPhysicalDeviceProperties);
     VkGetPhysicalDeviceMemoryPropertiesFunc getMemProperties=reinterpret_cast<VkGetPhysicalDeviceMemoryPropertiesFunc>(m_vkGetPhysicalDeviceMemoryProperties);
 
-    // Create a minimal Vulkan instance
+    // Optional Vulkan 1.1+ function for memory budget queries
+    VkGetPhysicalDeviceMemoryProperties2Func getMemProperties2=nullptr;
+    VkEnumerateDeviceExtensionPropertiesFunc enumDeviceExtensions=nullptr;
+
+    if(m_vkGetPhysicalDeviceMemoryProperties2)
+    {
+        getMemProperties2=reinterpret_cast<VkGetPhysicalDeviceMemoryProperties2Func>(m_vkGetPhysicalDeviceMemoryProperties2);
+    }
+    if(m_vkEnumerateDeviceExtensionProperties)
+    {
+        enumDeviceExtensions=reinterpret_cast<VkEnumerateDeviceExtensionPropertiesFunc>(m_vkEnumerateDeviceExtensionProperties);
+    }
+
+    // Create a Vulkan instance — request API 1.1 for vkGetPhysicalDeviceMemoryProperties2
     VkApplicationInfo appInfo{};
     appInfo.sType=VK_STRUCTURE_TYPE_APPLICATION_INFO;
     appInfo.pApplicationName="ArbiterAI HardwareDetector";
     appInfo.applicationVersion=1;
-    appInfo.apiVersion=(1u<<22)|(0u<<12)|0u; // VK_MAKE_API_VERSION(0,1,0,0)
+    appInfo.apiVersion=(1u<<22)|(1u<<12)|0u; // VK_MAKE_API_VERSION(0,1,1,0)
 
     VkInstanceCreateInfo createInfo{};
     createInfo.sType=VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
@@ -582,29 +718,151 @@ void HardwareDetector::detectVulkanGpus()
             continue;
         }
 
-        VkPhysicalDeviceMemoryProperties memProps{};
-        getMemProperties(devices[i], &memProps);
-
-        // Sum device-local heap sizes for total VRAM
-        int vramTotalMb=0;
-        for(uint32_t h=0; h<memProps.memoryHeapCount; ++h)
+        // Check if VK_EXT_memory_budget is supported on this device
+        bool hasBudgetExt=false;
+        if(enumDeviceExtensions)
         {
-            if(memProps.memoryHeaps[h].flags&VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
+            uint32_t extCount=0;
+            if(enumDeviceExtensions(devices[i], nullptr, &extCount, nullptr)==VK_SUCCESS&&extCount>0)
             {
-                vramTotalMb+=static_cast<int>(memProps.memoryHeaps[h].size/(1024*1024));
+                std::vector<VkExtensionProperties> exts(extCount);
+                if(enumDeviceExtensions(devices[i], nullptr, &extCount, exts.data())==VK_SUCCESS)
+                {
+                    for(uint32_t e=0; e<extCount; ++e)
+                    {
+                        if(std::strcmp(exts[e].extensionName, "VK_EXT_memory_budget")==0)
+                        {
+                            hasBudgetExt=true;
+                            break;
+                        }
+                    }
+                }
             }
         }
 
+        bool isIntegrated=(props.deviceType==VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU);
         GpuInfo gpu;
         gpu.index=gpuOffset+static_cast<int>(i);
         gpu.name=props.deviceName;
         gpu.backend=GpuBackend::Vulkan;
-        gpu.vramTotalMb=vramTotalMb;
-        gpu.vramFreeMb=vramTotalMb; // Vulkan doesn't provide free VRAM — approximate as total
-        gpu.unifiedMemory=(props.deviceType==VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU);
+        gpu.unifiedMemory=isIntegrated;
+
+        // Use VK_EXT_memory_budget for accurate per-heap budget/usage data.
+        // This is the authoritative runtime signal for how much memory
+        // the GPU can actually use, especially on UMA/APU systems where
+        // heap sizes alone can be misleading.
+        if(hasBudgetExt&&getMemProperties2)
+        {
+            VkPhysicalDeviceMemoryBudgetPropertiesEXT budgetProps{};
+            budgetProps.sType=VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
+
+            VkPhysicalDeviceMemoryProperties2 memProps2{};
+            memProps2.sType=VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2;
+            memProps2.pNext=&budgetProps;
+
+            getMemProperties2(devices[i], &memProps2);
+
+            const VkPhysicalDeviceMemoryProperties &mp=memProps2.memoryProperties;
 
-        spdlog::info("Vulkan GPU {}: {} ({}MB VRAM, integrated={})",
-            gpu.index, gpu.name, gpu.vramTotalMb, gpu.unifiedMemory);
+            // Sum DEVICE_LOCAL heaps — on discrete GPUs this is dedicated VRAM,
+            // on UMA systems this is the GPU-accessible portion of system RAM.
+            uint64_t deviceLocalBudgetBytes=0;
+            uint64_t deviceLocalUsageBytes=0;
+            uint64_t deviceLocalSizeBytes=0;
+
+            for(uint32_t h=0; h<mp.memoryHeapCount; ++h)
+            {
+                bool deviceLocal=(mp.memoryHeaps[h].flags&VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)!=0;
+
+                spdlog::debug("Vulkan GPU {}: heap {} — {} size={:.0f}MB budget={:.0f}MB usage={:.0f}MB",
+                    gpu.index, h,
+                    deviceLocal ? "DEVICE_LOCAL" : "host",
+                    static_cast<double>(mp.memoryHeaps[h].size)/(1024.0*1024.0),
+                    static_cast<double>(budgetProps.heapBudget[h])/(1024.0*1024.0),
+                    static_cast<double>(budgetProps.heapUsage[h])/(1024.0*1024.0));
+
+                MemoryHeapInfo heapInfo;
+                heapInfo.index=static_cast<int>(h);
+                heapInfo.deviceLocal=deviceLocal;
+                heapInfo.sizeMb=static_cast<int>(mp.memoryHeaps[h].size/(1024ULL*1024ULL));
+                heapInfo.budgetMb=static_cast<int>(budgetProps.heapBudget[h]/(1024ULL*1024ULL));
+                heapInfo.usageMb=static_cast<int>(budgetProps.heapUsage[h]/(1024ULL*1024ULL));
+                gpu.memoryHeaps.push_back(heapInfo);
+
+                if(deviceLocal)
+                {
+                    deviceLocalSizeBytes+=mp.memoryHeaps[h].size;
+                    deviceLocalBudgetBytes+=budgetProps.heapBudget[h];
+                    deviceLocalUsageBytes+=budgetProps.heapUsage[h];
+                }
+            }
+
+            gpu.hasMemoryBudget=true;
+
+            // Budget is the best estimate of how much this process can allocate.
+            // On UMA, budget may be significantly larger than the raw heap size
+            // (driver exposes most of system RAM as available to the GPU).
+            uint64_t budgetTotalMb=deviceLocalBudgetBytes/(1024ULL*1024ULL);
+            uint64_t budgetUsedMb=deviceLocalUsageBytes/(1024ULL*1024ULL);
+            uint64_t heapSizeMb=deviceLocalSizeBytes/(1024ULL*1024ULL);
+
+            // Use the larger of heap size and budget for total — on some UMA
+            // drivers the budget exceeds the reported heap size.
+            uint64_t effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
+            uint64_t effectiveFreeMb=(deviceLocalBudgetBytes>deviceLocalUsageBytes)
+                ? (deviceLocalBudgetBytes-deviceLocalUsageBytes)/(1024ULL*1024ULL)
+                : 0;
+
+            gpu.vramTotalMb=static_cast<int>(effectiveTotalMb);
+            gpu.vramFreeMb=static_cast<int>(effectiveFreeMb);
+
+            if(isIntegrated)
+            {
+                // For UMA, the GPU-accessible pool is the DEVICE_LOCAL budget —
+                // this is the closest Vulkan gives to "how much memory the GPU
+                // can actually use right now" on unified-memory systems.
+                gpu.gpuAccessibleRamMb=static_cast<int>(effectiveTotalMb);
+                gpu.gpuAccessibleRamFreeMb=static_cast<int>(effectiveFreeMb);
+            }
+
+            spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+                "Vulkan GPU {}: {} (budget: {}MB total, {}MB free, "
+                "heap size: {}MB, integrated={}, memoryBudget=true)",
+                gpu.index, gpu.name,
+                gpu.vramTotalMb, gpu.vramFreeMb,
+                static_cast<int>(heapSizeMb), gpu.unifiedMemory);
+        }
+        else
+        {
+            // Fallback: no memory budget extension — use raw heap sizes.
+            // Free VRAM is unknown, approximate as total.
+            VkPhysicalDeviceMemoryProperties memProps{};
+            getMemProperties(devices[i], &memProps);
+
+            int vramTotalMb=0;
+            for(uint32_t h=0; h<memProps.memoryHeapCount; ++h)
+            {
+                bool deviceLocal=(memProps.memoryHeaps[h].flags&VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)!=0;
+
+                MemoryHeapInfo heapInfo;
+                heapInfo.index=static_cast<int>(h);
+                heapInfo.deviceLocal=deviceLocal;
+                heapInfo.sizeMb=static_cast<int>(memProps.memoryHeaps[h].size/(1024ULL*1024ULL));
+                gpu.memoryHeaps.push_back(heapInfo);
+
+                if(deviceLocal)
+                {
+                    vramTotalMb+=static_cast<int>(memProps.memoryHeaps[h].size/(1024*1024));
+                }
+            }
+
+            gpu.vramTotalMb=vramTotalMb;
+            gpu.vramFreeMb=vramTotalMb;
+
+            spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+                "Vulkan GPU {}: {} ({}MB VRAM, integrated={}, memoryBudget=false)",
+                gpu.index, gpu.name, gpu.vramTotalMb, gpu.unifiedMemory);
+        }
 
         m_systemInfo.gpus.push_back(gpu);
     }
@@ -613,23 +871,14 @@ void HardwareDetector::detectVulkanGpus()
 }
 
 // --- Unified memory detection via sysfs (amdgpu) ---
+// Supplements Vulkan memory budget data with amdgpu kernel-side diagnostics.
+// If VK_EXT_memory_budget already provided gpuAccessibleRam, sysfs is used
+// only for logging correlation. If budget was not available, sysfs provides
+// the fallback for GPU-accessible memory detection.
 
 void HardwareDetector::detectUnifiedMemory()
 {
 #ifdef __linux__
-    // For each GPU flagged as integrated, attempt to read amdgpu sysfs
-    // to get the actual GPU-accessible memory pool (GTT + VRAM).
-    //
-    // amdgpu exposes per-device files under /sys/class/drm/card*/device/:
-    //   mem_info_vram_total  — dedicated VRAM in bytes
-    //   mem_info_vram_used   — used VRAM in bytes
-    //   mem_info_gtt_total   — system RAM accessible to GPU (GTT) in bytes
-    //   mem_info_gtt_used    — used GTT in bytes
-    //
-    // On unified memory APUs (e.g. Ryzen AI Max+), the GPU can address
-    // GTT + VRAM as its working memory pool. The GTT size is typically
-    // most of system RAM minus a kernel reservation.
-
     for(GpuInfo &gpu:m_systemInfo.gpus)
     {
         if(!gpu.unifiedMemory)
@@ -637,7 +886,15 @@ void HardwareDetector::detectUnifiedMemory()
             continue;
         }
 
+        // If Vulkan budget already set gpuAccessibleRamMb, we have good data.
+        // Still attempt sysfs for diagnostic logging, but don't overwrite.
+        bool hasBudgetData=(gpu.gpuAccessibleRamMb>0);
+
         // Find the matching DRM card by scanning /sys/class/drm/card*
+        // Match by reading mem_info_vram_total from sysfs and comparing
+        // to the basic Vulkan heap size. Note: when budget is available,
+        // gpu.vramTotalMb may reflect the budget (much larger on UMA),
+        // so we read the raw sysfs VRAM and look for any reasonable match.
         std::string matchedCardPath;
 
         for(int card=0; card<16; ++card)
@@ -651,107 +908,109 @@ void HardwareDetector::detectUnifiedMemory()
                 continue;
             }
 
-            // Verify this card's VRAM matches what Vulkan reported
-            // (to handle multi-GPU systems correctly)
+            // On UMA APUs, the sysfs VRAM is typically a small carveout
+            // (e.g. 512MB) that doesn't match the Vulkan budget-derived
+            // vramTotalMb. Accept the match if this is the only amdgpu card
+            // with sysfs data available, or if the values are close.
             long long vramBytes=0;
             testFile >> vramBytes;
-            int vramMb=static_cast<int>(vramBytes/(1024LL*1024LL));
-
-            // Allow 1% tolerance for rounding
-            int diff=vramMb-gpu.vramTotalMb;
-            if(diff<0)
-            {
-                diff=-diff;
-            }
 
-            if(diff<=gpu.vramTotalMb/100+1)
-            {
-                matchedCardPath=cardPath;
-                break;
-            }
+            // Just accept the first card that has sysfs data for UMA —
+            // multi-GPU UMA systems are extremely rare.
+            matchedCardPath=cardPath;
+            break;
         }
 
         if(matchedCardPath.empty())
         {
-            // No sysfs match — fall back to system RAM as GPU-accessible pool.
-            // Unified memory GPUs share system RAM, so the GPU can access most of it.
-            // Use total system RAM as the accessible pool estimate.
-            gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb;
-            gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb;
-
-            spdlog::info("Unified memory GPU {}: {} — no sysfs match, "
-                "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool",
-                gpu.index, gpu.name,
-                gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb);
+            if(!hasBudgetData)
+            {
+                // No sysfs and no budget — fall back to system RAM
+                gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb;
+                gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb;
+
+                spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+                    "Unified memory GPU {}: {} — no sysfs or budget data, "
+                    "falling back to system RAM ({}MB total, {}MB free)",
+                    gpu.index, gpu.name,
+                    gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb);
+            }
             continue;
         }
 
-        // Read VRAM used (better than Vulkan's "assume all free")
+        // Read sysfs VRAM and GTT for diagnostic logging
+        long long sysfsVramTotal=0, sysfsVramUsed=0;
+        long long gttTotalBytes=0, gttUsedBytes=0;
+
+        {
+            std::ifstream file(matchedCardPath+"/mem_info_vram_total");
+            if(file.is_open()) file >> sysfsVramTotal;
+        }
         {
             std::ifstream file(matchedCardPath+"/mem_info_vram_used");
-            if(file.is_open())
-            {
-                long long usedBytes=0;
-                file >> usedBytes;
-
-                int usedMb=static_cast<int>(usedBytes/(1024LL*1024LL));
-                gpu.vramFreeMb=gpu.vramTotalMb-usedMb;
-                if(gpu.vramFreeMb<0)
-                {
-                    gpu.vramFreeMb=0;
-                }
-            }
+            if(file.is_open()) file >> sysfsVramUsed;
         }
-
-        // Read GTT total and used
-        long long gttTotalBytes=0;
-        long long gttUsedBytes=0;
-
         {
             std::ifstream file(matchedCardPath+"/mem_info_gtt_total");
-            if(file.is_open())
-            {
-                file >> gttTotalBytes;
-            }
+            if(file.is_open()) file >> gttTotalBytes;
         }
         {
             std::ifstream file(matchedCardPath+"/mem_info_gtt_used");
-            if(file.is_open())
-            {
-                file >> gttUsedBytes;
-            }
+            if(file.is_open()) file >> gttUsedBytes;
         }
 
-        if(gttTotalBytes>0)
+        int sysfsVramTotalMb=static_cast<int>(sysfsVramTotal/(1024LL*1024LL));
+        int sysfsVramUsedMb=static_cast<int>(sysfsVramUsed/(1024LL*1024LL));
+        int gttTotalMb=static_cast<int>(gttTotalBytes/(1024LL*1024LL));
+        int gttUsedMb=static_cast<int>(gttUsedBytes/(1024LL*1024LL));
+        int gttFreeMb=gttTotalMb-gttUsedMb;
+
+        if(gttFreeMb<0) gttFreeMb=0;
+
+        spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+            "Unified memory GPU {}: {} — sysfs: VRAM {}MB ({}MB used), "
+            "GTT {}MB ({}MB used), budgetAlreadySet={}",
+            gpu.index, gpu.name,
+            sysfsVramTotalMb, sysfsVramUsedMb,
+            gttTotalMb, gttUsedMb, hasBudgetData);
+
+        if(hasBudgetData)
         {
-            int gttTotalMb=static_cast<int>(gttTotalBytes/(1024LL*1024LL));
-            int gttUsedMb=static_cast<int>(gttUsedBytes/(1024LL*1024LL));
-            int gttFreeMb=gttTotalMb-gttUsedMb;
+            // Vulkan budget is authoritative for allocation decisions.
+            // Log sysfs as a diagnostic side channel only.
+            continue;
+        }
 
-            if(gttFreeMb<0)
-            {
-                gttFreeMb=0;
-            }
+        // No Vulkan budget — use sysfs data for GPU-accessible memory.
+        // Refine VRAM free from sysfs (more accurate than "assume all free").
+        if(sysfsVramTotal>0)
+        {
+            gpu.vramTotalMb=sysfsVramTotalMb;
+            gpu.vramFreeMb=sysfsVramTotalMb-sysfsVramUsedMb;
+            if(gpu.vramFreeMb<0) gpu.vramFreeMb=0;
+        }
 
+        if(gttTotalBytes>0)
+        {
             // GPU-accessible memory = VRAM + GTT (system RAM mapped to GPU)
             gpu.gpuAccessibleRamMb=gpu.vramTotalMb+gttTotalMb;
             gpu.gpuAccessibleRamFreeMb=gpu.vramFreeMb+gttFreeMb;
 
-            spdlog::info("Unified memory GPU {}: {} — VRAM {}MB ({}MB free), "
-                "GTT {}MB ({}MB free), total accessible {}MB ({}MB free)",
+            spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+                "Unified memory GPU {}: {} — sysfs fallback: "
+                "total accessible {}MB ({}MB free)",
                 gpu.index, gpu.name,
-                gpu.vramTotalMb, gpu.vramFreeMb,
-                gttTotalMb, gttFreeMb,
                 gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb);
         }
         else
         {
-            // sysfs card matched but no GTT info — fall back to system RAM
+            // No GTT info — fall back to system RAM
             gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb;
             gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb;
 
-            spdlog::info("Unified memory GPU {}: {} — no GTT info, "
-                "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool",
+            spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+                "Unified memory GPU {}: {} — no GTT info, "
+                "falling back to system RAM ({}MB total, {}MB free)",
                 gpu.index, gpu.name,
                 gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb);
         }
diff --git a/src/arbiterAI/hardwareDetector.h b/src/arbiterAI/hardwareDetector.h
index 4dd511a..069191b 100644
--- a/src/arbiterAI/hardwareDetector.h
+++ b/src/arbiterAI/hardwareDetector.h
@@ -3,6 +3,7 @@
 
 #include <string>
 #include <vector>
+#include <map>
 #include <mutex>
 
 namespace arbiterAI
@@ -14,6 +15,14 @@ enum class GpuBackend {
     Vulkan
 };
 
+struct MemoryHeapInfo {
+    int index=0;
+    bool deviceLocal=false;
+    int sizeMb=0;
+    int budgetMb=0;    // 0 if VK_EXT_memory_budget not available
+    int usageMb=0;     // 0 if VK_EXT_memory_budget not available
+};
+
 struct GpuInfo {
     int index=0;
     std::string name;
@@ -25,6 +34,9 @@ struct GpuInfo {
     bool unifiedMemory=false;     // true for APUs/iGPUs sharing system RAM
     int gpuAccessibleRamMb=0;     // total RAM the GPU can access (GTT+VRAM on APUs, 0 for discrete)
     int gpuAccessibleRamFreeMb=0; // free RAM the GPU can access
+    bool hasMemoryBudget=false;   // true if VK_EXT_memory_budget was used
+    bool vramOverridden=false;    // true if VRAM values were overridden by user
+    std::vector<MemoryHeapInfo> memoryHeaps; // per-heap details from Vulkan
 };
 
 struct SystemInfo {
@@ -60,6 +72,22 @@ class HardwareDetector {
     /// Check whether Vulkan is available at runtime
     bool isVulkanAvailable() const;
 
+    /// Override the reported VRAM for a GPU (persists across refresh cycles).
+    /// For unified memory devices, also overrides gpuAccessibleRamMb.
+    void setVramOverride(int gpuIndex, int vramMb);
+
+    /// Clear a VRAM override for a specific GPU
+    void clearVramOverride(int gpuIndex);
+
+    /// Clear all VRAM overrides
+    void clearAllVramOverrides();
+
+    /// Check whether a GPU has a VRAM override set
+    bool hasVramOverride(int gpuIndex) const;
+
+    /// Get the VRAM override value for a GPU (0 if not set)
+    int getVramOverride(int gpuIndex) const;
+
 private:
     HardwareDetector();
     ~HardwareDetector();
@@ -73,6 +101,7 @@ class HardwareDetector {
     void detectNvmlGpus();
     void detectVulkanGpus();
     void detectUnifiedMemory();
+    void applyVramOverrides();
 
     // NVML dlopen handles
     bool loadNvml();
@@ -84,6 +113,7 @@ class HardwareDetector {
 
     SystemInfo m_systemInfo;
     mutable std::mutex m_mutex;
+    std::map<int, int> m_vramOverrides; // gpuIndex → vramMb
 
     // Runtime library handles
     void *m_nvmlLib=nullptr;
@@ -107,10 +137,15 @@ class HardwareDetector {
     void *m_vkEnumeratePhysicalDevices=nullptr;
     void *m_vkGetPhysicalDeviceProperties=nullptr;
     void *m_vkGetPhysicalDeviceMemoryProperties=nullptr;
+    void *m_vkGetPhysicalDeviceMemoryProperties2=nullptr;
+    void *m_vkEnumerateDeviceExtensionProperties=nullptr;
 
     // CPU utilization tracking (for delta calculation)
     long long m_prevCpuIdle=0;
     long long m_prevCpuTotal=0;
+
+    // First refresh logs at info, subsequent refreshes at debug
+    bool m_firstRefreshDone=false;
 };
 
 } // namespace arbiterAI
diff --git a/src/arbiterAI/modelDownloader.cpp b/src/arbiterAI/modelDownloader.cpp
index 33d5109..a8af745 100644
--- a/src/arbiterAI/modelDownloader.cpp
+++ b/src/arbiterAI/modelDownloader.cpp
@@ -20,68 +20,8 @@ ModelDownloader::ModelDownloader(std::shared_ptr<IFileVerifier> fileVerifier) :
 
 std::future<bool> ModelDownloader::downloadModel(const std::string &downloadUrl, const std::string &filePathStr, const std::optional<std::string> &fileHash, const std::optional<std::string> &minVersion, const std::optional<std::string> &maxVersion)
 {
-    return std::async(std::launch::async, [this, downloadUrl, filePathStr, fileHash, minVersion, maxVersion]()
-        {
-            // Check version compatibility first
-            if(minVersion||maxVersion)
-            {
-                std::string clientVersion="1.0.0"; // TODO: Get from build config
-                if ((minVersion && ModelManager::compareVersions(clientVersion, *minVersion) < 0) ||
-                    (maxVersion && ModelManager::compareVersions(clientVersion, *maxVersion) > 0))
-                {
-                    spdlog::error("Version mismatch: client {} not compatible with model requirements (min: {}, max: {})",
-                        clientVersion, minVersion?*minVersion:"none", maxVersion?*maxVersion:"none");
-                    return false;
-                }
-            }
-
-            std::filesystem::path filePath(filePathStr);
-            if(std::filesystem::exists(filePath))
-            {
-                if(fileHash&&m_fileVerifier->verifyFile(filePath.string(), *fileHash))
-                {
-                    spdlog::info("Model already exists and is verified: {}", filePath.string());
-                    return true;
-                }
-            }
-
-            spdlog::info("Downloading model from {} to {}", downloadUrl, filePath.string());
-            cpr::Response r=cpr::Get(cpr::Url{ downloadUrl });
-            if(r.status_code!=200)
-            {
-                spdlog::error("Failed to download model. Status code: {}", r.status_code);
-                return false;
-            }
-
-            try
-            {
-                std::filesystem::create_directories(filePath.parent_path());
-                std::ofstream outFile(filePath, std::ios::binary);
-                outFile.write(r.text.c_str(), r.text.length());
-                outFile.close();
-            }
-            catch(const std::filesystem::filesystem_error &e)
-            {
-                spdlog::error("Failed to write model to file: {}. Error: {}", filePath.string(), e.what());
-                return false;
-            }
-
-            if(fileHash)
-            {
-                if(m_fileVerifier->verifyFile(filePath.string(), *fileHash))
-                {
-                    spdlog::info("Model downloaded and verified successfully: {}", filePath.string());
-                    return true;
-                }
-                else
-                {
-                    spdlog::error("SHA256 verification failed for: {}", filePath.string());
-                    return false;
-                }
-            }
-
-            return true;
-        });
+    // Delegate to downloadModelWithProgress with no callback or tracking name
+    return downloadModelWithProgress(downloadUrl, filePathStr, fileHash, nullptr, "", "");
 }
 
 
@@ -188,31 +128,31 @@ std::future<bool> ModelDownloader::downloadModelWithProgress(
     const std::string &variant)
 {
     // Create tracking state
-    auto downloadState = std::make_shared<ActiveDownload>();
-    downloadState->modelName = modelName.empty() ? filePathStr : modelName;
-    downloadState->variant = variant;
-    downloadState->status = DownloadStatus::Pending;
-    downloadState->startTime = std::chrono::steady_clock::now();
+    auto downloadState=std::make_shared<ActiveDownload>();
+    downloadState->modelName=modelName.empty()?filePathStr:modelName;
+    downloadState->variant=variant;
+    downloadState->status=DownloadStatus::Pending;
+    downloadState->startTime=std::chrono::steady_clock::now();
 
     {
         std::lock_guard<std::mutex> lock(m_downloadsMutex);
-        m_activeDownloads[downloadState->modelName] = downloadState;
+        m_activeDownloads[downloadState->modelName]=downloadState;
     }
 
     return std::async(std::launch::async, [this, downloadUrl, filePathStr, fileHash, progressCallback, downloadState]()
     {
-        downloadState->status = DownloadStatus::InProgress;
+        downloadState->status=DownloadStatus::InProgress;
         std::filesystem::path filePath(filePathStr);
 
         // Check if file already exists and is valid
-        if (std::filesystem::exists(filePath))
+        if(std::filesystem::exists(filePath))
         {
-            if (fileHash && m_fileVerifier->verifyFile(filePath.string(), *fileHash))
+            if(fileHash&&m_fileVerifier->verifyFile(filePath.string(), *fileHash))
             {
                 spdlog::info("Model already exists and is verified: {}", filePath.string());
-                downloadState->status = DownloadStatus::Completed;
-                downloadState->percentComplete = 100.0f;
-                if (progressCallback)
+                downloadState->status=DownloadStatus::Completed;
+                downloadState->percentComplete=100.0f;
+                if(progressCallback)
                 {
                     progressCallback(0, 0, 100.0f);
                 }
@@ -220,120 +160,155 @@ std::future<bool> ModelDownloader::downloadModelWithProgress(
             }
         }
 
-        // Check for partial download (resume support)
-        std::string partialPath = filePathStr + ".partial";
-        int64_t existingBytes = 0;
-        if (std::filesystem::exists(partialPath))
-        {
-            existingBytes = std::filesystem::file_size(partialPath);
-            spdlog::info("Found partial download with {} bytes", existingBytes);
-        }
-
         spdlog::info("Downloading model from {} to {}", downloadUrl, filePath.string());
 
         try
         {
             std::filesystem::create_directories(filePath.parent_path());
         }
-        catch (const std::filesystem::filesystem_error &e)
+        catch(const std::filesystem::filesystem_error &e)
         {
             spdlog::error("Failed to create directory: {}", e.what());
-            downloadState->status = DownloadStatus::Failed;
-            downloadState->error = e.what();
+            downloadState->status=DownloadStatus::Failed;
+            downloadState->error=e.what();
             return false;
         }
 
-        // Use CPR with progress callback
-        cpr::Response r = cpr::Get(
+        // Stream directly to a .partial file on disk to avoid buffering the
+        // entire response body in RAM.  A 20 GB model download would otherwise
+        // require 20+ GB of heap, which caused OOM / heap corruption (SEGV).
+        std::string partialPath=filePathStr+".partial";
+
+        // Remove stale partial file so we start fresh
+        {
+            std::error_code ec;
+            std::filesystem::remove(partialPath, ec);
+        }
+
+        std::ofstream outFile(partialPath, std::ios::binary|std::ios::trunc);
+        if(!outFile.is_open())
+        {
+            spdlog::error("Failed to open partial file for writing: {}", partialPath);
+            downloadState->status=DownloadStatus::Failed;
+            downloadState->error="Failed to open "+partialPath;
+            return false;
+        }
+
+        bool writeError=false;
+
+        cpr::Response r=cpr::Get(
             cpr::Url{downloadUrl},
-            cpr::ProgressCallback([&downloadState, &progressCallback](cpr::cpr_off_t downloadTotal, 
-                                                                        cpr::cpr_off_t downloadNow,
-                                                                        cpr::cpr_off_t uploadTotal,
-                                                                        cpr::cpr_off_t uploadNow,
-                                                                        intptr_t userdata) -> bool
+            cpr::WriteCallback([&outFile, &writeError](const std::string_view &data, intptr_t) -> bool
             {
-                downloadState->bytesDownloaded = downloadNow;
-                downloadState->totalBytes = downloadTotal;
-                
-                float percent = 0.0f;
-                if (downloadTotal > 0)
+                outFile.write(data.data(), static_cast<std::streamsize>(data.size()));
+                if(!outFile.good())
+                {
+                    writeError=true;
+                    return false; // abort transfer
+                }
+                return true;
+            }),
+            cpr::ProgressCallback([&downloadState, &progressCallback](cpr::cpr_off_t downloadTotal,
+                cpr::cpr_off_t downloadNow,
+                cpr::cpr_off_t uploadTotal,
+                cpr::cpr_off_t uploadNow,
+                intptr_t userdata) -> bool
+            {
+                (void)uploadTotal;
+                (void)uploadNow;
+                (void)userdata;
+
+                downloadState->bytesDownloaded=downloadNow;
+                downloadState->totalBytes=downloadTotal;
+
+                float percent=0.0f;
+                if(downloadTotal>0)
                 {
-                    percent = (static_cast<float>(downloadNow) / downloadTotal) * 100.0f;
+                    percent=(static_cast<float>(downloadNow)/downloadTotal)*100.0f;
                 }
-                downloadState->percentComplete = percent;
+                downloadState->percentComplete=percent;
 
                 // Record speed sample
                 {
                     std::lock_guard<std::mutex> lock(downloadState->speedMutex);
-                    auto now = std::chrono::steady_clock::now();
+                    std::chrono::steady_clock::time_point now=std::chrono::steady_clock::now();
 
                     downloadState->speedSamples.push_back({now, downloadNow});
 
                     // Keep only last 10 seconds of samples
-                    auto cutoff = now - std::chrono::seconds(10);
-                    while(!downloadState->speedSamples.empty() && downloadState->speedSamples.front().first < cutoff)
+                    std::chrono::steady_clock::time_point cutoff=now-std::chrono::seconds(10);
+                    while(!downloadState->speedSamples.empty()&&downloadState->speedSamples.front().first<cutoff)
                     {
                         downloadState->speedSamples.pop_front();
                     }
                 }
-                
-                if (progressCallback)
+
+                if(progressCallback)
                 {
                     progressCallback(downloadNow, downloadTotal, percent);
                 }
-                
-                return true;  // Continue downloading
+
+                return true;
             })
         );
 
-        if (r.status_code != 200)
+        outFile.close();
+
+        if(writeError)
+        {
+            spdlog::error("Write error during download to {}", partialPath);
+            std::error_code ec;
+            std::filesystem::remove(partialPath, ec);
+            downloadState->status=DownloadStatus::Failed;
+            downloadState->error="Disk write error";
+            return false;
+        }
+
+        if(r.status_code!=200)
         {
             spdlog::error("Failed to download model. Status code: {}", r.status_code);
-            downloadState->status = DownloadStatus::Failed;
-            downloadState->error = "HTTP error: " + std::to_string(r.status_code);
+            std::error_code ec;
+            std::filesystem::remove(partialPath, ec);
+            downloadState->status=DownloadStatus::Failed;
+            downloadState->error="HTTP error: "+std::to_string(r.status_code);
             return false;
         }
 
-        try
+        // Rename .partial -> final path atomically
         {
-            std::ofstream outFile(filePath, std::ios::binary);
-            outFile.write(r.text.c_str(), r.text.length());
-            outFile.close();
-            
-            // Remove partial file if it exists
-            if (std::filesystem::exists(partialPath))
+            std::error_code ec;
+            std::filesystem::rename(partialPath, filePath, ec);
+            if(ec)
             {
-                std::filesystem::remove(partialPath);
+                spdlog::error("Failed to rename {} -> {}: {}", partialPath, filePath.string(), ec.message());
+                std::filesystem::remove(partialPath, ec);
+                downloadState->status=DownloadStatus::Failed;
+                downloadState->error="Failed to rename partial file";
+                return false;
             }
         }
-        catch (const std::filesystem::filesystem_error &e)
-        {
-            spdlog::error("Failed to write model to file: {}. Error: {}", filePath.string(), e.what());
-            downloadState->status = DownloadStatus::Failed;
-            downloadState->error = e.what();
-            return false;
-        }
 
-        if (fileHash)
+        if(fileHash)
         {
-            if (m_fileVerifier->verifyFile(filePath.string(), *fileHash))
+            if(m_fileVerifier->verifyFile(filePath.string(), *fileHash))
             {
                 spdlog::info("Model downloaded and verified successfully: {}", filePath.string());
-                downloadState->status = DownloadStatus::Completed;
-                downloadState->percentComplete = 100.0f;
+                downloadState->status=DownloadStatus::Completed;
+                downloadState->percentComplete=100.0f;
                 return true;
             }
             else
             {
                 spdlog::error("SHA256 verification failed for: {}", filePath.string());
-                downloadState->status = DownloadStatus::Failed;
-                downloadState->error = "SHA256 verification failed";
+                downloadState->status=DownloadStatus::Failed;
+                downloadState->error="SHA256 verification failed";
                 return false;
             }
         }
 
-        downloadState->status = DownloadStatus::Completed;
-        downloadState->percentComplete = 100.0f;
+        spdlog::info("Model downloaded successfully: {}", filePath.string());
+        downloadState->status=DownloadStatus::Completed;
+        downloadState->percentComplete=100.0f;
         return true;
     });
 }
diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp
index c02d529..1523cd2 100644
--- a/src/arbiterAI/modelManager.cpp
+++ b/src/arbiterAI/modelManager.cpp
@@ -35,6 +35,33 @@ bool ModelInfo::isSchemaCompatible(const std::string &schemaVersion) const
            ModelManager::compareVersions(schemaVersion, configVersion) <= 0;
 }
 
+std::vector<VariantDownload> ModelVariant::getAllFiles() const
+{
+    if(!files.empty())
+    {
+        return files;
+    }
+    if(!download.filename.empty())
+    {
+        return {download};
+    }
+    return {};
+}
+
+std::string ModelVariant::getPrimaryFilename() const
+{
+    if(!files.empty())
+    {
+        return files.front().filename;
+    }
+    return download.filename;
+}
+
+bool ModelVariant::isSplit() const
+{
+    return files.size()>1;
+}
+
 ModelManager &ModelManager::instance()
 {
     static ModelManager instance;
@@ -299,6 +326,26 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in
                     variant.download.filename=dl["filename"].get<std::string>();
                 }
             }
+            if(variantJson.contains("files")&&variantJson["files"].is_array())
+            {
+                for(const auto &fileJson:variantJson["files"])
+                {
+                    VariantDownload vd;
+                    if(fileJson.contains("url"))
+                    {
+                        vd.url=fileJson["url"].get<std::string>();
+                    }
+                    if(fileJson.contains("sha256"))
+                    {
+                        vd.sha256=fileJson["sha256"].get<std::string>();
+                    }
+                    if(fileJson.contains("filename"))
+                    {
+                        vd.filename=fileJson["filename"].get<std::string>();
+                    }
+                    variant.files.push_back(vd);
+                }
+            }
             info.variants.push_back(variant);
         }
     }
@@ -737,6 +784,19 @@ nlohmann::json ModelManager::modelInfoToJson(const ModelInfo &info)
                     {"filename", v.download.filename}
                 };
             }
+            if(!v.files.empty())
+            {
+                nlohmann::json filesArr=nlohmann::json::array();
+                for(const VariantDownload &f:v.files)
+                {
+                    nlohmann::json fj;
+                    fj["url"]=f.url;
+                    fj["sha256"]=f.sha256;
+                    fj["filename"]=f.filename;
+                    filesArr.push_back(fj);
+                }
+                vj["files"]=filesArr;
+            }
             variants.push_back(vj);
         }
         j["variants"]=variants;
diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h
index 5e5648d..703cb0f 100644
--- a/src/arbiterAI/modelManager.h
+++ b/src/arbiterAI/modelManager.h
@@ -45,7 +45,19 @@ struct ModelVariant {
     int fileSizeMb=0;
     int minVramMb=0;
     int recommendedVramMb=0;
-    VariantDownload download;
+    VariantDownload download; // Primary / single-file download (backward compat)
+    std::vector<VariantDownload> files; // All shard files (split GGUF). Empty = use download field.
+
+    /// Get the complete list of files to download for this variant.
+    /// Returns files if non-empty, otherwise a 1-element vector from download (if non-empty).
+    std::vector<VariantDownload> getAllFiles() const;
+
+    /// Get the primary filename (first shard / single file) used as the llama.cpp load path.
+    /// Returns empty string if no download info is configured.
+    std::string getPrimaryFilename() const;
+
+    /// Check whether this variant is a split (multi-file) GGUF.
+    bool isSplit() const;
 };
 
 struct Pricing
diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp
index e915a9f..f4f80dd 100644
--- a/src/arbiterAI/modelRuntime.cpp
+++ b/src/arbiterAI/modelRuntime.cpp
@@ -4,10 +4,12 @@
 #include "arbiterAI/storageManager.h"
 
 #include <llama.h>
+#include <ggml.h>
 #include <spdlog/spdlog.h>
 #include <algorithm>
 #include <filesystem>
 #include <thread>
+#include <regex>
 
 namespace arbiterAI
 {
@@ -21,8 +23,29 @@ ModelRuntime &ModelRuntime::instance()
 void ModelRuntime::reset()
 {
     ModelRuntime &rt=instance();
+
+    // Signal all background download threads to stop waiting
+    {
+        std::lock_guard<std::mutex> lock(rt.m_mutex);
+        rt.m_shuttingDown=true;
+        rt.m_downloadCv.notify_all();
+    }
+
+    for(std::thread &t:rt.m_downloadThreads)
+    {
+        if(t.joinable())
+        {
+            t.join();
+        }
+    }
+
     std::lock_guard<std::mutex> lock(rt.m_mutex);
 
+    rt.m_downloadThreads.clear();
+    rt.m_activeDownloadCount=0;
+    rt.m_maxConcurrentDownloads=2;
+    rt.m_shuttingDown=false;
+
     // Free all llama.cpp resources
     for(auto &pair:rt.m_models)
     {
@@ -49,6 +72,230 @@ ModelRuntime::ModelRuntime()
     m_readyRamBudgetMb=hw.totalRamMb/2;
 }
 
+// ---- llama.cpp log capture ------------------------------------------------
+
+// Thread-local pointer to the active ModelRuntime capturing logs.
+// Only one load happens at a time (under m_mutex), so this is safe.
+static ModelRuntime *s_capturingInstance=nullptr;
+
+static void llamaLogCallback(enum ggml_log_level level, const char *text, void *userData)
+{
+    (void)userData;
+    ModelRuntime *rt=s_capturingInstance;
+    if(rt)
+    {
+        rt->appendLlamaLog(text);
+    }
+
+    // Also forward to spdlog so logs still appear in the server log
+    if(text)
+    {
+        std::string msg(text);
+        // Strip trailing newline for spdlog
+        while(!msg.empty()&&(msg.back()=='\n'||msg.back()=='\r'))
+        {
+            msg.pop_back();
+        }
+        if(msg.empty()) return;
+
+        switch(level)
+        {
+            case GGML_LOG_LEVEL_ERROR:
+                spdlog::error("[llama] {}", msg);
+                break;
+            case GGML_LOG_LEVEL_WARN:
+                spdlog::warn("[llama] {}", msg);
+                break;
+            case GGML_LOG_LEVEL_DEBUG:
+                spdlog::debug("[llama] {}", msg);
+                break;
+            default:
+                spdlog::info("[llama] {}", msg);
+                break;
+        }
+    }
+}
+
+void ModelRuntime::appendLlamaLog(const char *text)
+{
+    if(m_capturingLlamaLog&&text)
+    {
+        m_llamaLogCapture<<text;
+    }
+}
+
+void ModelRuntime::beginLlamaLogCapture()
+{
+    m_llamaLogCapture.str("");
+    m_llamaLogCapture.clear();
+    m_capturingLlamaLog=true;
+    s_capturingInstance=this;
+    llama_log_set(llamaLogCallback, nullptr);
+}
+
+void ModelRuntime::endLlamaLogCapture()
+{
+    m_capturingLlamaLog=false;
+    s_capturingInstance=nullptr;
+    llama_log_set(llamaLogCallback, nullptr); // keep forwarding to spdlog
+}
+
+const char *loadFailureReasonToString(LoadFailureReason reason)
+{
+    switch(reason)
+    {
+        case LoadFailureReason::FileNotFound:     return "file_not_found";
+        case LoadFailureReason::FileCorrupt:       return "file_corrupt";
+        case LoadFailureReason::InsufficientVram:  return "insufficient_vram";
+        case LoadFailureReason::InsufficientRam:   return "insufficient_ram";
+        case LoadFailureReason::ContextTooLarge:   return "context_too_large";
+        case LoadFailureReason::UnsupportedArch:   return "unsupported_arch";
+        case LoadFailureReason::BackendError:      return "backend_error";
+        default:                                   return "unknown";
+    }
+}
+
+LoadErrorDetail ModelRuntime::classifyLoadFailure(
+    const std::string &llamaLog,
+    const std::string &model,
+    const std::string &filePath,
+    int contextSize) const
+{
+    LoadErrorDetail detail;
+    detail.llamaLog=llamaLog;
+
+    std::string logLower=llamaLog;
+    std::transform(logLower.begin(), logLower.end(), logLower.begin(), ::tolower);
+
+    // Check for file-not-found / failed-to-open
+    if(logLower.find("failed to open")!=std::string::npos||
+        logLower.find("no such file")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::FileNotFound;
+        detail.summary="Model file not found: "+filePath;
+        detail.suggestion="The GGUF file does not exist at the expected path. "
+            "Re-download the model or verify the file path in the model configuration.";
+        detail.action="redownload";
+        detail.recoverable=true;
+        return detail;
+    }
+
+    // Check for corrupt / truncated GGUF
+    if(logLower.find("invalid magic")!=std::string::npos||
+        logLower.find("gguf_init")!=std::string::npos&&logLower.find("fail")!=std::string::npos||
+        logLower.find("unexpected end")!=std::string::npos||
+        logLower.find("truncated")!=std::string::npos||
+        logLower.find("invalid header")!=std::string::npos||
+        logLower.find("bad magic")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::FileCorrupt;
+        detail.summary="Model file appears corrupt or truncated: "+filePath;
+        detail.suggestion="Delete the file and re-download it. "
+            "The download may have been interrupted or the file was partially written.";
+        detail.action="delete_and_redownload";
+        detail.recoverable=true;
+        return detail;
+    }
+
+    // Check for VRAM / GPU memory allocation failures
+    if(logLower.find("out of memory")!=std::string::npos||
+        logLower.find("cuda error")!=std::string::npos||
+        logLower.find("cudamalloc")!=std::string::npos||
+        logLower.find("not enough vram")!=std::string::npos||
+        logLower.find("vk_error_out_of_device_memory")!=std::string::npos||
+        logLower.find("failed to allocate")!=std::string::npos&&logLower.find("buffer")!=std::string::npos||
+        logLower.find("ggml_backend_cuda")!=std::string::npos&&logLower.find("error")!=std::string::npos||
+        logLower.find("ggml_cuda_error")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::InsufficientVram;
+        detail.summary="Insufficient GPU memory to load model with context size "+std::to_string(contextSize);
+        detail.suggestion="Try reducing the context size, use a smaller quantization variant, "
+            "or unload other models to free VRAM.";
+        detail.action="reduce_context";
+        detail.recoverable=true;
+        return detail;
+    }
+
+    // Check for RAM allocation failures
+    if(logLower.find("bad_alloc")!=std::string::npos||
+        logLower.find("cannot allocate memory")!=std::string::npos||
+        logLower.find("enomem")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::InsufficientRam;
+        detail.summary="Insufficient system RAM to load model";
+        detail.suggestion="Close other applications, unload other models, or use a smaller quantization variant.";
+        detail.action="reduce_context";
+        detail.recoverable=true;
+        return detail;
+    }
+
+    // Check for context size issues
+    if(logLower.find("n_ctx")!=std::string::npos&&
+        (logLower.find("too large")!=std::string::npos||logLower.find("exceeds")!=std::string::npos))
+    {
+        detail.reason=LoadFailureReason::ContextTooLarge;
+        detail.summary="Requested context size "+std::to_string(contextSize)+" exceeds model or hardware limits";
+        detail.suggestion="Use a smaller context size. The model may support a maximum context window smaller "
+            "than what was requested.";
+        detail.action="reduce_context";
+        detail.recoverable=true;
+        return detail;
+    }
+
+    // Check for unsupported architecture
+    if(logLower.find("unknown model architecture")!=std::string::npos||
+        logLower.find("unsupported model")!=std::string::npos||
+        logLower.find("unknown arch")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::UnsupportedArch;
+        detail.summary="Model architecture is not supported by this llama.cpp build";
+        detail.suggestion="Update the server to a newer version that supports this model architecture, "
+            "or use a different model.";
+        detail.action="update_server";
+        detail.recoverable=false;
+        return detail;
+    }
+
+    // Generic model load failure — still include the log for debugging
+    if(logLower.find("failed to load model")!=std::string::npos)
+    {
+        detail.reason=LoadFailureReason::BackendError;
+        detail.summary="llama.cpp failed to load the model from: "+filePath;
+        detail.suggestion="Check the llama.cpp log output above for specific error details. "
+            "Common causes include corrupt files, insufficient memory, or unsupported model formats.";
+        detail.action="check_logs";
+        detail.recoverable=false;
+        return detail;
+    }
+
+    // Fallback: generic context creation failure
+    detail.reason=LoadFailureReason::Unknown;
+    detail.summary="Model load failed for '"+model+"'";
+    detail.suggestion="Check the server log for llama.cpp error output.";
+    detail.action="check_logs";
+    detail.recoverable=false;
+    return detail;
+}
+
+LoadErrorDetail ModelRuntime::getLastLoadError() const
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return m_lastLoadError;
+}
+
+void ModelRuntime::setMaxConcurrentDownloads(int max)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_maxConcurrentDownloads=std::max(1, max);
+    m_downloadCv.notify_all();
+}
+
+int ModelRuntime::getMaxConcurrentDownloads() const
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return m_maxConcurrentDownloads;
+}
+
 ErrorCode ModelRuntime::loadModel(
     const std::string &model,
     const std::string &variant,
@@ -56,6 +303,9 @@ ErrorCode ModelRuntime::loadModel(
 {
     std::lock_guard<std::mutex> lock(m_mutex);
 
+    // Clear previous load error
+    m_lastLoadError=LoadErrorDetail{};
+
     // Check if already loaded
     auto it=m_models.find(model);
     if(it!=m_models.end())
@@ -110,8 +360,13 @@ ErrorCode ModelRuntime::loadModel(
     }
 
     // Determine context size
+    // For llama provider models, contextSize=0 means "use model's native
+    // training context from GGUF metadata" — resolved in loadLlamaModel after
+    // loading model weights.  For cloud models, fall back to the config value.
     int resolvedContext=contextSize;
-    if(resolvedContext<=0)
+    bool useNativeContext=(resolvedContext<=0&&modelInfo->provider=="llama");
+
+    if(resolvedContext<=0&&!useNativeContext)
     {
         resolvedContext=modelInfo->contextWindow;
     }
@@ -137,6 +392,16 @@ ErrorCode ModelRuntime::loadModel(
             ModelFit fit=ModelFitCalculator::calculateModelFit(modelInfo.value(), *selectedVar, hw);
             if(!fit.canRun)
             {
+                m_lastLoadError.reason=(fit.limitingFactor=="ram")
+                    ?LoadFailureReason::InsufficientRam
+                    :LoadFailureReason::InsufficientVram;
+                m_lastLoadError.summary="Model '"+model+"' variant '"+selectedVariant+
+                    "' cannot run on this hardware: "+fit.limitingFactor;
+                m_lastLoadError.suggestion=(fit.limitingFactor=="ram")
+                    ?"Insufficient system RAM. Try a smaller quantization variant or close other applications."
+                    :"Insufficient VRAM. Try a smaller quantization variant, reduce context size, or unload other models.";
+                m_lastLoadError.action="use_smaller_variant";
+                m_lastLoadError.recoverable=true;
                 spdlog::error("Model '{}' variant '{}' cannot run: {}", model, selectedVariant, fit.limitingFactor);
                 return ErrorCode::ModelLoadError;
             }
@@ -144,19 +409,31 @@ ErrorCode ModelRuntime::loadModel(
             // Evict if needed to make room
             evictIfNeeded(selectedVar->minVramMb);
 
-            // Check if model file exists, initiate download if needed
-            if(!selectedVar->download.filename.empty())
+            // Check if all model files exist, initiate async download for any missing ones
+            std::vector<VariantDownload> allFiles=selectedVar->getAllFiles();
+            std::string primaryFilename=selectedVar->getPrimaryFilename();
+
+            if(!allFiles.empty())
             {
-                std::string filePath="/models/"+selectedVar->download.filename;
-                if(!std::filesystem::exists(filePath)&&!selectedVar->download.url.empty())
+                bool anyMissing=false;
+                for(const VariantDownload &file:allFiles)
+                {
+                    std::string filePath="/models/"+file.filename;
+                    if(!std::filesystem::exists(filePath)&&!file.url.empty())
+                    {
+                        anyMissing=true;
+                        break;
+                    }
+                }
+
+                if(anyMissing)
                 {
-                    // Check storage quota before downloading
-                    int64_t fileSizeBytes=static_cast<int64_t>(selectedVar->fileSizeMb)*1024*1024;
-                    if(!StorageManager::instance().canDownload(fileSizeBytes))
+                    // Check storage quota for total missing file size
+                    int64_t totalDownloadBytes=static_cast<int64_t>(selectedVar->fileSizeMb)*1024*1024;
+                    if(!StorageManager::instance().canDownload(totalDownloadBytes))
                     {
-                        // Try cleanup first
                         StorageManager::instance().runCleanup();
-                        if(!StorageManager::instance().canDownload(fileSizeBytes))
+                        if(!StorageManager::instance().canDownload(totalDownloadBytes))
                         {
                             spdlog::error("Insufficient storage to download '{}' variant '{}' ({} MB)",
                                 model, selectedVariant, selectedVar->fileSizeMb);
@@ -164,47 +441,29 @@ ErrorCode ModelRuntime::loadModel(
                         }
                     }
 
-                    // Mark as downloading
+                    // Mark as downloading and launch async background download
                     LoadedModel &dlEntry=m_models[model];
                     dlEntry.modelName=model;
                     dlEntry.variant=selectedVariant;
                     dlEntry.state=ModelState::Downloading;
                     dlEntry.lastUsed=std::chrono::steady_clock::now();
-                    spdlog::info("Downloading model '{}' variant '{}'", model, selectedVariant);
-
-                    // Release the lock during download (can be very large)
-                    m_mutex.unlock();
-                    bool downloadOk=downloadModelFile(
-                        selectedVar->download.url,
-                        filePath,
-                        selectedVar->download.sha256,
-                        model,
-                        selectedVariant);
-                    m_mutex.lock();
-
-                    if(!downloadOk)
-                    {
-                        m_models.erase(model);
-                        return ErrorCode::ModelDownloadFailed;
-                    }
 
-                    // Register the download with StorageManager
-                    int64_t actualSize=0;
-                    std::error_code ec;
-                    if(std::filesystem::exists(filePath, ec))
-                    {
-                        actualSize=static_cast<int64_t>(std::filesystem::file_size(filePath, ec));
-                    }
-                    StorageManager::instance().registerDownload(
-                        model, selectedVariant, selectedVar->download.filename, actualSize);
+                    spdlog::info("Model '{}' variant '{}' needs download — launching async download",
+                        model, selectedVariant);
+
+                    m_downloadThreads.emplace_back(
+                        &ModelRuntime::runBackgroundDownload, this,
+                        model, selectedVariant, modelInfo.value());
+
+                    return ErrorCode::ModelDownloading;
                 }
             }
 
-            // Create/update loaded model entry
+            // All files present — create/update loaded model entry
             LoadedModel &entry=m_models[model];
             entry.modelName=model;
             entry.variant=selectedVariant;
-            entry.contextSize=std::min(resolvedContext, fit.maxContextSize);
+            entry.contextSize=useNativeContext?0:std::min(resolvedContext, fit.maxContextSize);
             entry.estimatedVramUsageMb=fit.estimatedVramUsageMb;
             entry.gpuIndices=fit.gpuIndices;
             entry.lastUsed=std::chrono::steady_clock::now();
@@ -212,8 +471,9 @@ ErrorCode ModelRuntime::loadModel(
             // Actually load llama.cpp model for local providers
             if(modelInfo->provider=="llama")
             {
-                std::string filePath="/models/"+selectedVar->download.filename;
-                ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices);
+                std::string filePath="/models/"+primaryFilename;
+                ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices,
+                    fit.maxContextSize);
                 if(loadResult!=ErrorCode::Success)
                 {
                     m_models.erase(model);
@@ -253,6 +513,246 @@ ErrorCode ModelRuntime::loadModel(
     return ErrorCode::Success;
 }
 
+ErrorCode ModelRuntime::downloadModel(
+    const std::string &model,
+    const std::string &variant)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // Check if already downloading
+    auto it=m_models.find(model);
+    if(it!=m_models.end())
+    {
+        if(it->second.state==ModelState::Downloading)
+        {
+            return ErrorCode::ModelDownloading;
+        }
+        if(it->second.state==ModelState::Loaded||it->second.state==ModelState::Ready)
+        {
+            return ErrorCode::Success;
+        }
+    }
+
+    // Look up model info from ModelManager
+    std::optional<ModelInfo> modelInfo=ModelManager::instance().getModelInfo(model);
+    if(!modelInfo.has_value())
+    {
+        spdlog::error("downloadModel: model '{}' not found in ModelManager", model);
+        return ErrorCode::ModelNotFound;
+    }
+
+    if(modelInfo->variants.empty())
+    {
+        // Cloud model — nothing to download
+        return ErrorCode::Success;
+    }
+
+    // Determine which variant to use
+    std::string selectedVariant=variant;
+    if(selectedVariant.empty())
+    {
+        selectedVariant=selectBestVariant(modelInfo.value());
+    }
+
+    // Find the selected variant
+    const ModelVariant *selectedVar=nullptr;
+    for(const ModelVariant &v:modelInfo->variants)
+    {
+        if(v.quantization==selectedVariant)
+        {
+            selectedVar=&v;
+            break;
+        }
+    }
+
+    if(!selectedVar)
+    {
+        spdlog::error("downloadModel: variant '{}' not found for model '{}'", selectedVariant, model);
+        return ErrorCode::ModelNotFound;
+    }
+
+    // Check if all files are already present
+    std::vector<VariantDownload> allFiles=selectedVar->getAllFiles();
+    bool anyMissing=false;
+    for(const VariantDownload &file:allFiles)
+    {
+        std::string filePath="/models/"+file.filename;
+        if(!std::filesystem::exists(filePath)&&!file.url.empty())
+        {
+            anyMissing=true;
+            break;
+        }
+    }
+
+    if(!anyMissing)
+    {
+        return ErrorCode::Success;
+    }
+
+    // Check storage quota
+    int64_t totalDownloadBytes=static_cast<int64_t>(selectedVar->fileSizeMb)*1024*1024;
+    if(!StorageManager::instance().canDownload(totalDownloadBytes))
+    {
+        StorageManager::instance().runCleanup();
+        if(!StorageManager::instance().canDownload(totalDownloadBytes))
+        {
+            spdlog::error("Insufficient storage to download '{}' variant '{}' ({} MB)",
+                model, selectedVariant, selectedVar->fileSizeMb);
+            return ErrorCode::InsufficientStorage;
+        }
+    }
+
+    // Mark as downloading and launch async background download
+    LoadedModel &dlEntry=m_models[model];
+    dlEntry.modelName=model;
+    dlEntry.variant=selectedVariant;
+    dlEntry.state=ModelState::Downloading;
+    dlEntry.lastUsed=std::chrono::steady_clock::now();
+
+    spdlog::info("downloadModel: launching async download for '{}' variant '{}'",
+        model, selectedVariant);
+
+    m_downloadThreads.emplace_back(
+        &ModelRuntime::runBackgroundDownload, this,
+        model, selectedVariant, modelInfo.value());
+
+    return ErrorCode::ModelDownloading;
+}
+
+void ModelRuntime::runBackgroundDownload(
+    const std::string &model,
+    const std::string &variant,
+    const ModelInfo &info)
+{
+    // Wait for a download slot (respects concurrent download limit)
+    {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        m_downloadCv.wait(lock, [this]()
+        {
+            return m_shuttingDown||m_activeDownloadCount<m_maxConcurrentDownloads;
+        });
+
+        if(m_shuttingDown)
+        {
+            return;
+        }
+
+        ++m_activeDownloadCount;
+    }
+
+    // Find the selected variant from the ModelInfo
+    const ModelVariant *selectedVar=nullptr;
+    for(const ModelVariant &v:info.variants)
+    {
+        if(v.quantization==variant)
+        {
+            selectedVar=&v;
+            break;
+        }
+    }
+
+    if(!selectedVar)
+    {
+        spdlog::error("runBackgroundDownload: variant '{}' not found for model '{}'", variant, model);
+        std::lock_guard<std::mutex> lock(m_mutex);
+        m_models.erase(model);
+        --m_activeDownloadCount;
+        m_downloadCv.notify_one();
+        return;
+    }
+
+    std::vector<VariantDownload> allFiles=selectedVar->getAllFiles();
+    std::string primaryFilename=selectedVar->getPrimaryFilename();
+
+    // Collect files that need downloading
+    std::vector<const VariantDownload *> missingFiles;
+    for(const VariantDownload &file:allFiles)
+    {
+        std::string filePath="/models/"+file.filename;
+        if(!std::filesystem::exists(filePath)&&!file.url.empty())
+        {
+            missingFiles.push_back(&file);
+        }
+    }
+
+    if(missingFiles.size()>1||allFiles.size()>1)
+    {
+        spdlog::info("Downloading model '{}' variant '{}' ({} files, {} missing)",
+            model, variant, allFiles.size(), missingFiles.size());
+    }
+    else
+    {
+        spdlog::info("Downloading model '{}' variant '{}'", model, variant);
+    }
+
+    // Download each missing file (no mutex held)
+    bool allDownloadsOk=true;
+    for(const VariantDownload *file:missingFiles)
+    {
+        std::string filePath="/models/"+file->filename;
+        bool downloadOk=downloadModelFile(
+            file->url,
+            filePath,
+            file->sha256,
+            model,
+            variant);
+
+        if(!downloadOk)
+        {
+            allDownloadsOk=false;
+            spdlog::error("Failed to download shard '{}' for model '{}' variant '{}'",
+                file->filename, model, variant);
+            break;
+        }
+    }
+
+    // Release the download slot
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        --m_activeDownloadCount;
+        m_downloadCv.notify_one();
+    }
+
+    // Update model state under lock
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    if(!allDownloadsOk)
+    {
+        m_models.erase(model);
+        spdlog::error("Download failed for model '{}' variant '{}'", model, variant);
+        return;
+    }
+
+    // Register all files with StorageManager
+    int64_t totalActualSize=0;
+    std::vector<std::string> extraFiles;
+    for(size_t i=0; i<allFiles.size(); ++i)
+    {
+        std::string filePath="/models/"+allFiles[i].filename;
+        int64_t actualSize=0;
+        std::error_code ec;
+        if(std::filesystem::exists(filePath, ec))
+        {
+            actualSize=static_cast<int64_t>(std::filesystem::file_size(filePath, ec));
+        }
+        totalActualSize+=actualSize;
+        if(i>0)
+        {
+            extraFiles.push_back(allFiles[i].filename);
+        }
+    }
+    StorageManager::instance().registerDownload(
+        model, variant, primaryFilename, totalActualSize, extraFiles);
+
+    // Transition to Unloaded (downloaded, not yet loaded into VRAM)
+    auto it=m_models.find(model);
+    if(it!=m_models.end()&&it->second.state==ModelState::Downloading)
+    {
+        it->second.state=ModelState::Unloaded;
+        spdlog::info("Download complete for model '{}' variant '{}' — state is now Unloaded", model, variant);
+    }
+}
+
 ErrorCode ModelRuntime::unloadModel(const std::string &model)
 {
     std::lock_guard<std::mutex> lock(m_mutex);
@@ -693,38 +1193,89 @@ ErrorCode ModelRuntime::loadLlamaModel(
     const std::string &model,
     const std::string &filePath,
     int contextSize,
-    const std::vector<int> &gpuIndices)
+    const std::vector<int> &gpuIndices,
+    int maxHardwareContext)
 {
     initLlamaBackend();
 
+    // Start capturing llama.cpp log output for diagnostics
+    beginLlamaLogCapture();
+
     llama_model_params mparams=llama_model_default_params();
     mparams.n_gpu_layers=99; // offload all layers to GPU by default
 
     llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams);
     if(!llamaModel)
     {
-        spdlog::error("Failed to load llama model from: {}", filePath);
+        std::string captured=m_llamaLogCapture.str();
+        endLlamaLogCapture();
+
+        m_lastLoadError=classifyLoadFailure(captured, model, filePath, contextSize);
+        spdlog::error("Failed to load llama model from: {} — {}", filePath, m_lastLoadError.summary);
         return ErrorCode::ModelLoadError;
     }
 
+    // Query native training context from GGUF metadata
+    int nativeContext=llama_model_n_ctx_train(llamaModel);
+
+    // Resolve actual context to allocate:
+    //   contextSize > 0  → user/config requested explicit size
+    //   contextSize == 0 → use model's native training context
+    // In both cases, cap by the hardware-fit maximum.
+    int actualContext=contextSize;
+    if(actualContext<=0)
+    {
+        actualContext=nativeContext;
+    }
+    if(maxHardwareContext>0&&actualContext>maxHardwareContext)
+    {
+        spdlog::info("Capping context from {} to {} (hardware limit) for model '{}'",
+            actualContext, maxHardwareContext, model);
+        actualContext=maxHardwareContext;
+    }
+
     llama_context_params cparams=llama_context_default_params();
-    cparams.n_ctx=contextSize;
+    cparams.n_ctx=static_cast<uint32_t>(actualContext);
     cparams.n_threads=std::thread::hardware_concurrency();
     cparams.n_threads_batch=std::thread::hardware_concurrency();
 
     llama_context *llamaCtx=llama_init_from_model(llamaModel, cparams);
     if(!llamaCtx)
     {
-        spdlog::error("Failed to create llama context for model: {}", model);
+        std::string captured=m_llamaLogCapture.str();
+        endLlamaLogCapture();
+
+        m_lastLoadError=classifyLoadFailure(captured, model, filePath, actualContext);
+
+        // If classification didn't catch a specific VRAM/context issue,
+        // context creation failure is almost always a memory issue
+        if(m_lastLoadError.reason==LoadFailureReason::Unknown||
+            m_lastLoadError.reason==LoadFailureReason::BackendError)
+        {
+            m_lastLoadError.reason=LoadFailureReason::InsufficientVram;
+            m_lastLoadError.summary="Failed to create context (size="+std::to_string(actualContext)+
+                ") — likely insufficient GPU memory";
+            m_lastLoadError.suggestion="Try a smaller context size or use a smaller quantization variant. "
+                "You can also unload other models to free VRAM.";
+            m_lastLoadError.action="reduce_context";
+            m_lastLoadError.recoverable=true;
+        }
+
+        spdlog::error("Failed to create llama context for model: {} — {}", model, m_lastLoadError.summary);
         llama_model_free(llamaModel);
         return ErrorCode::ModelLoadError;
     }
 
+    endLlamaLogCapture();
+
     LoadedModel &entry=m_models[model];
     entry.llamaModel=llamaModel;
     entry.llamaCtx=llamaCtx;
+    entry.maxContextSize=nativeContext;
+    entry.contextSize=static_cast<int>(llama_n_ctx(llamaCtx));
 
-    spdlog::info("llama.cpp model loaded: {} (context={})", model, contextSize);
+    spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={})",
+        model, entry.contextSize, entry.maxContextSize);
     return ErrorCode::Success;
 }
 
diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h
index 288ceac..d8c07e2 100644
--- a/src/arbiterAI/modelRuntime.h
+++ b/src/arbiterAI/modelRuntime.h
@@ -14,6 +14,9 @@
 #include <queue>
 #include <functional>
 #include <chrono>
+#include <sstream>
+#include <thread>
+#include <condition_variable>
 
 // Forward declarations for llama.cpp types
 struct llama_model;
@@ -30,6 +33,31 @@ enum class ModelState {
     Unloading
 };
 
+/// Classification of why a model load failed, with actionable guidance.
+enum class LoadFailureReason {
+    Unknown,            // could not determine
+    FileNotFound,       // GGUF file does not exist at expected path
+    FileCorrupt,        // GGUF header invalid or file truncated
+    InsufficientVram,   // not enough VRAM / failed to allocate GPU buffers
+    InsufficientRam,    // not enough system RAM
+    ContextTooLarge,    // requested context exceeds model or hardware limits
+    UnsupportedArch,    // model architecture not supported by this llama.cpp build
+    BackendError        // llama.cpp internal error
+};
+
+/// Convert a LoadFailureReason to a stable, snake_case string for API responses.
+const char *loadFailureReasonToString(LoadFailureReason reason);
+
+/// Detailed information about the last model load failure.
+struct LoadErrorDetail {
+    LoadFailureReason reason=LoadFailureReason::Unknown;
+    std::string summary;        // one-line human-readable summary
+    std::string suggestion;     // actionable fix for a human operator
+    std::string action;         // machine-readable recovery action (e.g. "redownload", "reduce_context")
+    bool recoverable=false;     // true if the caller can take an automated action to fix the problem
+    std::string llamaLog;       // raw llama.cpp log output captured during the load attempt
+};
+
 struct LoadedModel {
     std::string modelName;
     std::string variant;
@@ -38,6 +66,7 @@ struct LoadedModel {
     int ramUsageMb=0;
     int estimatedVramUsageMb=0;
     int contextSize=0;
+    int maxContextSize=0; // model's native/training context from GGUF metadata
     std::vector<int> gpuIndices;
     std::chrono::steady_clock::time_point lastUsed;
     bool pinned=false;
@@ -51,6 +80,9 @@ class ModelRuntime {
     static void reset(); // For testing
 
     /// Load a model into VRAM for inference.
+    /// If files are not yet downloaded, triggers an async download and returns
+    /// ModelDownloading immediately.  If a download is already in progress for
+    /// this model the call also returns ModelDownloading.
     /// @param model     Model name from ModelManager.
     /// @param variant   Quantization variant (empty = auto-select best fitting).
     /// @param contextSize  Context size (0 = use model default).
@@ -60,6 +92,20 @@ class ModelRuntime {
         const std::string &variant="",
         int contextSize=0);
 
+    /// Download model files without loading into VRAM.
+    /// Launches an async background download that respects the concurrent
+    /// download limit.  Returns ModelDownloading on success, Success if
+    /// files are already present, or an error code.
+    ErrorCode downloadModel(
+        const std::string &model,
+        const std::string &variant="");
+
+    /// Set the maximum number of concurrent model downloads (default: 2).
+    void setMaxConcurrentDownloads(int max);
+
+    /// Get the current concurrent download limit.
+    int getMaxConcurrentDownloads() const;
+
     /// Unload a model. Pinned models move to Ready; others to Unloaded.
     ErrorCode unloadModel(const std::string &model);
 
@@ -122,6 +168,14 @@ class ModelRuntime {
     /// Get the ModelInfo for a loaded model.
     std::optional<ModelInfo> getLoadedModelInfo(const std::string &model) const;
 
+    /// Get detailed information about the most recent model load failure.
+    /// Cleared at the start of each loadModel() call.
+    LoadErrorDetail getLastLoadError() const;
+
+    /// Called from the llama.cpp log callback to append captured text.
+    /// Public so the C-style callback can reach it; not intended for external use.
+    void appendLlamaLog(const char *text);
+
 private:
     ModelRuntime();
 
@@ -144,11 +198,14 @@ class ModelRuntime {
     void initLlamaBackend();
 
     /// Load a GGUF file into llama.cpp.
+    /// @param contextSize      Requested context (0 = use model's native training context).
+    /// @param maxHardwareContext  Hardware-fit limit (0 = no limit).
     ErrorCode loadLlamaModel(
         const std::string &model,
         const std::string &filePath,
         int contextSize,
-        const std::vector<int> &gpuIndices);
+        const std::vector<int> &gpuIndices,
+        int maxHardwareContext=0);
 
     /// Free llama.cpp resources for a model.
     void freeLlamaModel(LoadedModel &entry);
@@ -177,6 +234,39 @@ class ModelRuntime {
     std::queue<SwapRequest> m_pendingSwaps;
 
     ModelDownloader m_downloader;
+
+    /// Background download concurrency management.
+    int m_maxConcurrentDownloads=2;
+    int m_activeDownloadCount=0;
+    bool m_shuttingDown=false;
+    std::condition_variable m_downloadCv;
+    std::vector<std::thread> m_downloadThreads;
+
+    /// Internal: run a background download for a model.  Respects the
+    /// concurrent download semaphore and registers files with StorageManager
+    /// on success.  Called on a detached background thread.
+    void runBackgroundDownload(
+        const std::string &model,
+        const std::string &variant,
+        const ModelInfo &info);
+
+    /// Last load error detail (set in loadLlamaModel, cleared in loadModel).
+    LoadErrorDetail m_lastLoadError;
+
+    /// Buffer for capturing llama.cpp log output during model load.
+    std::ostringstream m_llamaLogCapture;
+    bool m_capturingLlamaLog=false;
+
+    /// Install/remove the llama.cpp log callback that routes to m_llamaLogCapture.
+    void beginLlamaLogCapture();
+    void endLlamaLogCapture();
+
+    /// Analyze captured llama.cpp log to classify the failure reason.
+    LoadErrorDetail classifyLoadFailure(
+        const std::string &llamaLog,
+        const std::string &model,
+        const std::string &filePath,
+        int contextSize) const;
 };
 
 } // namespace arbiterAI
diff --git a/src/arbiterAI/providers/llama.cpp b/src/arbiterAI/providers/llama.cpp
index 163219b..c3b1187 100644
--- a/src/arbiterAI/providers/llama.cpp
+++ b/src/arbiterAI/providers/llama.cpp
@@ -51,9 +51,11 @@ ErrorCode Llama::completion(const CompletionRequest &request,
     std::string resultText;
     int promptTokens=0;
     int completionTokens=0;
+    double promptTimeMs=0.0;
+    double generationTimeMs=0.0;
 
     ErrorCode code=runInference(llamaModel, llamaCtx, request, model,
-        resultText, promptTokens, completionTokens, nullptr);
+        resultText, promptTokens, completionTokens, promptTimeMs, generationTimeMs, nullptr);
 
     std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now();
     double totalTimeMs=std::chrono::duration<double, std::milli>(endTime-startTime).count();
@@ -79,7 +81,11 @@ ErrorCode Llama::completion(const CompletionRequest &request,
         stats.promptTokens=promptTokens;
         stats.completionTokens=completionTokens;
         stats.totalTimeMs=totalTimeMs;
+        stats.promptTimeMs=promptTimeMs;
+        stats.generationTimeMs=generationTimeMs;
         stats.tokensPerSecond=totalTimeMs>0.0?(completionTokens/(totalTimeMs/1000.0)):0.0;
+        stats.promptTokensPerSecond=promptTimeMs>0.0?(promptTokens/(promptTimeMs/1000.0)):0.0;
+        stats.generationTokensPerSecond=generationTimeMs>0.0?(completionTokens/(generationTimeMs/1000.0)):0.0;
         stats.timestamp=std::chrono::system_clock::now();
         TelemetryCollector::instance().recordInference(stats);
     }
@@ -120,9 +126,11 @@ ErrorCode Llama::streamingCompletion(const CompletionRequest &request,
     std::string resultText;
     int promptTokens=0;
     int completionTokens=0;
+    double promptTimeMs=0.0;
+    double generationTimeMs=0.0;
 
     ErrorCode code=runInference(llamaModel, llamaCtx, request, *modelInfo,
-        resultText, promptTokens, completionTokens, callback);
+        resultText, promptTokens, completionTokens, promptTimeMs, generationTimeMs, callback);
 
     std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now();
     double totalTimeMs=std::chrono::duration<double, std::milli>(endTime-startTime).count();
@@ -139,7 +147,11 @@ ErrorCode Llama::streamingCompletion(const CompletionRequest &request,
         stats.promptTokens=promptTokens;
         stats.completionTokens=completionTokens;
         stats.totalTimeMs=totalTimeMs;
+        stats.promptTimeMs=promptTimeMs;
+        stats.generationTimeMs=generationTimeMs;
         stats.tokensPerSecond=totalTimeMs>0.0?(completionTokens/(totalTimeMs/1000.0)):0.0;
+        stats.promptTokensPerSecond=promptTimeMs>0.0?(promptTokens/(promptTimeMs/1000.0)):0.0;
+        stats.generationTokensPerSecond=generationTimeMs>0.0?(completionTokens/(generationTimeMs/1000.0)):0.0;
         stats.timestamp=std::chrono::system_clock::now();
         TelemetryCollector::instance().recordInference(stats);
     }
@@ -331,6 +343,7 @@ std::string Llama::applyTemplate(llama_model *model,
 ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
     const CompletionRequest &request, const ModelInfo &modelInfo,
     std::string &result, int &promptTokens, int &completionTokens,
+    double &promptTimeMs, double &generationTimeMs,
     std::function<void(const std::string &)> streamCallback)
 {
     const llama_vocab *vocab=llama_model_get_vocab(model);
@@ -358,7 +371,7 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
     promptTokens=nTokens;
 
     // Clear KV cache for fresh inference
-    llama_kv_cache_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), true);
 
     llama_batch batch=llama_batch_init(std::max(nTokens, 512), 0, 1);
 
@@ -374,7 +387,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
     }
     batch.logits[batch.n_tokens-1]=1;
 
-    // Process prompt
+    // Process prompt (timed)
+    std::chrono::steady_clock::time_point promptStart=std::chrono::steady_clock::now();
+
     if(llama_decode(ctx, batch)!=0)
     {
         spdlog::error("llama_decode failed during prompt processing");
@@ -382,6 +397,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
         return ErrorCode::GenerationError;
     }
 
+    std::chrono::steady_clock::time_point promptEnd=std::chrono::steady_clock::now();
+    promptTimeMs=std::chrono::duration<double, std::milli>(promptEnd-promptStart).count();
+
     int maxOutputTokens=request.max_tokens.value_or(modelInfo.maxOutputTokens);
     int nCur=nTokens;
     completionTokens=0;
@@ -412,7 +430,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
         llama_sampler_accept(samplerChain, token);
     }
 
-    // Generation loop
+    // Generation loop (timed)
+    std::chrono::steady_clock::time_point genStart=std::chrono::steady_clock::now();
+
     for(int i=0; i<maxOutputTokens; ++i)
     {
         llama_token nextToken=llama_sampler_sample(samplerChain, ctx, -1);
@@ -478,6 +498,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx,
         }
     }
 
+    std::chrono::steady_clock::time_point genEnd=std::chrono::steady_clock::now();
+    generationTimeMs=std::chrono::duration<double, std::milli>(genEnd-genStart).count();
+
     llama_sampler_free(samplerChain);
     llama_batch_free(batch);
 
diff --git a/src/arbiterAI/providers/llama.h b/src/arbiterAI/providers/llama.h
index c683ba1..6366271 100644
--- a/src/arbiterAI/providers/llama.h
+++ b/src/arbiterAI/providers/llama.h
@@ -43,6 +43,7 @@ class Llama : public BaseProvider {
     ErrorCode runInference(llama_model *model, llama_context *ctx,
         const CompletionRequest &request, const ModelInfo &modelInfo,
         std::string &result, int &promptTokens, int &completionTokens,
+        double &promptTimeMs, double &generationTimeMs,
         std::function<void(const std::string &)> streamCallback);
 };
 
diff --git a/src/arbiterAI/storageManager.cpp b/src/arbiterAI/storageManager.cpp
index 5d64b8a..fe9f5e0 100644
--- a/src/arbiterAI/storageManager.cpp
+++ b/src/arbiterAI/storageManager.cpp
@@ -77,6 +77,10 @@ void StorageManager::initialize(const std::filesystem::path &modelsDir)
 {
     std::lock_guard<std::mutex> lock(m_mutex);
 
+    // Clear any existing state from a previous initialize() call
+    m_entries.clear();
+    m_dirty=false;
+
     m_modelsDir=modelsDir;
 
     if(!std::filesystem::exists(m_modelsDir))
@@ -187,7 +191,8 @@ std::vector<DownloadedModelFile> StorageManager::getDownloadedModels() const
 void StorageManager::registerDownload(const std::string &modelName,
     const std::string &variant,
     const std::string &filename,
-    int64_t fileSizeBytes)
+    int64_t fileSizeBytes,
+    const std::vector<std::string> &additionalFiles)
 {
     std::lock_guard<std::mutex> lock(m_mutex);
 
@@ -196,6 +201,7 @@ void StorageManager::registerDownload(const std::string &modelName,
     if(existing)
     {
         existing->filename=filename;
+        existing->additionalFiles=additionalFiles;
         existing->fileSizeBytes=fileSizeBytes;
         existing->downloadedAt=std::chrono::system_clock::now();
         m_dirty=true;
@@ -206,6 +212,7 @@ void StorageManager::registerDownload(const std::string &modelName,
     entry.modelName=modelName;
     entry.variant=variant;
     entry.filename=filename;
+    entry.additionalFiles=additionalFiles;
     entry.fileSizeBytes=fileSizeBytes;
     entry.downloadedAt=std::chrono::system_clock::now();
     entry.lastUsedAt=std::chrono::system_clock::now();
@@ -216,8 +223,16 @@ void StorageManager::registerDownload(const std::string &modelName,
     m_entries.push_back(entry);
     m_dirty=true;
 
-    spdlog::info("StorageManager: registered download {} variant {} ({})",
-        modelName, variant, formatBytes(fileSizeBytes));
+    if(additionalFiles.empty())
+    {
+        spdlog::info("StorageManager: registered download {} variant {} ({})",
+            modelName, variant, formatBytes(fileSizeBytes));
+    }
+    else
+    {
+        spdlog::info("StorageManager: registered download {} variant {} ({}, {} files)",
+            modelName, variant, formatBytes(fileSizeBytes), 1+additionalFiles.size());
+    }
 }
 
 void StorageManager::recordUsage(const std::string &modelName, const std::string &variant)
@@ -267,7 +282,10 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st
         // Delete files and remove entries (in reverse order to preserve indices)
         for(auto it=toRemove.rbegin(); it!=toRemove.rend(); ++it)
         {
-            std::filesystem::path filePath=m_modelsDir/m_entries[*it].filename;
+            ModelFileEntry &entry=m_entries[*it];
+
+            // Delete primary file
+            std::filesystem::path filePath=m_modelsDir/entry.filename;
 
             std::error_code ec;
             if(std::filesystem::exists(filePath, ec))
@@ -280,9 +298,23 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st
                 }
             }
 
-            freedBytes+=m_entries[*it].fileSizeBytes;
+            // Delete additional shard files
+            for(const std::string &extraFile:entry.additionalFiles)
+            {
+                std::filesystem::path extraPath=m_modelsDir/extraFile;
+                if(std::filesystem::exists(extraPath, ec))
+                {
+                    std::filesystem::remove(extraPath, ec);
+                    if(ec)
+                    {
+                        spdlog::warn("StorageManager: failed to delete shard {}: {}", extraPath.string(), ec.message());
+                    }
+                }
+            }
+
+            freedBytes+=entry.fileSizeBytes;
             spdlog::info("StorageManager: deleted {} variant {} ({})",
-                m_entries[*it].modelName, m_entries[*it].variant, formatBytes(m_entries[*it].fileSizeBytes));
+                entry.modelName, entry.variant, formatBytes(entry.fileSizeBytes));
             m_entries.erase(m_entries.begin()+static_cast<ptrdiff_t>(*it));
         }
     }
@@ -299,6 +331,7 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st
                     return false;
                 }
 
+                // Delete primary file
                 std::filesystem::path filePath=m_modelsDir/it->filename;
 
                 std::error_code ec;
@@ -312,6 +345,20 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st
                     }
                 }
 
+                // Delete additional shard files
+                for(const std::string &extraFile:it->additionalFiles)
+                {
+                    std::filesystem::path extraPath=m_modelsDir/extraFile;
+                    if(std::filesystem::exists(extraPath, ec))
+                    {
+                        std::filesystem::remove(extraPath, ec);
+                        if(ec)
+                        {
+                            spdlog::warn("StorageManager: failed to delete shard {}: {}", extraPath.string(), ec.message());
+                        }
+                    }
+                }
+
                 freedBytes=it->fileSizeBytes;
                 spdlog::info("StorageManager: deleted {} variant {} ({})",
                     modelName, variant, formatBytes(it->fileSizeBytes));
@@ -439,7 +486,7 @@ void StorageManager::scanModelsDirectory()
             continue;
         }
 
-        // Check if already tracked
+        // Check if already tracked (as primary or additional file)
         bool found=false;
         for(const ModelFileEntry &entry:m_entries)
         {
@@ -448,6 +495,15 @@ void StorageManager::scanModelsDirectory()
                 found=true;
                 break;
             }
+            for(const std::string &extra:entry.additionalFiles)
+            {
+                if(extra==filename)
+                {
+                    found=true;
+                    break;
+                }
+            }
+            if(found) break;
         }
 
         if(!found)
@@ -653,6 +709,10 @@ void StorageManager::loadUsageData()
                 entry.modelName=m.value("model", "");
                 entry.variant=m.value("variant", "");
                 entry.filename=m.value("filename", "");
+                if(m.contains("additional_files")&&m["additional_files"].is_array())
+                {
+                    entry.additionalFiles=m["additional_files"].get<std::vector<std::string>>();
+                }
                 entry.fileSizeBytes=m.value("file_size_bytes", int64_t(0));
                 entry.downloadedAt=isoToTimePoint(m.value("downloaded_at", ""));
                 entry.lastUsedAt=isoToTimePoint(m.value("last_used_at", ""));
@@ -660,10 +720,29 @@ void StorageManager::loadUsageData()
                 entry.hotReady=m.value("hot_ready", false);
                 entry.isProtected=m.value("protected", false);
 
-                if(!entry.filename.empty())
+                if(entry.filename.empty())
                 {
-                    m_entries.push_back(entry);
+                    continue;
                 }
+
+                // Deduplicate: skip if we already have an entry with the same filename
+                bool duplicate=false;
+                for(const ModelFileEntry &existing:m_entries)
+                {
+                    if(existing.filename==entry.filename)
+                    {
+                        duplicate=true;
+                        break;
+                    }
+                }
+
+                if(duplicate)
+                {
+                    spdlog::warn("StorageManager: skipping duplicate entry for file: {}", entry.filename);
+                    continue;
+                }
+
+                m_entries.push_back(entry);
             }
         }
 
@@ -704,6 +783,10 @@ void StorageManager::saveUsageData() const
         m["model"]=entry.modelName;
         m["variant"]=entry.variant;
         m["filename"]=entry.filename;
+        if(!entry.additionalFiles.empty())
+        {
+            m["additional_files"]=entry.additionalFiles;
+        }
         m["file_size_bytes"]=entry.fileSizeBytes;
         m["downloaded_at"]=timePointToIso(entry.downloadedAt);
         m["last_used_at"]=timePointToIso(entry.lastUsedAt);
@@ -740,13 +823,13 @@ std::vector<CleanupCandidate> StorageManager::collectCleanupCandidates() const
         if(m_cleanupPolicy.respectHotReady&&entry.hotReady) continue;
         if(m_cleanupPolicy.respectProtected&&entry.isProtected) continue;
 
-        // Skip entries that are currently Loaded or Ready in ModelRuntime
+        // Skip entries that are currently Loaded, Ready, or Downloading in ModelRuntime
         // Note: we don't hold ModelRuntime's lock here, so this is a best-effort check
         std::optional<LoadedModel> runtimeState=ModelRuntime::instance().getModelState(entry.modelName);
         if(runtimeState.has_value())
         {
             ModelState state=runtimeState->state;
-            if(state==ModelState::Loaded||state==ModelState::Ready)
+            if(state==ModelState::Loaded||state==ModelState::Ready||state==ModelState::Downloading)
             {
                 continue;
             }
@@ -785,6 +868,7 @@ DownloadedModelFile StorageManager::entryToPublic(const ModelFileEntry &entry) c
     f.modelName=entry.modelName;
     f.variant=entry.variant;
     f.filename=entry.filename;
+    f.additionalFiles=entry.additionalFiles;
     f.filePath=m_modelsDir/entry.filename;
     f.fileSizeBytes=entry.fileSizeBytes;
     f.downloadedAt=entry.downloadedAt;
diff --git a/src/arbiterAI/storageManager.h b/src/arbiterAI/storageManager.h
index dcb9fea..6718549 100644
--- a/src/arbiterAI/storageManager.h
+++ b/src/arbiterAI/storageManager.h
@@ -27,7 +27,8 @@ struct StorageInfo {
 struct DownloadedModelFile {
     std::string modelName;
     std::string variant;             // quantization (e.g., "Q4_K_M")
-    std::string filename;
+    std::string filename;            // primary filename (first shard / single file)
+    std::vector<std::string> additionalFiles; // extra shard filenames for split GGUF models
     std::filesystem::path filePath;
     int64_t fileSizeBytes=0;
     std::chrono::system_clock::time_point downloadedAt;
@@ -84,10 +85,12 @@ class StorageManager {
     std::vector<DownloadedModelFile> getDownloadedModels() const;
 
     /// Register a completed download (updates inventory).
+    /// @param additionalFiles Extra shard filenames for split GGUF models (empty for single-file).
     void registerDownload(const std::string &modelName,
         const std::string &variant,
         const std::string &filename,
-        int64_t fileSizeBytes);
+        int64_t fileSizeBytes,
+        const std::vector<std::string> &additionalFiles={});
 
     /// Record a model usage event (inference served).
     void recordUsage(const std::string &modelName, const std::string &variant);
@@ -152,6 +155,7 @@ class StorageManager {
         std::string modelName;
         std::string variant;
         std::string filename;
+        std::vector<std::string> additionalFiles; // extra shard filenames for split GGUF
         int64_t fileSizeBytes=0;
         std::chrono::system_clock::time_point downloadedAt;
         std::chrono::system_clock::time_point lastUsedAt;
diff --git a/src/arbiterAI/telemetryCollector.cpp b/src/arbiterAI/telemetryCollector.cpp
index 989cddd..cd5266e 100644
--- a/src/arbiterAI/telemetryCollector.cpp
+++ b/src/arbiterAI/telemetryCollector.cpp
@@ -70,6 +70,32 @@ SystemSnapshot TelemetryCollector::getSnapshot() const
     snapshot.avgTokensPerSecond=getAvgTokensPerSecond();
     snapshot.activeRequests=ModelRuntime::instance().isInferenceActive()?1:0;
 
+    // Calculate average prompt/generation speeds over last 5 minutes
+    std::chrono::system_clock::time_point cutoff=
+        std::chrono::system_clock::now()-std::chrono::minutes(5);
+    double promptSum=0.0, genSum=0.0;
+    int promptCount=0, genCount=0;
+
+    for(const InferenceStats &stat:m_inferenceHistory)
+    {
+        if(stat.timestamp>=cutoff)
+        {
+            if(stat.promptTokensPerSecond>0.0)
+            {
+                promptSum+=stat.promptTokensPerSecond;
+                promptCount++;
+            }
+            if(stat.generationTokensPerSecond>0.0)
+            {
+                genSum+=stat.generationTokensPerSecond;
+                genCount++;
+            }
+        }
+    }
+
+    snapshot.avgPromptTokensPerSecond=promptCount>0?(promptSum/promptCount):0.0;
+    snapshot.avgGenerationTokensPerSecond=genCount>0?(genSum/genCount):0.0;
+
     return snapshot;
 }
 
diff --git a/src/arbiterAI/telemetryCollector.h b/src/arbiterAI/telemetryCollector.h
index 350bf17..625edcd 100644
--- a/src/arbiterAI/telemetryCollector.h
+++ b/src/arbiterAI/telemetryCollector.h
@@ -17,10 +17,14 @@ struct InferenceStats {
     std::string model;
     std::string variant;
     double tokensPerSecond=0.0;
+    double promptTokensPerSecond=0.0;     // prompt processing speed (tokens in / sec)
+    double generationTokensPerSecond=0.0; // generation speed (tokens out / sec)
     int promptTokens=0;
     int completionTokens=0;
     double latencyMs=0.0;      // time to first token
     double totalTimeMs=0.0;    // total request time
+    double promptTimeMs=0.0;   // time spent processing prompt
+    double generationTimeMs=0.0; // time spent generating tokens
     std::chrono::system_clock::time_point timestamp;
 };
 
@@ -35,6 +39,8 @@ struct SystemSnapshot {
     SystemInfo hardware;
     std::vector<LoadedModel> models;
     double avgTokensPerSecond=0.0;
+    double avgPromptTokensPerSecond=0.0;
+    double avgGenerationTokensPerSecond=0.0;
     int activeRequests=0;
 };
 
diff --git a/src/arbiterAI/version.h.in b/src/arbiterAI/version.h.in
index 88ec987..3887fc9 100644
--- a/src/arbiterAI/version.h.in
+++ b/src/arbiterAI/version.h.in
@@ -15,4 +15,6 @@
 #define ARBITERAI_VERSION_PATCH @PROJECT_VERSION_PATCH@
 #define ARBITERAI_VERSION_STRING "@PROJECT_VERSION@"
 
+#define ARBITERAI_LLAMACPP_BUILD "@ARBITERAI_LLAMACPP_BUILD@"
+
 #endif//_arbiterAI_version_h_
diff --git a/src/server/dashboard.h b/src/server/dashboard.h
index fc9b3e6..d68ed32 100644
--- a/src/server/dashboard.h
+++ b/src/server/dashboard.h
@@ -314,10 +314,42 @@ td
 {
     background: linear-gradient(90deg, #4caf50, #66cc66);
 }
+.cpu-bar-fill
+{
+    background: linear-gradient(90deg, #f0a030, #f0c040);
+}
 .gpu-row
 {
     margin-bottom: 8px;
 }
+.heap-tooltip-wrap
+{
+    position: relative;
+    display: inline;
+    cursor: help;
+    border-bottom: 1px dotted #666;
+}
+.heap-tooltip-wrap .heap-tooltip
+{
+    display: none;
+    position: absolute;
+    bottom: 120%;
+    right: 0;
+    background: #1a1d2e;
+    border: 1px solid #2a2d3e;
+    border-radius: 6px;
+    padding: 8px 10px;
+    white-space: nowrap;
+    font-size: 11px;
+    color: #ccc;
+    z-index: 100;
+    box-shadow: 0 4px 12px rgba(0,0,0,0.4);
+    line-height: 1.6;
+}
+.heap-tooltip-wrap:hover .heap-tooltip
+{
+    display: block;
+}
 .gpu-label
 {
     display: flex;
@@ -332,6 +364,18 @@ td
     fill: none;
     stroke-width: 2;
 }
+#promptChart
+{
+    stroke: #4caf50;
+    fill: none;
+    stroke-width: 2;
+}
+#genChart
+{
+    stroke: #f0c040;
+    fill: none;
+    stroke-width: 2;
+}
 .chart-grid line
 {
     stroke: #1f2230;
@@ -342,19 +386,138 @@ td
     fill: #666;
     font-size: 10px;
 }
+.chart-legend
+{
+    display: flex;
+    gap: 16px;
+    margin-top: 6px;
+    font-size: 11px;
+    color: #999;
+}
+.chart-legend-dot
+{
+    display: inline-block;
+    width: 10px;
+    height: 10px;
+    border-radius: 2px;
+    margin-right: 4px;
+    vertical-align: middle;
+}
+.log-panel
+{
+    background: #0d0f14;
+    border: 1px solid #2a2d3a;
+    border-radius: 6px;
+    font-family: 'Menlo', 'Consolas', 'Courier New', monospace;
+    font-size: 12px;
+    line-height: 1.5;
+    max-height: 400px;
+    overflow-y: auto;
+    padding: 8px 0;
+}
+.log-panel.collapsed
+{
+    max-height: 0;
+    padding: 0;
+    border: none;
+    overflow: hidden;
+}
+.log-entry
+{
+    padding: 1px 12px;
+    white-space: pre-wrap;
+    word-break: break-all;
+}
+.log-entry:hover
+{
+    background: #161922;
+}
+.log-ts
+{
+    color: #555;
+    margin-right: 8px;
+}
+.log-level
+{
+    display: inline-block;
+    width: 52px;
+    font-weight: 600;
+    margin-right: 6px;
+}
+.log-level-trace { color: #666; }
+.log-level-debug { color: #888; }
+.log-level-info  { color: #4caf50; }
+.log-level-warning { color: #f0c040; }
+.log-level-error { color: #ff4444; }
+.log-level-critical { color: #ff2222; font-weight: 800; }
+.log-msg
+{
+    color: #ccc;
+}
+.log-toolbar
+{
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    margin-bottom: 8px;
+}
+.log-toolbar select
+{
+    background: #1a1d27;
+    color: #ccc;
+    border: 1px solid #2a2d3a;
+    border-radius: 4px;
+    padding: 4px 8px;
+    font-size: 12px;
+}
+.log-toolbar label
+{
+    color: #888;
+    font-size: 12px;
+    display: flex;
+    align-items: center;
+    gap: 4px;
+}
+.log-toolbar input[type="checkbox"]
+{
+    accent-color: #7c8aff;
+}
+.card-header-row
+{
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 12px;
+    cursor: pointer;
+}
+.card-header-row h2
+{
+    margin-bottom: 0;
+}
+.collapse-icon
+{
+    color: #888;
+    font-size: 16px;
+    transition: transform 0.2s;
+    user-select: none;
+}
+.collapse-icon.open
+{
+    transform: rotate(180deg);
+}
 </style>
 </head>
 <body>
 <div class="header">
-    <h1>ArbiterAI Dashboard <span id="versionBadge" class="version-badge"></span></h1>
+    <h1>ArbiterAI Dashboard <span id="versionBadge" class="version-badge"></span><span id="llamaBadge" class="version-badge"></span></h1>
     <div class="status"><span class="dot" id="statusDot"></span><span id="statusText">Connected</span></div>
 </div>
 <div class="container">
     <div class="grid">
         <div class="card">
             <h2>Performance</h2>
-            <div class="stat-big" id="avgTps">0.0</div>
-            <div style="color:#888;font-size:12px;margin-bottom:8px;">tokens/sec (avg)</div>
+            <div class="stat-row"><span class="stat-label">Prompt Speed</span><span class="stat-value" id="avgPromptTps">0.0 t/s</span></div>
+            <div class="stat-row"><span class="stat-label">Generation Speed</span><span class="stat-value" id="avgGenTps">0.0 t/s</span></div>
             <div class="stat-row"><span class="stat-label">Active Requests</span><span class="stat-value" id="activeReqs">0</span></div>
             <div class="stat-row"><span class="stat-label">Total Inferences</span><span class="stat-value" id="totalInferences">0</span></div>
             <div class="stat-row"><span class="stat-label">Model Swaps</span><span class="stat-value" id="totalSwaps">0</span></div>
@@ -364,16 +527,24 @@ td
             <div class="stat-row"><span class="stat-label">Total RAM</span><span class="stat-value" id="totalRam">-</span></div>
             <div class="stat-row"><span class="stat-label">Free RAM</span><span class="stat-value" id="freeRam">-</span></div>
             <div class="stat-row"><span class="stat-label">CPU Cores</span><span class="stat-value" id="cpuCores">-</span></div>
-            <div class="stat-row"><span class="stat-label">CPU Usage</span><span class="stat-value" id="cpuUsage">-</span></div>
+            <div class="gpu-label" style="margin-top:6px;"><span>CPU Usage</span><span id="cpuUsage">-</span></div>
+            <div class="gpu-bar"><div class="gpu-bar-fill cpu-bar-fill" id="cpuBar" style="width:0%"></div></div>
         </div>
         <div class="card">
-            <h2>Tokens/sec Over Time</h2>
+            <h2>Processing Speed Over Time</h2>
             <div class="chart-container">
-                <svg class="chart-svg" id="tpsChartSvg" viewBox="0 0 400 120" preserveAspectRatio="none">
+                <svg class="chart-svg" id="tpsChartSvg" viewBox="0 0 430 120" preserveAspectRatio="none">
                     <g class="chart-grid" id="chartGrid"></g>
-                    <polyline id="tpsChart" points=""></polyline>
+                    <g id="chartYLabelsLeft"></g>
+                    <g id="chartYLabelsRight"></g>
+                    <polyline id="promptChart" points=""></polyline>
+                    <polyline id="genChart" points=""></polyline>
                 </svg>
             </div>
+            <div class="chart-legend">
+                <span><span class="chart-legend-dot" style="background:#4caf50;"></span>Prompt (tokens in/s)</span>
+                <span><span class="chart-legend-dot" style="background:#f0c040;"></span>Generation (tokens out/s)</span>
+            </div>
         </div>
     </div>
     <div class="grid">
@@ -391,6 +562,7 @@ td
                     <th>Variant</th>
                     <th>State</th>
                     <th>Context</th>
+                    <th>Max Context</th>
                     <th>VRAM (MB)</th>
                     <th>RAM (MB)</th>
                     <th>Pinned</th>
@@ -443,9 +615,9 @@ td
         <div class="card">
             <h2>Recent Inferences</h2>
             <table>
-                <thead><tr><th>Model</th><th>TPS</th><th>Prompt</th><th>Completion</th><th>Latency</th></tr></thead>
+                <thead><tr><th>Model</th><th>Prompt t/s</th><th>Gen t/s</th><th>Prompt</th><th>Completion</th><th>Latency</th></tr></thead>
                 <tbody id="inferenceTable">
-                    <tr><td colspan="5" style="color:#666;text-align:center;">No recent inferences</td></tr>
+                    <tr><td colspan="6" style="color:#666;text-align:center;">No recent inferences</td></tr>
                 </tbody>
             </table>
         </div>
@@ -459,13 +631,118 @@ td
             </table>
         </div>
     </div>
+    <div class="card" style="margin-bottom:20px;">
+        <div class="card-header-row" onclick="toggleLogPanel()">
+            <h2>Server Log</h2>
+            <span class="collapse-icon open" id="logCollapseIcon">&#x25BC;</span>
+        </div>
+        <div class="log-toolbar" id="logToolbar">
+            <select id="logLevelFilter" onchange="refreshLogs(true)">
+                <option value="">All levels</option>
+                <option value="trace">Trace</option>
+                <option value="debug">Debug</option>
+                <option value="info">Info</option>
+                <option value="warning">Warning</option>
+                <option value="error">Error</option>
+                <option value="critical">Critical</option>
+            </select>
+            <label><input type="checkbox" id="logAutoScroll" checked onchange="refreshLogs(true)"> Live</label>
+        </div>
+        <div class="log-panel" id="logPanel">
+            <div class="log-entry" style="color:#666;">Loading logs...</div>
+        </div>
+    </div>
 </div>
 <script>
 const POLL_INTERVAL=2000;
 const FAST_POLL_INTERVAL=1000;
-let tpsHistory=[];
+let promptTpsHistory=[];
+let genTpsHistory=[];
 const MAX_TPS_POINTS=60;
 let hasActiveDownloads=false;
+let logPanelOpen=true;
+let lastLogEpoch=0;
+
+function toggleLogPanel()
+{
+    logPanelOpen=!logPanelOpen;
+    const panel=document.getElementById("logPanel");
+    const toolbar=document.getElementById("logToolbar");
+    const icon=document.getElementById("logCollapseIcon");
+
+    if(logPanelOpen)
+    {
+        panel.classList.remove("collapsed");
+        toolbar.style.display="flex";
+        icon.classList.add("open");
+    }
+    else
+    {
+        panel.classList.add("collapsed");
+        toolbar.style.display="none";
+        icon.classList.remove("open");
+    }
+}
+
+function logLevelClass(level)
+{
+    const map={"trace":"trace", "debug":"debug", "info":"info", "warning":"warning", "error":"error", "critical":"critical"};
+    return "log-level-"+(map[level]||"info");
+}
+
+function formatLogTime(isoStr)
+{
+    if(!isoStr) return "";
+    const d=new Date(isoStr);
+    const h=String(d.getHours()).padStart(2, "0");
+    const m=String(d.getMinutes()).padStart(2, "0");
+    const s=String(d.getSeconds()).padStart(2, "0");
+    const ms=String(d.getMilliseconds()).padStart(3, "0");
+    return h+":"+m+":"+s+"."+ms;
+}
+
+function escapeHtml(text)
+{
+    const el=document.createElement("span");
+    el.textContent=text;
+    return el.innerHTML;
+}
+
+async function refreshLogs(force)
+{
+    if(!logPanelOpen) return;
+
+    const autoScroll=document.getElementById("logAutoScroll").checked;
+
+    // When live mode is off, freeze the view entirely so the user
+    // can select and copy text without the DOM changing underneath.
+    // A forced refresh (filter change, re-enabling live) always runs.
+    if(!autoScroll&&!force) return;
+
+    const level=document.getElementById("logLevelFilter").value;
+    let url="/api/logs?count=200";
+    if(level) url+="&level="+encodeURIComponent(level);
+
+    const data=await fetchJson(url);
+    if(!data||!data.logs) return;
+
+    const panel=document.getElementById("logPanel");
+
+    let html="";
+    for(const entry of data.logs)
+    {
+        const lvl=entry.level||"info";
+        html+=`<div class="log-entry"><span class="log-ts">${formatLogTime(entry.timestamp)}</span><span class="log-level ${logLevelClass(lvl)}">${lvl.toUpperCase().padEnd(8)}</span><span class="log-msg">${escapeHtml(entry.message)}</span></div>`;
+    }
+
+    if(data.logs.length===0)
+    {
+        html='<div class="log-entry" style="color:#666;">No log entries</div>';
+    }
+
+    panel.innerHTML=html;
+    panel.scrollTop=panel.scrollHeight;
+}
 
 function formatMb(mb)
 {
@@ -506,40 +783,166 @@ async function modelAction(name, action)
     }
 }
 
+async function promptVramOverride(gpuIndex, currentMb)
+{
+    const input=prompt("Enter VRAM override in MB for GPU "+gpuIndex+" (current: "+currentMb+" MB):", currentMb);
+
+    if(input===null) return;
+
+    const vramMb=parseInt(input, 10);
+
+    if(isNaN(vramMb)||vramMb<=0)
+    {
+        alert("Invalid value. Please enter a positive integer in MB.");
+        return;
+    }
+
+    try
+    {
+        const res=await fetch("/api/hardware/vram-override", {
+            method: "POST",
+            headers: {"Content-Type": "application/json"},
+            body: JSON.stringify({gpu_index: gpuIndex, vram_mb: vramMb})
+        });
+
+        if(!res.ok)
+        {
+            const err=await res.json();
+            alert("Failed: "+(err.error||"Unknown error"));
+            return;
+        }
+        await refresh();
+    }
+    catch(e)
+    {
+        console.error("VRAM override failed:", e);
+    }
+}
+
+async function clearVramOverride(gpuIndex)
+{
+    try
+    {
+        const res=await fetch("/api/hardware/vram-override/"+gpuIndex, {method: "DELETE"});
+
+        if(!res.ok)
+        {
+            const err=await res.json();
+            alert("Failed: "+(err.error||"Unknown error"));
+            return;
+        }
+        await refresh();
+    }
+    catch(e)
+    {
+        console.error("Clear VRAM override failed:", e);
+    }
+}
+
 function updateTpsChart()
 {
-    const svg=document.getElementById("tpsChartSvg");
-    const polyline=document.getElementById("tpsChart");
+    const promptLine=document.getElementById("promptChart");
+    const genLine=document.getElementById("genChart");
     const grid=document.getElementById("chartGrid");
+    const yLabelsL=document.getElementById("chartYLabelsLeft");
+    const yLabelsR=document.getElementById("chartYLabelsRight");
+
+    const nPoints=Math.max(promptTpsHistory.length, genTpsHistory.length);
 
-    if(tpsHistory.length<2)
+    if(nPoints<2)
     {
-        polyline.setAttribute("points", "");
+        promptLine.setAttribute("points", "");
+        genLine.setAttribute("points", "");
         return;
     }
 
-    const w=400, h=120, pad=5;
-    const maxTps=Math.max(...tpsHistory, 1);
+    const w=430, h=120, padL=32, padR=32, padT=5, padB=5;
+
+    const maxPrompt=Math.max(...promptTpsHistory, 1);
+    const maxGen=Math.max(...genTpsHistory, 1);
+    const niceMaxPrompt=niceNum(maxPrompt);
+    const niceMaxGen=niceNum(maxGen);
 
-    let points=[];
-    for(let i=0; i<tpsHistory.length; i++)
+    function buildPoints(data, niceMax)
     {
-        const x=pad+(i/(Math.max(tpsHistory.length-1, 1)))*(w-2*pad);
-        const y=h-pad-(tpsHistory[i]/maxTps)*(h-2*pad);
-        points.push(x.toFixed(1)+","+y.toFixed(1));
+        let pts=[];
+        for(let i=0; i<data.length; i++)
+        {
+            const x=padL+(i/(Math.max(data.length-1, 1)))*(w-padL-padR);
+            const y=h-padB-(data[i]/niceMax)*(h-padT-padB);
+            pts.push(x.toFixed(1)+","+y.toFixed(1));
+        }
+        return pts.join(" ");
     }
-    polyline.setAttribute("points", points.join(" "));
 
+    promptLine.setAttribute("points", buildPoints(promptTpsHistory, niceMaxPrompt));
+    genLine.setAttribute("points", buildPoints(genTpsHistory, niceMaxGen));
+
+    // Grid lines (based on prompt/left axis)
     grid.innerHTML="";
-    for(let i=0; i<=4; i++)
+    yLabelsL.innerHTML="";
+    yLabelsR.innerHTML="";
+    const steps=4;
+
+    for(let i=0; i<=steps; i++)
     {
-        const y=pad+i*((h-2*pad)/4);
-        const val=((maxTps*(4-i)/4)).toFixed(0);
-        grid.innerHTML+=`<line x1="${pad}" y1="${y}" x2="${w-pad}" y2="${y}"/>`;
-        grid.innerHTML+=`<text class="chart-label" x="${w-pad+4}" y="${y+3}">${val}</text>`;
+        const y=padT+i*((h-padT-padB)/steps);
+        const promptVal=((niceMaxPrompt*(steps-i)/steps)).toFixed(0);
+        const genVal=((niceMaxGen*(steps-i)/steps)).toFixed(0);
+        grid.innerHTML+=`<line x1="${padL}" y1="${y}" x2="${w-padR}" y2="${y}"/>`;
+        yLabelsL.innerHTML+=`<text class="chart-label" x="${padL-4}" y="${y+3}" text-anchor="end" fill="#4caf50">${promptVal}</text>`;
+        yLabelsR.innerHTML+=`<text class="chart-label" x="${w-padR+4}" y="${y+3}" text-anchor="start" fill="#f0c040">${genVal}</text>`;
     }
 }
 
+function niceNum(val)
+{
+    if(val<=0) return 1;
+    const exp=Math.floor(Math.log10(val));
+    const frac=val/Math.pow(10, exp);
+    let nice;
+
+    if(frac<=1.5) nice=1.5;
+    else if(frac<=2) nice=2;
+    else if(frac<=3) nice=3;
+    else if(frac<=5) nice=5;
+    else nice=10;
+
+    return nice*Math.pow(10, exp);
+}
+
+function buildHeapTooltip(gpu)
+{
+    const heaps=gpu.memory_heaps;
+
+    if(!heaps||heaps.length===0) return "";
+
+    let lines=[];
+
+    if(gpu.has_memory_budget)
+    {
+        lines.push("<b>Vulkan Memory Heaps (VK_EXT_memory_budget)</b>");
+        for(const h of heaps)
+        {
+            const tag=h.device_local?"DEVICE_LOCAL":"host";
+            const avail=h.budget_mb-h.usage_mb;
+            lines.push(`Heap ${h.index} [${tag}]: size ${formatMb(h.size_mb)}, budget ${formatMb(h.budget_mb)}, used ${formatMb(h.usage_mb)}, avail ${formatMb(avail>0?avail:0)}`);
+        }
+    }
+    else
+    {
+        lines.push("<b>Vulkan Memory Heaps</b>");
+        for(const h of heaps)
+        {
+            const tag=h.device_local?"DEVICE_LOCAL":"host";
+            lines.push(`Heap ${h.index} [${tag}]: size ${formatMb(h.size_mb)}`);
+        }
+        lines.push("<i>VK_EXT_memory_budget not available</i>");
+    }
+
+    return lines.join("<br>");
+}
+
 function renderGpus(gpus)
 {
     const el=document.getElementById("gpuList");
@@ -571,11 +974,39 @@ function renderGpus(gpus)
             memPct=memTotal>0?(memUsed/memTotal*100):0;
         }
 
+        const hasUtilData=(gpu.backend==="CUDA");
+        let utilHtml;
+
+        if(hasUtilData)
+        {
+            utilHtml=`<div class="gpu-label"><span>Utilization</span><span>${utilPct.toFixed(0)}%</span></div>
+            <div class="gpu-bar"><div class="gpu-bar-fill gpu-bar-util" style="width:${utilPct.toFixed(1)}%"></div></div>`;
+        }
+        else
+        {
+            utilHtml=`<div class="gpu-label"><span>Utilization</span><span style="color:#555;">N/A (${gpu.backend})</span></div>`;
+        }
+
+        const tooltip=buildHeapTooltip(gpu);
+        let memSpan;
+
+        if(tooltip)
+        {
+            memSpan=`<span class="heap-tooltip-wrap">${memLabel}: ${formatMb(memUsed)} / ${formatMb(memTotal)}<span class="heap-tooltip">${tooltip}</span></span>`;
+        }
+        else
+        {
+            memSpan=`<span>${memLabel}: ${formatMb(memUsed)} / ${formatMb(memTotal)}</span>`;
+        }
+
+        const overrideTag=gpu.vram_overridden?` <span style="color:#f0a;font-size:0.8em;" title="VRAM overridden by user">⚙ Override</span>`:"";
+        const overrideBtn=`<button class="btn" style="font-size:0.7em;padding:2px 6px;margin-left:6px;" onclick="promptVramOverride(${gpu.index}, ${memTotal})" title="Override reported VRAM">✏</button>`;
+        const clearBtn=gpu.vram_overridden?`<button class="btn btn-danger" style="font-size:0.7em;padding:2px 6px;margin-left:2px;" onclick="clearVramOverride(${gpu.index})" title="Clear VRAM override">✕</button>`:"";
+
         html+=`<div class="gpu-row">
-            <div class="gpu-label"><span>${gpu.name} (${gpu.backend})${gpu.unified_memory?" ⚡ Unified":"" }</span><span>${memLabel}: ${formatMb(memUsed)} / ${formatMb(memTotal)}</span></div>
+            <div class="gpu-label"><span>${gpu.name} (${gpu.backend})${gpu.unified_memory?" ⚡ Unified":""}${overrideTag}${overrideBtn}${clearBtn}</span>${memSpan}</div>
             <div class="gpu-bar"><div class="gpu-bar-fill gpu-bar-vram" style="width:${memPct.toFixed(1)}%"></div></div>
-            <div class="gpu-label"><span>Utilization</span><span>${utilPct.toFixed(0)}%</span></div>
-            <div class="gpu-bar"><div class="gpu-bar-fill gpu-bar-util" style="width:${utilPct.toFixed(1)}%"></div></div>
+            ${utilHtml}
         </div>`;
     }
     el.innerHTML=html;
@@ -587,7 +1018,7 @@ function renderModels(models)
 
     if(!models||models.length===0)
     {
-        el.innerHTML='<tr><td colspan="8" style="color:#666;text-align:center;">No models loaded</td></tr>';
+        el.innerHTML='<tr><td colspan="9" style="color:#666;text-align:center;">No models loaded</td></tr>';
         return;
     }
 
@@ -600,11 +1031,15 @@ function renderModels(models)
         if(!m.pinned) actions.push(`<button class="btn" onclick="modelAction('${m.model}','pin')">Pin</button>`);
         else actions.push(`<button class="btn" onclick="modelAction('${m.model}','unpin')">Unpin</button>`);
 
+        const ctxDisplay=m.context_size? m.context_size.toLocaleString() : "-";
+        const maxCtxDisplay=m.max_context_size? m.max_context_size.toLocaleString() : "-";
+
         html+=`<tr>
             <td>${m.model}</td>
             <td>${m.variant||"-"}</td>
             <td><span class="badge ${stateClass(m.state)}">${m.state}</span></td>
-            <td>${m.context_size||"-"}</td>
+            <td>${ctxDisplay}</td>
+            <td>${maxCtxDisplay}</td>
             <td>${m.vram_usage_mb||0}</td>
             <td>${m.ram_usage_mb||0}</td>
             <td>${m.pinned?"Yes":"No"}</td>
@@ -620,7 +1055,7 @@ function renderInferences(history)
 
     if(!history||history.length===0)
     {
-        el.innerHTML='<tr><td colspan="5" style="color:#666;text-align:center;">No recent inferences</td></tr>';
+        el.innerHTML='<tr><td colspan="6" style="color:#666;text-align:center;">No recent inferences</td></tr>';
         return;
     }
 
@@ -628,9 +1063,13 @@ function renderInferences(history)
     let html="";
     for(const s of recent)
     {
+        const promptTps=s.prompt_tokens_per_second||0;
+        const genTps=s.generation_tokens_per_second||0;
+
         html+=`<tr>
             <td>${s.model}</td>
-            <td>${s.tokens_per_second.toFixed(1)}</td>
+            <td>${promptTps.toFixed(1)}</td>
+            <td>${genTps.toFixed(1)}</td>
             <td>${s.prompt_tokens}</td>
             <td>${s.completion_tokens}</td>
             <td>${s.latency_ms.toFixed(0)}ms</td>
@@ -857,7 +1296,8 @@ async function refresh()
     statusText.textContent="Connected";
 
     // Performance card
-    document.getElementById("avgTps").textContent=(stats.avg_tokens_per_second||0).toFixed(1);
+    document.getElementById("avgPromptTps").textContent=(stats.avg_prompt_tokens_per_second||0).toFixed(1)+" t/s";
+    document.getElementById("avgGenTps").textContent=(stats.avg_generation_tokens_per_second||0).toFixed(1)+" t/s";
     document.getElementById("activeReqs").textContent=stats.active_requests||0;
     document.getElementById("totalInferences").textContent=history?history.length:0;
     document.getElementById("totalSwaps").textContent=swaps?swaps.length:0;
@@ -869,13 +1309,16 @@ async function refresh()
         document.getElementById("freeRam").textContent=formatMb(stats.hardware.free_ram_mb);
         document.getElementById("cpuCores").textContent=stats.hardware.cpu_cores;
         document.getElementById("cpuUsage").textContent=stats.hardware.cpu_utilization_percent.toFixed(1)+"%";
+        document.getElementById("cpuBar").style.width=stats.hardware.cpu_utilization_percent.toFixed(1)+"%";
     }
 
     // TPS chart
-    if(stats.avg_tokens_per_second!==undefined)
+    if(stats.avg_prompt_tokens_per_second!==undefined||stats.avg_generation_tokens_per_second!==undefined)
     {
-        tpsHistory.push(stats.avg_tokens_per_second||0);
-        if(tpsHistory.length>MAX_TPS_POINTS) tpsHistory.shift();
+        promptTpsHistory.push(stats.avg_prompt_tokens_per_second||0);
+        genTpsHistory.push(stats.avg_generation_tokens_per_second||0);
+        if(promptTpsHistory.length>MAX_TPS_POINTS) promptTpsHistory.shift();
+        if(genTpsHistory.length>MAX_TPS_POINTS) genTpsHistory.shift();
         updateTpsChart();
     }
 
@@ -902,12 +1345,18 @@ async function loadVersion()
     if(data)
     {
         document.getElementById("versionBadge").textContent="v"+data.version;
+        if(data.llamaCppBuild)
+        {
+            document.getElementById("llamaBadge").textContent="llama.cpp "+data.llamaCppBuild;
+        }
     }
 }
 
 loadVersion();
 refresh();
+refreshLogs();
 setInterval(refresh, POLL_INTERVAL);
+setInterval(refreshLogs, POLL_INTERVAL);
 </script>
 </body>
 </html>)HTML";
diff --git a/src/server/logBuffer.h b/src/server/logBuffer.h
new file mode 100644
index 0000000..45dd4fc
--- /dev/null
+++ b/src/server/logBuffer.h
@@ -0,0 +1,87 @@
+#ifndef _ARBITERAI_SERVER_LOGBUFFER_H_
+#define _ARBITERAI_SERVER_LOGBUFFER_H_
+
+#include <spdlog/sinks/base_sink.h>
+#include <spdlog/details/null_mutex.h>
+
+#include <mutex>
+#include <deque>
+#include <string>
+#include <chrono>
+
+namespace arbiterAI
+{
+namespace server
+{
+
+struct LogEntry {
+    std::chrono::system_clock::time_point timestamp;
+    std::string level;
+    std::string message;
+};
+
+/// Thread-safe ring-buffer spdlog sink that retains the most recent N log
+/// entries in memory so the dashboard / REST API can serve them.
+class LogBufferSink : public spdlog::sinks::base_sink<std::mutex> {
+public:
+    explicit LogBufferSink(size_t maxEntries=500)
+        :m_maxEntries(maxEntries) {}
+
+    /// Return a snapshot of the buffered entries (newest last).
+    std::deque<LogEntry> getEntries() const
+    {
+        std::lock_guard<std::mutex> lock(m_readMutex);
+        return m_entries;
+    }
+
+    /// Return only the last `count` entries.
+    std::deque<LogEntry> getEntries(size_t count) const
+    {
+        std::lock_guard<std::mutex> lock(m_readMutex);
+        if(count>=m_entries.size()) return m_entries;
+
+        return std::deque<LogEntry>(m_entries.end()-static_cast<std::ptrdiff_t>(count), m_entries.end());
+    }
+
+    void clear()
+    {
+        std::lock_guard<std::mutex> lock(m_readMutex);
+        m_entries.clear();
+    }
+
+protected:
+    void sink_it_(const spdlog::details::log_msg &msg) override
+    {
+        LogEntry entry;
+        entry.timestamp=msg.time;
+        entry.level=std::string(spdlog::level::to_string_view(msg.level).data(),
+            spdlog::level::to_string_view(msg.level).size());
+        entry.message=std::string(msg.payload.data(), msg.payload.size());
+
+        std::lock_guard<std::mutex> lock(m_readMutex);
+        m_entries.push_back(std::move(entry));
+        while(m_entries.size()>m_maxEntries)
+        {
+            m_entries.pop_front();
+        }
+    }
+
+    void flush_() override {}
+
+private:
+    size_t m_maxEntries;
+    mutable std::mutex m_readMutex;
+    std::deque<LogEntry> m_entries;
+};
+
+/// Global accessor so routes.cpp can read the buffer without coupling to main.
+inline std::shared_ptr<LogBufferSink> &logBufferSinkInstance()
+{
+    static std::shared_ptr<LogBufferSink> instance;
+    return instance;
+}
+
+} // namespace server
+} // namespace arbiterAI
+
+#endif//_ARBITERAI_SERVER_LOGBUFFER_H_
diff --git a/src/server/main.cpp b/src/server/main.cpp
index 1f59cb8..5e156fc 100644
--- a/src/server/main.cpp
+++ b/src/server/main.cpp
@@ -1,59 +1,168 @@
 #include "routes.h"
+#include "logBuffer.h"
 
 #include "arbiterAI/arbiterAI.h"
+#include "arbiterAI/hardwareDetector.h"
 #include "arbiterAI/modelRuntime.h"
 #include "arbiterAI/storageManager.h"
 
 #include <httplib.h>
-#include <cxxopts.hpp>
+#include <nlohmann/json.hpp>
 #include <spdlog/spdlog.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/sinks/daily_file_sink.h>
 
 #include <iostream>
+#include <fstream>
 #include <string>
 #include <vector>
 #include <filesystem>
 
+namespace
+{
+
+int64_t parseStorageLimit(const std::string &str)
+{
+    if(str.empty()||str=="0") return 0;
+
+    char suffix=str.back();
+    std::string numStr=str;
+
+    if(suffix=='G'||suffix=='g')
+    {
+        numStr.pop_back();
+        return static_cast<int64_t>(std::stod(numStr)*1073741824);
+    }
+    if(suffix=='M'||suffix=='m')
+    {
+        numStr.pop_back();
+        return static_cast<int64_t>(std::stod(numStr)*1048576);
+    }
+    if(suffix=='K'||suffix=='k')
+    {
+        numStr.pop_back();
+        return static_cast<int64_t>(std::stod(numStr)*1024);
+    }
+    return std::stoll(str);
+}
+
+void printUsage()
+{
+    std::cout<<"Usage: arbiterAI-server [options]\n"
+        "\n"
+        "Options:\n"
+        "  -c, --config <path>   Path to server configuration JSON file (required)\n"
+        "  -h, --help            Print this help message\n"
+        "\n"
+        "See examples/server_config.json for the configuration file format.\n";
+}
+
+} // anonymous namespace
+
 int main(int argc, char *argv[])
 {
-    cxxopts::Options options("arbiterAI-server", "ArbiterAI standalone server with OpenAI-compatible API");
-
-    options.add_options()
-        ("p,port", "HTTP port", cxxopts::value<int>()->default_value("8080"))
-        ("H,host", "Bind address", cxxopts::value<std::string>()->default_value("0.0.0.0"))
-        ("c,config", "Model config path(s)", cxxopts::value<std::vector<std::string>>()->default_value("config"))
-        ("m,model", "Default model to load on startup", cxxopts::value<std::string>()->default_value(""))
-        ("v,variant", "Default variant (e.g., Q4_K_M)", cxxopts::value<std::string>()->default_value(""))
-        ("override-path", "Path to write runtime model config overrides (enables persistence)", cxxopts::value<std::string>()->default_value(""))
-        ("ram-budget", "Ready model RAM budget in MB (0 = auto 50%)", cxxopts::value<int>()->default_value("0"))
-        ("models-dir", "Path to directory for downloaded model files", cxxopts::value<std::string>()->default_value("/models"))
-        ("storage-limit", "Maximum bytes for model storage (0 = all free, supports suffixes: 10G, 500M)", cxxopts::value<std::string>()->default_value("0"))
-        ("cleanup-enabled", "Enable automated storage cleanup", cxxopts::value<bool>()->default_value("true"))
-        ("cleanup-max-age", "Maximum days since last use before cleanup candidacy", cxxopts::value<int>()->default_value("30"))
-        ("cleanup-interval", "Hours between automated cleanup runs", cxxopts::value<int>()->default_value("24"))
-        ("log-level", "Log level (trace, debug, info, warn, error)", cxxopts::value<std::string>()->default_value("info"))
-        ("h,help", "Print usage");
-
-    cxxopts::ParseResult result;
+    // ── Parse CLI — only --config and --help ──────────────────────
+    std::string configPath;
+
+    for(int i=1; i<argc; ++i)
+    {
+        std::string arg=argv[i];
+
+        if(arg=="-h"||arg=="--help")
+        {
+            printUsage();
+            return 0;
+        }
+        if((arg=="-c"||arg=="--config")&&i+1<argc)
+        {
+            configPath=argv[++i];
+        }
+        else if(arg.rfind("--config=", 0)==0)
+        {
+            configPath=arg.substr(9);
+        }
+        else
+        {
+            std::cerr<<"Unknown argument: "<<arg<<"\n";
+            printUsage();
+            return 1;
+        }
+    }
+
+    if(configPath.empty())
+    {
+        std::cerr<<"Error: --config <path> is required.\n\n";
+        printUsage();
+        return 1;
+    }
+
+    // ── Load config file ─────────────────────────────────────────
+    nlohmann::json cfg;
 
     try
     {
-        result=options.parse(argc, argv);
+        std::ifstream file(configPath);
+
+        if(!file.is_open())
+        {
+            std::cerr<<"Error: cannot open config file: "<<configPath<<"\n";
+            return 1;
+        }
+
+        cfg=nlohmann::json::parse(file, nullptr, true, true);
     }
-    catch(const cxxopts::exceptions::exception &e)
+    catch(const std::exception &e)
     {
-        std::cerr<<"Error parsing options: "<<e.what()<<std::endl;
-        std::cout<<options.help()<<std::endl;
+        std::cerr<<"Error parsing config file: "<<e.what()<<"\n";
         return 1;
     }
 
-    if(result.count("help"))
+    // ── Extract settings with defaults ───────────────────────────
+    std::string host=cfg.value("host", "0.0.0.0");
+    int port=cfg.value("port", 8080);
+
+    std::vector<std::string> modelConfigPaths;
+    if(cfg.contains("model_config_paths")&&cfg["model_config_paths"].is_array())
+    {
+        for(const nlohmann::json &p:cfg["model_config_paths"])
+        {
+            modelConfigPaths.push_back(p.get<std::string>());
+        }
+    }
+    if(modelConfigPaths.empty())
     {
-        std::cout<<options.help()<<std::endl;
-        return 0;
+        modelConfigPaths.push_back("config");
     }
 
-    // Configure logging
-    std::string logLevel=result["log-level"].as<std::string>();
+    std::string modelsDir=cfg.value("models_dir", "/models");
+    std::string defaultModel=cfg.value("default_model", "");
+    std::string defaultVariant=cfg.value("default_variant", "");
+    std::string overridePath=cfg.value("override_path", "");
+    int ramBudget=cfg.value("ram_budget_mb", 0);
+    int maxDownloads=cfg.value("max_concurrent_downloads", 2);
+
+    // Storage
+    nlohmann::json storageCfg=cfg.value("storage", nlohmann::json::object());
+    std::string storageLimitStr=storageCfg.value("limit", "0");
+    bool cleanupEnabled=storageCfg.value("cleanup_enabled", true);
+    int cleanupMaxAgeDays=storageCfg.value("cleanup_max_age_days", 30);
+    int cleanupIntervalHours=storageCfg.value("cleanup_interval_hours", 24);
+
+    // Hardware
+    nlohmann::json hwCfg=cfg.value("hardware", nlohmann::json::object());
+    nlohmann::json vramOverrides=hwCfg.value("vram_overrides", nlohmann::json::object());
+
+    // Logging
+    nlohmann::json logCfg=cfg.value("logging", nlohmann::json::object());
+    std::string logLevel=logCfg.value("level", "info");
+    std::string logDir=logCfg.value("directory", "");
+    int logRotateHour=logCfg.value("rotate_hour", 0);
+    int logRetainDays=logCfg.value("retain_days", 7);
+
+    if(logRotateHour<0||logRotateHour>23) logRotateHour=0;
+    if(logRetainDays<1) logRetainDays=1;
+
+    // ── Configure logging ────────────────────────────────────────
     if(logLevel=="trace")       spdlog::set_level(spdlog::level::trace);
     else if(logLevel=="debug")  spdlog::set_level(spdlog::level::debug);
     else if(logLevel=="info")   spdlog::set_level(spdlog::level::info);
@@ -61,59 +170,47 @@ int main(int argc, char *argv[])
     else if(logLevel=="error")  spdlog::set_level(spdlog::level::err);
     else                        spdlog::set_level(spdlog::level::info);
 
-    int port=result["port"].as<int>();
-    std::string host=result["host"].as<std::string>();
-    std::vector<std::string> configStrs=result["config"].as<std::vector<std::string>>();
-    std::string defaultModel=result["model"].as<std::string>();
-    std::string defaultVariant=result["variant"].as<std::string>();
-    std::string overridePath=result["override-path"].as<std::string>();
-    int ramBudget=result["ram-budget"].as<int>();
-    std::string modelsDir=result["models-dir"].as<std::string>();
-    std::string storageLimitStr=result["storage-limit"].as<std::string>();
-    bool cleanupEnabled=result["cleanup-enabled"].as<bool>();
-    int cleanupMaxAgeDays=result["cleanup-max-age"].as<int>();
-    int cleanupIntervalHours=result["cleanup-interval"].as<int>();
-
-    // Parse storage limit (supports suffixes: G, M, K)
-    int64_t storageLimitBytes=0;
-    if(!storageLimitStr.empty()&&storageLimitStr!="0")
-    {
-        char suffix=storageLimitStr.back();
-        std::string numStr=storageLimitStr;
-
-        if(suffix=='G'||suffix=='g')
-        {
-            numStr.pop_back();
-            storageLimitBytes=static_cast<int64_t>(std::stod(numStr)*1073741824);
-        }
-        else if(suffix=='M'||suffix=='m')
-        {
-            numStr.pop_back();
-            storageLimitBytes=static_cast<int64_t>(std::stod(numStr)*1048576);
-        }
-        else if(suffix=='K'||suffix=='k')
-        {
-            numStr.pop_back();
-            storageLimitBytes=static_cast<int64_t>(std::stod(numStr)*1024);
-        }
-        else
-        {
-            storageLimitBytes=std::stoll(storageLimitStr);
-        }
+    auto consoleSink=std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+    auto bufferSink=std::make_shared<arbiterAI::server::LogBufferSink>(1000);
+    arbiterAI::server::logBufferSinkInstance()=bufferSink;
+
+    std::vector<spdlog::sink_ptr> sinks{consoleSink, bufferSink};
+
+    if(!logDir.empty())
+    {
+        std::filesystem::create_directories(logDir);
+        std::string logPath=logDir+"/arbiterAI-server.log";
+
+        auto fileSink=std::make_shared<spdlog::sinks::daily_file_sink_mt>(
+            logPath, logRotateHour, 0, false,
+            static_cast<uint16_t>(logRetainDays));
+        sinks.push_back(fileSink);
+    }
+
+    auto logger=std::make_shared<spdlog::logger>("arbiterAI",
+        sinks.begin(), sinks.end());
+    logger->set_level(spdlog::get_level());
+    spdlog::set_default_logger(logger);
+
+    spdlog::info("Loaded config from: {}", configPath);
+
+    if(!logDir.empty())
+    {
+        spdlog::info("File logging enabled: dir={}, rotate at {:02d}:00, retain {} days",
+            logDir, logRotateHour, logRetainDays);
     }
 
-    // Convert config paths
-    std::vector<std::filesystem::path> configPaths;
-    for(const std::string &s:configStrs)
+    // ── Initialize ArbiterAI ─────────────────────────────────────
+    std::vector<std::filesystem::path> fsPaths;
+    for(const std::string &s:modelConfigPaths)
     {
-        configPaths.push_back(s);
+        fsPaths.push_back(s);
     }
 
     spdlog::info("Initializing ArbiterAI...");
 
-    // Initialize ArbiterAI
     arbiterAI::ArbiterAI &ai=arbiterAI::ArbiterAI::instance();
-    arbiterAI::ErrorCode err=ai.initialize(configPaths);
+    arbiterAI::ErrorCode err=ai.initialize(fsPaths);
 
     if(err!=arbiterAI::ErrorCode::Success)
     {
@@ -123,7 +220,22 @@ int main(int argc, char *argv[])
 
     spdlog::info("ArbiterAI initialized successfully");
 
-    // Configure StorageManager
+    // ── Apply VRAM overrides ─────────────────────────────────────
+    for(auto it=vramOverrides.begin(); it!=vramOverrides.end(); ++it)
+    {
+        int gpuIndex=std::stoi(it.key());
+        int vramMb=it.value().get<int>();
+
+        if(vramMb>0)
+        {
+            arbiterAI::HardwareDetector::instance().setVramOverride(gpuIndex, vramMb);
+            spdlog::info("VRAM override set for GPU {}: {} MB", gpuIndex, vramMb);
+        }
+    }
+
+    // ── Configure StorageManager ─────────────────────────────────
+    int64_t storageLimitBytes=parseStorageLimit(storageLimitStr);
+
     arbiterAI::StorageManager &storage=arbiterAI::StorageManager::instance();
     storage.initialize(modelsDir);
 
@@ -141,14 +253,21 @@ int main(int argc, char *argv[])
     spdlog::info("Cleanup policy: enabled={}, maxAge={}d, interval={}h",
         cleanupEnabled, cleanupMaxAgeDays, cleanupIntervalHours);
 
-    // Set RAM budget if specified
+    // ── RAM budget ───────────────────────────────────────────────
     if(ramBudget>0)
     {
         arbiterAI::ModelRuntime::instance().setReadyRamBudget(ramBudget);
         spdlog::info("Ready model RAM budget set to {} MB", ramBudget);
     }
 
-    // Load default model if specified
+    // ── Concurrent download limit ────────────────────────────────
+    if(maxDownloads>0)
+    {
+        arbiterAI::ModelRuntime::instance().setMaxConcurrentDownloads(maxDownloads);
+        spdlog::info("Max concurrent downloads set to {}", maxDownloads);
+    }
+
+    // ── Load default model ───────────────────────────────────────
     if(!defaultModel.empty())
     {
         spdlog::info("Loading default model: {} (variant: {})", defaultModel, defaultVariant.empty()?"auto":defaultVariant);
@@ -168,20 +287,17 @@ int main(int argc, char *argv[])
         }
     }
 
-    // Create HTTP server
+    // ── HTTP server ──────────────────────────────────────────────
     httplib::Server server;
 
-    // Register routes
     arbiterAI::server::registerRoutes(server);
 
-    // Set override path for runtime model config persistence
     if(!overridePath.empty())
     {
         arbiterAI::server::setOverridePath(overridePath);
         spdlog::info("Runtime model config overrides will be saved to: {}", overridePath);
     }
 
-    // Log available endpoints
     spdlog::info("Server endpoints:");
     spdlog::info("  GET  /health                - Health check");
     spdlog::info("  POST /v1/chat/completions   - Chat completions (OpenAI-compatible)");
@@ -202,6 +318,8 @@ int main(int argc, char *argv[])
     spdlog::info("  GET  /api/stats/history      - Inference history");
     spdlog::info("  GET  /api/stats/swaps        - Swap history");
     spdlog::info("  GET  /api/hardware           - Hardware info");
+    spdlog::info("  POST /api/hardware/vram-override  - Set VRAM override");
+    spdlog::info("  DEL  /api/hardware/vram-override/:idx - Clear VRAM override");
     spdlog::info("  GET  /api/storage            - Storage overview");
     spdlog::info("  GET  /api/storage/models     - Downloaded models");
     spdlog::info("  GET  /api/storage/models/:n  - Model storage stats");
diff --git a/src/server/routes.cpp b/src/server/routes.cpp
index 527a819..ef178e6 100644
--- a/src/server/routes.cpp
+++ b/src/server/routes.cpp
@@ -1,5 +1,6 @@
 #include "routes.h"
 #include "dashboard.h"
+#include "logBuffer.h"
 
 #include "arbiterAI/arbiterAI.h"
 #include "arbiterAI/modelManager.h"
@@ -71,7 +72,8 @@ nlohmann::json gpuInfoToJson(const GpuInfo &gpu)
         {"vram_free_mb", gpu.vramFreeMb},
         {"compute_capability", gpu.computeCapability},
         {"utilization_percent", gpu.utilizationPercent},
-        {"unified_memory", gpu.unifiedMemory}
+        {"unified_memory", gpu.unifiedMemory},
+        {"vram_overridden", gpu.vramOverridden}
     };
 
     if(gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0)
@@ -80,6 +82,27 @@ nlohmann::json gpuInfoToJson(const GpuInfo &gpu)
         j["gpu_accessible_ram_free_mb"]=gpu.gpuAccessibleRamFreeMb;
     }
 
+    if(!gpu.memoryHeaps.empty())
+    {
+        nlohmann::json heaps=nlohmann::json::array();
+        for(const MemoryHeapInfo &heap:gpu.memoryHeaps)
+        {
+            nlohmann::json h={
+                {"index", heap.index},
+                {"device_local", heap.deviceLocal},
+                {"size_mb", heap.sizeMb}
+            };
+            if(gpu.hasMemoryBudget)
+            {
+                h["budget_mb"]=heap.budgetMb;
+                h["usage_mb"]=heap.usageMb;
+            }
+            heaps.push_back(h);
+        }
+        j["memory_heaps"]=heaps;
+        j["has_memory_budget"]=gpu.hasMemoryBudget;
+    }
+
     return j;
 }
 
@@ -129,6 +152,7 @@ nlohmann::json loadedModelToJson(const LoadedModel &m)
         {"ram_usage_mb", m.ramUsageMb},
         {"estimated_vram_mb", m.estimatedVramUsageMb},
         {"context_size", m.contextSize},
+        {"max_context_size", m.maxContextSize},
         {"gpu_indices", gpuIndices},
         {"pinned", m.pinned}
     };
@@ -140,10 +164,14 @@ nlohmann::json inferenceStatsToJson(const InferenceStats &s)
         {"model", s.model},
         {"variant", s.variant},
         {"tokens_per_second", s.tokensPerSecond},
+        {"prompt_tokens_per_second", s.promptTokensPerSecond},
+        {"generation_tokens_per_second", s.generationTokensPerSecond},
         {"prompt_tokens", s.promptTokens},
         {"completion_tokens", s.completionTokens},
         {"latency_ms", s.latencyMs},
-        {"total_time_ms", s.totalTimeMs}
+        {"total_time_ms", s.totalTimeMs},
+        {"prompt_time_ms", s.promptTimeMs},
+        {"generation_time_ms", s.generationTimeMs}
     };
 }
 
@@ -278,6 +306,11 @@ void registerRoutes(httplib::Server &server)
     server.Get("/api/stats/history", handleGetStatsHistory);
     server.Get("/api/stats/swaps", handleGetStatsSwaps);
     server.Get("/api/hardware", handleGetHardware);
+    server.Post("/api/hardware/vram-override", handleSetVramOverride);
+    server.Delete(R"(/api/hardware/vram-override/(\d+))", handleClearVramOverride);
+
+    // Logs
+    server.Get("/api/logs", handleGetLogs);
 
     // Storage management
     server.Get("/api/storage", handleGetStorage);
@@ -778,7 +811,8 @@ void handleGetVersion(const httplib::Request &, httplib::Response &res)
         {"version", ver.toString()},
         {"major", ver.major},
         {"minor", ver.minor},
-        {"patch", ver.patch}
+        {"patch", ver.patch},
+        {"llamaCppBuild", ver.llamaCppBuild}
     };
     res.set_content(j.dump(), "application/json");
 }
@@ -846,16 +880,48 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
         std::string variant;
         int contextSize=0;
 
+        // Accept parameters from query string
         if(req.has_param("variant"))
             variant=req.get_param_value("variant");
         if(req.has_param("context"))
             contextSize=std::stoi(req.get_param_value("context"));
 
+        // Also accept from JSON body (body takes precedence)
+        if(!req.body.empty())
+        {
+            try
+            {
+                nlohmann::json body=nlohmann::json::parse(req.body);
+                if(body.contains("variant")&&body["variant"].is_string())
+                    variant=body["variant"].get<std::string>();
+                if(body.contains("context")&&body["context"].is_number_integer())
+                    contextSize=body["context"].get<int>();
+                if(body.contains("context_size")&&body["context_size"].is_number_integer())
+                    contextSize=body["context_size"].get<int>();
+            }
+            catch(const nlohmann::json::parse_error &)
+            {
+                // Not JSON body — ignore, use query params
+            }
+        }
+
+        spdlog::info("Load request: model='{}' variant='{}' context={}", modelName, variant, contextSize);
+
         ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize);
 
         if(err==ErrorCode::Success)
         {
-            res.set_content(nlohmann::json{{"status", "loaded"}, {"model", modelName}}.dump(), "application/json");
+            // Include the actual context info in the response
+            nlohmann::json response={{"status", "loaded"}, {"model", modelName}};
+
+            std::optional<LoadedModel> state=ModelRuntime::instance().getModelState(modelName);
+            if(state.has_value())
+            {
+                response["context_size"]=state->contextSize;
+                response["max_context_size"]=state->maxContextSize;
+            }
+
+            res.set_content(response.dump(), "application/json");
         }
         else if(err==ErrorCode::ModelDownloading)
         {
@@ -882,8 +948,88 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res)
         }
         else
         {
+            std::string errCode=errorCodeToString(err);
+            std::string reason;
+
+            switch(err)
+            {
+                case ErrorCode::ModelNotFound:
+                    reason="Model '"+modelName+"' is not defined in any loaded configuration file. "
+                        "Check that the model name matches a config entry and that the config path is correct.";
+                    break;
+                case ErrorCode::ModelLoadError:
+                {
+                    LoadErrorDetail detail=ModelRuntime::instance().getLastLoadError();
+                    reason=detail.summary;
+                    if(reason.empty())
+                    {
+                        reason="Failed to load model '"+modelName+"'";
+                        if(!variant.empty()) reason+=" variant '"+variant+"'";
+                        reason+=". Check the server log for details.";
+                    }
+                    break;
+                }
+                case ErrorCode::ModelDownloadFailed:
+                    reason="Download failed for model '"+modelName+"'";
+                    if(!variant.empty()) reason+=" variant '"+variant+"'";
+                    reason+=". The download URL may be unreachable or the SHA256 hash did not match.";
+                    break;
+                case ErrorCode::InvalidRequest:
+                    reason="Invalid request for model '"+modelName+"'. "
+                        "The model may be a local provider without variants defined.";
+                    break;
+                case ErrorCode::UnsupportedProvider:
+                    reason="The provider for model '"+modelName+"' is not supported or not enabled in this build.";
+                    break;
+                default:
+                    reason="Unexpected error loading model '"+modelName+"': "+errCode;
+                    break;
+            }
+
+            spdlog::warn("Load failed: model='{}' variant='{}' error={} — {}", modelName, variant, errCode, reason);
+
+            // Build detail payload; include structured error fields for programmatic handling
+            LoadErrorDetail loadDetail=ModelRuntime::instance().getLastLoadError();
+
+            nlohmann::json details={
+                {"model", modelName},
+                {"variant", variant.empty()?nlohmann::json(nullptr):nlohmann::json(variant)},
+                {"context_requested", contextSize},
+                {"error_code", errCode},
+                {"reason", loadFailureReasonToString(loadDetail.reason)},
+                {"recoverable", loadDetail.recoverable}
+            };
+
+            if(!loadDetail.action.empty())
+            {
+                details["action"]=loadDetail.action;
+            }
+            if(!loadDetail.suggestion.empty())
+            {
+                details["suggestion"]=loadDetail.suggestion;
+            }
+            if(!loadDetail.llamaLog.empty())
+            {
+                // Trim to a reasonable size for the response
+                std::string logSnippet=loadDetail.llamaLog;
+                if(logSnippet.size()>2000)
+                {
+                    logSnippet=logSnippet.substr(logSnippet.size()-2000);
+                }
+                details["llama_log"]=logSnippet;
+            }
+
+            nlohmann::json body={
+                {"error", {
+                    {"message", reason},
+                    {"type", "invalid_request_error"},
+                    {"code", errCode},
+                    {"param", "model"},
+                    {"details", details}
+                }}
+            };
             res.status=400;
-            res.set_content(errorJson("Failed to load model: "+errorCodeToString(err), "invalid_request_error", "model", errorCodeToString(err)).dump(), "application/json");
+            res.set_content(body.dump(), "application/json");
         }
     }
     catch(const std::exception &e)
@@ -949,12 +1095,11 @@ void handleDownloadModel(const httplib::Request &req, httplib::Response &res)
 {
     std::string modelName=req.matches[1];
 
-    // Initiate download via loadModel (which triggers download if file not present)
     std::string variant;
     if(req.has_param("variant"))
         variant=req.get_param_value("variant");
 
-    ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant);
+    ErrorCode err=ArbiterAI::instance().downloadModel(modelName, variant);
 
     if(err==ErrorCode::ModelDownloading)
     {
@@ -1238,6 +1383,8 @@ void handleGetStats(const httplib::Request &, httplib::Response &res)
         {"hardware", systemInfoToJson(snapshot.hardware)},
         {"models", models},
         {"avg_tokens_per_second", snapshot.avgTokensPerSecond},
+        {"avg_prompt_tokens_per_second", snapshot.avgPromptTokensPerSecond},
+        {"avg_generation_tokens_per_second", snapshot.avgGenerationTokensPerSecond},
         {"active_requests", snapshot.activeRequests}
     };
 
@@ -1286,6 +1433,66 @@ void handleGetHardware(const httplib::Request &, httplib::Response &res)
     res.set_content(systemInfoToJson(hw).dump(), "application/json");
 }
 
+void handleSetVramOverride(const httplib::Request &req, httplib::Response &res)
+{
+    nlohmann::json body;
+
+    try
+    {
+        body=nlohmann::json::parse(req.body);
+    }
+    catch(const std::exception &)
+    {
+        res.status=400;
+        res.set_content(R"({"error":"Invalid JSON body"})", "application/json");
+        return;
+    }
+
+    if(!body.contains("gpu_index")||!body.contains("vram_mb"))
+    {
+        res.status=400;
+        res.set_content(R"({"error":"Missing required fields: gpu_index, vram_mb"})", "application/json");
+        return;
+    }
+
+    int gpuIndex=body["gpu_index"].get<int>();
+    int vramMb=body["vram_mb"].get<int>();
+
+    if(vramMb<=0)
+    {
+        res.status=400;
+        res.set_content(R"({"error":"vram_mb must be positive"})", "application/json");
+        return;
+    }
+
+    HardwareDetector::instance().setVramOverride(gpuIndex, vramMb);
+
+    nlohmann::json result={
+        {"status", "ok"},
+        {"gpu_index", gpuIndex},
+        {"vram_mb", vramMb}
+    };
+
+    res.set_content(result.dump(), "application/json");
+}
+
+void handleClearVramOverride(const httplib::Request &req, httplib::Response &res)
+{
+    int gpuIndex=std::stoi(req.matches[1]);
+
+    HardwareDetector::instance().clearVramOverride(gpuIndex);
+
+    // Refresh to restore detected values
+    HardwareDetector::instance().refresh();
+
+    nlohmann::json result={
+        {"status", "ok"},
+        {"gpu_index", gpuIndex}
+    };
+
+    res.set_content(result.dump(), "application/json");
+}
+
 // ========== Storage Management ==========
 
 namespace
@@ -1323,7 +1530,7 @@ std::string timePointToIsoStr(const std::chrono::system_clock::time_point &tp)
 
 nlohmann::json downloadedModelToJson(const DownloadedModelFile &f)
 {
-    return {
+    nlohmann::json j={
         {"model", f.modelName},
         {"variant", f.variant},
         {"filename", f.filename},
@@ -1334,8 +1541,14 @@ nlohmann::json downloadedModelToJson(const DownloadedModelFile &f)
         {"usage_count", f.usageCount},
         {"hot_ready", f.hotReady},
         {"protected", f.isProtected},
-        {"runtime_state", f.runtimeState}
+        {"runtime_state", f.runtimeState},
+        {"file_count", 1+static_cast<int>(f.additionalFiles.size())}
     };
+    if(!f.additionalFiles.empty())
+    {
+        j["additional_files"]=f.additionalFiles;
+    }
+    return j;
 }
 
 } // anonymous namespace
@@ -1366,15 +1579,18 @@ void handleGetStorageModels(const httplib::Request &req, httplib::Response &res)
     std::string sortField=req.has_param("sort")?req.get_param_value("sort"):"last_used";
     std::string sortOrder=req.has_param("order")?req.get_param_value("order"):"desc";
 
+    bool ascending=(sortOrder=="asc");
+
     auto compare=[&](const DownloadedModelFile &a, const DownloadedModelFile &b) -> bool
     {
-        bool result=false;
-        if(sortField=="name") result=a.modelName<b.modelName;
-        else if(sortField=="size") result=a.fileSizeBytes<b.fileSizeBytes;
-        else if(sortField=="usage_count") result=a.usageCount<b.usageCount;
-        else result=a.lastUsedAt<b.lastUsedAt; // default: last_used
+        // For descending, swap a and b so strict weak ordering is maintained.
+        const DownloadedModelFile &lhs=ascending?a:b;
+        const DownloadedModelFile &rhs=ascending?b:a;
 
-        return sortOrder=="asc"?result:!result;
+        if(sortField=="name") return lhs.modelName<rhs.modelName;
+        if(sortField=="size") return lhs.fileSizeBytes<rhs.fileSizeBytes;
+        if(sortField=="usage_count") return lhs.usageCount<rhs.usageCount;
+        return lhs.lastUsedAt<rhs.lastUsedAt; // default: last_used
     };
 
     std::sort(models.begin(), models.end(), compare);
@@ -1495,8 +1711,24 @@ void handleDeleteModelFiles(const httplib::Request &req, httplib::Response &res)
         return;
     }
 
-    // Unload from ModelRuntime if loaded
+    // Reject deletion of models that are currently downloading
     std::optional<LoadedModel> state=ModelRuntime::instance().getModelState(modelName);
+    if(state.has_value()&&state->state==ModelState::Downloading)
+    {
+        if(variant.empty()||state->variant==variant)
+        {
+            res.status=409;
+            res.set_content(nlohmann::json{
+                {"error", {
+                    {"message", "Cannot delete model '"+modelName+"': download is in progress"},
+                    {"type", "invalid_request_error"}
+                }}
+            }.dump(), "application/json");
+            return;
+        }
+    }
+
+    // Unload from ModelRuntime if loaded
     if(state.has_value()&&(state->state==ModelState::Loaded||state->state==ModelState::Ready))
     {
         if(variant.empty()||state->variant==variant)
@@ -1757,6 +1989,64 @@ void handleGetActiveDownloads(const httplib::Request &, httplib::Response &res)
     res.set_content(nlohmann::json{{"downloads", downloads}}.dump(), "application/json");
 }
 
+// ========== Logs ==========
+
+void handleGetLogs(const httplib::Request &req, httplib::Response &res)
+{
+    auto &sink=logBufferSinkInstance();
+    if(!sink)
+    {
+        res.set_content(nlohmann::json{{"logs", nlohmann::json::array()}}.dump(), "application/json");
+        return;
+    }
+
+    size_t count=200;
+    std::string levelFilter;
+
+    if(req.has_param("count"))
+    {
+        count=static_cast<size_t>(std::stoi(req.get_param_value("count")));
+        if(count>1000) count=1000;
+    }
+    if(req.has_param("level"))
+    {
+        levelFilter=req.get_param_value("level");
+    }
+
+    std::deque<LogEntry> entries=sink->getEntries(count);
+
+    nlohmann::json logs=nlohmann::json::array();
+    for(const LogEntry &entry:entries)
+    {
+        if(!levelFilter.empty()&&entry.level!=levelFilter)
+            continue;
+
+        auto epochMs=std::chrono::duration_cast<std::chrono::milliseconds>(
+            entry.timestamp.time_since_epoch()).count();
+
+        // Format ISO timestamp
+        std::time_t t=std::chrono::system_clock::to_time_t(entry.timestamp);
+        std::tm tm{};
+        gmtime_r(&t, &tm);
+
+        auto ms=epochMs%1000;
+        char buf[32];
+        std::strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", &tm);
+
+        std::ostringstream ts;
+        ts<<buf<<"."<<std::setfill('0')<<std::setw(3)<<ms<<"Z";
+
+        logs.push_back({
+            {"timestamp", ts.str()},
+            {"epoch_ms", epochMs},
+            {"level", entry.level},
+            {"message", entry.message}
+        });
+    }
+
+    res.set_content(nlohmann::json{{"logs", logs}}.dump(), "application/json");
+}
+
 // ========== Dashboard ==========
 
 void handleDashboard(const httplib::Request &, httplib::Response &res)
diff --git a/src/server/routes.h b/src/server/routes.h
index 3a120b4..ed79d13 100644
--- a/src/server/routes.h
+++ b/src/server/routes.h
@@ -56,6 +56,12 @@ void handleGetStats(const httplib::Request &req, httplib::Response &res);
 void handleGetStatsHistory(const httplib::Request &req, httplib::Response &res);
 void handleGetStatsSwaps(const httplib::Request &req, httplib::Response &res);
 void handleGetHardware(const httplib::Request &req, httplib::Response &res);
+void handleSetVramOverride(const httplib::Request &req, httplib::Response &res);
+void handleClearVramOverride(const httplib::Request &req, httplib::Response &res);
+
+// ========== Logs ==========
+
+void handleGetLogs(const httplib::Request &req, httplib::Response &res);
 
 // ========== Storage Management ==========
 
diff --git a/tests/hardwareDetectorTests.cpp b/tests/hardwareDetectorTests.cpp
index ea1cdaf..878bf95 100644
--- a/tests/hardwareDetectorTests.cpp
+++ b/tests/hardwareDetectorTests.cpp
@@ -404,4 +404,199 @@ TEST_F(ModelFitCalculatorTest, UnifiedMemoryFallbackToVramWhenNoAccessibleInfo)
     EXPECT_GT(fit.maxContextSize, 4096);
 }
 
+// --- VRAM Override tests ---
+
+class VramOverrideTest : public ::testing::Test
+{
+protected:
+    void TearDown() override
+    {
+        HardwareDetector::instance().clearAllVramOverrides();
+    }
+};
+
+TEST_F(VramOverrideTest, SetAndQueryOverride)
+{
+    HardwareDetector &hw=HardwareDetector::instance();
+
+    EXPECT_FALSE(hw.hasVramOverride(0));
+    EXPECT_EQ(hw.getVramOverride(0), 0);
+
+    hw.setVramOverride(0, 32000);
+
+    EXPECT_TRUE(hw.hasVramOverride(0));
+    EXPECT_EQ(hw.getVramOverride(0), 32000);
+}
+
+TEST_F(VramOverrideTest, ClearOverride)
+{
+    HardwareDetector &hw=HardwareDetector::instance();
+
+    hw.setVramOverride(0, 32000);
+    EXPECT_TRUE(hw.hasVramOverride(0));
+
+    hw.clearVramOverride(0);
+    EXPECT_FALSE(hw.hasVramOverride(0));
+    EXPECT_EQ(hw.getVramOverride(0), 0);
+}
+
+TEST_F(VramOverrideTest, ClearAllOverrides)
+{
+    HardwareDetector &hw=HardwareDetector::instance();
+
+    hw.setVramOverride(0, 32000);
+    hw.setVramOverride(1, 16000);
+    EXPECT_TRUE(hw.hasVramOverride(0));
+    EXPECT_TRUE(hw.hasVramOverride(1));
+
+    hw.clearAllVramOverrides();
+    EXPECT_FALSE(hw.hasVramOverride(0));
+    EXPECT_FALSE(hw.hasVramOverride(1));
+}
+
+TEST_F(VramOverrideTest, OverridePersistsAcrossRefresh)
+{
+    HardwareDetector &hw=HardwareDetector::instance();
+    std::vector<GpuInfo> gpus=hw.getGpus();
+
+    if(gpus.empty())
+    {
+        GTEST_SKIP()<<"No GPUs detected, cannot test override persistence";
+    }
+
+    int gpuIdx=gpus[0].index;
+    int overrideValue=12345;
+
+    hw.setVramOverride(gpuIdx, overrideValue);
+    hw.refresh();
+
+    SystemInfo info=hw.getSystemInfo();
+    ASSERT_FALSE(info.gpus.empty());
+
+    bool found=false;
+    for(const GpuInfo &gpu:info.gpus)
+    {
+        if(gpu.index==gpuIdx)
+        {
+            EXPECT_EQ(gpu.vramTotalMb, overrideValue);
+            EXPECT_LE(gpu.vramFreeMb, overrideValue);
+            EXPECT_TRUE(gpu.vramOverridden);
+            found=true;
+            break;
+        }
+    }
+    EXPECT_TRUE(found);
+}
+
+TEST_F(VramOverrideTest, OverrideAppliedToGpuInfo)
+{
+    HardwareDetector &hw=HardwareDetector::instance();
+    std::vector<GpuInfo> gpus=hw.getGpus();
+
+    if(gpus.empty())
+    {
+        GTEST_SKIP()<<"No GPUs detected, cannot test override application";
+    }
+
+    int gpuIdx=gpus[0].index;
+    int originalTotal=gpus[0].vramTotalMb;
+    int overrideValue=originalTotal/2;
+
+    if(overrideValue<=0) overrideValue=1024;
+
+    hw.setVramOverride(gpuIdx, overrideValue);
+
+    SystemInfo info=hw.getSystemInfo();
+
+    for(const GpuInfo &gpu:info.gpus)
+    {
+        if(gpu.index==gpuIdx)
+        {
+            EXPECT_EQ(gpu.vramTotalMb, overrideValue);
+            EXPECT_TRUE(gpu.vramOverridden);
+            break;
+        }
+    }
+}
+
+TEST_F(VramOverrideTest, NonOverriddenGpuUnaffected)
+{
+    HardwareDetector &hw=HardwareDetector::instance();
+    std::vector<GpuInfo> gpus=hw.getGpus();
+
+    if(gpus.empty())
+    {
+        GTEST_SKIP()<<"No GPUs detected";
+    }
+
+    int gpuIdx=gpus[0].index;
+    int originalTotal=gpus[0].vramTotalMb;
+
+    // Override a non-existent GPU index
+    hw.setVramOverride(999, 99999);
+    hw.refresh();
+
+    SystemInfo info=hw.getSystemInfo();
+    for(const GpuInfo &gpu:info.gpus)
+    {
+        if(gpu.index==gpuIdx)
+        {
+            // Original GPU should be unaffected
+            EXPECT_FALSE(gpu.vramOverridden);
+            break;
+        }
+    }
+}
+
+TEST_F(VramOverrideTest, OverrideAffectsModelFitCalculation)
+{
+    GpuInfo gpu;
+    gpu.index=0;
+    gpu.name="Test GPU";
+    gpu.backend=GpuBackend::CUDA;
+    gpu.vramTotalMb=2000;
+    gpu.vramFreeMb=2000;
+
+    SystemInfo hw;
+    hw.totalRamMb=32000;
+    hw.freeRamMb=24000;
+    hw.cpuCores=8;
+    hw.gpus={gpu};
+
+    ModelInfo model;
+    model.model="test-model";
+    model.provider="llama";
+
+    HardwareRequirements hwReqs;
+    hwReqs.minSystemRamMb=8192;
+    hwReqs.parameterCount="7B";
+    model.hardwareRequirements=hwReqs;
+
+    ContextScaling scaling;
+    scaling.baseContext=4096;
+    scaling.maxContext=131072;
+    scaling.vramPer1kContextMb=64;
+    model.contextScaling=scaling;
+
+    ModelVariant variant;
+    variant.quantization="Q4_K_M";
+    variant.fileSizeMb=4370;
+    variant.minVramMb=4096;
+    variant.recommendedVramMb=8192;
+
+    // Without override: 2000 MB VRAM cannot fit 4096 minVram, CPU fallback gives base context
+    ModelFit fitBefore=ModelFitCalculator::calculateModelFit(model, variant, hw);
+    EXPECT_EQ(fitBefore.maxContextSize, 4096);
+
+    // Apply override: 16000 MB VRAM now fits model + allows extra context
+    hw.gpus[0].vramTotalMb=16000;
+    hw.gpus[0].vramFreeMb=16000;
+    hw.gpus[0].vramOverridden=true;
+
+    ModelFit fitAfter=ModelFitCalculator::calculateModelFit(model, variant, hw);
+
+    EXPECT_TRUE(fitAfter.canRun);
+    EXPECT_GT(fitAfter.maxContextSize, fitBefore.maxContextSize);
+}
+
 } // namespace arbiterAI
diff --git a/tests/modelManagerTests.cpp b/tests/modelManagerTests.cpp
index 75d06a4..8b31798 100644
--- a/tests/modelManagerTests.cpp
+++ b/tests/modelManagerTests.cpp
@@ -628,4 +628,178 @@ TEST_F(ModelManagerConfigInjectionTest, AddModelFromJson_AllOptionalFields)
     EXPECT_DOUBLE_EQ(info->pricing.completion_token_cost, 0.0006);
 }
 
+TEST_F(ModelManagerTest, ParseSplitGGUFVariant)
+{
+    std::filesystem::create_directory("config_split");
+    std::filesystem::create_directory("config_split/models");
+    std::ofstream outfile("config_split/models/split_model.json");
+    outfile << R"({
+        "models": [
+            {
+                "model": "big-model-120b",
+                "provider": "llama",
+                "variants": [
+                    {
+                        "quantization": "Q4_K_M",
+                        "file_size_mb": 70000,
+                        "min_vram_mb": 40000,
+                        "recommended_vram_mb": 48000,
+                        "download": {
+                            "url": "https://example.com/big-model-Q4_K_M-00001-of-00002.gguf",
+                            "sha256": "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1",
+                            "filename": "big-model-Q4_K_M-00001-of-00002.gguf"
+                        },
+                        "files": [
+                            {
+                                "url": "https://example.com/big-model-Q4_K_M-00001-of-00002.gguf",
+                                "sha256": "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1",
+                                "filename": "big-model-Q4_K_M-00001-of-00002.gguf"
+                            },
+                            {
+                                "url": "https://example.com/big-model-Q4_K_M-00002-of-00002.gguf",
+                                "sha256": "bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb2",
+                                "filename": "big-model-Q4_K_M-00002-of-00002.gguf"
+                            }
+                        ]
+                    }
+                ]
+            }
+        ]
+    })";
+    outfile.close();
+
+    ModelManager &mm=ModelManager::instance();
+    mm.initialize({"config_split"});
+
+    auto model=mm.getModelInfo("big-model-120b");
+    ASSERT_TRUE(model.has_value());
+    ASSERT_EQ(model->variants.size(), 1u);
+
+    const ModelVariant &v=model->variants[0];
+    EXPECT_EQ(v.quantization, "Q4_K_M");
+    EXPECT_EQ(v.fileSizeMb, 70000);
+
+    // Single-file download field (backward compat)
+    EXPECT_EQ(v.download.url, "https://example.com/big-model-Q4_K_M-00001-of-00002.gguf");
+    EXPECT_EQ(v.download.filename, "big-model-Q4_K_M-00001-of-00002.gguf");
+
+    // Multi-file files array
+    ASSERT_EQ(v.files.size(), 2u);
+    EXPECT_EQ(v.files[0].filename, "big-model-Q4_K_M-00001-of-00002.gguf");
+    EXPECT_EQ(v.files[1].filename, "big-model-Q4_K_M-00002-of-00002.gguf");
+    EXPECT_EQ(v.files[1].sha256, "bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb2");
+
+    // Helper methods
+    EXPECT_TRUE(v.isSplit());
+    EXPECT_EQ(v.getPrimaryFilename(), "big-model-Q4_K_M-00001-of-00002.gguf");
+
+    std::vector<VariantDownload> allFiles=v.getAllFiles();
+    ASSERT_EQ(allFiles.size(), 2u);
+    EXPECT_EQ(allFiles[0].filename, "big-model-Q4_K_M-00001-of-00002.gguf");
+    EXPECT_EQ(allFiles[1].filename, "big-model-Q4_K_M-00002-of-00002.gguf");
+
+    std::filesystem::remove_all("config_split");
+}
+
+TEST_F(ModelManagerTest, SingleFileVariantBackwardCompat)
+{
+    // Single-file variant without "files" array should still work
+    ModelManager &mm=ModelManager::instance();
+    mm.initialize({"config1"});
+
+    auto model=mm.getModelInfo("model1");
+    ASSERT_TRUE(model.has_value());
+
+    // Build a single-file variant via JSON injection
+    nlohmann::json modelJson={
+        {"model", "single-file-model"},
+        {"provider", "llama"},
+        {"variants", {{
+            {"quantization", "Q8_0"},
+            {"file_size_mb", 8100},
+            {"min_vram_mb", 8192},
+            {"download", {
+                {"url", "https://example.com/single.gguf"},
+                {"sha256", "abc123def456abc123def456abc123def456abc123def456abc123def456abcd"},
+                {"filename", "single.gguf"}
+            }}
+        }}}
+    };
+
+    std::string error;
+    ASSERT_TRUE(mm.addModelFromJson(modelJson, error))<<error;
+
+    auto info=mm.getModelInfo("single-file-model");
+    ASSERT_TRUE(info.has_value());
+    ASSERT_EQ(info->variants.size(), 1u);
+
+    const ModelVariant &v=info->variants[0];
+
+    // files array should be empty
+    EXPECT_TRUE(v.files.empty());
+    EXPECT_FALSE(v.isSplit());
+
+    // getAllFiles() should fall back to the single download field
+    std::vector<VariantDownload> allFiles=v.getAllFiles();
+    ASSERT_EQ(allFiles.size(), 1u);
+    EXPECT_EQ(allFiles[0].filename, "single.gguf");
+    EXPECT_EQ(allFiles[0].url, "https://example.com/single.gguf");
+
+    // getPrimaryFilename() should return the single download filename
+    EXPECT_EQ(v.getPrimaryFilename(), "single.gguf");
+}
+
+TEST_F(ModelManagerTest, SplitVariantSerializationRoundTrip)
+{
+    nlohmann::json modelJson={
+        {"model", "roundtrip-split"},
+        {"provider", "llama"},
+        {"variants", {{
+            {"quantization", "Q4_K_M"},
+            {"file_size_mb", 70000},
+            {"min_vram_mb", 40000},
+            {"recommended_vram_mb", 48000},
+            {"download", {
+                {"url", "https://example.com/shard-00001-of-00003.gguf"},
+                {"sha256", "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1"},
+                {"filename", "shard-00001-of-00003.gguf"}
+            }},
+            {"files", {
+                {{"url", "https://example.com/shard-00001-of-00003.gguf"},
+                 {"sha256", "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1"},
+                 {"filename", "shard-00001-of-00003.gguf"}},
+                {{"url", "https://example.com/shard-00002-of-00003.gguf"},
+                 {"sha256", "bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb2"},
+                 {"filename", "shard-00002-of-00003.gguf"}},
+                {{"url", "https://example.com/shard-00003-of-00003.gguf"},
+                 {"sha256", "ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc3"},
+                 {"filename", "shard-00003-of-00003.gguf"}}
+            }}
+        }}}
+    };
+
+    ModelManager &mm=ModelManager::instance();
+    mm.initialize({"config1"});
+
+    std::string error;
+    ASSERT_TRUE(mm.addModelFromJson(modelJson, error))<<error;
+
+    auto info=mm.getModelInfo("roundtrip-split");
+    ASSERT_TRUE(info.has_value());
+
+    // Serialize back to JSON
+    nlohmann::json outputJson=ModelManager::modelInfoToJson(info.value());
+
+    // Verify files array is present in output
+    ASSERT_TRUE(outputJson.contains("variants"));
+    ASSERT_EQ(outputJson["variants"].size(), 1u);
+    ASSERT_TRUE(outputJson["variants"][0].contains("files"));
+    ASSERT_EQ(outputJson["variants"][0]["files"].size(), 3u);
+    EXPECT_EQ(outputJson["variants"][0]["files"][2]["filename"].get<std::string>(), "shard-00003-of-00003.gguf");
+
+    // Verify download field is also present (backward compat)
+    ASSERT_TRUE(outputJson["variants"][0].contains("download"));
+    EXPECT_EQ(outputJson["variants"][0]["download"]["filename"].get<std::string>(), "shard-00001-of-00003.gguf");
+}
+
 } // namespace arbiterAI
\ No newline at end of file
diff --git a/tests/modelRuntimeTests.cpp b/tests/modelRuntimeTests.cpp
index 03162ef..86ee084 100644
--- a/tests/modelRuntimeTests.cpp
+++ b/tests/modelRuntimeTests.cpp
@@ -1,6 +1,7 @@
 #include "arbiterAI/modelRuntime.h"
 #include "arbiterAI/modelManager.h"
 #include "arbiterAI/hardwareDetector.h"
+#include "arbiterAI/storageManager.h"
 #include <gtest/gtest.h>
 #include <fstream>
 #include <filesystem>
@@ -484,4 +485,271 @@ TEST_F(ModelRuntimeTest, LoadLocalModelInvalidVariantFails)
     EXPECT_EQ(result, ErrorCode::ModelNotFound);
 }
 
+// --- Download concurrency settings ---
+
+TEST_F(ModelRuntimeTest, DefaultMaxConcurrentDownloadsIsTwo)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    EXPECT_EQ(rt.getMaxConcurrentDownloads(), 2);
+}
+
+TEST_F(ModelRuntimeTest, SetMaxConcurrentDownloads)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    rt.setMaxConcurrentDownloads(4);
+    EXPECT_EQ(rt.getMaxConcurrentDownloads(), 4);
+
+    rt.setMaxConcurrentDownloads(1);
+    EXPECT_EQ(rt.getMaxConcurrentDownloads(), 1);
+}
+
+TEST_F(ModelRuntimeTest, SetMaxConcurrentDownloadsClampsToMinimumOne)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    rt.setMaxConcurrentDownloads(0);
+    EXPECT_EQ(rt.getMaxConcurrentDownloads(), 1);
+
+    rt.setMaxConcurrentDownloads(-5);
+    EXPECT_EQ(rt.getMaxConcurrentDownloads(), 1);
+}
+
+// --- downloadModel ---
+
+TEST_F(ModelRuntimeTest, DownloadModelNotFoundFails)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    ErrorCode result=rt.downloadModel("nonexistent-model");
+
+    EXPECT_EQ(result, ErrorCode::ModelNotFound);
+}
+
+TEST_F(ModelRuntimeTest, DownloadModelCloudModelReturnsSuccess)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    // Cloud models (mock) have no variants — nothing to download
+    ErrorCode result=rt.downloadModel("mock-model");
+
+    EXPECT_EQ(result, ErrorCode::Success);
+}
+
+TEST_F(ModelRuntimeTest, DownloadModelInvalidVariantFails)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    ErrorCode result=rt.downloadModel("test-local-7b", "INVALID_QUANT");
+
+    EXPECT_EQ(result, ErrorCode::ModelNotFound);
+}
+
+TEST_F(ModelRuntimeTest, DownloadModelAlreadyDownloadingReturnDownloading)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    // First call should initiate download (or fail fast if no URL, but will
+    // set state to Downloading if URL is empty — the background thread will
+    // finish immediately since there are no missing files with valid URLs).
+    // For this test, we rely on the test config having empty URLs:
+    // the variant URLs are empty, so no files are actually missing (they have
+    // no URL to download from). That means downloadModel returns Success.
+    // Instead, let's verify the state tracking for an already-loaded model.
+    rt.loadModel("mock-model");
+    auto state=rt.getModelState("mock-model");
+    ASSERT_TRUE(state.has_value());
+    EXPECT_EQ(state->state, ModelState::Loaded);
+
+    // Calling download on a loaded model returns Success
+    ErrorCode result=rt.downloadModel("mock-model");
+    EXPECT_EQ(result, ErrorCode::Success);
+}
+
+TEST_F(ModelRuntimeTest, LoadModelMissingFilesReturnsDownloading)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    // Create a config with a downloadable URL (fake but non-empty)
+    std::ofstream dlModel("rt_config/models/download_model.json");
+    dlModel << R"({
+        "models": [
+            {
+                "model": "test-download-7b",
+                "provider": "llama",
+                "ranking": 40,
+                "context_window": 4096,
+                "max_tokens": 2048,
+                "hardware_requirements": {
+                    "min_system_ram_mb": 4096,
+                    "parameter_count": "7B"
+                },
+                "context_scaling": {
+                    "base_context": 4096,
+                    "max_context": 32768,
+                    "vram_per_1k_context_mb": 64
+                },
+                "variants": [
+                    {
+                        "quantization": "Q4_K_M",
+                        "file_size_mb": 1,
+                        "min_vram_mb": 128,
+                        "recommended_vram_mb": 256,
+                        "download": {
+                            "url": "http://localhost:99999/fake.gguf",
+                            "sha256": "abc123",
+                            "filename": "test-dl-7b-q4.gguf"
+                        }
+                    }
+                ]
+            }
+        ]
+    })";
+    dlModel.close();
+
+    // Re-initialize ModelManager to pick up the new config
+    ModelManager::reset();
+    ModelManager::instance().initialize({"rt_config"});
+    ModelRuntime::reset();
+
+    // Initialize StorageManager so canDownload() succeeds
+    std::filesystem::create_directories("rt_test_models");
+    StorageManager::instance().initialize("rt_test_models");
+    StorageManager::instance().setStorageLimit(static_cast<int64_t>(10)*1024*1024*1024);
+
+    // loadModel should detect missing file and return ModelDownloading
+    ErrorCode result=rt.loadModel("test-download-7b", "Q4_K_M");
+
+    EXPECT_EQ(result, ErrorCode::ModelDownloading);
+
+    auto state=rt.getModelState("test-download-7b");
+    ASSERT_TRUE(state.has_value());
+    EXPECT_EQ(state->state, ModelState::Downloading);
+
+    std::filesystem::remove_all("rt_test_models");
+}
+
+TEST_F(ModelRuntimeTest, DownloadModelMissingFilesReturnsDownloading)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    // Create a config with a downloadable URL
+    std::ofstream dlModel("rt_config/models/download_model2.json");
+    dlModel << R"({
+        "models": [
+            {
+                "model": "test-download-7b-v2",
+                "provider": "llama",
+                "ranking": 40,
+                "context_window": 4096,
+                "max_tokens": 2048,
+                "hardware_requirements": {
+                    "min_system_ram_mb": 4096,
+                    "parameter_count": "7B"
+                },
+                "context_scaling": {
+                    "base_context": 4096,
+                    "max_context": 32768,
+                    "vram_per_1k_context_mb": 64
+                },
+                "variants": [
+                    {
+                        "quantization": "Q4_K_M",
+                        "file_size_mb": 1,
+                        "min_vram_mb": 128,
+                        "recommended_vram_mb": 256,
+                        "download": {
+                            "url": "http://localhost:99999/fake2.gguf",
+                            "sha256": "def456",
+                            "filename": "test-dl-7b-v2-q4.gguf"
+                        }
+                    }
+                ]
+            }
+        ]
+    })";
+    dlModel.close();
+
+    ModelManager::reset();
+    ModelManager::instance().initialize({"rt_config"});
+    ModelRuntime::reset();
+
+    // Initialize StorageManager so canDownload() succeeds
+    std::filesystem::create_directories("rt_test_models");
+    StorageManager::instance().initialize("rt_test_models");
+    StorageManager::instance().setStorageLimit(static_cast<int64_t>(10)*1024*1024*1024);
+
+    ErrorCode result=rt.downloadModel("test-download-7b-v2", "Q4_K_M");
+
+    EXPECT_EQ(result, ErrorCode::ModelDownloading);
+
+    auto state=rt.getModelState("test-download-7b-v2");
+    ASSERT_TRUE(state.has_value());
+    EXPECT_EQ(state->state, ModelState::Downloading);
+
+    std::filesystem::remove_all("rt_test_models");
+}
+
+TEST_F(ModelRuntimeTest, ResetClearsDownloadState)
+{
+    ModelRuntime &rt=ModelRuntime::instance();
+
+    // Create a config with a downloadable URL
+    std::ofstream dlModel("rt_config/models/download_model3.json");
+    dlModel << R"({
+        "models": [
+            {
+                "model": "test-download-reset",
+                "provider": "llama",
+                "ranking": 40,
+                "context_window": 4096,
+                "max_tokens": 2048,
+                "hardware_requirements": {
+                    "min_system_ram_mb": 4096,
+                    "parameter_count": "7B"
+                },
+                "context_scaling": {
+                    "base_context": 4096,
+                    "max_context": 32768,
+                    "vram_per_1k_context_mb": 64
+                },
+                "variants": [
+                    {
+                        "quantization": "Q4_K_M",
+                        "file_size_mb": 1,
+                        "min_vram_mb": 128,
+                        "recommended_vram_mb": 256,
+                        "download": {
+                            "url": "http://localhost:99999/fake3.gguf",
+                            "sha256": "ghi789",
+                            "filename": "test-dl-reset-q4.gguf"
+                        }
+                    }
+                ]
+            }
+        ]
+    })";
+    dlModel.close();
+
+    ModelManager::reset();
+    ModelManager::instance().initialize({"rt_config"});
+    ModelRuntime::reset();
+
+    // Initialize StorageManager so canDownload() succeeds
+    std::filesystem::create_directories("rt_test_models");
+    StorageManager::instance().initialize("rt_test_models");
+    StorageManager::instance().setStorageLimit(static_cast<int64_t>(10)*1024*1024*1024);
+
+    rt.downloadModel("test-download-reset", "Q4_K_M");
+
+    // Reset should clear all state and join threads
+    ModelRuntime::reset();
+
+    EXPECT_TRUE(rt.getModelStates().empty());
+    EXPECT_EQ(rt.getMaxConcurrentDownloads(), 2);
+
+    std::filesystem::remove_all("rt_test_models");
+}
+
 } // namespace arbiterAI
diff --git a/tests/serverConnectTests.cpp b/tests/serverConnectTests.cpp
new file mode 100644
index 0000000..dd60539
--- /dev/null
+++ b/tests/serverConnectTests.cpp
@@ -0,0 +1,330 @@
+#include <gtest/gtest.h>
+
+#include "arbiterAI/arbiterAI.h"
+#include "arbiterAI/chatClient.h"
+#include "arbiterAI/modelManager.h"
+#include "arbiterAI/providers/openai.h"
+
+#include <cpr/cpr.h>
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+namespace arbiterAI
+{
+
+// Remote arbiterAI server endpoint
+static const std::string SERVER_URL="http://192.168.2.114:8080";
+static const std::string SERVER_API_URL=SERVER_URL+"/v1";
+
+// Helper: check whether the remote server is reachable.
+static bool isServerReachable()
+{
+    auto response=cpr::Get(
+        cpr::Url{SERVER_URL+"/health"},
+        cpr::Timeout{3000});
+
+    if(response.error || response.status_code!=200)
+        return false;
+
+    try
+    {
+        auto j=nlohmann::json::parse(response.text);
+        return j.value("status", "")=="ok";
+    }
+    catch(...)
+    {
+        return false;
+    }
+}
+
+// Helper: query /v1/models on the remote server and return the first model id.
+static std::string getFirstAvailableModel()
+{
+    auto response=cpr::Get(
+        cpr::Url{SERVER_API_URL+"/models"},
+        cpr::Timeout{5000});
+
+    if(response.error || response.status_code!=200)
+        return "";
+
+    try
+    {
+        auto j=nlohmann::json::parse(response.text);
+        if(j.contains("data") && j["data"].is_array() && !j["data"].empty())
+        {
+            return j["data"][0].value("id", "");
+        }
+    }
+    catch(...)
+    {
+    }
+    return "";
+}
+
+// Helper: find a model that is currently loaded and ready for inference
+// on the remote server.  Returns empty string if none.
+static std::string getLoadedModel()
+{
+    auto response=cpr::Get(
+        cpr::Url{SERVER_URL+"/api/models/loaded"},
+        cpr::Timeout{5000});
+
+    if(response.error || response.status_code!=200)
+        return "";
+
+    try
+    {
+        auto j=nlohmann::json::parse(response.text);
+        if(!j.contains("models") || !j["models"].is_array())
+            return "";
+
+        for(const auto &m : j["models"])
+        {
+            std::string state=m.value("state", "");
+            if(state=="Loaded")
+            {
+                return m.value("model", "");
+            }
+        }
+    }
+    catch(...)
+    {
+    }
+    return "";
+}
+
+class ServerConnectTest : public ::testing::Test {
+protected:
+    void SetUp() override
+    {
+        if(!isServerReachable())
+        {
+            GTEST_SKIP()<<"arbiterAI server not reachable at "<<SERVER_URL;
+        }
+    }
+};
+
+// Fixture for tests that require a model loaded and ready on the server.
+class ServerCompletionTest : public ::testing::Test {
+protected:
+    void SetUp() override
+    {
+        if(!isServerReachable())
+        {
+            GTEST_SKIP()<<"arbiterAI server not reachable at "<<SERVER_URL;
+        }
+
+        m_remoteModel=getLoadedModel();
+        if(m_remoteModel.empty())
+        {
+            GTEST_SKIP()<<"No model currently loaded on remote server";
+        }
+
+        // Reset library state for a clean test
+        ModelManager::reset();
+
+        // Register the remote model in the local ModelManager so the library
+        // can route requests through the OpenAI provider.
+        ModelInfo info;
+        info.model=m_remoteModel;
+        info.provider="openai";
+        info.contextWindow=131072;
+        info.maxTokens=4096;
+        ModelManager::instance().addModel(info);
+
+        // Create an ArbiterAI instance and wire up the OpenAI provider
+        // pointed at the remote arbiterAI server.
+        m_ai=std::make_unique<ArbiterAI>();
+
+        auto provider=std::make_unique<OpenAI>();
+        provider->setApiUrl(SERVER_API_URL);
+        provider->setApiKey("not-needed");
+        provider->initialize(ModelManager::instance().getModels("openai"));
+        m_ai->providers["openai"]=std::move(provider);
+        m_ai->initialized=true;
+    }
+
+    void TearDown() override
+    {
+        m_ai.reset();
+        ModelManager::reset();
+    }
+
+    std::string m_remoteModel;
+    std::unique_ptr<ArbiterAI> m_ai;
+};
+
+// ── Health ────────────────────────────────────────────────────
+
+TEST_F(ServerConnectTest, HealthEndpointReturnsOk)
+{
+    auto response=cpr::Get(
+        cpr::Url{SERVER_URL+"/health"},
+        cpr::Timeout{5000});
+
+    ASSERT_EQ(response.status_code, 200);
+
+    auto j=nlohmann::json::parse(response.text);
+    EXPECT_EQ(j.value("status", ""), "ok");
+    EXPECT_FALSE(j.value("version", "").empty());
+}
+
+// ── Model Listing ────────────────────────────────────────────
+
+TEST_F(ServerConnectTest, ListModelsReturnsNonEmpty)
+{
+    auto response=cpr::Get(
+        cpr::Url{SERVER_API_URL+"/models"},
+        cpr::Timeout{5000});
+
+    ASSERT_EQ(response.status_code, 200);
+
+    auto j=nlohmann::json::parse(response.text);
+    ASSERT_TRUE(j.contains("data"));
+    ASSERT_TRUE(j["data"].is_array());
+    EXPECT_GT(j["data"].size(), 0u);
+
+    // Each entry should have an "id"
+    for(const auto &model : j["data"])
+    {
+        EXPECT_TRUE(model.contains("id"));
+        EXPECT_FALSE(model["id"].get<std::string>().empty());
+    }
+}
+
+// ── Library Completion via OpenAI Provider ────────────────────
+
+TEST_F(ServerCompletionTest, CompletionThroughLibrary)
+{
+    CompletionRequest request;
+    request.model=m_remoteModel;
+    request.messages={{"user", "Say hello in exactly one word."}};
+    request.max_tokens=32;
+    request.temperature=0.0;
+    request.api_key="not-needed";
+
+    CompletionResponse response;
+    ErrorCode result=m_ai->completion(request, response);
+
+    ASSERT_EQ(result, ErrorCode::Success)
+        <<"Completion failed for model: "<<m_remoteModel;
+
+    EXPECT_FALSE(response.text.empty())
+        <<"Response text should not be empty";
+    EXPECT_GT(response.usage.total_tokens, 0)
+        <<"Token usage should be reported";
+}
+
+// ── Library Streaming Completion ──────────────────────────────
+
+TEST_F(ServerCompletionTest, StreamingCompletionThroughLibrary)
+{
+    CompletionRequest request;
+    request.model=m_remoteModel;
+    request.messages={{"user", "Count from 1 to 3."}};
+    request.max_tokens=64;
+    request.temperature=0.0;
+    request.api_key="not-needed";
+
+    std::string accumulated;
+    int chunkCount=0;
+
+    auto callback=[&](const std::string &chunk)
+    {
+        accumulated+=chunk;
+        chunkCount++;
+    };
+
+    ErrorCode result=m_ai->streamingCompletion(request, callback);
+
+    ASSERT_EQ(result, ErrorCode::Success)
+        <<"Streaming completion failed for model: "<<m_remoteModel;
+
+    EXPECT_FALSE(accumulated.empty())
+        <<"Accumulated streaming text should not be empty";
+    EXPECT_GT(chunkCount, 0)
+        <<"Should have received at least one chunk";
+}
+
+// ── ChatClient Session ────────────────────────────────────────
+
+TEST_F(ServerCompletionTest, ChatClientSessionCompletion)
+{
+    ChatConfig config;
+    config.model=m_remoteModel;
+    config.maxTokens=64;
+    config.temperature=0.0;
+    config.apiKey="not-needed";
+
+    auto client=m_ai->createChatClient(config);
+    ASSERT_NE(client, nullptr)
+        <<"Failed to create ChatClient for model: "<<m_remoteModel;
+
+    CompletionRequest request;
+    request.messages={{"user", "What is 2 + 2? Answer with just the number."}};
+    request.max_tokens=16;
+    request.temperature=0.0;
+
+    CompletionResponse response;
+    ErrorCode result=client->completion(request, response);
+
+    ASSERT_EQ(result, ErrorCode::Success);
+    EXPECT_FALSE(response.text.empty());
+
+    // Verify the history was updated
+    auto history=client->getHistory();
+    EXPECT_GE(history.size(), 2u)
+        <<"History should contain at least the user message and assistant response";
+
+    // Verify usage stats
+    UsageStats stats;
+    client->getUsageStats(stats);
+    EXPECT_GT(stats.totalTokens, 0);
+    EXPECT_EQ(stats.completionCount, 1);
+}
+
+// ── Multi-turn Conversation ───────────────────────────────────
+
+TEST_F(ServerCompletionTest, MultiTurnConversation)
+{
+    ChatConfig config;
+    config.model=m_remoteModel;
+    config.maxTokens=64;
+    config.temperature=0.0;
+    config.apiKey="not-needed";
+
+    auto client=m_ai->createChatClient(config);
+    ASSERT_NE(client, nullptr);
+
+    // Turn 1
+    CompletionRequest req1;
+    req1.messages={{"user", "Remember the number 42."}};
+    req1.max_tokens=64;
+    req1.temperature=0.0;
+
+    CompletionResponse resp1;
+    ASSERT_EQ(client->completion(req1, resp1), ErrorCode::Success);
+    EXPECT_FALSE(resp1.text.empty());
+
+    // Turn 2 — references turn 1
+    CompletionRequest req2;
+    req2.messages={{"user", "What number did I ask you to remember?"}};
+    req2.max_tokens=32;
+    req2.temperature=0.0;
+
+    CompletionResponse resp2;
+    ASSERT_EQ(client->completion(req2, resp2), ErrorCode::Success);
+    EXPECT_FALSE(resp2.text.empty());
+
+    // History should contain system (if any) + 4 messages (2 user + 2 assistant)
+    auto history=client->getHistory();
+    EXPECT_GE(history.size(), 4u);
+
+    // Usage should reflect both completions
+    UsageStats stats;
+    client->getUsageStats(stats);
+    EXPECT_EQ(stats.completionCount, 2);
+}
+
+} // namespace arbiterAI
diff --git a/vcpkg/custom_ports/llama-cpp/0001-external-ggml.patch b/vcpkg/custom_ports/llama-cpp/0001-external-ggml.patch
deleted file mode 100644
index 3a745b4..0000000
--- a/vcpkg/custom_ports/llama-cpp/0001-external-ggml.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -145,6 +145,8 @@ endif()
- # 3rd-party
- #
- 
-+find_package(ggml CONFIG REQUIRED)
-+add_library(ggml ALIAS ggml::ggml)
- if (NOT TARGET ggml)
-     add_subdirectory(ggml)
-     # ... otherwise assume ggml is added by a parent CMakeLists.txt
diff --git a/vcpkg/custom_ports/llama-cpp/portfile.cmake b/vcpkg/custom_ports/llama-cpp/portfile.cmake
index dc01956..7007666 100644
--- a/vcpkg/custom_ports/llama-cpp/portfile.cmake
+++ b/vcpkg/custom_ports/llama-cpp/portfile.cmake
@@ -2,26 +2,26 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO ggml-org/llama.cpp
     REF b${VERSION}
-    SHA512 e093f4c7d4b2de425932bb4960683527a8a3bba242132c2f5e5bfed8480f0e336a06f97baf2d20ee591c6deee7535e159d40884a5e3f7caf0ae0967b8a046850
+    SHA512 b05f130a2052d3c2cec483c3b098f71585fe7d00fa1971786c0a646717f82320211801780625b9aabc9fc1e1797f8995381e40661f3e8a115c72710f147083cd
     HEAD_REF master
-    PATCHES
-        0001-external-ggml.patch
 )
 
-file(REMOVE_RECURSE "${SOURCE_PATH}/ggml")
-
 vcpkg_cmake_configure(
     SOURCE_PATH "${SOURCE_PATH}"
     OPTIONS
       -DGGML_CCACHE=OFF
+      -DGGML_VULKAN=ON
       -DLLAMA_BUILD_TESTS=OFF
       -DLLAMA_BUILD_EXAMPLES=OFF
+      -DLLAMA_BUILD_TOOLS=OFF
+      -DLLAMA_BUILD_SERVER=OFF
       -DLLAMA_ALL_WARNINGS=OFF
       ${FEATURE_OPTIONS}
 )
 
 vcpkg_cmake_install()
-vcpkg_cmake_config_fixup(PACKAGE_NAME llama CONFIG_PATH "lib/cmake/llama")
+vcpkg_cmake_config_fixup(PACKAGE_NAME llama CONFIG_PATH "lib/cmake/llama" DO_NOT_DELETE_PARENT_CONFIG_PATH)
+vcpkg_cmake_config_fixup(PACKAGE_NAME ggml  CONFIG_PATH "lib/cmake/ggml")
 vcpkg_copy_pdbs()
 vcpkg_fixup_pkgconfig()
 
diff --git a/vcpkg/custom_ports/llama-cpp/vcpkg.json b/vcpkg/custom_ports/llama-cpp/vcpkg.json
index e20d77f..f6565d7 100644
--- a/vcpkg/custom_ports/llama-cpp/vcpkg.json
+++ b/vcpkg/custom_ports/llama-cpp/vcpkg.json
@@ -1,12 +1,11 @@
 {
     "name": "llama-cpp",
-    "version": "4743",
+    "version": "8573",
     "port-version": 1,
     "description": "LLM inference in C/C++",
     "homepage": "https://github.com/ggml-org/llama.cpp",
     "license": "MIT",
     "dependencies": [
-      "ggml",
       {
         "name": "vcpkg-cmake",
         "host": true