caseymcc · caseymcc · May 11, 2026 · May 2, 2026 · May 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,4 +45,5 @@ models/
 # local info
 push-server.sh
 docs/tasks/
-tmp/
+tmp/
+ansible/
diff --git a/arbiterAI_config b/arbiterAI_config
diff --git a/docs/server.md b/docs/server.md
@@ -65,6 +65,11 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
     "models_dir": "/models",
     "default_model": "",
     "default_variant": "",
+    "startup_defaults": {
+      "cpu": {"model": "", "variant": ""},
+      "cuda": {"model": "", "variant": ""},
+      "vulkan": {"model": "", "variant": ""}
+    },
     "override_path": "",
     "ram_budget_mb": 0,
     "max_concurrent_downloads": 2,
@@ -99,6 +104,7 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
 | `models_dir` | `string` | `"/models"` | Directory for downloaded model files |
 | `default_model` | `string` | `""` | Model to load on startup |
 | `default_variant` | `string` | `""` | Default quantization variant (e.g., `Q4_K_M`) |
+| `startup_defaults` | `object` | `{}` | Per-accelerator startup defaults used on restart. Keys: `cpu`, `cuda`, `vulkan`, each with `model` and optional `variant`. If unset, the server falls back to `default_model` / `default_variant`. |
 | `override_path` | `string` | `""` | Path to write runtime model config overrides |
 | `ram_budget_mb` | `int` | `0` | Ready-model RAM budget in MB (`0` = auto 50%) |
 | `max_concurrent_downloads` | `int` | `2` | Maximum simultaneous model downloads |

diff --git a/examples/server_config.json b/examples/server_config.json
@@ -10,6 +10,32 @@
 
     "default_model": "",
     "default_variant": "",
+    "startup_defaults": {
+        "cpu": {
+            "model": "",
+            "variant": ""
+        },
+        "cuda": {
+            "model": "",
+            "variant": ""
+        },
+        "vulkan": {
+            "model": "",
+            "variant": ""
+        }
+    },
+
+    "startup_models": [
+        {
+            "model": "Qwen3.5-27B",
+            "variant": "Q4_K_M",
+            "context_size": 248832,
+            "runtime_options": {
+                "flash_attn": true
+            },
+            "devices": [1]
+        }
+    ],
 
     "override_path": "",
 

diff --git a/scripts/context_stress_test.sh b/scripts/context_stress_test.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+# context_stress_test.sh — Progressively fill context on ai-lab to find the real limit
+#
+# The model (Qwen3.5-27B:Q4_K_M) is loaded with 248832 context on the MI50 32GB.
+# This script sends increasingly large prompts and observes when llama.cpp errors.
+#
+# Strategy: Use a binary search approach. Start with a known-good token count,
+# then double until failure, then binary search between last-good and first-bad.
+
+set -euo pipefail
+
+SERVER="http://192.168.2.101:8081"
+MODEL="Qwen3.5-27B"
+RESULTS_FILE="/tmp/context_stress_results.txt"
+
+echo "Context Stress Test - $(date)" | tee "$RESULTS_FILE"
+echo "Server: $SERVER" | tee -a "$RESULTS_FILE"
+echo "Model: $MODEL (Q4_K_M)" | tee -a "$RESULTS_FILE"
+echo "Configured context: 248832" | tee -a "$RESULTS_FILE"
+echo "========================================" | tee -a "$RESULTS_FILE"
+
+# Generate a repeating text block to fill context
+# ~4 chars per token for English text is a rough estimate
+# We'll use a simple repeating pattern
+generate_payload() {
+    local target_tokens=$1
+    # Each word "hello " is roughly 1-2 tokens; use ~3.5 chars/token estimate
+    local char_count=$((target_tokens * 4))
+
+    # Generate repeating text
+    local text=""
+    local block="The quick brown fox jumps over the lazy dog. This is a test of context window capacity. "
+    local block_len=${#block}
+    local repeats=$((char_count / block_len + 1))
+
+    # Use python for efficiency with large strings
+    python3 -c "
+import json, sys
+
+target_chars = $char_count
+block = 'The quick brown fox jumps over the lazy dog. This is a test of context window capacity. '
+text = (block * ($repeats))[:target_chars]
+
+payload = {
+    'model': '$MODEL',
+    'messages': [
+        {'role': 'system', 'content': 'You are a helpful assistant. Respond with exactly one word: OK'},
+        {'role': 'user', 'content': text}
+    ],
+    'max_tokens': 5,
+    'temperature': 0.0
+}
+
+json.dump(payload, sys.stdout)
+"
+}
+
+# Send a request and check if it succeeds
+test_context() {
+    local target_tokens=$1
+    local start_time=$(date +%s%N)
+
+    echo -n "  Testing ~${target_tokens} tokens... " | tee -a "$RESULTS_FILE"
+
+    # Generate payload and send
+    local response
+    local http_code
+
+    # Write payload to temp file to handle large sizes
+    generate_payload "$target_tokens" > /tmp/context_test_payload.json
+    local payload_size=$(wc -c < /tmp/context_test_payload.json)
+    echo -n "(payload: ${payload_size} bytes) " | tee -a "$RESULTS_FILE"
+
+    # Send request with extended timeout (large context = slow)
+    response=$(curl -sf -w "\n%{http_code}" \
+        --max-time 300 \
+        -X POST "${SERVER}/v1/chat/completions" \
+        -H "Content-Type: application/json" \
+        -d @/tmp/context_test_payload.json 2>&1) || {
+        local exit_code=$?
+        echo "CURL_ERROR (exit=$exit_code)" | tee -a "$RESULTS_FILE"
+        echo "  Response: $(echo "$response" | tail -5)" | tee -a "$RESULTS_FILE"
+        return 1
+    }
+
+    http_code=$(echo "$response" | tail -1)
+    local body=$(echo "$response" | sed '$d')
+
+    local end_time=$(date +%s%N)
+    local elapsed_ms=$(( (end_time - start_time) / 1000000 ))
+
+    if [ "$http_code" = "200" ]; then
+        local prompt_tokens=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('usage',{}).get('prompt_tokens','?'))" 2>/dev/null || echo "?")
+        echo "OK (HTTP 200, prompt_tokens=${prompt_tokens}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE"
+        return 0
+    else
+        local error_msg=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',{}).get('message','unknown')[:200])" 2>/dev/null || echo "$body" | head -c 200)
+        echo "FAILED (HTTP ${http_code}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE"
+        echo "  Error: ${error_msg}" | tee -a "$RESULTS_FILE"
+        return 1
+    fi
+}
+
+# Phase 1: Exponential probing - find the ballpark where it fails
+echo "" | tee -a "$RESULTS_FILE"
+echo "Phase 1: Exponential probing" | tee -a "$RESULTS_FILE"
+echo "----------------------------------------" | tee -a "$RESULTS_FILE"
+
+# Start with small amounts and increase
+TOKEN_SIZES=(1000 4000 8000 16000 32000 64000 96000 128000 160000 192000 224000 240000 248000)
+
+last_good=0
+first_bad=0
+
+for tokens in "${TOKEN_SIZES[@]}"; do
+    if test_context "$tokens"; then
+        last_good=$tokens
+    else
+        first_bad=$tokens
+        break
+    fi
+done
+
+if [ "$first_bad" -eq 0 ]; then
+    echo "" | tee -a "$RESULTS_FILE"
+    echo "All tests passed! Model handled up to ~${last_good} tokens." | tee -a "$RESULTS_FILE"
+    echo "The full 248832 context appears usable." | tee -a "$RESULTS_FILE"
+else
+    # Phase 2: Binary search between last_good and first_bad
+    echo "" | tee -a "$RESULTS_FILE"
+    echo "Phase 2: Binary search between ${last_good} and ${first_bad}" | tee -a "$RESULTS_FILE"
+    echo "----------------------------------------" | tee -a "$RESULTS_FILE"
+
+    low=$last_good
+    high=$first_bad
+
+    while [ $((high - low)) -gt 2000 ]; do
+        mid=$(( (low + high) / 2 ))
+        if test_context "$mid"; then
+            low=$mid
+        else
+            high=$mid
+        fi
+    done
+
+    echo "" | tee -a "$RESULTS_FILE"
+    echo "========================================" | tee -a "$RESULTS_FILE"
+    echo "RESULT: Maximum usable context is approximately ${low}-${high} tokens" | tee -a "$RESULTS_FILE"
+    echo "  Last successful: ~${low} tokens" | tee -a "$RESULTS_FILE"
+    echo "  First failure:   ~${high} tokens" | tee -a "$RESULTS_FILE"
+    echo "  Configured max:  248832 tokens" | tee -a "$RESULTS_FILE"
+    echo "  Utilization:     $(python3 -c "print(f'{${low}/248832*100:.1f}%')")" | tee -a "$RESULTS_FILE"
+fi
+
+echo "" | tee -a "$RESULTS_FILE"
+echo "Full results saved to: $RESULTS_FILE" | tee -a "$RESULTS_FILE"
+echo "Done - $(date)" | tee -a "$RESULTS_FILE"
diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp
@@ -533,14 +533,14 @@ ErrorCode ArbiterAI::getAvailableModels(std::vector<std::string>& models)
 // ========== Local Model Management ==========
 
 ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize,
-    const RuntimeOptions *optionsOverride)
+    const RuntimeOptions *optionsOverride, const std::vector<int> &targetDevices)
 {
     RuntimeOptions opts;
     if(optionsOverride)
     {
         opts=*optionsOverride;
     }
-    return ModelRuntime::instance().loadModel(model, variant, contextSize, opts);
+    return ModelRuntime::instance().loadModel(model, variant, contextSize, opts, targetDevices);
 }
 
 ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant)

diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h
@@ -650,7 +650,7 @@ class ArbiterAI
      * @return ErrorCode indicating success, ModelDownloading, or failure
      */
     ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0,
-        const RuntimeOptions *optionsOverride=nullptr);
+        const RuntimeOptions *optionsOverride=nullptr, const std::vector<int> &targetDevices={});
 
     /**
      * @brief Download model files without loading into VRAM

diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp
@@ -560,7 +560,8 @@ void HardwareDetector::detectNvmlGpus()
             }
         }
 
-        spdlog::info("NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})",
+        spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
+            "NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})",
             gpu.index, gpu.name, gpu.vramTotalMb, gpu.vramFreeMb, gpu.computeCapability);
 
         m_systemInfo.gpus.push_back(gpu);
@@ -764,8 +765,12 @@ void HardwareDetector::detectVulkanGpus()
 
             const VkPhysicalDeviceMemoryProperties &mp=memProps2.memoryProperties;
 
-            // Sum DEVICE_LOCAL heaps — on discrete GPUs this is dedicated VRAM,
-            // on UMA systems this is the GPU-accessible portion of system RAM.
+            // Collect DEVICE_LOCAL heap info for budget and usage tracking.
+            // Cards like the MI50 32GB expose multiple DEVICE_LOCAL heaps
+            // (e.g. CPU-visible BAR heap + GPU-only heap). The budget from
+            // VK_EXT_memory_budget is the authoritative measure of how much
+            // VRAM is actually allocatable — it accounts for BAR limitations,
+            // other processes, and driver reservations.
             uint64_t deviceLocalBudgetBytes=0;
             uint64_t deviceLocalUsageBytes=0;
             uint64_t deviceLocalSizeBytes=0;
@@ -799,20 +804,39 @@ void HardwareDetector::detectVulkanGpus()
 
             gpu.hasMemoryBudget=true;
 
-            // Budget is the best estimate of how much this process can allocate.
-            // On UMA, budget may be significantly larger than the raw heap size
-            // (driver exposes most of system RAM as available to the GPU).
             uint64_t budgetTotalMb=deviceLocalBudgetBytes/(1024ULL*1024ULL);
-            uint64_t budgetUsedMb=deviceLocalUsageBytes/(1024ULL*1024ULL);
             uint64_t heapSizeMb=deviceLocalSizeBytes/(1024ULL*1024ULL);
 
-            // Use the larger of heap size and budget for total — on some UMA
-            // drivers the budget exceeds the reported heap size.
-            uint64_t effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
+            uint64_t effectiveTotalMb;
             uint64_t effectiveFreeMb=(deviceLocalBudgetBytes>deviceLocalUsageBytes)
                 ? (deviceLocalBudgetBytes-deviceLocalUsageBytes)/(1024ULL*1024ULL)
                 : 0;
 
+            if(isIntegrated)
+            {
+                // UMA/integrated GPUs: budget may exceed heap size (driver
+                // exposes system RAM as GPU-accessible). Use the larger value.
+                effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
+            }
+            else
+            {
+                // Discrete GPUs: budget is the authoritative allocatable total.
+                // When a device has multiple DEVICE_LOCAL heaps (e.g. visible
+                // BAR heap + GPU-only heap), the budget for the BAR heap may
+                // be much smaller than its physical size if Resizable BAR is
+                // not enabled. Using heap size would over-report and cause
+                // model loads that overcommit VRAM and spill to system RAM.
+                effectiveTotalMb=budgetTotalMb;
+
+                if(budgetTotalMb<heapSizeMb*90/100)
+                {
+                    spdlog::warn("Vulkan GPU {}: allocatable budget ({}MB) is significantly less than "
+                        "physical VRAM ({}MB). This typically means Resizable BAR / Above 4G Decoding "
+                        "is not enabled in BIOS. Enable it to unlock the full VRAM.",
+                        gpu.index, budgetTotalMb, heapSizeMb);
+                }
+            }
+
             gpu.vramTotalMb=static_cast<int>(effectiveTotalMb);
             gpu.vramFreeMb=static_cast<int>(effectiveFreeMb);
 
@@ -826,8 +850,8 @@ void HardwareDetector::detectVulkanGpus()
             }
 
             spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
-                "Vulkan GPU {}: {} (budget: {}MB total, {}MB free, "
-                "heap size: {}MB, integrated={}, memoryBudget=true)",
+                "Vulkan GPU {}: {} (allocatable: {}MB, free: {}MB, "
+                "physical: {}MB, integrated={}, memoryBudget=true)",
                 gpu.index, gpu.name,
                 gpu.vramTotalMb, gpu.vramFreeMb,
                 static_cast<int>(heapSizeMb), gpu.unifiedMemory);
@@ -852,7 +876,7 @@ void HardwareDetector::detectVulkanGpus()
 
                 if(deviceLocal)
                 {
-                    vramTotalMb+=static_cast<int>(memProps.memoryHeaps[h].size/(1024*1024));
+                    vramTotalMb+=heapInfo.sizeMb;
                 }
             }
 

diff --git a/src/arbiterAI/modelFitCalculator.h b/src/arbiterAI/modelFitCalculator.h
@@ -33,7 +33,6 @@ class ModelFitCalculator {
         const std::vector<ModelInfo> &models,
         const SystemInfo &hw);
 
-private:
     /// Sum free VRAM across a set of GPU indices.
     /// For unified memory GPUs, uses gpuAccessibleRamFreeMb when available.
     static int sumFreeVram(const SystemInfo &hw, const std::vector<int> &gpuIndices);
@@ -42,6 +41,7 @@ class ModelFitCalculator {
     /// For unified memory GPUs, uses gpuAccessibleRamMb when available.
     static int sumTotalVram(const SystemInfo &hw, const std::vector<int> &gpuIndices);
 
+private:
     /// Get all GPU indices from the system info.
     static std::vector<int> allGpuIndices(const SystemInfo &hw);
 

diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp
@@ -72,6 +72,7 @@ void RuntimeOptions::mergeFrom(const RuntimeOptions &other)
     if(other.swaFull.has_value()) swaFull=other.swaFull;
     if(other.nGpuLayers.has_value()) nGpuLayers=other.nGpuLayers;
     if(other.overrideTensor.has_value()) overrideTensor=other.overrideTensor;
+    if(other.vulkanNoHostVisibleVram.has_value()) vulkanNoHostVisibleVram=other.vulkanNoHostVisibleVram;
 }
 
 ModelManager &ModelManager::instance()
@@ -365,6 +366,22 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in
                     variant.files.push_back(vd);
                 }
             }
+
+            // Skip CLIP/mmproj variants — these are multimodal projection
+            // files, not standalone models.  Loading them as the main model
+            // causes llama.cpp to fail with "CLIP cannot be used as main model".
+            std::string primaryFile=variant.getPrimaryFilename();
+            std::string primaryLower=primaryFile;
+            std::transform(primaryLower.begin(), primaryLower.end(), primaryLower.begin(), ::tolower);
+            if(primaryLower.find("mmproj")!=std::string::npos||
+                primaryLower.find("clip-")!=std::string::npos||
+                primaryLower.find("vision-")!=std::string::npos)
+            {
+                spdlog::debug("Skipping multimodal projection variant '{}' for model '{}' (file: {})",
+                    variant.quantization, info.model, primaryFile);
+                continue;
+            }
+
             info.variants.push_back(variant);
         }
     }

diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h
@@ -51,6 +51,7 @@ struct RuntimeOptions {
     std::optional<bool> swaFull;                // --swa-full: full SWA (sliding window attention)
     std::optional<int> nGpuLayers;              // -ngl: number of GPU layers (99=all)
     std::optional<std::string> overrideTensor;  // -ot: tensor override pattern (e.g. "per_layer_token_embd.weight=CPU")
+    std::optional<bool> vulkanNoHostVisibleVram; // GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM: skip BAR-mapped heap, force device-local only
 
     /// Merge another set of options on top of this one (override only non-empty fields).
     void mergeFrom(const RuntimeOptions &other);