Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ models/
# local info
push-server.sh
docs/tasks/
tmp/
tmp/
ansible/
2 changes: 1 addition & 1 deletion arbiterAI_config
6 changes: 6 additions & 0 deletions docs/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
"models_dir": "/models",
"default_model": "",
"default_variant": "",
"startup_defaults": {
"cpu": {"model": "", "variant": ""},
"cuda": {"model": "", "variant": ""},
"vulkan": {"model": "", "variant": ""}
},
"override_path": "",
"ram_budget_mb": 0,
"max_concurrent_downloads": 2,
Expand Down Expand Up @@ -99,6 +104,7 @@ All server settings are defined in a JSON configuration file. See [`examples/ser
| `models_dir` | `string` | `"/models"` | Directory for downloaded model files |
| `default_model` | `string` | `""` | Model to load on startup |
| `default_variant` | `string` | `""` | Default quantization variant (e.g., `Q4_K_M`) |
| `startup_defaults` | `object` | `{}` | Per-accelerator startup defaults used on restart. Keys: `cpu`, `cuda`, `vulkan`, each with `model` and optional `variant`. If unset, the server falls back to `default_model` / `default_variant`. |
| `override_path` | `string` | `""` | Path to write runtime model config overrides |
| `ram_budget_mb` | `int` | `0` | Ready-model RAM budget in MB (`0` = auto 50%) |
| `max_concurrent_downloads` | `int` | `2` | Maximum simultaneous model downloads |
Expand Down
26 changes: 26 additions & 0 deletions examples/server_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,32 @@

"default_model": "",
"default_variant": "",
"startup_defaults": {
"cpu": {
"model": "",
"variant": ""
},
"cuda": {
"model": "",
"variant": ""
},
"vulkan": {
"model": "",
"variant": ""
}
},

"startup_models": [
{
"model": "Qwen3.5-27B",
"variant": "Q4_K_M",
"context_size": 248832,
"runtime_options": {
"flash_attn": true
},
"devices": [1]
}
],

"override_path": "",

Expand Down
157 changes: 157 additions & 0 deletions scripts/context_stress_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/bin/bash
# context_stress_test.sh — Progressively fill context on ai-lab to find the real limit
#
# The model (Qwen3.5-27B:Q4_K_M) is loaded with 248832 context on the MI50 32GB.
# This script sends increasingly large prompts and observes when llama.cpp errors.
#
# Strategy: Use a binary search approach. Start with a known-good token count,
# then double until failure, then binary search between last-good and first-bad.

set -euo pipefail

SERVER="http://192.168.2.101:8081"
MODEL="Qwen3.5-27B"
RESULTS_FILE="/tmp/context_stress_results.txt"

echo "Context Stress Test - $(date)" | tee "$RESULTS_FILE"
echo "Server: $SERVER" | tee -a "$RESULTS_FILE"
echo "Model: $MODEL (Q4_K_M)" | tee -a "$RESULTS_FILE"
echo "Configured context: 248832" | tee -a "$RESULTS_FILE"
echo "========================================" | tee -a "$RESULTS_FILE"

# Generate a repeating text block to fill context
# ~4 chars per token for English text is a rough estimate
# We'll use a simple repeating pattern
generate_payload() {
local target_tokens=$1
# Each word "hello " is roughly 1-2 tokens; use ~3.5 chars/token estimate
local char_count=$((target_tokens * 4))

# Generate repeating text
local text=""
local block="The quick brown fox jumps over the lazy dog. This is a test of context window capacity. "
local block_len=${#block}
local repeats=$((char_count / block_len + 1))

# Use python for efficiency with large strings
python3 -c "
import json, sys

target_chars = $char_count
block = 'The quick brown fox jumps over the lazy dog. This is a test of context window capacity. '
text = (block * ($repeats))[:target_chars]

payload = {
'model': '$MODEL',
'messages': [
{'role': 'system', 'content': 'You are a helpful assistant. Respond with exactly one word: OK'},
{'role': 'user', 'content': text}
],
'max_tokens': 5,
'temperature': 0.0
}

json.dump(payload, sys.stdout)
"
}

# Send a request and check if it succeeds
test_context() {
local target_tokens=$1
local start_time=$(date +%s%N)

echo -n " Testing ~${target_tokens} tokens... " | tee -a "$RESULTS_FILE"

# Generate payload and send
local response
local http_code

# Write payload to temp file to handle large sizes
generate_payload "$target_tokens" > /tmp/context_test_payload.json
local payload_size=$(wc -c < /tmp/context_test_payload.json)
echo -n "(payload: ${payload_size} bytes) " | tee -a "$RESULTS_FILE"

# Send request with extended timeout (large context = slow)
response=$(curl -sf -w "\n%{http_code}" \
--max-time 300 \
-X POST "${SERVER}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d @/tmp/context_test_payload.json 2>&1) || {
local exit_code=$?
echo "CURL_ERROR (exit=$exit_code)" | tee -a "$RESULTS_FILE"
echo " Response: $(echo "$response" | tail -5)" | tee -a "$RESULTS_FILE"
return 1
}

http_code=$(echo "$response" | tail -1)
local body=$(echo "$response" | sed '$d')

local end_time=$(date +%s%N)
local elapsed_ms=$(( (end_time - start_time) / 1000000 ))

if [ "$http_code" = "200" ]; then
local prompt_tokens=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('usage',{}).get('prompt_tokens','?'))" 2>/dev/null || echo "?")
echo "OK (HTTP 200, prompt_tokens=${prompt_tokens}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE"
return 0
else
local error_msg=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',{}).get('message','unknown')[:200])" 2>/dev/null || echo "$body" | head -c 200)
echo "FAILED (HTTP ${http_code}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE"
echo " Error: ${error_msg}" | tee -a "$RESULTS_FILE"
return 1
fi
}

# Phase 1: Exponential probing - find the ballpark where it fails
echo "" | tee -a "$RESULTS_FILE"
echo "Phase 1: Exponential probing" | tee -a "$RESULTS_FILE"
echo "----------------------------------------" | tee -a "$RESULTS_FILE"

# Start with small amounts and increase
TOKEN_SIZES=(1000 4000 8000 16000 32000 64000 96000 128000 160000 192000 224000 240000 248000)

last_good=0
first_bad=0

for tokens in "${TOKEN_SIZES[@]}"; do
if test_context "$tokens"; then
last_good=$tokens
else
first_bad=$tokens
break
fi
done

if [ "$first_bad" -eq 0 ]; then
echo "" | tee -a "$RESULTS_FILE"
echo "All tests passed! Model handled up to ~${last_good} tokens." | tee -a "$RESULTS_FILE"
echo "The full 248832 context appears usable." | tee -a "$RESULTS_FILE"
else
# Phase 2: Binary search between last_good and first_bad
echo "" | tee -a "$RESULTS_FILE"
echo "Phase 2: Binary search between ${last_good} and ${first_bad}" | tee -a "$RESULTS_FILE"
echo "----------------------------------------" | tee -a "$RESULTS_FILE"

low=$last_good
high=$first_bad

while [ $((high - low)) -gt 2000 ]; do
mid=$(( (low + high) / 2 ))
if test_context "$mid"; then
low=$mid
else
high=$mid
fi
done

echo "" | tee -a "$RESULTS_FILE"
echo "========================================" | tee -a "$RESULTS_FILE"
echo "RESULT: Maximum usable context is approximately ${low}-${high} tokens" | tee -a "$RESULTS_FILE"
echo " Last successful: ~${low} tokens" | tee -a "$RESULTS_FILE"
echo " First failure: ~${high} tokens" | tee -a "$RESULTS_FILE"
echo " Configured max: 248832 tokens" | tee -a "$RESULTS_FILE"
echo " Utilization: $(python3 -c "print(f'{${low}/248832*100:.1f}%')")" | tee -a "$RESULTS_FILE"
fi

echo "" | tee -a "$RESULTS_FILE"
echo "Full results saved to: $RESULTS_FILE" | tee -a "$RESULTS_FILE"
echo "Done - $(date)" | tee -a "$RESULTS_FILE"
4 changes: 2 additions & 2 deletions src/arbiterAI/arbiterAI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,14 +533,14 @@ ErrorCode ArbiterAI::getAvailableModels(std::vector<std::string>& models)
// ========== Local Model Management ==========

ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize,
const RuntimeOptions *optionsOverride)
const RuntimeOptions *optionsOverride, const std::vector<int> &targetDevices)
{
RuntimeOptions opts;
if(optionsOverride)
{
opts=*optionsOverride;
}
return ModelRuntime::instance().loadModel(model, variant, contextSize, opts);
return ModelRuntime::instance().loadModel(model, variant, contextSize, opts, targetDevices);
}

ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant)
Expand Down
2 changes: 1 addition & 1 deletion src/arbiterAI/arbiterAI.h
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ class ArbiterAI
* @return ErrorCode indicating success, ModelDownloading, or failure
*/
ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0,
const RuntimeOptions *optionsOverride=nullptr);
const RuntimeOptions *optionsOverride=nullptr, const std::vector<int> &targetDevices={});

/**
* @brief Download model files without loading into VRAM
Expand Down
50 changes: 37 additions & 13 deletions src/arbiterAI/hardwareDetector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,8 @@ void HardwareDetector::detectNvmlGpus()
}
}

spdlog::info("NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})",
spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
"NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})",
gpu.index, gpu.name, gpu.vramTotalMb, gpu.vramFreeMb, gpu.computeCapability);

m_systemInfo.gpus.push_back(gpu);
Expand Down Expand Up @@ -764,8 +765,12 @@ void HardwareDetector::detectVulkanGpus()

const VkPhysicalDeviceMemoryProperties &mp=memProps2.memoryProperties;

// Sum DEVICE_LOCAL heaps — on discrete GPUs this is dedicated VRAM,
// on UMA systems this is the GPU-accessible portion of system RAM.
// Collect DEVICE_LOCAL heap info for budget and usage tracking.
// Cards like the MI50 32GB expose multiple DEVICE_LOCAL heaps
// (e.g. CPU-visible BAR heap + GPU-only heap). The budget from
// VK_EXT_memory_budget is the authoritative measure of how much
// VRAM is actually allocatable — it accounts for BAR limitations,
// other processes, and driver reservations.
uint64_t deviceLocalBudgetBytes=0;
uint64_t deviceLocalUsageBytes=0;
uint64_t deviceLocalSizeBytes=0;
Expand Down Expand Up @@ -799,20 +804,39 @@ void HardwareDetector::detectVulkanGpus()

gpu.hasMemoryBudget=true;

// Budget is the best estimate of how much this process can allocate.
// On UMA, budget may be significantly larger than the raw heap size
// (driver exposes most of system RAM as available to the GPU).
uint64_t budgetTotalMb=deviceLocalBudgetBytes/(1024ULL*1024ULL);
uint64_t budgetUsedMb=deviceLocalUsageBytes/(1024ULL*1024ULL);
uint64_t heapSizeMb=deviceLocalSizeBytes/(1024ULL*1024ULL);

// Use the larger of heap size and budget for total — on some UMA
// drivers the budget exceeds the reported heap size.
uint64_t effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
uint64_t effectiveTotalMb;
uint64_t effectiveFreeMb=(deviceLocalBudgetBytes>deviceLocalUsageBytes)
? (deviceLocalBudgetBytes-deviceLocalUsageBytes)/(1024ULL*1024ULL)
: 0;

if(isIntegrated)
{
// UMA/integrated GPUs: budget may exceed heap size (driver
// exposes system RAM as GPU-accessible). Use the larger value.
effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb;
}
else
{
// Discrete GPUs: budget is the authoritative allocatable total.
// When a device has multiple DEVICE_LOCAL heaps (e.g. visible
// BAR heap + GPU-only heap), the budget for the BAR heap may
// be much smaller than its physical size if Resizable BAR is
// not enabled. Using heap size would over-report and cause
// model loads that overcommit VRAM and spill to system RAM.
effectiveTotalMb=budgetTotalMb;

if(budgetTotalMb<heapSizeMb*90/100)
{
spdlog::warn("Vulkan GPU {}: allocatable budget ({}MB) is significantly less than "
"physical VRAM ({}MB). This typically means Resizable BAR / Above 4G Decoding "
"is not enabled in BIOS. Enable it to unlock the full VRAM.",
gpu.index, budgetTotalMb, heapSizeMb);
}
}

gpu.vramTotalMb=static_cast<int>(effectiveTotalMb);
gpu.vramFreeMb=static_cast<int>(effectiveFreeMb);

Expand All @@ -826,8 +850,8 @@ void HardwareDetector::detectVulkanGpus()
}

spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info,
"Vulkan GPU {}: {} (budget: {}MB total, {}MB free, "
"heap size: {}MB, integrated={}, memoryBudget=true)",
"Vulkan GPU {}: {} (allocatable: {}MB, free: {}MB, "
"physical: {}MB, integrated={}, memoryBudget=true)",
gpu.index, gpu.name,
gpu.vramTotalMb, gpu.vramFreeMb,
static_cast<int>(heapSizeMb), gpu.unifiedMemory);
Expand All @@ -852,7 +876,7 @@ void HardwareDetector::detectVulkanGpus()

if(deviceLocal)
{
vramTotalMb+=static_cast<int>(memProps.memoryHeaps[h].size/(1024*1024));
vramTotalMb+=heapInfo.sizeMb;
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/arbiterAI/modelFitCalculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class ModelFitCalculator {
const std::vector<ModelInfo> &models,
const SystemInfo &hw);

private:
/// Sum free VRAM across a set of GPU indices.
/// For unified memory GPUs, uses gpuAccessibleRamFreeMb when available.
static int sumFreeVram(const SystemInfo &hw, const std::vector<int> &gpuIndices);
Expand All @@ -42,6 +41,7 @@ class ModelFitCalculator {
/// For unified memory GPUs, uses gpuAccessibleRamMb when available.
static int sumTotalVram(const SystemInfo &hw, const std::vector<int> &gpuIndices);

private:
/// Get all GPU indices from the system info.
static std::vector<int> allGpuIndices(const SystemInfo &hw);

Expand Down
17 changes: 17 additions & 0 deletions src/arbiterAI/modelManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ void RuntimeOptions::mergeFrom(const RuntimeOptions &other)
if(other.swaFull.has_value()) swaFull=other.swaFull;
if(other.nGpuLayers.has_value()) nGpuLayers=other.nGpuLayers;
if(other.overrideTensor.has_value()) overrideTensor=other.overrideTensor;
if(other.vulkanNoHostVisibleVram.has_value()) vulkanNoHostVisibleVram=other.vulkanNoHostVisibleVram;
}

ModelManager &ModelManager::instance()
Expand Down Expand Up @@ -365,6 +366,22 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in
variant.files.push_back(vd);
}
}

// Skip CLIP/mmproj variants — these are multimodal projection
// files, not standalone models. Loading them as the main model
// causes llama.cpp to fail with "CLIP cannot be used as main model".
std::string primaryFile=variant.getPrimaryFilename();
std::string primaryLower=primaryFile;
std::transform(primaryLower.begin(), primaryLower.end(), primaryLower.begin(), ::tolower);
if(primaryLower.find("mmproj")!=std::string::npos||
primaryLower.find("clip-")!=std::string::npos||
primaryLower.find("vision-")!=std::string::npos)
{
spdlog::debug("Skipping multimodal projection variant '{}' for model '{}' (file: {})",
variant.quantization, info.model, primaryFile);
continue;
}

info.variants.push_back(variant);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/arbiterAI/modelManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ struct RuntimeOptions {
std::optional<bool> swaFull; // --swa-full: full SWA (sliding window attention)
std::optional<int> nGpuLayers; // -ngl: number of GPU layers (99=all)
std::optional<std::string> overrideTensor; // -ot: tensor override pattern (e.g. "per_layer_token_embd.weight=CPU")
std::optional<bool> vulkanNoHostVisibleVram; // GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM: skip BAR-mapped heap, force device-local only

/// Merge another set of options on top of this one (override only non-empty fields).
void mergeFrom(const RuntimeOptions &other);
Expand Down
Loading
Loading