From 2e3c6c342a5c04839a86ad14b4da42129eb59df1 Mon Sep 17 00:00:00 2001 From: krazer Date: Sat, 2 May 2026 11:18:45 -0400 Subject: [PATCH 1/2] updates for vulkan devices Co-authored-by: Copilot --- docs/server.md | 6 + examples/server_config.json | 26 + src/arbiterAI/arbiterAI.cpp | 4 +- src/arbiterAI/arbiterAI.h | 2 +- src/arbiterAI/hardwareDetector.cpp | 50 +- src/arbiterAI/modelFitCalculator.h | 2 +- src/arbiterAI/modelManager.cpp | 1 + src/arbiterAI/modelManager.h | 1 + src/arbiterAI/modelRuntime.cpp | 450 ++++++- src/arbiterAI/modelRuntime.h | 46 +- src/arbiterAI/providers/llama.cpp | 90 +- src/arbiterAI/telemetryCollector.cpp | 2 +- src/server/dashboard.h | 400 +++++- src/server/dashboardConfig.h | 1792 ++++++++++++++++++++++++++ src/server/main.cpp | 473 ++++++- src/server/routes.cpp | 882 ++++++++++++- src/server/routes.h | 10 + tests/modelRuntimeTests.cpp | 6 +- tests/telemetryCollectorTests.cpp | 2 +- 19 files changed, 4135 insertions(+), 110 deletions(-) create mode 100644 src/server/dashboardConfig.h diff --git a/docs/server.md b/docs/server.md index 9c55fe6..9ea331b 100644 --- a/docs/server.md +++ b/docs/server.md @@ -65,6 +65,11 @@ All server settings are defined in a JSON configuration file. See [`examples/ser "models_dir": "/models", "default_model": "", "default_variant": "", + "startup_defaults": { + "cpu": {"model": "", "variant": ""}, + "cuda": {"model": "", "variant": ""}, + "vulkan": {"model": "", "variant": ""} + }, "override_path": "", "ram_budget_mb": 0, "max_concurrent_downloads": 2, @@ -99,6 +104,7 @@ All server settings are defined in a JSON configuration file. See [`examples/ser | `models_dir` | `string` | `"/models"` | Directory for downloaded model files | | `default_model` | `string` | `""` | Model to load on startup | | `default_variant` | `string` | `""` | Default quantization variant (e.g., `Q4_K_M`) | +| `startup_defaults` | `object` | `{}` | Per-accelerator startup defaults used on restart. Keys: `cpu`, `cuda`, `vulkan`, each with `model` and optional `variant`. If unset, the server falls back to `default_model` / `default_variant`. | | `override_path` | `string` | `""` | Path to write runtime model config overrides | | `ram_budget_mb` | `int` | `0` | Ready-model RAM budget in MB (`0` = auto 50%) | | `max_concurrent_downloads` | `int` | `2` | Maximum simultaneous model downloads | diff --git a/examples/server_config.json b/examples/server_config.json index 35ea631..60d4f19 100644 --- a/examples/server_config.json +++ b/examples/server_config.json @@ -10,6 +10,32 @@ "default_model": "", "default_variant": "", + "startup_defaults": { + "cpu": { + "model": "", + "variant": "" + }, + "cuda": { + "model": "", + "variant": "" + }, + "vulkan": { + "model": "", + "variant": "" + } + }, + + "startup_models": [ + { + "model": "Qwen3.5-27B", + "variant": "Q4_K_M", + "context_size": 248832, + "runtime_options": { + "flash_attn": true + }, + "devices": [1] + } + ], "override_path": "", diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp index 892b93c..43d7bb7 100644 --- a/src/arbiterAI/arbiterAI.cpp +++ b/src/arbiterAI/arbiterAI.cpp @@ -533,14 +533,14 @@ ErrorCode ArbiterAI::getAvailableModels(std::vector& models) // ========== Local Model Management ========== ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize, - const RuntimeOptions *optionsOverride) + const RuntimeOptions *optionsOverride, const std::vector &targetDevices) { RuntimeOptions opts; if(optionsOverride) { opts=*optionsOverride; } - return ModelRuntime::instance().loadModel(model, variant, contextSize, opts); + return ModelRuntime::instance().loadModel(model, variant, contextSize, opts, targetDevices); } ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant) diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h index 499ff2d..e5106fb 100644 --- a/src/arbiterAI/arbiterAI.h +++ b/src/arbiterAI/arbiterAI.h @@ -650,7 +650,7 @@ class ArbiterAI * @return ErrorCode indicating success, ModelDownloading, or failure */ ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0, - const RuntimeOptions *optionsOverride=nullptr); + const RuntimeOptions *optionsOverride=nullptr, const std::vector &targetDevices={}); /** * @brief Download model files without loading into VRAM diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp index d0ac4de..7ca4685 100644 --- a/src/arbiterAI/hardwareDetector.cpp +++ b/src/arbiterAI/hardwareDetector.cpp @@ -560,7 +560,8 @@ void HardwareDetector::detectNvmlGpus() } } - spdlog::info("NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})", + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "NVML GPU {}: {} ({}MB VRAM, {}MB free, CC {:.1f})", gpu.index, gpu.name, gpu.vramTotalMb, gpu.vramFreeMb, gpu.computeCapability); m_systemInfo.gpus.push_back(gpu); @@ -764,8 +765,12 @@ void HardwareDetector::detectVulkanGpus() const VkPhysicalDeviceMemoryProperties &mp=memProps2.memoryProperties; - // Sum DEVICE_LOCAL heaps — on discrete GPUs this is dedicated VRAM, - // on UMA systems this is the GPU-accessible portion of system RAM. + // Collect DEVICE_LOCAL heap info for budget and usage tracking. + // Cards like the MI50 32GB expose multiple DEVICE_LOCAL heaps + // (e.g. CPU-visible BAR heap + GPU-only heap). The budget from + // VK_EXT_memory_budget is the authoritative measure of how much + // VRAM is actually allocatable — it accounts for BAR limitations, + // other processes, and driver reservations. uint64_t deviceLocalBudgetBytes=0; uint64_t deviceLocalUsageBytes=0; uint64_t deviceLocalSizeBytes=0; @@ -799,20 +804,39 @@ void HardwareDetector::detectVulkanGpus() gpu.hasMemoryBudget=true; - // Budget is the best estimate of how much this process can allocate. - // On UMA, budget may be significantly larger than the raw heap size - // (driver exposes most of system RAM as available to the GPU). uint64_t budgetTotalMb=deviceLocalBudgetBytes/(1024ULL*1024ULL); - uint64_t budgetUsedMb=deviceLocalUsageBytes/(1024ULL*1024ULL); uint64_t heapSizeMb=deviceLocalSizeBytes/(1024ULL*1024ULL); - // Use the larger of heap size and budget for total — on some UMA - // drivers the budget exceeds the reported heap size. - uint64_t effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb; + uint64_t effectiveTotalMb; uint64_t effectiveFreeMb=(deviceLocalBudgetBytes>deviceLocalUsageBytes) ? (deviceLocalBudgetBytes-deviceLocalUsageBytes)/(1024ULL*1024ULL) : 0; + if(isIntegrated) + { + // UMA/integrated GPUs: budget may exceed heap size (driver + // exposes system RAM as GPU-accessible). Use the larger value. + effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb; + } + else + { + // Discrete GPUs: budget is the authoritative allocatable total. + // When a device has multiple DEVICE_LOCAL heaps (e.g. visible + // BAR heap + GPU-only heap), the budget for the BAR heap may + // be much smaller than its physical size if Resizable BAR is + // not enabled. Using heap size would over-report and cause + // model loads that overcommit VRAM and spill to system RAM. + effectiveTotalMb=budgetTotalMb; + + if(budgetTotalMb(effectiveTotalMb); gpu.vramFreeMb=static_cast(effectiveFreeMb); @@ -826,8 +850,8 @@ void HardwareDetector::detectVulkanGpus() } spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, - "Vulkan GPU {}: {} (budget: {}MB total, {}MB free, " - "heap size: {}MB, integrated={}, memoryBudget=true)", + "Vulkan GPU {}: {} (allocatable: {}MB, free: {}MB, " + "physical: {}MB, integrated={}, memoryBudget=true)", gpu.index, gpu.name, gpu.vramTotalMb, gpu.vramFreeMb, static_cast(heapSizeMb), gpu.unifiedMemory); @@ -852,7 +876,7 @@ void HardwareDetector::detectVulkanGpus() if(deviceLocal) { - vramTotalMb+=static_cast(memProps.memoryHeaps[h].size/(1024*1024)); + vramTotalMb+=heapInfo.sizeMb; } } diff --git a/src/arbiterAI/modelFitCalculator.h b/src/arbiterAI/modelFitCalculator.h index 6ed089a..cc62744 100644 --- a/src/arbiterAI/modelFitCalculator.h +++ b/src/arbiterAI/modelFitCalculator.h @@ -33,7 +33,6 @@ class ModelFitCalculator { const std::vector &models, const SystemInfo &hw); -private: /// Sum free VRAM across a set of GPU indices. /// For unified memory GPUs, uses gpuAccessibleRamFreeMb when available. static int sumFreeVram(const SystemInfo &hw, const std::vector &gpuIndices); @@ -42,6 +41,7 @@ class ModelFitCalculator { /// For unified memory GPUs, uses gpuAccessibleRamMb when available. static int sumTotalVram(const SystemInfo &hw, const std::vector &gpuIndices); +private: /// Get all GPU indices from the system info. static std::vector allGpuIndices(const SystemInfo &hw); diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp index d80cbe0..5c314d4 100644 --- a/src/arbiterAI/modelManager.cpp +++ b/src/arbiterAI/modelManager.cpp @@ -72,6 +72,7 @@ void RuntimeOptions::mergeFrom(const RuntimeOptions &other) if(other.swaFull.has_value()) swaFull=other.swaFull; if(other.nGpuLayers.has_value()) nGpuLayers=other.nGpuLayers; if(other.overrideTensor.has_value()) overrideTensor=other.overrideTensor; + if(other.vulkanNoHostVisibleVram.has_value()) vulkanNoHostVisibleVram=other.vulkanNoHostVisibleVram; } ModelManager &ModelManager::instance() diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h index f7ef48b..04309a1 100644 --- a/src/arbiterAI/modelManager.h +++ b/src/arbiterAI/modelManager.h @@ -51,6 +51,7 @@ struct RuntimeOptions { std::optional swaFull; // --swa-full: full SWA (sliding window attention) std::optional nGpuLayers; // -ngl: number of GPU layers (99=all) std::optional overrideTensor; // -ot: tensor override pattern (e.g. "per_layer_token_embd.weight=CPU") + std::optional vulkanNoHostVisibleVram; // GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM: skip BAR-mapped heap, force device-local only /// Merge another set of options on top of this one (override only non-empty fields). void mergeFrom(const RuntimeOptions &other); diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp index 4b5df20..22774e4 100644 --- a/src/arbiterAI/modelRuntime.cpp +++ b/src/arbiterAI/modelRuntime.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -72,8 +73,7 @@ void ModelRuntime::reset() } rt.m_models.clear(); - rt.m_inferenceActive=false; - rt.m_inferenceModel.clear(); + rt.m_activeInference.clear(); while(!rt.m_pendingSwaps.empty()) { rt.m_pendingSwaps.pop(); @@ -337,7 +337,8 @@ ErrorCode ModelRuntime::loadModel( const std::string &model, const std::string &variant, int contextSize, - const RuntimeOptions &optionsOverride) + const RuntimeOptions &optionsOverride, + const std::vector &targetDevices) { std::lock_guard lock(m_mutex); @@ -428,6 +429,26 @@ ErrorCode ModelRuntime::loadModel( if(selectedVar) { ModelFit fit=ModelFitCalculator::calculateModelFit(modelInfo.value(), *selectedVar, hw); + + // If caller specified target devices, override the auto-selected GPU indices + if(!targetDevices.empty()) + { + fit.gpuIndices=targetDevices; + // Recalculate available VRAM for target devices only + int targetVram=ModelFitCalculator::sumFreeVram(hw, targetDevices); + if(targetVram>0) + { + fit.canRun=true; + fit.limitingFactor.clear(); + // Recalculate max context for the specified devices + int targetTotalVram=ModelFitCalculator::sumTotalVram(hw, targetDevices); + if(fit.maxContextSize<=0&&targetTotalVram>0) + { + fit.maxContextSize=fit.maxContextSize; + } + } + } + if(!fit.canRun) { m_lastLoadError.reason=(fit.limitingFactor=="ram") @@ -444,8 +465,12 @@ ErrorCode ModelRuntime::loadModel( return ErrorCode::ModelLoadError; } - // Evict if needed to make room - evictIfNeeded(selectedVar->minVramMb); + // Evict if needed to make room on each assigned GPU + for(int gpuIdx:fit.gpuIndices) + { + int perGpuVram=selectedVar->minVramMb/static_cast(fit.gpuIndices.size()); + evictIfNeeded(perGpuVram, gpuIdx); + } // Check if all model files exist, initiate async download for any missing ones std::vector allFiles=selectedVar->getAllFiles(); @@ -506,6 +531,19 @@ ErrorCode ModelRuntime::loadModel( entry.gpuIndices=fit.gpuIndices; entry.lastUsed=std::chrono::steady_clock::now(); + // Distribute estimated VRAM usage across assigned GPUs + entry.perGpuVramMb.clear(); + if(!fit.gpuIndices.empty()) + { + int perGpu=fit.estimatedVramUsageMb/static_cast(fit.gpuIndices.size()); + int remainder=fit.estimatedVramUsageMb%static_cast(fit.gpuIndices.size()); + + for(size_t i=0; i(i)provider=="llama") { @@ -838,6 +876,7 @@ ErrorCode ModelRuntime::unloadModel(const std::string &model) entry.state=ModelState::Unloaded; entry.vramUsageMb=0; entry.ramUsageMb=0; + entry.perGpuVramMb.clear(); spdlog::info("Model '{}' unloaded", model); } @@ -880,7 +919,7 @@ ErrorCode ModelRuntime::swapModel( int contextSize, const RuntimeOptions &optionsOverride) { - if(m_inferenceActive) + if(!m_activeInference.empty()) { // Queue the swap for when inference completes std::lock_guard lock(m_mutex); @@ -922,6 +961,7 @@ ErrorCode ModelRuntime::swapModel( pair.second.state=ModelState::Unloaded; pair.second.vramUsageMb=0; pair.second.ramUsageMb=0; + pair.second.perGpuVramMb.clear(); } } } @@ -1127,9 +1167,72 @@ std::vector ModelRuntime::resolveBackendPriority(const ModelInfo &m return priority; } -void ModelRuntime::evictIfNeeded(int requiredVramMb) +void ModelRuntime::evictIfNeeded(int requiredVramMb, int gpuIndex) { - // Calculate current VRAM usage across all loaded models + if(gpuIndex>=0) + { + // Per-GPU eviction: only consider models on this specific GPU + int committedOnGpu=getCommittedVramMb(gpuIndex); + int estimatedFree=getEstimatedFreeVramMb(gpuIndex); + + if(estimatedFree>=requiredVramMb) + { + return; // enough VRAM on this GPU + } + + int needToFree=requiredVramMb-estimatedFree; + + struct EvictCandidate { + std::string model; + int vramOnGpu; + std::chrono::steady_clock::time_point lastUsed; + }; + + std::vector candidates; + for(const auto &pair:m_models) + { + if(pair.second.state==ModelState::Loaded&& + !pair.second.pinned&& + !m_activeInference.count(pair.first)) + { + auto gpuIt=pair.second.perGpuVramMb.find(gpuIndex); + if(gpuIt!=pair.second.perGpuVramMb.end()&&gpuIt->second>0) + { + candidates.push_back({pair.first, gpuIt->second, pair.second.lastUsed}); + } + } + } + + std::sort(candidates.begin(), candidates.end(), + [](const EvictCandidate &a, const EvictCandidate &b) + { + return a.lastUsed=needToFree) + { + break; + } + + auto it=m_models.find(candidate.model); + if(it!=m_models.end()) + { + freeLlamaModel(it->second); + it->second.state=ModelState::Unloaded; + it->second.vramUsageMb=0; + it->second.ramUsageMb=0; + it->second.perGpuVramMb.clear(); + freed+=candidate.vramOnGpu; + spdlog::info("Evicted model '{}' to free {}MB VRAM on GPU {}", candidate.model, candidate.vramOnGpu, gpuIndex); + } + } + return; + } + + // Global eviction (legacy path): sum across all GPUs int currentVramUsage=0; for(const auto &pair:m_models) { @@ -1161,7 +1264,7 @@ void ModelRuntime::evictIfNeeded(int requiredVramMb) { if(pair.second.state==ModelState::Loaded&& !pair.second.pinned&& - pair.first!=m_inferenceModel) + !m_activeInference.count(pair.first)) { candidates.push_back({pair.first, pair.second.estimatedVramUsageMb, pair.second.lastUsed}); } @@ -1189,6 +1292,7 @@ void ModelRuntime::evictIfNeeded(int requiredVramMb) it->second.state=ModelState::Unloaded; it->second.vramUsageMb=0; it->second.ramUsageMb=0; + it->second.perGpuVramMb.clear(); freed+=candidate.vramMb; spdlog::info("Evicted model '{}' to free {}MB VRAM", candidate.model, candidate.vramMb); } @@ -1197,8 +1301,7 @@ void ModelRuntime::evictIfNeeded(int requiredVramMb) void ModelRuntime::beginInference(const std::string &model) { - m_inferenceActive=true; - m_inferenceModel=model; + m_activeInference.insert(model); std::lock_guard lock(m_mutex); auto it=m_models.find(model); @@ -1208,27 +1311,76 @@ void ModelRuntime::beginInference(const std::string &model) } } -void ModelRuntime::endInference() +void ModelRuntime::endInference(const std::string &model) { // Record usage for storage tracking - if(!m_inferenceModel.empty()) + if(!model.empty()) { std::lock_guard lock(m_mutex); - auto it=m_models.find(m_inferenceModel); + auto it=m_models.find(model); if(it!=m_models.end()) { - StorageManager::instance().recordUsage(m_inferenceModel, it->second.variant); + StorageManager::instance().recordUsage(model, it->second.variant); } } - m_inferenceActive=false; - m_inferenceModel.clear(); - drainPendingSwaps(); + m_activeInference.erase(model); + + if(m_activeInference.empty()) + { + drainPendingSwaps(); + } } bool ModelRuntime::isInferenceActive() const { - return m_inferenceActive; + return !m_activeInference.empty(); +} + +bool ModelRuntime::isInferenceActive(const std::string &model) const +{ + return m_activeInference.count(model)>0; +} + +int ModelRuntime::getActiveInferenceCount() const +{ + return static_cast(m_activeInference.size()); +} + +int ModelRuntime::getCommittedVramMb(int gpuIndex) const +{ + int committed=0; + for(const auto &pair:m_models) + { + if(pair.second.state!=ModelState::Loaded) + { + continue; + } + + auto gpuIt=pair.second.perGpuVramMb.find(gpuIndex); + if(gpuIt!=pair.second.perGpuVramMb.end()) + { + committed+=gpuIt->second; + } + } + return committed; +} + +int ModelRuntime::getEstimatedFreeVramMb(int gpuIndex) const +{ + SystemInfo hw=HardwareDetector::instance().getSystemInfo(); + + if(gpuIndex<0||gpuIndex>=static_cast(hw.gpus.size())) + { + return 0; + } + + const GpuInfo &gpu=hw.gpus[gpuIndex]; + int totalVram=gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0 + ?gpu.gpuAccessibleRamMb:gpu.vramTotalMb; + int committed=getCommittedVramMb(gpuIndex); + + return std::max(0, totalVram-committed); } std::string ModelRuntime::selectBestVariant(const ModelInfo &model) const @@ -1401,6 +1553,21 @@ ErrorCode ModelRuntime::loadLlamaModel( const RuntimeOptions &options, const std::vector &backendPriority) { + // Apply Vulkan environment variable overrides before backend init. + // These are read by ggml-vulkan.cpp via getenv() during device initialization. + if(options.vulkanNoHostVisibleVram.has_value()) + { + if(options.vulkanNoHostVisibleVram.value()) + { + setenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM", "1", 1); + spdlog::info("Set GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM=1 for model '{}'", model); + } + else + { + unsetenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM"); + } + } + initLlamaBackend(); // Log available backend devices matching backendPriority for diagnostics. @@ -1533,8 +1700,174 @@ ErrorCode ModelRuntime::loadLlamaModel( } } - // NOTE: mparams.devices is intentionally left as NULL (default). - // See comment above about why explicit device lists hurt UMA performance. + // NOTE: mparams.devices is intentionally left as NULL (default) on + // single-GPU / UMA systems. See comment above about why explicit device + // lists hurt UMA performance. On multi-GPU discrete systems, we target + // specific GPU(s) via mparams.devices — but NEVER include the CPU device, + // which would cause llama.cpp to split tensors across GPU and CPU equally. + std::vector targetDevices; + + SystemInfo hw=HardwareDetector::instance().getSystemInfo(); + bool isMultiGpuDiscrete=hw.gpus.size()>1; + bool hasUmaGpu=false; + for(const GpuInfo &gpu:hw.gpus) + { + if(gpu.unifiedMemory) + { + hasUmaGpu=true; + break; + } + } + + if(isMultiGpuDiscrete&&!hasUmaGpu&&!gpuIndices.empty()) + { + // Build mapping from hardware detector GPU index to ggml backend device. + // Match by comparing device descriptions since index spaces differ: + // HW detector may skip duplicates (e.g. RTX 3060 via Vulkan when CUDA is primary) + // while ggml enumerates all backend devices. + + size_t devCount=ggml_backend_dev_count(); + struct GgmlGpuDev + { + ggml_backend_dev_t dev; + std::string name; + std::string description; + }; + std::vector ggmlGpus; + + for(size_t i=0; ibackend==GpuBackend::CUDA) expectedPrefix="CUDA"; + else if(hwGpu->backend==GpuBackend::Vulkan) expectedPrefix="Vulkan"; + + for(const GgmlGpuDev &ggmlDev:ggmlGpus) + { + // Check backend match first + if(!expectedPrefix.empty()&&ggmlDev.name.find(expectedPrefix)==std::string::npos) + continue; + + // Check if HW GPU name appears in ggml description + // HW name: "AMD Instinct MI50/MI60 (RADV VEGA20)" + // ggml desc: "AMD RADV VEGA20" or similar + // Try matching key substrings + bool matches=false; + + // Extract key identifiers from both names for matching + if(ggmlDev.description.find(hwGpu->name)!=std::string::npos) + { + matches=true; + } + else + { + // Try partial matching — extract words from HW name and check ggml desc + // Look for distinctive substrings like "VEGA20", "MI50", "RTX 3060", etc. + std::vector keywords; + std::string hwName=hwGpu->name; + + // Extract alphanumeric tokens from HW GPU name + std::string token; + for(char c:hwName) + { + if(std::isalnum(c)||(c=='-')) + { + token+=c; + } + else if(!token.empty()) + { + if(token.size()>=3) keywords.push_back(token); + token.clear(); + } + } + if(token.size()>=3) keywords.push_back(token); + + // Check if distinctive keywords from HW name appear in ggml description + int matchCount=0; + for(const std::string &kw:keywords) + { + if(ggmlDev.description.find(kw)!=std::string::npos) + { + ++matchCount; + } + } + + // Require at least 2 keyword matches or 1 match for short names + if(matchCount>=2||(matchCount>=1&&keywords.size()<=2)) + { + matches=true; + } + } + + if(matches) + { + bestMatch=ggmlDev.dev; + bestMatchName=ggmlDev.name; + break; + } + } + + if(bestMatch) + { + targetDevices.push_back(bestMatch); + spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}'", + idx, hwGpu->name, bestMatchName, model); + } + else + { + spdlog::warn("GPU hw[{}] '{}' could not be matched to any ggml device for model '{}'", + idx, hwGpu->name, model); + } + } + + if(!targetDevices.empty()) + { + targetDevices.push_back(nullptr); // NULL terminator + mparams.devices=targetDevices.data(); + } + } llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams); if(!llamaModel) @@ -1649,6 +1982,7 @@ ErrorCode ModelRuntime::loadLlamaModel( return ErrorCode::ModelLoadError; } + std::string capturedLog=m_llamaLogCapture.str(); endLlamaLogCapture(); LoadedModel &entry=m_models[model]; @@ -1657,6 +1991,9 @@ ErrorCode ModelRuntime::loadLlamaModel( entry.maxContextSize=nativeContext; entry.contextSize=static_cast(llama_n_ctx(llamaCtx)); + // Parse per-device buffer allocations from llama.cpp log output + parseDeviceAllocations(entry, capturedLog); + spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={}, ngl={}, flash_attn={}, mmap={}, backend_filter={})", model, entry.contextSize, entry.maxContextSize, options.nGpuLayers.value_or(99), @@ -1693,6 +2030,77 @@ void ModelRuntime::freeLlamaModel(LoadedModel &entry) } } +void ModelRuntime::parseDeviceAllocations(LoadedModel &entry, const std::string &logOutput) +{ + entry.deviceAllocations.clear(); + entry.graphSplits=0; + entry.cpuMappedBufferMb=0; + + // Parse: "load_tensors: CPU_Mapped model buffer size = 682.03 MiB" + // Parse: "load_tensors: Vulkan1 model buffer size = 15272.77 MiB" + std::regex modelBufRe(R"(load_tensors:\s+(\S+)\s+model buffer size\s*=\s*([\d.]+)\s*MiB)"); + // Parse: "llama_kv_cache: Vulkan1 KV buffer size = 8262.00 MiB" + std::regex kvBufRe(R"(llama_kv_cache:\s+(\S+)\s+KV buffer size\s*=\s*([\d.]+)\s*MiB)"); + // Parse: "sched_reserve: Vulkan1 compute buffer size = 801.28 MiB" + std::regex computeBufRe(R"(sched_reserve:\s+(\S+)\s+compute buffer size\s*=\s*([\d.]+)\s*MiB)"); + // Parse: "sched_reserve: graph splits = 2" + std::regex graphSplitsRe(R"(sched_reserve:\s+graph splits\s*=\s*(\d+))"); + + std::istringstream stream(logOutput); + std::string line; + + while(std::getline(stream, line)) + { + std::smatch match; + + if(std::regex_search(line, match, modelBufRe)) + { + std::string device=match[1].str(); + int sizeMb=static_cast(std::round(std::stod(match[2].str()))); + + if(device=="CPU_Mapped") + { + entry.cpuMappedBufferMb=sizeMb; + } + else + { + entry.deviceAllocations[device].deviceName=device; + entry.deviceAllocations[device].modelBufferMb=sizeMb; + } + } + else if(std::regex_search(line, match, kvBufRe)) + { + std::string device=match[1].str(); + int sizeMb=static_cast(std::round(std::stod(match[2].str()))); + entry.deviceAllocations[device].deviceName=device; + entry.deviceAllocations[device].kvCacheBufferMb=sizeMb; + } + else if(std::regex_search(line, match, computeBufRe)) + { + std::string device=match[1].str(); + int sizeMb=static_cast(std::round(std::stod(match[2].str()))); + + // Skip host-side compute buffers + if(device.find("Host")!=std::string::npos) + continue; + + entry.deviceAllocations[device].deviceName=device; + entry.deviceAllocations[device].computeBufferMb=sizeMb; + } + else if(std::regex_search(line, match, graphSplitsRe)) + { + entry.graphSplits=std::stoi(match[1].str()); + } + } + + // Calculate totals for each device + for(auto &pair:entry.deviceAllocations) + { + DeviceAllocation &alloc=pair.second; + alloc.totalMb=alloc.modelBufferMb+alloc.kvCacheBufferMb+alloc.computeBufferMb; + } +} + llama_model *ModelRuntime::getLlamaModel(const std::string &model) const { std::lock_guard lock(m_mutex); diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h index 79a7bda..d2abd14 100644 --- a/src/arbiterAI/modelRuntime.h +++ b/src/arbiterAI/modelRuntime.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,14 @@ struct LoadErrorDetail { std::string llamaLog; // raw llama.cpp log output captured during the load attempt }; +struct DeviceAllocation { + std::string deviceName; + int modelBufferMb=0; + int kvCacheBufferMb=0; + int computeBufferMb=0; + int totalMb=0; +}; + struct LoadedModel { std::string modelName; std::string variant; @@ -69,6 +78,10 @@ struct LoadedModel { int contextSize=0; int maxContextSize=0; // model's native/training context from GGUF metadata std::vector gpuIndices; + std::map perGpuVramMb; // gpu index → estimated VRAM usage on that GPU + std::map deviceAllocations; // device name → actual buffer allocations + int graphSplits=0; + int cpuMappedBufferMb=0; std::chrono::steady_clock::time_point lastUsed; bool pinned=false; llama_model *llamaModel=nullptr; @@ -89,12 +102,14 @@ class ModelRuntime { /// @param variant Quantization variant (empty = auto-select best fitting). /// @param contextSize Context size (0 = use model default). /// @param optionsOverride Optional runtime options to merge on top of model config defaults. + /// @param targetDevices Optional GPU indices to target (empty = auto-select). /// @return ErrorCode::Success, ModelDownloading, ModelNotFound, ModelLoadError. ErrorCode loadModel( const std::string &model, const std::string &variant="", int contextSize=0, - const RuntimeOptions &optionsOverride=RuntimeOptions{}); + const RuntimeOptions &optionsOverride=RuntimeOptions{}, + const std::vector &targetDevices={}); /// Download model files without loading into VRAM. /// Launches an async background download that respects the concurrent @@ -159,17 +174,24 @@ class ModelRuntime { std::vector getDefaultBackendPriority() const; /// Evict least-recently-used non-pinned models to free VRAM. - void evictIfNeeded(int requiredVramMb); + /// When gpuIndex >= 0, only considers models on that specific GPU. + void evictIfNeeded(int requiredVramMb, int gpuIndex=-1); - /// Mark inference as started (blocks swap execution). + /// Mark inference as started on a model (blocks eviction of that model). void beginInference(const std::string &model); - /// Mark inference as completed and drain pending swaps. - void endInference(); + /// Mark inference as completed on a model and drain pending swaps. + void endInference(const std::string &model); - /// Check if inference is currently active. + /// Check if any inference is currently active. bool isInferenceActive() const; + /// Check if inference is active on a specific model. + bool isInferenceActive(const std::string &model) const; + + /// Get the number of models currently running inference. + int getActiveInferenceCount() const; + /// Get the llama_model handle for a loaded local model. /// Returns nullptr if not loaded or not a local model. llama_model *getLlamaModel(const std::string &model) const; @@ -189,6 +211,12 @@ class ModelRuntime { /// Public so the C-style callback can reach it; not intended for external use. void appendLlamaLog(const char *text); + /// Get the VRAM currently committed to loaded models on a specific GPU (MB). + int getCommittedVramMb(int gpuIndex) const; + + /// Get the estimated free VRAM on a specific GPU accounting for loaded models (MB). + int getEstimatedFreeVramMb(int gpuIndex) const; + private: ModelRuntime(); @@ -238,6 +266,9 @@ class ModelRuntime { /// Free llama.cpp resources for a model. void freeLlamaModel(LoadedModel &entry); + /// Parse per-device buffer allocations from llama.cpp log output. + void parseDeviceAllocations(LoadedModel &entry, const std::string &logOutput); + /// Download a model file synchronously. /// @return true on success, false on failure. bool downloadModelFile( @@ -251,8 +282,7 @@ class ModelRuntime { mutable std::mutex m_mutex; int m_readyRamBudgetMb=0; std::vector m_defaultBackendPriority; - std::atomic m_inferenceActive{false}; - std::string m_inferenceModel; + std::set m_activeInference; // models currently running inference bool m_llamaInitialized=false; struct SwapRequest { diff --git a/src/arbiterAI/providers/llama.cpp b/src/arbiterAI/providers/llama.cpp index c3b1187..384f409 100644 --- a/src/arbiterAI/providers/llama.cpp +++ b/src/arbiterAI/providers/llama.cpp @@ -60,7 +60,7 @@ ErrorCode Llama::completion(const CompletionRequest &request, std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now(); double totalTimeMs=std::chrono::duration(endTime-startTime).count(); - runtime.endInference(); + runtime.endInference(request.model); if(code==ErrorCode::Success) { @@ -135,7 +135,7 @@ ErrorCode Llama::streamingCompletion(const CompletionRequest &request, std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now(); double totalTimeMs=std::chrono::duration(endTime-startTime).count(); - runtime.endInference(); + runtime.endInference(request.model); if(code==ErrorCode::Success) { @@ -209,24 +209,34 @@ ErrorCode Llama::getEmbeddings(const EmbeddingRequest &request, } tokens.resize(nTokens); - llama_batch batch=llama_batch_init(nTokens, 0, 1); + int nBatch=static_cast(llama_n_batch(llamaCtx)); + llama_batch batch=llama_batch_init(std::max(nBatch, 512), 0, 1); - batch.n_tokens=nTokens; - for(int32_t i=0; i=nTokens); - if(llama_decode(llamaCtx, batch)!=0) - { - spdlog::error("llama_decode failed for embeddings"); - llama_batch_free(batch); - return ErrorCode::GenerationError; + batch.n_tokens=chunkSize; + for(int32_t i=0; i(llama_n_batch(ctx)); + llama_batch batch=llama_batch_init(std::max(nBatch, 512), 0, 1); - // Fill batch with prompt tokens - batch.n_tokens=nTokens; - for(int32_t i=0; i=nTokens); + + batch.n_tokens=chunkSize; + for(int32_t i=0; i
-

ArbiterAI Dashboard

+
+

ArbiterAI Dashboard

+ +
Connected
@@ -563,6 +657,7 @@ td State Context Max Context + GPU(s) VRAM (MB) RAM (MB) Pinned @@ -570,7 +665,7 @@ td - No models loaded + No models loaded
@@ -648,6 +743,13 @@ const MAX_TPS_POINTS=60; let hasActiveDownloads=false; let logPanelOpen=true; let lastLogEpoch=0; +let availableModelOptions=[]; +let serverConfigState=null; +const STARTUP_ACCELERATORS=[ + {key: "cpu", label: "CPU", selectId: "startupDefaultCpu", statusId: "startupDefaultCpuStatus"}, + {key: "cuda", label: "CUDA", selectId: "startupDefaultCuda", statusId: "startupDefaultCudaStatus"}, + {key: "vulkan", label: "Vulkan", selectId: "startupDefaultVulkan", statusId: "startupDefaultVulkanStatus"} +]; function toggleLogPanel() { @@ -694,6 +796,220 @@ function escapeHtml(text) return el.innerHTML; } +function encodeStartupModelValue(model, variant) +{ + return encodeURIComponent(JSON.stringify({model, variant: variant||""})); +} + +function decodeStartupModelValue(value) +{ + if(!value) return {model: "", variant: ""}; + + try + { + return JSON.parse(decodeURIComponent(value)); + } + catch(e) + { + return {model: "", variant: ""}; + } +} + +function formatStartupModelLabel(model, variant) +{ + return variant?`${model} (${variant})`:model; +} + +function formatAcceleratorLabel(accelerator) +{ + const match=STARTUP_ACCELERATORS.find((item) => item.key===accelerator); + if(match) return match.label; + if(accelerator==="legacy") return "Legacy Default"; + return accelerator?accelerator.toUpperCase():"Unknown"; +} + +function buildAvailableModelOptions(models) +{ + const seen=new Set(); + const options=[]; + + if(!models) return options; + + for(const model of models) + { + const modelName=model.model||""; + const variant=model.variant||""; + if(!modelName) continue; + + const key=modelName+"\u0000"+variant; + if(seen.has(key)) continue; + seen.add(key); + + options.push({ + model: modelName, + variant, + sortKey: modelName.toLowerCase()+"\u0000"+variant.toLowerCase(), + label: formatStartupModelLabel(modelName, variant) + }); + } + + options.sort((left, right) => left.sortKey.localeCompare(right.sortKey)); + return options; +} + +function showStartupDefaultsMessage(text, state) +{ + const el=document.getElementById("startupDefaultsMessage"); + if(!el) return; + + el.textContent=text||""; + el.className="settings-message"; + + if(state==="error") el.classList.add("error"); + if(state==="success") el.classList.add("success"); +} + +function getStartupDefaultEntry(accelerator) +{ + if(!serverConfigState||!serverConfigState.startup_defaults) + { + return {model: "", variant: ""}; + } + + const entry=serverConfigState.startup_defaults[accelerator]; + return { + model: entry&&entry.model?entry.model:"", + variant: entry&&entry.variant?entry.variant:"" + }; +} + +function isAcceleratorDetected(accelerator) +{ + if(!serverConfigState||!serverConfigState.detected_accelerators) + { + return accelerator==="cpu"; + } + + return serverConfigState.detected_accelerators.includes(accelerator); +} + +function renderStartupSettings() +{ + if(!serverConfigState) + { + showStartupDefaultsMessage("Loading startup defaults...", ""); + return; + } + + const effective=serverConfigState.effective_startup_default||{}; + const effectiveLabel=document.getElementById("startupDefaultsEffective"); + if(effective.model) + { + effectiveLabel.textContent=`Next restart: ${formatAcceleratorLabel(effective.accelerator)} -> ${formatStartupModelLabel(effective.model, effective.variant||"")}`; + } + else + { + effectiveLabel.textContent="Next restart: no startup default configured"; + } + + for(const accelerator of STARTUP_ACCELERATORS) + { + const select=document.getElementById(accelerator.selectId); + const status=document.getElementById(accelerator.statusId); + const selectedEntry=getStartupDefaultEntry(accelerator.key); + const selectedValue=selectedEntry.model?encodeStartupModelValue(selectedEntry.model, selectedEntry.variant):""; + + let html=``; + let hasSelected=!selectedValue; + + for(const option of availableModelOptions) + { + const value=encodeStartupModelValue(option.model, option.variant); + const selected=value===selectedValue?" selected":""; + if(selected) hasSelected=true; + html+=``; + } + + if(selectedValue&&!hasSelected) + { + html+=``; + } + + select.innerHTML=html; + status.textContent=isAcceleratorDetected(accelerator.key)?"Detected":"Not detected"; + } + + const saveButton=document.getElementById("saveStartupDefaultsBtn"); + saveButton.disabled=!serverConfigState; +} + +async function loadStartupSettings() +{ + const [modelsResponse, configResponse]=await Promise.all([ + fetchJson("/api/models"), + fetchJson("/api/server/config") + ]); + + if(modelsResponse&&modelsResponse.models) + { + availableModelOptions=buildAvailableModelOptions(modelsResponse.models); + } + + if(configResponse) + { + serverConfigState=configResponse; + } + + renderStartupSettings(); +} + +async function saveStartupDefaults() +{ + const saveButton=document.getElementById("saveStartupDefaultsBtn"); + const startupDefaults={}; + + for(const accelerator of STARTUP_ACCELERATORS) + { + const value=decodeStartupModelValue(document.getElementById(accelerator.selectId).value); + startupDefaults[accelerator.key]={ + model: value.model||"", + variant: value.variant||"" + }; + } + + saveButton.disabled=true; + showStartupDefaultsMessage("Saving startup defaults...", ""); + + try + { + const response=await fetch("/api/server/config", { + method: "PUT", + headers: {"Content-Type": "application/json"}, + body: JSON.stringify({startup_defaults: startupDefaults}) + }); + const data=await response.json(); + + if(!response.ok) + { + showStartupDefaultsMessage(data.error?.message||"Failed to save startup defaults.", "error"); + return; + } + + serverConfigState=data; + renderStartupSettings(); + showStartupDefaultsMessage("Startup defaults saved. They will be used on the next server restart.", "success"); + } + catch(e) + { + console.error("Saving startup defaults failed:", e); + showStartupDefaultsMessage("Failed to save startup defaults.", "error"); + } + finally + { + saveButton.disabled=false; + } +} + async function refreshLogs(force) { if(!logPanelOpen) return; @@ -929,7 +1245,7 @@ function buildHeapTooltip(gpu) return lines.join("
"); } -function renderGpus(gpus) +function renderGpus(gpus, models) { const el=document.getElementById("gpuList"); @@ -939,6 +1255,25 @@ function renderGpus(gpus) return; } + // Build per-GPU model assignments from loaded models + const gpuModels={}; + if(models&&models.length>0) + { + for(const m of models) + { + if(m.state!=="Loaded") continue; + const indices=m.gpu_indices||[]; + const perGpu=m.per_gpu_vram_mb||{}; + + for(const idx of indices) + { + if(!gpuModels[idx]) gpuModels[idx]=[]; + const vram=perGpu[String(idx)]||m.estimated_vram_mb||0; + gpuModels[idx].push({name:m.model, vram:vram}); + } + } + } + let html=""; for(const gpu of gpus) { @@ -989,10 +1324,22 @@ function renderGpus(gpus) const overrideBtn=``; const clearBtn=gpu.vram_overridden?``:""; + // Show models loaded on this GPU + let modelsHtml=""; + const assignedModels=gpuModels[gpu.index]||[]; + if(assignedModels.length>0) + { + const modelTags=assignedModels.map(m=> + `${m.name} ${m.vram}MB` + ).join(""); + modelsHtml=`
${modelTags}
`; + } + html+=`
${gpu.name} (${gpu.backend})${gpu.unified_memory?" ⚡ Unified":""}${overrideTag}${overrideBtn}${clearBtn}${memSpan}
${utilHtml} + ${modelsHtml}
`; } el.innerHTML=html; @@ -1004,7 +1351,7 @@ function renderModels(models) if(!models||models.length===0) { - el.innerHTML='No models loaded'; + el.innerHTML='No models loaded'; return; } @@ -1020,13 +1367,52 @@ function renderModels(models) const ctxDisplay=m.context_size? m.context_size.toLocaleString() : "-"; const maxCtxDisplay=m.max_context_size? m.max_context_size.toLocaleString() : "-"; + let gpuDisplay="-"; + if(m.gpu_indices&&m.gpu_indices.length>0) + { + gpuDisplay=m.gpu_indices.join(", "); + } + + let vramDisplay=`${m.estimated_vram_mb||m.vram_usage_mb||0}`; + + // Show device allocation breakdown if available + if(m.device_allocations&&Object.keys(m.device_allocations).length>0) + { + let allocHtml='
'; + for(const [devKey,alloc] of Object.entries(m.device_allocations)) + { + const devName=alloc.device_name||devKey; + allocHtml+=`
${devName}: ` + +`Model ${alloc.model_buffer_mb||0} MB` + +` | KV ${alloc.kv_cache_buffer_mb||0} MB` + +` | Compute ${alloc.compute_buffer_mb||0} MB` + +` | ${alloc.total_mb||0} MB total
`; + } + if(m.cpu_mapped_buffer_mb&&m.cpu_mapped_buffer_mb>0) + { + allocHtml+=`
CPU: ${m.cpu_mapped_buffer_mb} MB mapped
`; + } + if(m.graph_splits&&m.graph_splits>1) + { + allocHtml+=`
Graph splits: ${m.graph_splits}
`; + } + allocHtml+='
'; + vramDisplay+=allocHtml; + } + else if(m.per_gpu_vram_mb&&Object.keys(m.per_gpu_vram_mb).length>1) + { + const parts=Object.entries(m.per_gpu_vram_mb).map(([k,v])=>`GPU${k}:${v}`); + vramDisplay+=` (${parts.join(", ")})`; + } + html+=` ${m.model} ${m.variant||"-"} ${m.state} ${ctxDisplay} ${maxCtxDisplay} - ${m.vram_usage_mb||0} + ${gpuDisplay} + ${vramDisplay} ${m.ram_usage_mb||0} ${m.pinned?"Yes":"No"} ${actions.join("")} @@ -1259,8 +1645,8 @@ async function refresh() } // GPUs - if(hw&&hw.gpus) renderGpus(hw.gpus); - else if(stats.hardware&&stats.hardware.gpus) renderGpus(stats.hardware.gpus); + if(hw&&hw.gpus) renderGpus(hw.gpus, stats.models||[]); + else if(stats.hardware&&stats.hardware.gpus) renderGpus(stats.hardware.gpus, stats.models||[]); // Models if(stats.models) renderModels(stats.models); diff --git a/src/server/dashboardConfig.h b/src/server/dashboardConfig.h new file mode 100644 index 0000000..af4f079 --- /dev/null +++ b/src/server/dashboardConfig.h @@ -0,0 +1,1792 @@ +#ifndef _ARBITERAI_SERVER_DASHBOARDCONFIG_H_ +#define _ARBITERAI_SERVER_DASHBOARDCONFIG_H_ + +#include + +namespace arbiterAI +{ +namespace server +{ + +const std::string DASHBOARD_CONFIG_HTML=R"HTML( + + + + +ArbiterAI - Configuration + + + +
+
+ ← Dashboard + Downloaded Models + Configuration + +
+
Connected
+
+
+
+
Startup Configuration
+
This is the server configuration. Models listed here are loaded on server startup. Adding a model saves it to the config and loads it immediately. Removing a model unloads it and removes it from the config.
+
+ + +
+
+
+
+
Startup Models
+
Select models to load on startup and assign compute devices.
+
+ +
+
+
+
+ + + + +
+

VRAM Overrides

+
Override the reported VRAM for each GPU. Useful when the driver reports incorrect values or to simulate different hardware. Changes take effect immediately for fit calculations.
+
Loading GPU info...
+
+ +
+
+
+
+ + + +)HTML"; + +} // namespace server +} // namespace arbiterAI + +#endif//_ARBITERAI_SERVER_DASHBOARDCONFIG_H_ \ No newline at end of file diff --git a/src/server/main.cpp b/src/server/main.cpp index 7fb4f67..4d91c27 100644 --- a/src/server/main.cpp +++ b/src/server/main.cpp @@ -3,6 +3,7 @@ #include "arbiterAI/arbiterAI.h" #include "arbiterAI/hardwareDetector.h" +#include "arbiterAI/modelManager.h" #include "arbiterAI/modelRuntime.h" #include "arbiterAI/storageManager.h" @@ -15,12 +16,382 @@ #include #include #include +#include #include #include +#include namespace { +struct StartupDefaultSelection { + std::string model; + std::string variant; + int contextSize=0; + arbiterAI::RuntimeOptions runtimeOptions; +}; + +struct StartupModelEntry { + std::string model; + std::string variant; + int contextSize=0; + arbiterAI::RuntimeOptions runtimeOptions; + std::vector devices; +}; + +arbiterAI::RuntimeOptions parseStartupRuntimeOptions(const nlohmann::json &j) +{ + arbiterAI::RuntimeOptions opts; + if(!j.is_object()) return opts; + if(j.contains("flash_attn")&&j["flash_attn"].is_boolean()) + opts.flashAttn=j["flash_attn"].get(); + if(j.contains("kv_cache_type_k")&&j["kv_cache_type_k"].is_string()) + opts.kvCacheTypeK=j["kv_cache_type_k"].get(); + if(j.contains("kv_cache_type_v")&&j["kv_cache_type_v"].is_string()) + opts.kvCacheTypeV=j["kv_cache_type_v"].get(); + if(j.contains("no_mmap")&&j["no_mmap"].is_boolean()) + opts.noMmap=j["no_mmap"].get(); + if(j.contains("reasoning_budget")&&j["reasoning_budget"].is_number_integer()) + opts.reasoningBudget=j["reasoning_budget"].get(); + if(j.contains("swa_full")&&j["swa_full"].is_boolean()) + opts.swaFull=j["swa_full"].get(); + if(j.contains("n_gpu_layers")&&j["n_gpu_layers"].is_number_integer()) + opts.nGpuLayers=j["n_gpu_layers"].get(); + if(j.contains("override_tensor")&&j["override_tensor"].is_string()) + opts.overrideTensor=j["override_tensor"].get(); + if(j.contains("vulkan_no_host_visible_vram")&&j["vulkan_no_host_visible_vram"].is_boolean()) + opts.vulkanNoHostVisibleVram=j["vulkan_no_host_visible_vram"].get(); + return opts; +} + +int sanitizeContextSize(int contextSize) +{ + return contextSize>0?contextSize:0; +} + +std::string toLowerCopy(const std::string &value) +{ + std::string lower=value; + std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) + { + return static_cast(std::tolower(c)); + }); + return lower; +} + +std::string normalizeAcceleratorKey(const std::string &value) +{ + std::string lower=toLowerCopy(value); + + if(lower=="cpu"||lower=="cuda"||lower=="vulkan") + { + return lower; + } + + return ""; +} + +std::map parseStartupDefaults(const nlohmann::json &cfg) +{ + std::map startupDefaults={ + {"cpu", StartupDefaultSelection{}}, + {"cuda", StartupDefaultSelection{}}, + {"vulkan", StartupDefaultSelection{}} + }; + + nlohmann::json defaultsJson=cfg.value("startup_defaults", nlohmann::json::object()); + if(!defaultsJson.is_object()) + { + return startupDefaults; + } + + for(auto it=defaultsJson.begin(); it!=defaultsJson.end(); ++it) + { + std::string key=normalizeAcceleratorKey(it.key()); + if(key.empty()) + { + continue; + } + + if(it.value().is_string()) + { + startupDefaults[key].model=it.value().get(); + startupDefaults[key].variant.clear(); + startupDefaults[key].contextSize=0; + continue; + } + + if(!it.value().is_object()) + { + continue; + } + + startupDefaults[key].model=it.value().value("model", ""); + startupDefaults[key].variant=it.value().value("variant", ""); + startupDefaults[key].contextSize=sanitizeContextSize(it.value().value("context_size", 0)); + if(it.value().contains("runtime_options")) + { + startupDefaults[key].runtimeOptions=parseStartupRuntimeOptions(it.value()["runtime_options"]); + } + } + + return startupDefaults; +} + +std::vector parseStartupModels(const nlohmann::json &cfg) +{ + std::vector entries; + + if(!cfg.contains("startup_models")||!cfg["startup_models"].is_array()) + { + return entries; + } + + for(const nlohmann::json &item:cfg["startup_models"]) + { + if(!item.is_object()||!item.contains("model")) + continue; + + StartupModelEntry entry; + entry.model=item.value("model", ""); + entry.variant=item.value("variant", ""); + entry.contextSize=sanitizeContextSize(item.value("context_size", 0)); + + if(item.contains("runtime_options")) + { + entry.runtimeOptions=parseStartupRuntimeOptions(item["runtime_options"]); + } + + if(item.contains("devices")&&item["devices"].is_array()) + { + for(const nlohmann::json &d:item["devices"]) + { + if(d.is_number_integer()) + { + entry.devices.push_back(d.get()); + } + } + } + + if(!entry.model.empty()) + { + entries.push_back(std::move(entry)); + } + } + + return entries; +} + +bool hasAccelerator(const arbiterAI::SystemInfo &hw, const std::string &accelerator) +{ + if(accelerator=="cpu") + { + return true; + } + + for(const arbiterAI::GpuInfo &gpu:hw.gpus) + { + if(accelerator=="cuda"&&gpu.backend==arbiterAI::GpuBackend::CUDA) + { + return true; + } + if(accelerator=="vulkan"&&gpu.backend==arbiterAI::GpuBackend::Vulkan) + { + return true; + } + } + + return false; +} + +std::vector buildStartupAcceleratorOrder( + const arbiterAI::SystemInfo &hw, + const std::vector &defaultBackendPriority) +{ + std::vector order; + + auto appendIfAvailable=[&order, &hw](const std::string &accelerator) + { + if(accelerator.empty()||!hasAccelerator(hw, accelerator)) + { + return; + } + if(std::find(order.begin(), order.end(), accelerator)==order.end()) + { + order.push_back(accelerator); + } + }; + + for(const std::string &backend:defaultBackendPriority) + { + appendIfAvailable(normalizeAcceleratorKey(backend)); + } + + appendIfAvailable("cuda"); + appendIfAvailable("vulkan"); + appendIfAvailable("cpu"); + + return order; +} + +StartupDefaultSelection selectStartupDefault( + const arbiterAI::SystemInfo &hw, + const std::map &startupDefaults, + const std::vector &defaultBackendPriority, + const std::string &legacyDefaultModel, + const std::string &legacyDefaultVariant, + std::string &selectedAccelerator) +{ + for(const std::string &accelerator:buildStartupAcceleratorOrder(hw, defaultBackendPriority)) + { + auto it=startupDefaults.find(accelerator); + if(it!=startupDefaults.end()&&!it->second.model.empty()) + { + selectedAccelerator=accelerator; + return it->second; + } + } + + selectedAccelerator.clear(); + return {legacyDefaultModel, legacyDefaultVariant, 0}; +} + +void scheduleStartupLoadAfterDownload( + const StartupDefaultSelection &selection, + const std::string &accelerator) +{ + std::thread([selection, accelerator]() + { + std::string variant=selection.variant; + + for(int attempt=0; attempt<300; ++attempt) + { + std::this_thread::sleep_for(std::chrono::seconds(2)); + + std::optional state= + arbiterAI::ModelRuntime::instance().getModelState(selection.model); + if(!state.has_value()) + { + spdlog::warn("Startup default model '{}' disappeared while waiting for download to finish", selection.model); + return; + } + + if(!state->variant.empty()) + { + variant=state->variant; + } + + if(state->state==arbiterAI::ModelState::Downloading) + { + continue; + } + + if(state->state==arbiterAI::ModelState::Loaded||state->state==arbiterAI::ModelState::Ready) + { + spdlog::info("Startup default model '{}' is ready after background download for {}", + selection.model, + accelerator.empty()?"legacy startup":accelerator); + return; + } + + if(state->state!=arbiterAI::ModelState::Unloaded) + { + spdlog::warn("Startup default model '{}' ended in unexpected state {} after download", + selection.model, + static_cast(state->state)); + return; + } + + spdlog::info("Startup default model '{}' finished downloading; loading now for {}", + selection.model, + accelerator.empty()?"legacy startup":accelerator); + + arbiterAI::RuntimeOptions opts=selection.runtimeOptions; + arbiterAI::ErrorCode loadErr=arbiterAI::ArbiterAI::instance().loadModel( + selection.model, + variant, + selection.contextSize, + &opts); + + if(loadErr==arbiterAI::ErrorCode::Success) + { + spdlog::info("Startup default model '{}' loaded successfully after download", selection.model); + return; + } + + if(loadErr==arbiterAI::ErrorCode::ModelDownloading) + { + continue; + } + + spdlog::warn("Failed to load startup default model '{}' after download (error={})", + selection.model, + static_cast(loadErr)); + return; + } + + spdlog::warn("Timed out waiting for startup default model '{}' to finish downloading", selection.model); + }).detach(); +} + +void scheduleStartupModelLoadAfterDownload(const StartupModelEntry &entry) +{ + std::thread([entry]() + { + std::string variant=entry.variant; + + for(int attempt=0; attempt<300; ++attempt) + { + std::this_thread::sleep_for(std::chrono::seconds(2)); + + std::optional state= + arbiterAI::ModelRuntime::instance().getModelState(entry.model); + if(!state.has_value()) + { + spdlog::warn("Startup model '{}' disappeared while waiting for download", entry.model); + return; + } + + if(!state->variant.empty()) + variant=state->variant; + + if(state->state==arbiterAI::ModelState::Downloading) + continue; + + if(state->state==arbiterAI::ModelState::Loaded||state->state==arbiterAI::ModelState::Ready) + { + spdlog::info("Startup model '{}' is ready after background download", entry.model); + return; + } + + if(state->state!=arbiterAI::ModelState::Unloaded) + { + spdlog::warn("Startup model '{}' ended in unexpected state {}", entry.model, static_cast(state->state)); + return; + } + + spdlog::info("Startup model '{}' finished downloading; loading now", entry.model); + + arbiterAI::RuntimeOptions opts=entry.runtimeOptions; + arbiterAI::ErrorCode loadErr=arbiterAI::ArbiterAI::instance().loadModel( + entry.model, variant, entry.contextSize, &opts, entry.devices); + + if(loadErr==arbiterAI::ErrorCode::Success) + { + spdlog::info("Startup model '{}' loaded successfully after download", entry.model); + return; + } + if(loadErr==arbiterAI::ErrorCode::ModelDownloading) + continue; + + spdlog::warn("Failed to load startup model '{}' after download (error={})", entry.model, static_cast(loadErr)); + return; + } + + spdlog::warn("Timed out waiting for startup model '{}' to finish downloading", entry.model); + }).detach(); +} + int64_t parseStorageLimit(const std::string &str) { if(str.empty()||str=="0") return 0; @@ -137,6 +508,7 @@ int main(int argc, char *argv[]) std::string modelsDir=cfg.value("models_dir", "/models"); std::string defaultModel=cfg.value("default_model", ""); std::string defaultVariant=cfg.value("default_variant", ""); + std::map startupDefaults=parseStartupDefaults(cfg); std::string overridePath=cfg.value("override_path", ""); std::string injectedConfigDir=cfg.value("injected_config_dir", ""); int ramBudget=cfg.value("ram_budget_mb", 0); @@ -294,23 +666,96 @@ int main(int argc, char *argv[]) spdlog::info("Max concurrent downloads set to {}", maxDownloads); } - // ── Load default model ─────────────────────────────────────── - if(!defaultModel.empty()) - { - spdlog::info("Loading default model: {} (variant: {})", defaultModel, defaultVariant.empty()?"auto":defaultVariant); - arbiterAI::ErrorCode loadErr=ai.loadModel(defaultModel, defaultVariant); + // ── Load startup models ───────────────────────────────────── + arbiterAI::HardwareDetector::instance().refresh(); + arbiterAI::SystemInfo startupHardware=arbiterAI::HardwareDetector::instance().getSystemInfo(); - if(loadErr==arbiterAI::ErrorCode::Success) - { - spdlog::info("Default model '{}' loaded successfully", defaultModel); - } - else if(loadErr==arbiterAI::ErrorCode::ModelDownloading) + // New format: startup_models array (preferred) + std::vector startupModels=parseStartupModels(cfg); + + if(!startupModels.empty()) + { + for(const StartupModelEntry &entry:startupModels) { - spdlog::info("Default model '{}' is downloading...", defaultModel); + std::string devicesStr; + if(!entry.devices.empty()) + { + for(size_t i=0; i0) devicesStr+=", "; + devicesStr+=std::to_string(entry.devices[i]); + } + } + else + { + devicesStr="auto"; + } + + spdlog::info("Loading startup model: {} (variant: {}, devices: [{}])", + entry.model, entry.variant.empty()?"auto":entry.variant, devicesStr); + + arbiterAI::RuntimeOptions startupOpts=entry.runtimeOptions; + arbiterAI::ErrorCode loadErr=ai.loadModel( + entry.model, entry.variant, entry.contextSize, &startupOpts, entry.devices); + + if(loadErr==arbiterAI::ErrorCode::Success) + { + spdlog::info("Startup model '{}' loaded successfully", entry.model); + } + else if(loadErr==arbiterAI::ErrorCode::ModelDownloading) + { + spdlog::info("Startup model '{}' is downloading...", entry.model); + scheduleStartupModelLoadAfterDownload(entry); + } + else + { + spdlog::warn("Failed to load startup model '{}' (error={})", entry.model, static_cast(loadErr)); + } } - else + } + else + { + // Legacy format: startup_defaults keyed by accelerator + std::string selectedAccelerator; + StartupDefaultSelection startupSelection=selectStartupDefault( + startupHardware, + startupDefaults, + defaultBackendPriority, + defaultModel, + defaultVariant, + selectedAccelerator); + + if(!startupSelection.model.empty()) { - spdlog::warn("Failed to load default model '{}' (error={})", defaultModel, static_cast(loadErr)); + if(selectedAccelerator.empty()) + { + spdlog::info("Loading legacy default model: {} (variant: {})", startupSelection.model, startupSelection.variant.empty()?"auto":startupSelection.variant); + } + else + { + spdlog::info("Loading startup default model for {}: {} (variant: {})", selectedAccelerator, startupSelection.model, startupSelection.variant.empty()?"auto":startupSelection.variant); + } + + arbiterAI::RuntimeOptions startupOpts=startupSelection.runtimeOptions; + arbiterAI::ErrorCode loadErr=ai.loadModel( + startupSelection.model, + startupSelection.variant, + startupSelection.contextSize, + &startupOpts); + + if(loadErr==arbiterAI::ErrorCode::Success) + { + spdlog::info("Startup default model '{}' loaded successfully", startupSelection.model); + } + else if(loadErr==arbiterAI::ErrorCode::ModelDownloading) + { + spdlog::info("Startup default model '{}' is downloading...", startupSelection.model); + scheduleStartupLoadAfterDownload(startupSelection, selectedAccelerator); + } + else + { + spdlog::warn("Failed to load startup default model '{}' (error={})", startupSelection.model, static_cast(loadErr)); + } } } @@ -318,6 +763,7 @@ int main(int argc, char *argv[]) httplib::Server server; arbiterAI::server::registerRoutes(server); + arbiterAI::server::setServerConfigPath(configPath); if(!overridePath.empty()) { @@ -360,6 +806,7 @@ int main(int argc, char *argv[]) spdlog::info(" POST /api/storage/cleanup/run - Run cleanup"); spdlog::info(" GET /api/downloads - Active downloads"); spdlog::info(" GET /dashboard - Live dashboard"); + spdlog::info(" GET /dashboard/config - Startup configuration"); spdlog::info("Starting server on {}:{}", host, port); spdlog::info("Dashboard: http://{}:{}/dashboard", host=="0.0.0.0"?"localhost":host, port); diff --git a/src/server/routes.cpp b/src/server/routes.cpp index 843b88b..7fd8dc2 100644 --- a/src/server/routes.cpp +++ b/src/server/routes.cpp @@ -1,5 +1,6 @@ #include "routes.h" #include "dashboard.h" +#include "dashboardConfig.h" #include "logBuffer.h" #include "arbiterAI/arbiterAI.h" @@ -12,10 +13,13 @@ #include #include +#include #include +#include #include #include #include +#include namespace arbiterAI { @@ -26,6 +30,643 @@ namespace { std::string g_overridePath; +std::string g_serverConfigPath; +std::mutex g_serverConfigMutex; +constexpr const char *STARTUP_ACCELERATOR_CPU="cpu"; +constexpr const char *STARTUP_ACCELERATOR_CUDA="cuda"; +constexpr const char *STARTUP_ACCELERATOR_VULKAN="vulkan"; + +int sanitizeContextSize(int contextSize) +{ + return contextSize>0?contextSize:0; +} + +std::string toLowerCopy(const std::string &value) +{ + std::string lower=value; + std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) + { + return static_cast(std::tolower(c)); + }); + return lower; +} + +std::string normalizeAcceleratorKey(const std::string &value) +{ + std::string lower=toLowerCopy(value); + + if(lower==STARTUP_ACCELERATOR_CPU) + { + return STARTUP_ACCELERATOR_CPU; + } + if(lower==STARTUP_ACCELERATOR_CUDA) + { + return STARTUP_ACCELERATOR_CUDA; + } + if(lower==STARTUP_ACCELERATOR_VULKAN) + { + return STARTUP_ACCELERATOR_VULKAN; + } + + return ""; +} + +nlohmann::json defaultStartupDefaultsJson() +{ + return { + {STARTUP_ACCELERATOR_CPU, {{"model", ""}, {"variant", ""}, {"context_size", 0}, {"runtime_options", nlohmann::json::object()}}}, + {STARTUP_ACCELERATOR_CUDA, {{"model", ""}, {"variant", ""}, {"context_size", 0}, {"runtime_options", nlohmann::json::object()}}}, + {STARTUP_ACCELERATOR_VULKAN, {{"model", ""}, {"variant", ""}, {"context_size", 0}, {"runtime_options", nlohmann::json::object()}}} + }; +} + +nlohmann::json sanitizeStartupDefaults(const nlohmann::json &startupDefaults) +{ + nlohmann::json sanitized=defaultStartupDefaultsJson(); + + if(!startupDefaults.is_object()) + { + return sanitized; + } + + for(auto it=startupDefaults.begin(); it!=startupDefaults.end(); ++it) + { + std::string key=normalizeAcceleratorKey(it.key()); + if(key.empty()) + { + continue; + } + + if(it.value().is_string()) + { + sanitized[key]={ + {"model", it.value().get()}, + {"variant", ""}, + {"context_size", 0}, + {"runtime_options", nlohmann::json::object()} + }; + continue; + } + + if(!it.value().is_object()) + { + continue; + } + + sanitized[key]={ + {"model", it.value().value("model", "")}, + {"variant", it.value().value("variant", "")}, + {"context_size", sanitizeContextSize(it.value().value("context_size", 0))}, + {"runtime_options", it.value().value("runtime_options", nlohmann::json::object())} + }; + } + + return sanitized; +} + +bool hasAccelerator(const SystemInfo &hw, const std::string &accelerator); + +int defaultStartupContextSize(const ModelInfo &model) +{ + if(model.contextScaling.has_value()&&model.contextScaling->baseContext>0) + { + return model.contextScaling->baseContext; + } + + return model.contextWindow>0?model.contextWindow:0; +} + +int effectiveStartupContextSize(const ModelInfo &model, int requestedContextSize) +{ + if(requestedContextSize>0) + { + return requestedContextSize; + } + + return defaultStartupContextSize(model); +} + +SystemInfo filterSystemInfoForAccelerator(const SystemInfo &hw, const std::string &accelerator) +{ + SystemInfo filtered=hw; + filtered.gpus.clear(); + + if(accelerator==STARTUP_ACCELERATOR_CPU) + { + return filtered; + } + + for(const GpuInfo &gpu:hw.gpus) + { + if(accelerator==STARTUP_ACCELERATOR_CUDA&&gpu.backend==GpuBackend::CUDA) + { + filtered.gpus.push_back(gpu); + } + else if(accelerator==STARTUP_ACCELERATOR_VULKAN&&gpu.backend==GpuBackend::Vulkan) + { + filtered.gpus.push_back(gpu); + } + } + + return filtered; +} + +int sumEffectiveFreeMemoryMb(const SystemInfo &hw) +{ + int total=0; + + for(const GpuInfo &gpu:hw.gpus) + { + if(gpu.unifiedMemory&&gpu.gpuAccessibleRamFreeMb>0) + { + total+=gpu.gpuAccessibleRamFreeMb; + } + else + { + total+=gpu.vramFreeMb; + } + } + + return total; +} + +int sumEffectiveTotalMemoryMb(const SystemInfo &hw) +{ + int total=0; + + for(const GpuInfo &gpu:hw.gpus) + { + if(gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0) + { + total+=gpu.gpuAccessibleRamMb; + } + else + { + total+=gpu.vramTotalMb; + } + } + + return total; +} + +SystemInfo asStartupSystemInfo(const SystemInfo &hw) +{ + SystemInfo startup=hw; + startup.freeRamMb=hw.totalRamMb; + + for(GpuInfo &gpu:startup.gpus) + { + gpu.vramFreeMb=gpu.vramTotalMb; + + if(gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0) + { + gpu.gpuAccessibleRamFreeMb=gpu.gpuAccessibleRamMb; + } + } + + return startup; +} + +int estimateStartupRequiredVramMb( + const ModelInfo &model, + const ModelVariant &variant, + int requestedContextSize) +{ + int requiredVramMb=variant.minVramMb; + + if(model.contextScaling.has_value()&&model.contextScaling->vramPer1kContextMb>0) + { + int effectiveContext=effectiveStartupContextSize(model, requestedContextSize); + int baseContext=model.contextScaling->baseContext; + if(effectiveContext0) + { + int extraChunks=(extraContext+1023)/1024; + requiredVramMb+=extraChunks*model.contextScaling->vramPer1kContextMb; + } + } + + return requiredVramMb; +} + +int estimateStartupRequiredRamMb( + const ModelInfo &model, + const ModelVariant &variant, + int requestedContextSize) +{ + int requiredRamMb=std::max( + model.hardwareRequirements.has_value()?model.hardwareRequirements->minSystemRamMb:0, + variant.fileSizeMb); + + if(model.contextScaling.has_value()&&model.contextScaling->vramPer1kContextMb>0) + { + int effectiveContext=effectiveStartupContextSize(model, requestedContextSize); + int baseContext=model.contextScaling->baseContext; + if(effectiveContext0) + { + int extraChunks=(extraContext+1023)/1024; + requiredRamMb+=extraChunks*model.contextScaling->vramPer1kContextMb; + } + } + + return requiredRamMb; +} + +std::string startupCompatibilityLabel(const std::string &compatibility) +{ + if(compatibility=="likely") + { + return "Likely"; + } + if(compatibility=="tight") + { + return "Tight fit"; + } + if(compatibility=="cloud") + { + return "Cloud"; + } + if(compatibility=="undetected") + { + return "No device"; + } + + return "Unlikely"; +} + +int startupCompatibilitySortRank(const std::string &compatibility) +{ + if(compatibility=="likely") + { + return 0; + } + if(compatibility=="tight") + { + return 1; + } + if(compatibility=="cloud") + { + return 2; + } + if(compatibility=="unlikely") + { + return 3; + } + + return 4; +} + +nlohmann::json buildStartupOptionJson( + const std::string &accelerator, + const SystemInfo &hw, + const ModelInfo &model, + const std::string &variantName, + int requestedContextSize) +{ + SystemInfo startupHw=asStartupSystemInfo(hw); + + nlohmann::json option={ + {"model", model.model}, + {"variant", variantName}, + {"provider", model.provider}, + {"requested_context_size", sanitizeContextSize(requestedContextSize)}, + {"effective_context_size", effectiveStartupContextSize(model, requestedContextSize)}, + {"max_context_size", 0}, + {"required_vram_mb", 0}, + {"required_ram_mb", 0}, + {"available_vram_mb", 0}, + {"available_ram_mb", startupHw.freeRamMb}, + {"can_run", true}, + {"compatibility", "cloud"}, + {"compatibility_label", "Cloud"}, + {"compatibility_reason", "Provider-managed model; no local download or VRAM requirement."}, + {"sort_rank", startupCompatibilitySortRank("cloud")} + }; + + if(model.variants.empty()) + { + return option; + } + + const ModelVariant *selectedVariant=nullptr; + for(const ModelVariant &candidate:model.variants) + { + if(candidate.quantization==variantName) + { + selectedVariant=&candidate; + break; + } + } + + if(!selectedVariant) + { + option["can_run"]=false; + option["compatibility"]="unlikely"; + option["compatibility_label"]=startupCompatibilityLabel("unlikely"); + option["compatibility_reason"]="Variant metadata is missing from the live model catalog."; + option["sort_rank"]=startupCompatibilitySortRank("unlikely"); + return option; + } + + SystemInfo acceleratorHw=filterSystemInfoForAccelerator(startupHw, accelerator); + bool acceleratorDetected=hasAccelerator(hw, accelerator); + int availableVramMb=sumEffectiveTotalMemoryMb(acceleratorHw); + int requiredVramMb=estimateStartupRequiredVramMb(model, *selectedVariant, requestedContextSize); + int requiredRamMb=estimateStartupRequiredRamMb(model, *selectedVariant, requestedContextSize); + int desiredContextSize=effectiveStartupContextSize(model, requestedContextSize); + + option["required_vram_mb"]=requiredVramMb; + option["required_ram_mb"]=requiredRamMb; + option["available_vram_mb"]=availableVramMb; + option["available_ram_mb"]=startupHw.freeRamMb; + option["base_memory_mb"]=static_cast(selectedVariant->minVramMb); + option["base_context_size"]=model.contextScaling.has_value()?model.contextScaling->baseContext:0; + option["memory_per_1k_context_mb"]=model.contextScaling.has_value()?model.contextScaling->vramPer1kContextMb:0; + + if(accelerator==STARTUP_ACCELERATOR_CPU) + { + int maxContextSize=model.contextScaling.has_value() + ? model.contextScaling->maxContext + : model.contextWindow; + bool contextFits=maxContextSize<=0||desiredContextSize<=0||desiredContextSize<=maxContextSize; + bool canRun=requiredRamMb<=startupHw.freeRamMb&&contextFits; + std::string compatibility=canRun + ? (requiredRamMb>=static_cast(startupHw.freeRamMb*0.85f)?"tight":"likely") + : "unlikely"; + std::string reason; + + if(canRun) + { + reason="Fits in total system RAM for CPU startup."; + } + else if(!contextFits) + { + reason="Requested context exceeds the CPU startup limit."; + } + else + { + reason="Needs more system RAM than the device has for CPU startup."; + } + + option["max_context_size"]=maxContextSize; + option["can_run"]=canRun; + option["compatibility"]=compatibility; + option["compatibility_label"]=startupCompatibilityLabel(compatibility); + option["compatibility_reason"]=reason; + option["sort_rank"]=startupCompatibilitySortRank(compatibility); + return option; + } + + ModelFit fit=ModelFitCalculator::calculateModelFit(model, *selectedVariant, acceleratorHw); + bool contextFits=fit.maxContextSize<=0||desiredContextSize<=0||desiredContextSize<=fit.maxContextSize; + bool wouldFallbackToCpu=fit.canRun&&fit.gpuIndices.empty(); + bool canRun=acceleratorDetected&&fit.canRun&&!wouldFallbackToCpu&&contextFits&&requiredVramMb<=availableVramMb; + std::string compatibility; + std::string reason; + + if(!acceleratorDetected) + { + compatibility="undetected"; + reason="No compatible accelerator is currently detected for this startup slot."; + } + else if(wouldFallbackToCpu) + { + compatibility="unlikely"; + reason="Total VRAM would force a CPU fallback instead of using this accelerator."; + } + else if(!fit.canRun) + { + compatibility="unlikely"; + if(fit.limitingFactor=="ram") + { + reason="Insufficient system RAM for this model on the device."; + } + else + { + reason="Insufficient total VRAM on the device."; + } + } + else if(!contextFits) + { + compatibility="unlikely"; + reason="Requested context is higher than the model can sustain on the device."; + } + else if(requiredVramMb>=static_cast(availableVramMb*0.85f)) + { + compatibility="tight"; + reason="Fits, but VRAM is tight for the requested context."; + } + else + { + compatibility="likely"; + reason="Fits comfortably on the device for the requested context."; + } + + option["max_context_size"]=fit.maxContextSize; + option["can_run"]=canRun; + option["compatibility"]=compatibility; + option["compatibility_label"]=startupCompatibilityLabel(compatibility); + option["compatibility_reason"]=reason; + option["sort_rank"]=startupCompatibilitySortRank(compatibility); + return option; +} + +std::vector parseDefaultBackendPriority(const nlohmann::json &cfg) +{ + std::vector priority; + + nlohmann::json hardwareCfg=cfg.value("hardware", nlohmann::json::object()); + if(hardwareCfg.contains("default_backend_priority")&&hardwareCfg["default_backend_priority"].is_array()) + { + for(const nlohmann::json &backend:hardwareCfg["default_backend_priority"]) + { + if(backend.is_string()) + { + priority.push_back(toLowerCopy(backend.get())); + } + } + } + + return priority; +} + +bool hasAccelerator(const SystemInfo &hw, const std::string &accelerator) +{ + if(accelerator==STARTUP_ACCELERATOR_CPU) + { + return true; + } + + for(const GpuInfo &gpu:hw.gpus) + { + if(accelerator==STARTUP_ACCELERATOR_CUDA&&gpu.backend==GpuBackend::CUDA) + { + return true; + } + if(accelerator==STARTUP_ACCELERATOR_VULKAN&&gpu.backend==GpuBackend::Vulkan) + { + return true; + } + } + + return false; +} + +std::vector buildStartupAcceleratorOrder( + const SystemInfo &hw, + const std::vector &defaultBackendPriority) +{ + std::vector order; + + auto appendIfAvailable=[&order, &hw](const std::string &accelerator) + { + if(accelerator.empty()||!hasAccelerator(hw, accelerator)) + { + return; + } + if(std::find(order.begin(), order.end(), accelerator)==order.end()) + { + order.push_back(accelerator); + } + }; + + for(const std::string &backend:defaultBackendPriority) + { + appendIfAvailable(normalizeAcceleratorKey(backend)); + } + + appendIfAvailable(STARTUP_ACCELERATOR_CUDA); + appendIfAvailable(STARTUP_ACCELERATOR_VULKAN); + appendIfAvailable(STARTUP_ACCELERATOR_CPU); + + return order; +} + +nlohmann::json resolveEffectiveStartupDefault(const nlohmann::json &cfg, const SystemInfo &hw) +{ + nlohmann::json startupDefaults=sanitizeStartupDefaults(cfg.value("startup_defaults", nlohmann::json::object())); + std::vector acceleratorOrder=buildStartupAcceleratorOrder(hw, parseDefaultBackendPriority(cfg)); + + for(const std::string &accelerator:acceleratorOrder) + { + nlohmann::json entry=startupDefaults.value(accelerator, nlohmann::json::object()); + std::string model=entry.value("model", ""); + if(!model.empty()) + { + return { + {"accelerator", accelerator}, + {"model", model}, + {"variant", entry.value("variant", "")}, + {"context_size", sanitizeContextSize(entry.value("context_size", 0))}, + {"runtime_options", entry.value("runtime_options", nlohmann::json::object())} + }; + } + } + + return { + {"accelerator", "legacy"}, + {"model", cfg.value("default_model", "")}, + {"variant", cfg.value("default_variant", "")}, + {"context_size", 0} + }; +} + +bool loadServerConfigJson(nlohmann::json &cfg, std::string &error) +{ + if(g_serverConfigPath.empty()) + { + error="Server config path is not set"; + return false; + } + + std::ifstream file(g_serverConfigPath); + if(!file.is_open()) + { + error="Cannot open server config file"; + return false; + } + + try + { + cfg=nlohmann::json::parse(file, nullptr, true, true); + } + catch(const std::exception &e) + { + error=e.what(); + return false; + } + + if(!cfg.is_object()) + { + error="Server config root must be a JSON object"; + return false; + } + + return true; +} + +bool saveServerConfigJson(const nlohmann::json &cfg, std::string &error) +{ + if(g_serverConfigPath.empty()) + { + error="Server config path is not set"; + return false; + } + + std::ofstream file(g_serverConfigPath, std::ios::trunc); + if(!file.is_open()) + { + error="Cannot open server config file for writing"; + return false; + } + + file<(); if(j.contains("override_tensor")&&j["override_tensor"].is_string()) opts.overrideTensor=j["override_tensor"].get(); + if(j.contains("vulkan_no_host_visible_vram")&&j["vulkan_no_host_visible_vram"].is_boolean()) + opts.vulkanNoHostVisibleVram=j["vulkan_no_host_visible_vram"].get(); return opts; } @@ -218,9 +863,37 @@ nlohmann::json loadedModelToJson(const LoadedModel &m) {"context_size", m.contextSize}, {"max_context_size", m.maxContextSize}, {"gpu_indices", gpuIndices}, - {"pinned", m.pinned} + {"pinned", m.pinned}, + {"graph_splits", m.graphSplits}, + {"cpu_mapped_buffer_mb", m.cpuMappedBufferMb} }; + if(!m.perGpuVramMb.empty()) + { + nlohmann::json perGpuJson=nlohmann::json::object(); + for(const auto &pair:m.perGpuVramMb) + { + perGpuJson[std::to_string(pair.first)]=pair.second; + } + j["per_gpu_vram_mb"]=perGpuJson; + } + + if(!m.deviceAllocations.empty()) + { + nlohmann::json allocations=nlohmann::json::object(); + for(const auto &pair:m.deviceAllocations) + { + allocations[pair.first]={ + {"device_name", pair.second.deviceName}, + {"model_buffer_mb", pair.second.modelBufferMb}, + {"kv_cache_buffer_mb", pair.second.kvCacheBufferMb}, + {"compute_buffer_mb", pair.second.computeBufferMb}, + {"total_mb", pair.second.totalMb} + }; + } + j["device_allocations"]=allocations; + } + nlohmann::json activeOpts=runtimeOptionsToJson(m.activeOptions); if(!activeOpts.empty()) { @@ -329,6 +1002,12 @@ std::pair parseModelVariant(const std::string &modelId // ========== Override Path ========== +void setServerConfigPath(const std::string &path) +{ + std::lock_guard lock(g_serverConfigMutex); + g_serverConfigPath=path; +} + void setOverridePath(const std::string &path) { g_overridePath=path; @@ -355,7 +1034,7 @@ void registerRoutes(httplib::Server &server) server.Options(R"(.*)", [](const httplib::Request &, httplib::Response &res) { res.set_header("Access-Control-Allow-Origin", "*"); - res.set_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS, DELETE"); + res.set_header("Access-Control-Allow-Methods", "GET, POST, PUT, OPTIONS, DELETE"); res.set_header("Access-Control-Allow-Headers", "Content-Type, Authorization"); res.set_header("Access-Control-Max-Age", "86400"); res.status=204; @@ -376,6 +1055,11 @@ void registerRoutes(httplib::Server &server) // Version server.Get("/api/version", handleGetVersion); + // Server config + server.Get("/api/server/config", handleGetServerConfig); + server.Put("/api/server/config", handleSetServerConfig); + server.Get("/api/server/startup-options", handleGetStartupOptions); + // Chat completions (OpenAI-compatible) server.Post("/v1/chat/completions", handleChatCompletions); server.Get("/v1/models", handleListModelsV1); @@ -432,6 +1116,7 @@ void registerRoutes(httplib::Server &server) server.Get("/api/downloads", handleGetActiveDownloads); // Dashboard + server.Get("/dashboard/config", handleDashboardConfig); server.Get("/dashboard/storage", handleDashboardStorage); server.Get("/dashboard", handleDashboard); @@ -976,6 +1661,181 @@ void handleGetVersion(const httplib::Request &, httplib::Response &res) res.set_content(j.dump(), "application/json"); } +// ========== Server Config ========== + +void handleGetServerConfig(const httplib::Request &, httplib::Response &res) +{ + std::lock_guard lock(g_serverConfigMutex); + + nlohmann::json cfg; + std::string error; + if(!loadServerConfigJson(cfg, error)) + { + res.status=500; + res.set_content(errorJson("Failed to load server config: "+error).dump(), "application/json"); + return; + } + + res.set_content(buildServerConfigResponse(cfg).dump(), "application/json"); +} + +void handleSetServerConfig(const httplib::Request &req, httplib::Response &res) +{ + nlohmann::json body; + try + { + body=nlohmann::json::parse(req.body); + } + catch(const std::exception &) + { + res.status=400; + res.set_content(errorJson("Invalid JSON body", "invalid_request_error", "", "parse_error").dump(), "application/json"); + return; + } + + std::lock_guard lock(g_serverConfigMutex); + + nlohmann::json cfg; + std::string error; + if(!loadServerConfigJson(cfg, error)) + { + res.status=500; + res.set_content(errorJson("Failed to load server config: "+error).dump(), "application/json"); + return; + } + + if(body.contains("default_model")) + { + if(!body["default_model"].is_string()) + { + res.status=400; + res.set_content(errorJson("'default_model' must be a string", "invalid_request_error", "default_model", "invalid_type").dump(), "application/json"); + return; + } + cfg["default_model"]=body["default_model"].get(); + } + + if(body.contains("default_variant")) + { + if(!body["default_variant"].is_string()) + { + res.status=400; + res.set_content(errorJson("'default_variant' must be a string", "invalid_request_error", "default_variant", "invalid_type").dump(), "application/json"); + return; + } + cfg["default_variant"]=body["default_variant"].get(); + } + + if(body.contains("startup_defaults")) + { + if(!body["startup_defaults"].is_object()) + { + res.status=400; + res.set_content(errorJson("'startup_defaults' must be an object", "invalid_request_error", "startup_defaults", "invalid_type").dump(), "application/json"); + return; + } + + cfg["startup_defaults"]=sanitizeStartupDefaults(body["startup_defaults"]); + } + + if(body.contains("startup_models")) + { + if(!body["startup_models"].is_array()) + { + res.status=400; + res.set_content(errorJson("'startup_models' must be an array", "invalid_request_error", "startup_models", "invalid_type").dump(), "application/json"); + return; + } + + cfg["startup_models"]=body["startup_models"]; + } + + if(!saveServerConfigJson(cfg, error)) + { + res.status=500; + res.set_content(errorJson("Failed to save server config: "+error).dump(), "application/json"); + return; + } + + res.set_content(buildServerConfigResponse(cfg).dump(), "application/json"); +} + +void handleGetStartupOptions(const httplib::Request &req, httplib::Response &res) +{ + std::string accelerator=normalizeAcceleratorKey(req.has_param("accelerator") + ? req.get_param_value("accelerator") + : ""); + if(accelerator.empty()) + { + res.status=400; + res.set_content(errorJson("Missing or invalid 'accelerator' query parameter", "invalid_request_error", "accelerator", "invalid_value").dump(), "application/json"); + return; + } + + int contextSize=0; + if(req.has_param("context_size")) + { + try + { + contextSize=sanitizeContextSize(std::stoi(req.get_param_value("context_size"))); + } + catch(const std::exception &) + { + res.status=400; + res.set_content(errorJson("'context_size' must be an integer", "invalid_request_error", "context_size", "invalid_value").dump(), "application/json"); + return; + } + } + + HardwareDetector::instance().refresh(); + SystemInfo hw=HardwareDetector::instance().getSystemInfo(); + + std::vector models=ModelManager::instance().getModelsByRanking(); + std::vector options; + options.reserve(models.size()); + + for(const ModelInfo &model:models) + { + if(model.variants.empty()) + { + options.push_back(buildStartupOptionJson(accelerator, hw, model, "", contextSize)); + continue; + } + + for(const ModelVariant &variant:model.variants) + { + options.push_back(buildStartupOptionJson(accelerator, hw, model, variant.quantization, contextSize)); + } + } + + std::sort(options.begin(), options.end(), [](const nlohmann::json &left, const nlohmann::json &right) + { + int leftRank=left.value("sort_rank", 99); + int rightRank=right.value("sort_rank", 99); + if(leftRank!=rightRank) + { + return leftRank targetDevices; // Accept parameters from query string if(req.has_param("variant")) @@ -1093,6 +1954,14 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) contextSize=body["context_size"].get(); if(body.contains("runtime_options")&&body["runtime_options"].is_object()) optionsOverride=parseRuntimeOptions(body["runtime_options"]); + if(body.contains("devices")&&body["devices"].is_array()) + { + for(const auto &d:body["devices"]) + { + if(d.is_number_integer()) + targetDevices.push_back(d.get()); + } + } } catch(const nlohmann::json::parse_error &) { @@ -1100,9 +1969,9 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) } } - spdlog::info("Load request: model='{}' variant='{}' context={}", modelName, variant, contextSize); + spdlog::info("Load request: model='{}' variant='{}' context={} devices={}", modelName, variant, contextSize, targetDevices.size()); - ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize, &optionsOverride); + ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize, &optionsOverride, targetDevices); if(err==ErrorCode::Success) { @@ -2335,6 +3204,11 @@ void handleDashboard(const httplib::Request &, httplib::Response &res) res.set_content(DASHBOARD_HTML, "text/html"); } +void handleDashboardConfig(const httplib::Request &, httplib::Response &res) +{ + res.set_content(DASHBOARD_CONFIG_HTML, "text/html"); +} + void handleDashboardStorage(const httplib::Request &, httplib::Response &res) { res.set_content(DASHBOARD_STORAGE_HTML, "text/html"); diff --git a/src/server/routes.h b/src/server/routes.h index 418daf4..6b20831 100644 --- a/src/server/routes.h +++ b/src/server/routes.h @@ -11,6 +11,9 @@ namespace server /// Register all route handlers on the given HTTP server. void registerRoutes(httplib::Server &server); +/// Set the main server configuration JSON path for persisted dashboard updates. +void setServerConfigPath(const std::string &path); + /// Set the override path for persisting runtime model configs. void setOverridePath(const std::string &path); @@ -32,6 +35,12 @@ void handleHealth(const httplib::Request &req, httplib::Response &res); void handleGetVersion(const httplib::Request &req, httplib::Response &res); +// ========== Server Config ========== + +void handleGetServerConfig(const httplib::Request &req, httplib::Response &res); +void handleSetServerConfig(const httplib::Request &req, httplib::Response &res); +void handleGetStartupOptions(const httplib::Request &req, httplib::Response &res); + // ========== Model Management ========== void handleGetModels(const httplib::Request &req, httplib::Response &res); @@ -88,6 +97,7 @@ void handleGetActiveDownloads(const httplib::Request &req, httplib::Response &re // ========== Dashboard ========== void handleDashboard(const httplib::Request &req, httplib::Response &res); +void handleDashboardConfig(const httplib::Request &req, httplib::Response &res); void handleDashboardStorage(const httplib::Request &req, httplib::Response &res); } // namespace server diff --git a/tests/modelRuntimeTests.cpp b/tests/modelRuntimeTests.cpp index 86ee084..b76ef3f 100644 --- a/tests/modelRuntimeTests.cpp +++ b/tests/modelRuntimeTests.cpp @@ -304,7 +304,7 @@ TEST_F(ModelRuntimeTest, EndInferenceDrainsSwapQueue) rt.swapModel("mock-model-2"); // End inference — should drain the queue and execute swap - rt.endInference(); + rt.endInference("mock-model"); EXPECT_FALSE(rt.isInferenceActive()); @@ -330,7 +330,7 @@ TEST_F(ModelRuntimeTest, MultipleQueuedSwapsOnlyExecutesLatest) rt.swapModel("mock-model-2"); rt.swapModel("mock-model"); // swap back to mock-model - rt.endInference(); + rt.endInference("mock-model"); // mock-model should be loaded (the latest swap target) auto state=rt.getModelState("mock-model"); @@ -350,7 +350,7 @@ TEST_F(ModelRuntimeTest, BeginEndInferenceTracksState) rt.beginInference("mock-model"); EXPECT_TRUE(rt.isInferenceActive()); - rt.endInference(); + rt.endInference("mock-model"); EXPECT_FALSE(rt.isInferenceActive()); } diff --git a/tests/telemetryCollectorTests.cpp b/tests/telemetryCollectorTests.cpp index 35f4d00..ce95299 100644 --- a/tests/telemetryCollectorTests.cpp +++ b/tests/telemetryCollectorTests.cpp @@ -266,7 +266,7 @@ TEST_F(TelemetryCollectorTest, SnapshotActiveRequests) SystemSnapshot snapshot2=tc.getSnapshot(); EXPECT_EQ(snapshot2.activeRequests, 1); - ModelRuntime::instance().endInference(); + ModelRuntime::instance().endInference("tel-mock-1"); SystemSnapshot snapshot3=tc.getSnapshot(); EXPECT_EQ(snapshot3.activeRequests, 0); From 6d44752aa05339ead2b9b8e426ad6c8355d24edd Mon Sep 17 00:00:00 2001 From: krazer Date: Sun, 10 May 2026 12:02:08 -0400 Subject: [PATCH 2/2] feat: context size auto-select and VRAM fit visualization - Dashboard config page: context slider allows 0 (auto), shows VRAM fit gradient (green/yellow/red) based on selected compute devices, hard-caps slider at max VRAM capacity, info banner for auto mode - Model runtime: context_size=0 now selects largest context fitting in available VRAM (min of hardware max and native training context) - Add context stress test script - Update .gitignore for ansible/ and tmp/ --- .gitignore | 3 +- arbiterAI_config | 2 +- scripts/context_stress_test.sh | 157 +++++++++++++++++++++++++++++++++ src/arbiterAI/modelManager.cpp | 16 ++++ src/arbiterAI/modelRuntime.cpp | 89 ++++++++++++++++--- src/arbiterAI/modelRuntime.h | 7 ++ src/server/dashboardConfig.h | 103 +++++++++++++++++++-- src/server/main.cpp | 3 + src/server/routes.cpp | 36 ++++---- 9 files changed, 375 insertions(+), 41 deletions(-) create mode 100755 scripts/context_stress_test.sh diff --git a/.gitignore b/.gitignore index 99f7b81..e9d191a 100644 --- a/.gitignore +++ b/.gitignore @@ -45,4 +45,5 @@ models/ # local info push-server.sh docs/tasks/ -tmp/ \ No newline at end of file +tmp/ +ansible/ diff --git a/arbiterAI_config b/arbiterAI_config index cffe407..e6a4342 160000 --- a/arbiterAI_config +++ b/arbiterAI_config @@ -1 +1 @@ -Subproject commit cffe4077ba15986e26035d2e69ad8341f6dcc83d +Subproject commit e6a4342141f6e84f229be0141ae1374b16194110 diff --git a/scripts/context_stress_test.sh b/scripts/context_stress_test.sh new file mode 100755 index 0000000..49e13a3 --- /dev/null +++ b/scripts/context_stress_test.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# context_stress_test.sh — Progressively fill context on ai-lab to find the real limit +# +# The model (Qwen3.5-27B:Q4_K_M) is loaded with 248832 context on the MI50 32GB. +# This script sends increasingly large prompts and observes when llama.cpp errors. +# +# Strategy: Use a binary search approach. Start with a known-good token count, +# then double until failure, then binary search between last-good and first-bad. + +set -euo pipefail + +SERVER="http://192.168.2.101:8081" +MODEL="Qwen3.5-27B" +RESULTS_FILE="/tmp/context_stress_results.txt" + +echo "Context Stress Test - $(date)" | tee "$RESULTS_FILE" +echo "Server: $SERVER" | tee -a "$RESULTS_FILE" +echo "Model: $MODEL (Q4_K_M)" | tee -a "$RESULTS_FILE" +echo "Configured context: 248832" | tee -a "$RESULTS_FILE" +echo "========================================" | tee -a "$RESULTS_FILE" + +# Generate a repeating text block to fill context +# ~4 chars per token for English text is a rough estimate +# We'll use a simple repeating pattern +generate_payload() { + local target_tokens=$1 + # Each word "hello " is roughly 1-2 tokens; use ~3.5 chars/token estimate + local char_count=$((target_tokens * 4)) + + # Generate repeating text + local text="" + local block="The quick brown fox jumps over the lazy dog. This is a test of context window capacity. " + local block_len=${#block} + local repeats=$((char_count / block_len + 1)) + + # Use python for efficiency with large strings + python3 -c " +import json, sys + +target_chars = $char_count +block = 'The quick brown fox jumps over the lazy dog. This is a test of context window capacity. ' +text = (block * ($repeats))[:target_chars] + +payload = { + 'model': '$MODEL', + 'messages': [ + {'role': 'system', 'content': 'You are a helpful assistant. Respond with exactly one word: OK'}, + {'role': 'user', 'content': text} + ], + 'max_tokens': 5, + 'temperature': 0.0 +} + +json.dump(payload, sys.stdout) +" +} + +# Send a request and check if it succeeds +test_context() { + local target_tokens=$1 + local start_time=$(date +%s%N) + + echo -n " Testing ~${target_tokens} tokens... " | tee -a "$RESULTS_FILE" + + # Generate payload and send + local response + local http_code + + # Write payload to temp file to handle large sizes + generate_payload "$target_tokens" > /tmp/context_test_payload.json + local payload_size=$(wc -c < /tmp/context_test_payload.json) + echo -n "(payload: ${payload_size} bytes) " | tee -a "$RESULTS_FILE" + + # Send request with extended timeout (large context = slow) + response=$(curl -sf -w "\n%{http_code}" \ + --max-time 300 \ + -X POST "${SERVER}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d @/tmp/context_test_payload.json 2>&1) || { + local exit_code=$? + echo "CURL_ERROR (exit=$exit_code)" | tee -a "$RESULTS_FILE" + echo " Response: $(echo "$response" | tail -5)" | tee -a "$RESULTS_FILE" + return 1 + } + + http_code=$(echo "$response" | tail -1) + local body=$(echo "$response" | sed '$d') + + local end_time=$(date +%s%N) + local elapsed_ms=$(( (end_time - start_time) / 1000000 )) + + if [ "$http_code" = "200" ]; then + local prompt_tokens=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('usage',{}).get('prompt_tokens','?'))" 2>/dev/null || echo "?") + echo "OK (HTTP 200, prompt_tokens=${prompt_tokens}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE" + return 0 + else + local error_msg=$(echo "$body" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',{}).get('message','unknown')[:200])" 2>/dev/null || echo "$body" | head -c 200) + echo "FAILED (HTTP ${http_code}, ${elapsed_ms}ms)" | tee -a "$RESULTS_FILE" + echo " Error: ${error_msg}" | tee -a "$RESULTS_FILE" + return 1 + fi +} + +# Phase 1: Exponential probing - find the ballpark where it fails +echo "" | tee -a "$RESULTS_FILE" +echo "Phase 1: Exponential probing" | tee -a "$RESULTS_FILE" +echo "----------------------------------------" | tee -a "$RESULTS_FILE" + +# Start with small amounts and increase +TOKEN_SIZES=(1000 4000 8000 16000 32000 64000 96000 128000 160000 192000 224000 240000 248000) + +last_good=0 +first_bad=0 + +for tokens in "${TOKEN_SIZES[@]}"; do + if test_context "$tokens"; then + last_good=$tokens + else + first_bad=$tokens + break + fi +done + +if [ "$first_bad" -eq 0 ]; then + echo "" | tee -a "$RESULTS_FILE" + echo "All tests passed! Model handled up to ~${last_good} tokens." | tee -a "$RESULTS_FILE" + echo "The full 248832 context appears usable." | tee -a "$RESULTS_FILE" +else + # Phase 2: Binary search between last_good and first_bad + echo "" | tee -a "$RESULTS_FILE" + echo "Phase 2: Binary search between ${last_good} and ${first_bad}" | tee -a "$RESULTS_FILE" + echo "----------------------------------------" | tee -a "$RESULTS_FILE" + + low=$last_good + high=$first_bad + + while [ $((high - low)) -gt 2000 ]; do + mid=$(( (low + high) / 2 )) + if test_context "$mid"; then + low=$mid + else + high=$mid + fi + done + + echo "" | tee -a "$RESULTS_FILE" + echo "========================================" | tee -a "$RESULTS_FILE" + echo "RESULT: Maximum usable context is approximately ${low}-${high} tokens" | tee -a "$RESULTS_FILE" + echo " Last successful: ~${low} tokens" | tee -a "$RESULTS_FILE" + echo " First failure: ~${high} tokens" | tee -a "$RESULTS_FILE" + echo " Configured max: 248832 tokens" | tee -a "$RESULTS_FILE" + echo " Utilization: $(python3 -c "print(f'{${low}/248832*100:.1f}%')")" | tee -a "$RESULTS_FILE" +fi + +echo "" | tee -a "$RESULTS_FILE" +echo "Full results saved to: $RESULTS_FILE" | tee -a "$RESULTS_FILE" +echo "Done - $(date)" | tee -a "$RESULTS_FILE" diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp index 5c314d4..11f629a 100644 --- a/src/arbiterAI/modelManager.cpp +++ b/src/arbiterAI/modelManager.cpp @@ -366,6 +366,22 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in variant.files.push_back(vd); } } + + // Skip CLIP/mmproj variants — these are multimodal projection + // files, not standalone models. Loading them as the main model + // causes llama.cpp to fail with "CLIP cannot be used as main model". + std::string primaryFile=variant.getPrimaryFilename(); + std::string primaryLower=primaryFile; + std::transform(primaryLower.begin(), primaryLower.end(), primaryLower.begin(), ::tolower); + if(primaryLower.find("mmproj")!=std::string::npos|| + primaryLower.find("clip-")!=std::string::npos|| + primaryLower.find("vision-")!=std::string::npos) + { + spdlog::debug("Skipping multimodal projection variant '{}' for model '{}' (file: {})", + variant.quantization, info.model, primaryFile); + continue; + } + info.variants.push_back(variant); } } diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp index 22774e4..ccd3d0c 100644 --- a/src/arbiterAI/modelRuntime.cpp +++ b/src/arbiterAI/modelRuntime.cpp @@ -279,6 +279,19 @@ LoadErrorDetail ModelRuntime::classifyLoadFailure( return detail; } + // Check for CLIP / multimodal projection file loaded as main model + if(logLower.find("clip cannot be used as main model")!=std::string::npos|| + logLower.find("mmproj")!=std::string::npos&&logLower.find("clip")!=std::string::npos) + { + detail.reason=LoadFailureReason::UnsupportedArch; + detail.summary="File is a CLIP/mmproj multimodal projection, not a standalone model: "+filePath; + detail.suggestion="This file is a vision encoder projection used with --mmproj, not a model. " + "Remove this variant from the model config and use the correct GGUF model file instead."; + detail.action="fix_config"; + detail.recoverable=false; + return detail; + } + // Check for unsupported architecture if(logLower.find("unknown model architecture")!=std::string::npos|| logLower.find("unsupported model")!=std::string::npos|| @@ -333,6 +346,23 @@ int ModelRuntime::getMaxConcurrentDownloads() const return m_maxConcurrentDownloads; } +void ModelRuntime::setModelsDir(const std::string &dir) +{ + std::lock_guard lock(m_mutex); + m_modelsDir=dir; + // Ensure trailing slash for path concatenation + if(!m_modelsDir.empty()&&m_modelsDir.back()!='/') + { + m_modelsDir+='/'; + } +} + +std::string ModelRuntime::getModelsDir() const +{ + std::lock_guard lock(m_mutex); + return m_modelsDir; +} + ErrorCode ModelRuntime::loadModel( const std::string &model, const std::string &variant, @@ -481,7 +511,7 @@ ErrorCode ModelRuntime::loadModel( bool anyMissing=false; for(const VariantDownload &file:allFiles) { - std::string filePath="/models/"+file.filename; + std::string filePath=m_modelsDir+file.filename; if(!std::filesystem::exists(filePath)&&!file.url.empty()) { anyMissing=true; @@ -555,7 +585,7 @@ ErrorCode ModelRuntime::loadModel( // Resolve backend priority: model config > architecture rule > server default std::vector effectiveBackendPriority=resolveBackendPriority(*modelInfo); - std::string filePath="/models/"+primaryFilename; + std::string filePath=m_modelsDir+primaryFilename; ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices, fit.maxContextSize, resolvedOptions, effectiveBackendPriority); if(loadResult!=ErrorCode::Success) @@ -660,7 +690,7 @@ ErrorCode ModelRuntime::downloadModel( bool anyMissing=false; for(const VariantDownload &file:allFiles) { - std::string filePath="/models/"+file.filename; + std::string filePath=m_modelsDir+file.filename; if(!std::filesystem::exists(filePath)&&!file.url.empty()) { anyMissing=true; @@ -752,7 +782,7 @@ void ModelRuntime::runBackgroundDownload( std::vector missingFiles; for(const VariantDownload &file:allFiles) { - std::string filePath="/models/"+file.filename; + std::string filePath=m_modelsDir+file.filename; if(!std::filesystem::exists(filePath)&&!file.url.empty()) { missingFiles.push_back(&file); @@ -773,7 +803,7 @@ void ModelRuntime::runBackgroundDownload( bool allDownloadsOk=true; for(const VariantDownload *file:missingFiles) { - std::string filePath="/models/"+file->filename; + std::string filePath=m_modelsDir+file->filename; bool downloadOk=downloadModelFile( file->url, filePath, @@ -812,7 +842,7 @@ void ModelRuntime::runBackgroundDownload( std::vector extraFiles; for(size_t i=0; ibackend==GpuBackend::CUDA) expectedPrefix="CUDA"; else if(hwGpu->backend==GpuBackend::Vulkan) expectedPrefix="Vulkan"; - for(const GgmlGpuDev &ggmlDev:ggmlGpus) + // Try matching with backend prefix first, then without (fallback). + // The HW detector may report a GPU as CUDA while ggml only has + // Vulkan backends available (or vice versa). + std::vector prefixesToTry={expectedPrefix, ""}; + + for(const std::string &prefix:prefixesToTry) { - // Check backend match first - if(!expectedPrefix.empty()&&ggmlDev.name.find(expectedPrefix)==std::string::npos) - continue; + if(bestMatch) break; + + for(const GgmlGpuDev &ggmlDev:ggmlGpus) + { + // Check backend match first + if(!prefix.empty()&&ggmlDev.name.find(prefix)==std::string::npos) + continue; // Check if HW GPU name appears in ggml description // HW name: "AMD Instinct MI50/MI60 (RADV VEGA20)" @@ -1847,13 +1886,23 @@ ErrorCode ModelRuntime::loadLlamaModel( bestMatchName=ggmlDev.name; break; } + } } if(bestMatch) { targetDevices.push_back(bestMatch); - spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}'", - idx, hwGpu->name, bestMatchName, model); + bool backendFallback=!expectedPrefix.empty()&&bestMatchName.find(expectedPrefix)==std::string::npos; + if(backendFallback) + { + spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}' (backend fallback: {} not available)", + idx, hwGpu->name, bestMatchName, model, expectedPrefix); + } + else + { + spdlog::info("Targeting GPU hw[{}] '{}': ggml device '{}' for model '{}'", + idx, hwGpu->name, bestMatchName, model); + } } else { @@ -1895,12 +1944,24 @@ ErrorCode ModelRuntime::loadLlamaModel( // Resolve actual context to allocate: // contextSize > 0 → user/config requested explicit size - // contextSize == 0 → use model's native training context + // contextSize == 0 → auto-select the largest context that fits in + // available VRAM, capped by the model's native + // training context // In both cases, cap by the hardware-fit maximum. int actualContext=contextSize; if(actualContext<=0) { - actualContext=nativeContext; + // Auto-select: use hardware maximum, but don't exceed native context + if(maxHardwareContext>0) + { + actualContext=std::min(maxHardwareContext, nativeContext); + spdlog::info("Auto-selecting context size {} (hardware max={}, native={}) for model '{}'", + actualContext, maxHardwareContext, nativeContext, model); + } + else + { + actualContext=nativeContext; + } } if(maxHardwareContext>0&&actualContext>maxHardwareContext) { diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h index d2abd14..b238851 100644 --- a/src/arbiterAI/modelRuntime.h +++ b/src/arbiterAI/modelRuntime.h @@ -119,6 +119,12 @@ class ModelRuntime { const std::string &model, const std::string &variant=""); + /// Set the base directory for model files (default: "/models"). + void setModelsDir(const std::string &dir); + + /// Get the current models directory. + std::string getModelsDir() const; + /// Set the maximum number of concurrent model downloads (default: 2). void setMaxConcurrentDownloads(int max); @@ -280,6 +286,7 @@ class ModelRuntime { std::map m_models; mutable std::mutex m_mutex; + std::string m_modelsDir="/models/"; int m_readyRamBudgetMb=0; std::vector m_defaultBackendPriority; std::set m_activeInference; // models currently running inference diff --git a/src/server/dashboardConfig.h b/src/server/dashboardConfig.h index af4f079..6eb5dc8 100644 --- a/src/server/dashboardConfig.h +++ b/src/server/dashboardConfig.h @@ -1248,7 +1248,7 @@ function renderEffectiveStartup() let html='Next restart will load: '; const parts=active.map(e=>{ const label=formatStartupModelLabel(e.model, e.variant); - const ctx=e.context_size>0?formatContextSize(e.context_size):'default'; + const ctx=e.context_size>0?formatContextSize(e.context_size):'auto'; const devs=e.devices&&e.devices.length>0?'GPU '+e.devices.join(','):'auto'; return ''+escapeHtml(label)+' (ctx: '+escapeHtml(ctx)+', devices: '+escapeHtml(devs)+')'; }); @@ -1489,13 +1489,94 @@ function updateStartupModelDevice(index, gpuIndex, checked) { entry.devices=entry.devices.filter(d=>d!==gpuIndex); } + updateStartupModelSliderGradient(index); } function updateStartupModelContext(index, value) { - startupModelsState[index].context_size=parseInt(value, 10)||0; + const parsed=parseInt(value, 10)||0; + startupModelsState[index].context_size=parsed; const label=document.getElementById('smCtxLabel_'+index); - if(label) label.textContent=formatContextSize(parseInt(value, 10)); + if(label) label.textContent=parsed===0?'Auto':formatContextSize(parsed); + const autoInfo=document.getElementById('smCtxAutoInfo_'+index); + if(autoInfo) autoInfo.style.display=parsed===0?'block':'none'; + updateStartupModelSliderGradient(index); +} + +function getSelectedDevicesVram(index) +{ + const entry=startupModelsState[index]; + if(!entry||!entry.devices||entry.devices.length===0) + { + // No devices selected: sum all GPU VRAM + let total=0; + for(const gpu of availableGpus) total+=gpu.vram_total_mb||0; + return total; + } + let total=0; + for(const gpuIdx of entry.devices) + { + const gpu=availableGpus.find(g=>g.index===gpuIdx); + if(gpu) total+=gpu.vram_total_mb||0; + } + return total; +} + +function updateStartupModelSliderGradient(index) +{ + const slider=document.getElementById('smCtxSlider_'+index); + if(!slider) return; + const entry=startupModelsState[index]; + const modelOpt=availableModelOptions.find(o=>o.model===entry.model&&o.variant===entry.variant); + + if(!modelOpt||!modelOpt.memory_per_1k_context_mb||modelOpt.memory_per_1k_context_mb<=0) + { + slider.style.background='#32384b'; + return; + } + + const min=parseInt(slider.min); + const max=parseInt(slider.max); + const range=max-min; + if(range<=0){ slider.style.background='#32384b'; return; } + + const baseMemory=modelOpt.base_memory_mb||0; + const baseContext=modelOpt.base_context_size||0; + const memPer1k=modelOpt.memory_per_1k_context_mb; + const availableMemory=getSelectedDevicesVram(index); + if(availableMemory<=0){ slider.style.background='#32384b'; return; } + + const likelyCtx=baseContext+((0.85*availableMemory-baseMemory)/memPer1k)*1024; + const tightCtx=baseContext+((availableMemory-baseMemory)/memPer1k)*1024; + + // Hard max: clamp slider to the tight max + const hardMaxCtx=Math.floor(tightCtx/1024)*1024; + if(hardMaxCtx>0&&hardMaxCtxparseInt(slider.max)) + { + slider.value=slider.max; + updateStartupModelContext(index, slider.value); + return; + } + } + + const likelyPct=Math.max(0, Math.min(100, ((likelyCtx-min)/range)*100)); + const tightPct=Math.max(0, Math.min(100, ((tightCtx-min)/range)*100)); + + if(likelyPct>=100) + { + slider.style.background='linear-gradient(to right, rgba(76,175,80,0.35) 0%, rgba(76,175,80,0.35) 100%)'; + } + else if(tightPct<=0) + { + slider.style.background='linear-gradient(to right, rgba(255,96,96,0.35) 0%, rgba(255,96,96,0.35) 100%)'; + } + else + { + slider.style.background='linear-gradient(to right, rgba(76,175,80,0.35) 0%, rgba(76,175,80,0.35) '+likelyPct+'%, rgba(240,192,64,0.35) '+likelyPct+'%, rgba(240,192,64,0.35) '+tightPct+'%, rgba(255,96,96,0.35) '+tightPct+'%, rgba(255,96,96,0.35) 100%)'; + } } function readStartupModelRuntimeOpts(index) @@ -1551,8 +1632,9 @@ function renderStartupModels() // Context slider const modelOpt=availableModelOptions.find(o=>o.model===entry.model&&o.variant===entry.variant); const maxCtx=modelOpt?modelOpt.max_context_size||131072:131072; - const ctxVal=entry.context_size>0?Math.min(entry.context_size, maxCtx):4096; - const ctxLabel=formatContextSize(ctxVal); + const ctxVal=entry.context_size>0?Math.min(entry.context_size, maxCtx):0; + const ctxLabel=ctxVal===0?'Auto':formatContextSize(ctxVal); + const isAutoCtx=ctxVal===0; // Device checkboxes let devicesHtml=''; @@ -1584,8 +1666,11 @@ function renderStartupModels() +'' +'
' +'' - +'' + +'
' + +'\u2139\uFE0F Auto: The server will select the largest context size that fits in the available VRAM of the assigned device(s).' + +'
' +'
' +'
' +'' @@ -1608,6 +1693,12 @@ function renderStartupModels() } el.innerHTML=html; + + // Apply VRAM color gradients after DOM is updated + for(let i=0; i0) { storage.setStorageLimit(storageLimitBytes); diff --git a/src/server/routes.cpp b/src/server/routes.cpp index 7fd8dc2..f2e5880 100644 --- a/src/server/routes.cpp +++ b/src/server/routes.cpp @@ -1471,38 +1471,36 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) void handleListModelsV1(const httplib::Request &, httplib::Response &res) { - std::vector modelNames; - ArbiterAI::instance().getAvailableModels(modelNames); + // Return only currently loaded models (OpenAI-compatible: models ready for inference) + std::vector states=ModelRuntime::instance().getModelStates(); auto created=static_cast(std::time(nullptr)); nlohmann::json data=nlohmann::json::array(); - for(const std::string &name:modelNames) + for(const LoadedModel &m:states) { - // Always emit the bare model name + if(m.state!=ModelState::Loaded) + continue; + + // Emit bare model name data.push_back({ - {"id", name}, + {"id", m.modelName}, {"object", "model"}, {"created", created}, {"owned_by", "arbiterai"}, {"permission", nlohmann::json::array()} }); - // For models with variants, also emit "model:variant" entries - ModelInfo info; - if(ArbiterAI::instance().getModelInfo(name, info)==ErrorCode::Success - &&!info.variants.empty()) + // Also emit "model:variant" if a variant is loaded + if(!m.variant.empty()) { - for(const ModelVariant &v:info.variants) - { - data.push_back({ - {"id", name+":"+v.quantization}, - {"object", "model"}, - {"created", created}, - {"owned_by", "arbiterai"}, - {"permission", nlohmann::json::array()} - }); - } + data.push_back({ + {"id", m.modelName+":"+m.variant}, + {"object", "model"}, + {"created", created}, + {"owned_by", "arbiterai"}, + {"permission", nlohmann::json::array()} + }); } }