diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index ca54224..bffdcf5 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -21,11 +21,12 @@ A C++17 library providing a unified interface for multiple LLM providers. 1. **All commands** must go through `./runDocker.sh ...`. 2. **All development** (building, testing, running) must be done inside the Docker container. The host environment is not guaranteed to have the correct tools or dependencies. -3. **Do not** use `python`, `pip`, `pytest` — the host may not have the correct Python version or dependencies. -4. **Do not** create or use a virtualenv on the host. The container is the virtualenv. -5. The project source is **bind-mounted** at `/app` inside the container. Edits to files on the host are immediately visible inside the container. -6. If you change the `Dockerfile`, run `./runDocker.sh --rebuild`. -7. Don't launch the server, ask the user to launch so that its not running in the agents terminal. +3. **Do not** run commands in the terminal with `2>&1` as the user cannot verify the command is running or not. +4. **Do not** use `python`, `pip`, `pytest` — the host may not have the correct Python version or dependencies. +5. **Do not** create or use a virtualenv on the host. The container is the virtualenv. +6. The project source is **bind-mounted** at `/app` inside the container. Edits to files on the host are immediately visible inside the container. +7. If you change the `Dockerfile`, run `./runDocker.sh --rebuild`. +8. Don't launch the server, ask the user to launch so that its not running in the agents terminal. ## Active Tasks diff --git a/.gitignore b/.gitignore index 004da88..99f7b81 100644 --- a/.gitignore +++ b/.gitignore @@ -35,9 +35,6 @@ build/ vcpkg_installed/ -# Cloned config repository -arbiterAI_config/ - # Generated at build time by CMake **/generated/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a60ce16 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "arbiterAI_config"] + path = arbiterAI_config + url = https://github.com/caseymcc/arbiterAI_config.git \ No newline at end of file diff --git a/arbiterAI_config b/arbiterAI_config new file mode 160000 index 0000000..cffe407 --- /dev/null +++ b/arbiterAI_config @@ -0,0 +1 @@ +Subproject commit cffe4077ba15986e26035d2e69ad8341f6dcc83d diff --git a/docker/Dockerfile b/docker/Dockerfile index 1e72eaf..079cfad 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,11 +1,12 @@ # syntax=docker/dockerfile:1 -ARG DOCKER_VERSION=1.2.0 +ARG DOCKER_VERSION=1.2.1 FROM ubuntu:24.04 # Install basic build tools, Python 3, and GPU libraries. # Vulkan headers + glslc are needed at build time for llama.cpp's Vulkan backend. -# At runtime, GPU inference requires a Vulkan ICD on the host; without one -# llama.cpp falls back to CPU-only. +# ROCm HIP SDK is needed at build time for llama.cpp's HIP/ROCm backend. +# At runtime, GPU inference requires a Vulkan ICD or ROCm driver on the host; +# without one llama.cpp falls back to CPU-only. RUN apt-get update && apt-get install -y \ build-essential \ cmake \ @@ -34,8 +35,25 @@ RUN apt-get update && apt-get install -y \ mesa-vulkan-drivers \ glslc \ glslang-tools \ + wget \ && rm -rf /var/lib/apt/lists/* +# Install ROCm HIP SDK (build-time only — no kernel driver needed in container) +# Use hiplibsdk usecase to get HIP development libraries and CMake configs +# (the plain "hip" usecase only installs runtime, missing hip-lang-config.cmake) +RUN wget -q https://repo.radeon.com/amdgpu-install/7.2.1/ubuntu/noble/amdgpu-install_7.2.1.70201-1_all.deb && \ + apt-get update && \ + apt-get install -y ./amdgpu-install_7.2.1.70201-1_all.deb && \ + apt-get update && \ + amdgpu-install -y --usecase=hiplibsdk --no-dkms && \ + rm -f amdgpu-install_7.2.1.70201-1_all.deb && \ + rm -rf /var/lib/apt/lists/* + +ENV ROCM_PATH=/opt/rocm +ENV HIP_PATH=/opt/rocm +ENV PATH="${ROCM_PATH}/bin:${PATH}" +ENV CMAKE_PREFIX_PATH="${ROCM_PATH}:${CMAKE_PREFIX_PATH}" + RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \ apt-get install -y nodejs && \ rm -rf /var/lib/apt/lists/* diff --git a/docs/server.md b/docs/server.md index 84b655c..9c55fe6 100644 --- a/docs/server.md +++ b/docs/server.md @@ -77,7 +77,8 @@ All server settings are defined in a JSON configuration file. See [`examples/ser "hardware": { "vram_overrides": { "0": 32000 - } + }, + "default_backend_priority": ["vulkan"] }, "logging": { "level": "info", @@ -116,6 +117,7 @@ All server settings are defined in a JSON configuration file. See [`examples/ser | Field | Type | Default | Description | |-------|------|---------|-------------| | `vram_overrides` | `object` | `{}` | GPU index → VRAM MB overrides (e.g., `{"0": 32000}`) | +| `default_backend_priority` | `string[]` | `[]` | Default GPU backend preference for models without their own `backend_priority` (e.g., `["vulkan"]`). Empty = all backends. | **`logging` object:** diff --git a/examples/server_config.json b/examples/server_config.json index ef8c883..35ea631 100644 --- a/examples/server_config.json +++ b/examples/server_config.json @@ -24,7 +24,8 @@ }, "hardware": { - "vram_overrides": {} + "vram_overrides": {}, + "default_backend_priority": [] }, "logging": { diff --git a/schemas/model_config.schema.json b/schemas/model_config.schema.json index d112f68..537777b 100644 --- a/schemas/model_config.schema.json +++ b/schemas/model_config.schema.json @@ -247,6 +247,58 @@ } } } + }, + "runtime_options": { + "type": "object", + "description": "Runtime options for llama.cpp model loading and inference. Applied as defaults; can be overridden at load time via the API.", + "properties": { + "flash_attn": { + "type": "boolean", + "description": "Enable or disable flash attention (-fa)" + }, + "kv_cache_type_k": { + "type": "string", + "description": "KV cache data type for keys (-ctk)", + "enum": ["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"] + }, + "kv_cache_type_v": { + "type": "string", + "description": "KV cache data type for values (-ctv)", + "enum": ["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"] + }, + "no_mmap": { + "type": "boolean", + "description": "Disable memory-mapped file I/O (--no-mmap)" + }, + "reasoning_budget": { + "type": "integer", + "description": "Reasoning token budget (--reasoning-budget). 0 disables reasoning tokens.", + "minimum": 0 + }, + "swa_full": { + "type": "boolean", + "description": "Use full-size sliding window attention cache (--swa-full)" + }, + "n_gpu_layers": { + "type": "integer", + "description": "Number of layers to offload to GPU (-ngl). 99 offloads all.", + "minimum": 0 + }, + "override_tensor": { + "type": "string", + "description": "Tensor override pattern (-ot) for routing tensors to CPU/GPU" + } + }, + "additionalProperties": false + }, + "backend_priority": { + "type": "array", + "description": "Ordered preference for GPU backends. First available backend is used.", + "items": { + "type": "string", + "enum": ["vulkan", "rocm", "cuda"] + }, + "uniqueItems": true } } } diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp index 9d8b013..892b93c 100644 --- a/src/arbiterAI/arbiterAI.cpp +++ b/src/arbiterAI/arbiterAI.cpp @@ -532,9 +532,15 @@ ErrorCode ArbiterAI::getAvailableModels(std::vector& models) // ========== Local Model Management ========== -ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize) +ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &variant, int contextSize, + const RuntimeOptions *optionsOverride) { - return ModelRuntime::instance().loadModel(model, variant, contextSize); + RuntimeOptions opts; + if(optionsOverride) + { + opts=*optionsOverride; + } + return ModelRuntime::instance().loadModel(model, variant, contextSize, opts); } ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant) diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h index 7f817ea..0f8d540 100644 --- a/src/arbiterAI/arbiterAI.h +++ b/src/arbiterAI/arbiterAI.h @@ -34,6 +34,7 @@ struct ModelFit; struct LoadedModel; struct SystemSnapshot; struct InferenceStats; +struct RuntimeOptions; /** * @struct VersionInfo @@ -241,6 +242,42 @@ inline void from_json(const nlohmann::json &j, ToolCall &t) * - "assistant": may include tool_calls when the model invokes tools * - "tool": includes tool_call_id linking the result back to a specific tool call */ + +/// Extract text from an OpenAI `content` field. +/// The spec allows content as either a plain string or an array of content +/// parts (e.g. [{"type":"text","text":"..."},{"type":"image_url",...}]). +/// This helper concatenates all "text" parts and ignores non-text entries. +inline std::string contentToString(const nlohmann::json &contentJson) +{ + if(contentJson.is_string()) + return contentJson.get(); + + if(contentJson.is_array()) + { + std::string result; + for(const nlohmann::json &part:contentJson) + { + if(part.is_string()) + { + if(!result.empty()) result+=' '; + result+=part.get(); + } + else if(part.is_object() + && part.contains("type") + && part.at("type").get()=="text" + && part.contains("text")) + { + if(!result.empty()) result+=' '; + result+=part.at("text").get(); + } + // Skip non-text parts (image_url, etc.) + } + return result; + } + + return {}; +} + struct Message { std::string role; @@ -262,7 +299,7 @@ inline void from_json(const nlohmann::json &j, Message &m) { j.at("role").get_to(m.role); if(j.contains("content") && !j.at("content").is_null()) - j.at("content").get_to(m.content); + m.content=contentToString(j.at("content")); if(j.contains("tool_call_id")) m.toolCallId=j.at("tool_call_id").get(); if(j.contains("tool_calls")) @@ -605,9 +642,11 @@ class ArbiterAI * @param model Model name * @param variant Quantization variant (empty = auto-select) * @param contextSize Context size (0 = model default) + * @param optionsOverride Optional runtime options to merge on top of model config defaults (nullptr = use config defaults) * @return ErrorCode indicating success, ModelDownloading, or failure */ - ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0); + ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0, + const RuntimeOptions *optionsOverride=nullptr); /** * @brief Download model files without loading into VRAM diff --git a/src/arbiterAI/configDownloader.cpp b/src/arbiterAI/configDownloader.cpp index 7571dac..d4d8198 100644 --- a/src/arbiterAI/configDownloader.cpp +++ b/src/arbiterAI/configDownloader.cpp @@ -195,20 +195,24 @@ ConfigDownloadStatus ConfigDownloader::checkoutVersion() spdlog::info("Checking out version: {}", m_version); - // Try to resolve the version as a direct ref, remote branch, or tag + // Try to resolve the version — prefer remote branch refs first so that + // a fetch+checkout always picks up the latest remote commit rather than + // a stale local branch ref that was never fast-forwarded. git_object *obj=nullptr; - error=git_revparse_single(&obj, repo, m_version.c_str()); + + // 1. Try as a remote-tracking branch (most common path after fetch) + std::string remoteBranch="refs/remotes/origin/"+m_version; + error=git_revparse_single(&obj, repo, remoteBranch.c_str()); if(error!=0) { - // Try as a remote branch - std::string remoteBranch="refs/remotes/origin/"+m_version; - error=git_revparse_single(&obj, repo, remoteBranch.c_str()); + // 2. Try as a direct ref / local branch / SHA + error=git_revparse_single(&obj, repo, m_version.c_str()); } if(error!=0) { - // Try as a tag + // 3. Try as a tag std::string tag="refs/tags/"+m_version; error=git_revparse_single(&obj, repo, tag.c_str()); } diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp index 1523cd2..d80cbe0 100644 --- a/src/arbiterAI/modelManager.cpp +++ b/src/arbiterAI/modelManager.cpp @@ -62,6 +62,18 @@ bool ModelVariant::isSplit() const return files.size()>1; } +void RuntimeOptions::mergeFrom(const RuntimeOptions &other) +{ + if(other.flashAttn.has_value()) flashAttn=other.flashAttn; + if(other.kvCacheTypeK.has_value()) kvCacheTypeK=other.kvCacheTypeK; + if(other.kvCacheTypeV.has_value()) kvCacheTypeV=other.kvCacheTypeV; + if(other.noMmap.has_value()) noMmap=other.noMmap; + if(other.reasoningBudget.has_value()) reasoningBudget=other.reasoningBudget; + if(other.swaFull.has_value()) swaFull=other.swaFull; + if(other.nGpuLayers.has_value()) nGpuLayers=other.nGpuLayers; + if(other.overrideTensor.has_value()) overrideTensor=other.overrideTensor; +} + ModelManager &ModelManager::instance() { static ModelManager instance; @@ -104,6 +116,13 @@ bool ModelManager::initialize(const std::vector &configPa } } } + + // Load GPU backend rules from the config repo + auto backendsPath=m_configDownloader.getLocalPath()/"configs"/"defaults"/"backends"/"gpu_backends.json"; + if(std::filesystem::exists(backendsPath)) + { + loadGpuBackendRules(backendsPath); + } } else { @@ -350,6 +369,53 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in } } + // Runtime options (llama.cpp model load/inference parameters) + if(modelJson.contains("runtime_options")&&modelJson["runtime_options"].is_object()) + { + auto &ro=modelJson["runtime_options"]; + + if(ro.contains("flash_attn")&&ro["flash_attn"].is_boolean()) + info.runtimeOptions.flashAttn=ro["flash_attn"].get(); + if(ro.contains("kv_cache_type_k")&&ro["kv_cache_type_k"].is_string()) + info.runtimeOptions.kvCacheTypeK=ro["kv_cache_type_k"].get(); + if(ro.contains("kv_cache_type_v")&&ro["kv_cache_type_v"].is_string()) + info.runtimeOptions.kvCacheTypeV=ro["kv_cache_type_v"].get(); + if(ro.contains("no_mmap")&&ro["no_mmap"].is_boolean()) + info.runtimeOptions.noMmap=ro["no_mmap"].get(); + if(ro.contains("reasoning_budget")&&ro["reasoning_budget"].is_number_integer()) + info.runtimeOptions.reasoningBudget=ro["reasoning_budget"].get(); + if(ro.contains("swa_full")&&ro["swa_full"].is_boolean()) + info.runtimeOptions.swaFull=ro["swa_full"].get(); + if(ro.contains("n_gpu_layers")&&ro["n_gpu_layers"].is_number_integer()) + info.runtimeOptions.nGpuLayers=ro["n_gpu_layers"].get(); + if(ro.contains("override_tensor")&&ro["override_tensor"].is_string()) + info.runtimeOptions.overrideTensor=ro["override_tensor"].get(); + } + + // Backend priority (ordered preference for GPU compute backends) + if(modelJson.contains("backend_priority")&&modelJson["backend_priority"].is_array()) + { + for(const auto &bp:modelJson["backend_priority"]) + { + if(bp.is_string()) + { + info.backendPriority.push_back(bp.get()); + } + } + } + + // Disabled backends (model-level override to exclude specific backends) + if(modelJson.contains("disabled_backends")&&modelJson["disabled_backends"].is_array()) + { + for(const auto &db:modelJson["disabled_backends"]) + { + if(db.is_string()) + { + info.disabledBackends.push_back(db.get()); + } + } + } + return true; } @@ -632,6 +698,7 @@ bool ModelManager::addModelFromJson(const nlohmann::json &modelJson, std::string m_models.push_back(info); m_modelProviderMap[info.model]=info.provider; m_runtimeModels.insert(info.model); + saveInjectedConfig(info.model); return true; } @@ -683,6 +750,7 @@ bool ModelManager::updateModelFromJson(const nlohmann::json &modelJson, std::str m_modelProviderMap[info.model]=info.provider; m_runtimeModels.insert(info.model); + saveInjectedConfig(info.model); return true; } @@ -696,10 +764,129 @@ bool ModelManager::removeModel(const std::string &modelName) m_models.erase(it); m_modelProviderMap.erase(modelName); - m_runtimeModels.erase(modelName); + + if(m_runtimeModels.count(modelName)) + { + m_runtimeModels.erase(modelName); + removeInjectedConfig(modelName); + } + return true; } +bool ModelManager::loadGpuBackendRules(const std::filesystem::path &filePath) +{ + spdlog::info("Loading GPU backend rules from: {}", filePath.string()); + + try + { + std::ifstream file(filePath); + + if(!file.is_open()) + { + spdlog::warn("Cannot open GPU backend rules file: {}", filePath.string()); + return false; + } + + nlohmann::json j=nlohmann::json::parse(file, nullptr, true, true); + + if(!j.contains("gpu_backends")||!j["gpu_backends"].is_array()) + { + spdlog::warn("GPU backend rules file missing 'gpu_backends' array"); + return false; + } + + m_gpuBackendRules.clear(); + + for(const nlohmann::json &entry:j["gpu_backends"]) + { + GpuBackendRule rule; + rule.name=entry.value("name", ""); + + if(entry.contains("match")&&entry["match"].is_array()) + { + for(const nlohmann::json &m:entry["match"]) + { + rule.match.push_back(m.get()); + } + } + + if(entry.contains("disabled_backends")&&entry["disabled_backends"].is_array()) + { + for(const nlohmann::json &d:entry["disabled_backends"]) + { + rule.disabledBackends.push_back(d.get()); + } + } + + if(entry.contains("backend_priority")&&entry["backend_priority"].is_array()) + { + for(const nlohmann::json &bp:entry["backend_priority"]) + { + rule.backendPriority.push_back(bp.get()); + } + } + + rule.notes=entry.value("notes", ""); + + if(!rule.match.empty()) + { + spdlog::info(" GPU backend rule '{}': match=[{}], priority=[{}], disabled=[{}]", + rule.name, + [&]() + { + std::string s; + for(const std::string &m:rule.match) { if(!s.empty()) s+=", "; s+=m; } + return s; + }(), + [&]() + { + std::string s; + for(const std::string &p:rule.backendPriority) { if(!s.empty()) s+=", "; s+=p; } + return s; + }(), + [&]() + { + std::string s; + for(const std::string &d:rule.disabledBackends) { if(!s.empty()) s+=", "; s+=d; } + return s; + }()); + m_gpuBackendRules.push_back(std::move(rule)); + } + } + + spdlog::info("Loaded {} GPU backend rules", m_gpuBackendRules.size()); + return true; + } + catch(const std::exception &e) + { + spdlog::warn("Failed to parse GPU backend rules: {}", e.what()); + return false; + } +} + +std::optional ModelManager::findGpuBackendRule(const std::string &gpuName) const +{ + std::string gpuLower=gpuName; + std::transform(gpuLower.begin(), gpuLower.end(), gpuLower.begin(), ::tolower); + + for(const GpuBackendRule &rule:m_gpuBackendRules) + { + for(const std::string &pattern:rule.match) + { + std::string patternLower=pattern; + std::transform(patternLower.begin(), patternLower.end(), patternLower.begin(), ::tolower); + + if(gpuLower.find(patternLower)!=std::string::npos) + { + return rule; + } + } + } + + return std::nullopt; +} + nlohmann::json ModelManager::modelInfoToJson(const ModelInfo &info) { nlohmann::json j; @@ -802,6 +989,41 @@ nlohmann::json ModelManager::modelInfoToJson(const ModelInfo &info) j["variants"]=variants; } + // Runtime options + { + nlohmann::json ro; + if(info.runtimeOptions.flashAttn.has_value()) + ro["flash_attn"]=info.runtimeOptions.flashAttn.value(); + if(info.runtimeOptions.kvCacheTypeK.has_value()) + ro["kv_cache_type_k"]=info.runtimeOptions.kvCacheTypeK.value(); + if(info.runtimeOptions.kvCacheTypeV.has_value()) + ro["kv_cache_type_v"]=info.runtimeOptions.kvCacheTypeV.value(); + if(info.runtimeOptions.noMmap.has_value()) + ro["no_mmap"]=info.runtimeOptions.noMmap.value(); + if(info.runtimeOptions.reasoningBudget.has_value()) + ro["reasoning_budget"]=info.runtimeOptions.reasoningBudget.value(); + if(info.runtimeOptions.swaFull.has_value()) + ro["swa_full"]=info.runtimeOptions.swaFull.value(); + if(info.runtimeOptions.nGpuLayers.has_value()) + ro["n_gpu_layers"]=info.runtimeOptions.nGpuLayers.value(); + if(info.runtimeOptions.overrideTensor.has_value()) + ro["override_tensor"]=info.runtimeOptions.overrideTensor.value(); + if(!ro.empty()) + j["runtime_options"]=ro; + } + + // Backend priority + if(!info.backendPriority.empty()) + { + j["backend_priority"]=info.backendPriority; + } + + // Disabled backends + if(!info.disabledBackends.empty()) + { + j["disabled_backends"]=info.disabledBackends; + } + return j; } @@ -849,4 +1071,177 @@ bool ModelManager::saveOverrides(const std::filesystem::path &overridePath) cons return true; } +std::string ModelManager::sanitizeFilename(const std::string &name) +{ + std::string result; + result.reserve(name.size()); + + for(char c:name) + { + if(std::isalnum(static_cast(c))||c=='-'||c=='_'||c=='.') + { + result+=c; + } + else + { + result+='_'; + } + } + + return result; +} + +void ModelManager::setInjectedConfigDir(const std::filesystem::path &dir) +{ + m_injectedConfigDir=dir; + + if(!dir.empty()) + { + std::error_code ec; + std::filesystem::create_directories(dir, ec); + + if(ec) + { + spdlog::error("Failed to create injected config directory '{}': {}", dir.string(), ec.message()); + } + else + { + spdlog::info("Injected model configs will be persisted to: {}", dir.string()); + } + } +} + +int ModelManager::loadInjectedConfigs() +{ + if(m_injectedConfigDir.empty()||!std::filesystem::exists(m_injectedConfigDir)) + { + return 0; + } + + int loaded=0; + + for(const auto &entry:std::filesystem::directory_iterator(m_injectedConfigDir)) + { + if(entry.path().extension()!=".json") + continue; + + try + { + std::ifstream file(entry.path()); + if(!file.is_open()) + { + spdlog::warn("Failed to open injected config: {}", entry.path().string()); + continue; + } + + nlohmann::json j=nlohmann::json::parse(file); + + if(!j.contains("model")||!j["model"].is_string()) + { + spdlog::warn("Injected config missing 'model' field: {}", entry.path().string()); + continue; + } + + std::string modelName=j["model"].get(); + + // Skip if a model with this name already exists (repo configs take precedence) + auto existing=std::find_if(m_models.begin(), m_models.end(), + [&modelName](const ModelInfo &info) { return info.model==modelName; }); + + if(existing!=m_models.end()) + { + spdlog::debug("Skipping injected config for '{}' — already loaded from config repo", modelName); + continue; + } + + std::string error; + if(addModelFromJson(j, error)) + { + spdlog::info("Restored injected model config: {}", modelName); + ++loaded; + } + else + { + spdlog::warn("Failed to restore injected config '{}': {}", modelName, error); + } + } + catch(const nlohmann::json::parse_error &e) + { + spdlog::warn("Failed to parse injected config '{}': {}", entry.path().string(), e.what()); + } + } + + if(loaded>0) + { + spdlog::info("Restored {} injected model config(s) from {}", loaded, m_injectedConfigDir.string()); + } + + return loaded; +} + +bool ModelManager::saveInjectedConfig(const std::string &modelName) const +{ + if(m_injectedConfigDir.empty()) + return true; // no persistence configured — not an error + + // Find the model info + auto it=std::find_if(m_models.begin(), m_models.end(), + [&modelName](const ModelInfo &info) { return info.model==modelName; }); + + if(it==m_models.end()) + return false; + + nlohmann::json j=modelInfoToJson(*it); + + std::string filename=sanitizeFilename(modelName)+".json"; + std::filesystem::path filePath=m_injectedConfigDir/filename; + std::filesystem::path tempPath=filePath.string()+".tmp"; + + std::ofstream file(tempPath); + if(!file.is_open()) + { + spdlog::error("Failed to write injected config for '{}': cannot open {}", modelName, tempPath.string()); + return false; + } + + file< {}", modelName, filePath.string()); + return true; +} + +bool ModelManager::removeInjectedConfig(const std::string &modelName) const +{ + if(m_injectedConfigDir.empty()) + return true; + + std::string filename=sanitizeFilename(modelName)+".json"; + std::filesystem::path filePath=m_injectedConfigDir/filename; + + std::error_code ec; + if(std::filesystem::exists(filePath, ec)) + { + std::filesystem::remove(filePath, ec); + if(ec) + { + spdlog::error("Failed to remove injected config for '{}': {}", modelName, ec.message()); + return false; + } + spdlog::info("Removed injected model config: {}", filePath.string()); + } + + return true; +} + } // namespace arbiterAI diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h index 703cb0f..f7ef48b 100644 --- a/src/arbiterAI/modelManager.h +++ b/src/arbiterAI/modelManager.h @@ -40,6 +40,22 @@ struct VariantDownload { std::string filename; }; +/// Runtime options that control llama.cpp model loading and inference behavior. +/// These can be set per-model in the config and overridden at load time via the API. +struct RuntimeOptions { + std::optional flashAttn; // -fa: enable/disable flash attention + std::optional kvCacheTypeK; // -ctk: KV cache type for keys (e.g. "f16", "q8_0", "q4_0") + std::optional kvCacheTypeV; // -ctv: KV cache type for values + std::optional noMmap; // --no-mmap: disable memory mapping + std::optional reasoningBudget; // --reasoning-budget: reasoning token budget (0=disabled) + std::optional swaFull; // --swa-full: full SWA (sliding window attention) + std::optional nGpuLayers; // -ngl: number of GPU layers (99=all) + std::optional overrideTensor; // -ot: tensor override pattern (e.g. "per_layer_token_embd.weight=CPU") + + /// Merge another set of options on top of this one (override only non-empty fields). + void mergeFrom(const RuntimeOptions &other); +}; + struct ModelVariant { std::string quantization; int fileSizeMb=0; @@ -103,11 +119,24 @@ struct ModelInfo std::optional hardwareRequirements; std::optional contextScaling; std::vector variants; + RuntimeOptions runtimeOptions; // Per-model llama.cpp runtime options + std::vector backendPriority; // Ordered preference: ["vulkan", "rocm", "cuda"] + std::vector disabledBackends; // Backends to exclude (model-level override) bool isCompatible(const std::string &clientVersion) const; bool isSchemaCompatible(const std::string &schemaVersion) const; }; +/// GPU architecture backend configuration entry. +/// Matched against detected GPU names to determine default backend behavior. +struct GpuBackendRule { + std::string name; // Human-readable name (e.g. "AMD RDNA 3.5 (Strix Point)") + std::vector match; // Case-insensitive substrings to match against GPU name + std::vector disabledBackends; // Backends to disable for this architecture + std::vector backendPriority; // Preferred backend order + std::string notes; // Human-readable notes +}; + class ModelManager { public: @@ -129,6 +158,24 @@ class ModelManager static nlohmann::json modelInfoToJson(const ModelInfo &info); bool saveOverrides(const std::filesystem::path &overridePath) const; + /// Set the directory where injected model configs are persisted as individual + /// JSON files. Each file is named after the model (sanitized). On add/update + /// the file is written; on delete the file is removed. Call loadInjectedConfigs() + /// after initialize() to restore previously injected models. + void setInjectedConfigDir(const std::filesystem::path &dir); + + /// Load all previously persisted injected model configs from the injected + /// config directory. Models that already exist (from the config repo) are + /// skipped — injected configs never shadow repo configs. + int loadInjectedConfigs(); + + /// Find the first GpuBackendRule whose match patterns hit the given GPU name. + /// Returns nullopt if no rule matches. + std::optional findGpuBackendRule(const std::string &gpuName) const; + + /// Get all loaded GPU backend rules (for diagnostics / API). + const std::vector &getGpuBackendRules() const { return m_gpuBackendRules; } + public: static int compareVersions(const std::string &v1, const std::string &v2); @@ -140,11 +187,24 @@ class ModelManager bool validateModelJson(const nlohmann::json &modelJson, std::string &error) const; void mergeModelInfo(ModelInfo &existing, const ModelInfo &source, const nlohmann::json &sourceJson) const; + /// Persist a single injected model config to the injected config directory. + bool saveInjectedConfig(const std::string &modelName) const; + + /// Remove a single injected model config file from the injected config directory. + bool removeInjectedConfig(const std::string &modelName) const; + + /// Sanitize a model name into a safe filename (alphanumeric, hyphens, underscores). + static std::string sanitizeFilename(const std::string &name); + std::vector m_models; std::map m_modelProviderMap; std::set m_runtimeModels; + std::vector m_gpuBackendRules; ConfigDownloader m_configDownloader; + std::filesystem::path m_injectedConfigDir; bool m_initialized{ false }; + + bool loadGpuBackendRules(const std::filesystem::path &filePath); }; } // namespace arbiterAI diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp index f4f80dd..4b5df20 100644 --- a/src/arbiterAI/modelRuntime.cpp +++ b/src/arbiterAI/modelRuntime.cpp @@ -1,10 +1,12 @@ #include "arbiterAI/modelRuntime.h" #include "arbiterAI/hardwareDetector.h" +#include "arbiterAI/modelManager.h" #include "arbiterAI/telemetryCollector.h" #include "arbiterAI/storageManager.h" #include #include +#include #include #include #include @@ -14,6 +16,23 @@ namespace arbiterAI { +/// Map a string KV cache type name to the corresponding ggml_type enum value. +/// Returns GGML_TYPE_COUNT if the string is not recognized. +static ggml_type parseGgmlType(const std::string &name) +{ + if(name=="f32") return GGML_TYPE_F32; + if(name=="f16") return GGML_TYPE_F16; + if(name=="bf16") return GGML_TYPE_BF16; + if(name=="q8_0") return GGML_TYPE_Q8_0; + if(name=="q4_0") return GGML_TYPE_Q4_0; + if(name=="q4_1") return GGML_TYPE_Q4_1; + if(name=="q5_0") return GGML_TYPE_Q5_0; + if(name=="q5_1") return GGML_TYPE_Q5_1; + + spdlog::warn("Unknown ggml type '{}', ignoring", name); + return GGML_TYPE_COUNT; +} + ModelRuntime &ModelRuntime::instance() { static ModelRuntime runtime; @@ -151,6 +170,7 @@ const char *loadFailureReasonToString(LoadFailureReason reason) case LoadFailureReason::ContextTooLarge: return "context_too_large"; case LoadFailureReason::UnsupportedArch: return "unsupported_arch"; case LoadFailureReason::BackendError: return "backend_error"; + case LoadFailureReason::VulkanDeviceLost: return "vulkan_device_lost"; default: return "unknown"; } } @@ -197,6 +217,23 @@ LoadErrorDetail ModelRuntime::classifyLoadFailure( return detail; } + // Check for Vulkan device lost (GPU driver reset or hung pipeline) + if(logLower.find("errordevicelost")!=std::string::npos|| + logLower.find("error_device_lost")!=std::string::npos|| + logLower.find("device lost")!=std::string::npos|| + logLower.find("vk_error_device_lost")!=std::string::npos|| + logLower.find("vk::queue::submit")!=std::string::npos&&logLower.find("lost")!=std::string::npos) + { + detail.reason=LoadFailureReason::VulkanDeviceLost; + detail.summary="Vulkan device lost during model load — the GPU driver may have reset"; + detail.suggestion="The Vulkan backend will be reinitialized automatically. " + "If this persists, restart the server. Possible causes: GPU overheating, " + "driver bug, or another process crashing the GPU."; + detail.action="reinit_backend"; + detail.recoverable=true; + return detail; + } + // Check for VRAM / GPU memory allocation failures if(logLower.find("out of memory")!=std::string::npos|| logLower.find("cuda error")!=std::string::npos|| @@ -299,7 +336,8 @@ int ModelRuntime::getMaxConcurrentDownloads() const ErrorCode ModelRuntime::loadModel( const std::string &model, const std::string &variant, - int contextSize) + int contextSize, + const RuntimeOptions &optionsOverride) { std::lock_guard lock(m_mutex); @@ -471,9 +509,17 @@ ErrorCode ModelRuntime::loadModel( // Actually load llama.cpp model for local providers if(modelInfo->provider=="llama") { + // Resolve runtime options: model config defaults + API override + RuntimeOptions resolvedOptions=modelInfo->runtimeOptions; + resolvedOptions.mergeFrom(optionsOverride); + entry.activeOptions=resolvedOptions; + + // Resolve backend priority: model config > architecture rule > server default + std::vector effectiveBackendPriority=resolveBackendPriority(*modelInfo); + std::string filePath="/models/"+primaryFilename; ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices, - fit.maxContextSize); + fit.maxContextSize, resolvedOptions, effectiveBackendPriority); if(loadResult!=ErrorCode::Success) { m_models.erase(model); @@ -831,7 +877,8 @@ ErrorCode ModelRuntime::unpinModel(const std::string &model) ErrorCode ModelRuntime::swapModel( const std::string &newModel, const std::string &variant, - int contextSize) + int contextSize, + const RuntimeOptions &optionsOverride) { if(m_inferenceActive) { @@ -841,6 +888,7 @@ ErrorCode ModelRuntime::swapModel( req.model=newModel; req.variant=variant; req.contextSize=contextSize; + req.optionsOverride=optionsOverride; m_pendingSwaps.push(req); spdlog::info("Swap to '{}' queued (inference active)", newModel); return ErrorCode::ModelDownloading; // "queued" status @@ -879,7 +927,7 @@ ErrorCode ModelRuntime::swapModel( } } - ErrorCode result=loadModel(newModel, variant, contextSize); + ErrorCode result=loadModel(newModel, variant, contextSize, optionsOverride); // Record swap telemetry std::chrono::steady_clock::time_point swapEnd=std::chrono::steady_clock::now(); @@ -943,6 +991,142 @@ int ModelRuntime::getReadyRamBudget() const return m_readyRamBudgetMb; } +void ModelRuntime::setDefaultBackendPriority(const std::vector &priority) +{ + std::lock_guard lock(m_mutex); + m_defaultBackendPriority=priority; + + if(!priority.empty()) + { + std::string joined; + for(const std::string &p:priority) + { + if(!joined.empty()) joined+=", "; + joined+=p; + } + spdlog::info("Default backend priority set to [{}]", joined); + } + else + { + spdlog::info("Default backend priority cleared (all backends)"); + } +} + +std::vector ModelRuntime::getDefaultBackendPriority() const +{ + std::lock_guard lock(m_mutex); + return m_defaultBackendPriority; +} + +std::vector ModelRuntime::resolveBackendPriority(const ModelInfo &model) const +{ + // Layered resolution: + // 1. Model config backend_priority (highest priority) + // 2. GPU architecture rule from config repo (matched by GPU name) + // 3. Server default_backend_priority (lowest priority) + // + // disabled_backends are collected from all layers (union) and removed + // from the final priority list. + + std::vector priority; + std::vector disabled; + + // Collect disabled backends from model config + for(const std::string &d:model.disabledBackends) + { + std::string lower=d; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + disabled.push_back(lower); + } + + // Look up architecture rule based on detected GPU names + std::optional archRule; + std::vector gpus=HardwareDetector::instance().getGpus(); + + for(const GpuInfo &gpu:gpus) + { + archRule=ModelManager::instance().findGpuBackendRule(gpu.name); + if(archRule) + { + spdlog::debug("GPU '{}' matched architecture rule '{}'", gpu.name, archRule->name); + break; + } + } + + // Collect disabled backends from architecture rule + if(archRule) + { + for(const std::string &d:archRule->disabledBackends) + { + std::string lower=d; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + if(std::find(disabled.begin(), disabled.end(), lower)==disabled.end()) + disabled.push_back(lower); + } + } + + // Determine priority: model config > architecture rule > server default + if(!model.backendPriority.empty()) + { + priority=model.backendPriority; + } + else if(archRule&&!archRule->backendPriority.empty()) + { + priority=archRule->backendPriority; + } + else + { + priority=m_defaultBackendPriority; + } + + // Remove disabled backends from the priority list + if(!disabled.empty()&&!priority.empty()) + { + std::vector filtered; + + for(const std::string &p:priority) + { + std::string lower=p; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + + // Also check rocm/hip aliasing + bool isDisabled=false; + for(const std::string &d:disabled) + { + if(lower==d) { isDisabled=true; break; } + if((lower=="rocm"||lower=="hip")&&(d=="rocm"||d=="hip")) { isDisabled=true; break; } + } + + if(!isDisabled) + filtered.push_back(p); + } + + priority=filtered; + } + + if(!priority.empty()||!disabled.empty()) + { + spdlog::info("Backend resolution for '{}': priority=[{}], disabled=[{}], source={}", + model.model, + [&]() + { + std::string s; + for(const std::string &p:priority) { if(!s.empty()) s+=", "; s+=p; } + return s.empty()?"(all)":s; + }(), + [&]() + { + std::string s; + for(const std::string &d:disabled) { if(!s.empty()) s+=", "; s+=d; } + return s.empty()?"(none)":s; + }(), + !model.backendPriority.empty()?"model config": + (archRule?"architecture rule '"+archRule->name+"'":"server default")); + } + + return priority; +} + void ModelRuntime::evictIfNeeded(int requiredVramMb) { // Calculate current VRAM usage across all loaded models @@ -1113,7 +1297,7 @@ void ModelRuntime::drainPendingSwaps() // Release lock before calling swapModel (it acquires its own lock) m_mutex.unlock(); - swapModel(latest.model, latest.variant, latest.contextSize); + swapModel(latest.model, latest.variant, latest.contextSize, latest.optionsOverride); m_mutex.lock(); } @@ -1189,94 +1373,310 @@ void ModelRuntime::initLlamaBackend() } } +void ModelRuntime::reinitLlamaBackend() +{ + spdlog::warn("Reinitializing llama.cpp backend (Vulkan device recovery)"); + + if(m_llamaInitialized) + { + llama_backend_free(); + m_llamaInitialized=false; + } + + // Brief pause to let the GPU driver settle after a device-lost event + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + llama_backend_init(); + m_llamaInitialized=true; + + spdlog::info("llama.cpp backend reinitialized successfully"); +} + ErrorCode ModelRuntime::loadLlamaModel( const std::string &model, const std::string &filePath, int contextSize, const std::vector &gpuIndices, - int maxHardwareContext) + int maxHardwareContext, + const RuntimeOptions &options, + const std::vector &backendPriority) { initLlamaBackend(); - // Start capturing llama.cpp log output for diagnostics - beginLlamaLogCapture(); + // Log available backend devices matching backendPriority for diagnostics. + // NOTE: We intentionally do NOT set mparams.devices — llama.cpp's default + // device selection (devices=NULL) produces much better tensor placement on + // UMA/iGPU systems. When an explicit device list is provided that includes + // both GPU and CPU, llama.cpp treats them as co-equal allocation targets and + // splits model tensors and KV cache across both, which dramatically hurts + // performance on unified-memory architectures. Leaving devices=NULL lets + // llama.cpp's internal logic keep everything on the GPU device. + if(!backendPriority.empty()) + { + size_t devCount=ggml_backend_dev_count(); + std::vector matchedDevices; - llama_model_params mparams=llama_model_default_params(); - mparams.n_gpu_layers=99; // offload all layers to GPU by default + for(const std::string &preferred:backendPriority) + { + std::string prefLower=preferred; + std::transform(prefLower.begin(), prefLower.end(), prefLower.begin(), ::tolower); - llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams); - if(!llamaModel) - { - std::string captured=m_llamaLogCapture.str(); - endLlamaLogCapture(); + for(size_t i=0; i 0 → user/config requested explicit size - // contextSize == 0 → use model's native training context - // In both cases, cap by the hardware-fit maximum. - int actualContext=contextSize; - if(actualContext<=0) - { - actualContext=nativeContext; - } - if(maxHardwareContext>0&&actualContext>maxHardwareContext) - { - spdlog::info("Capping context from {} to {} (hardware limit) for model '{}'", - actualContext, maxHardwareContext, model); - actualContext=maxHardwareContext; + std::string regName=ggml_backend_reg_name(reg); + std::string regLower=regName; + std::transform(regLower.begin(), regLower.end(), regLower.begin(), ::tolower); + + bool match=(regLower==prefLower); + if(!match&&(prefLower=="rocm"||prefLower=="hip")) + match=(regLower=="rocm"||regLower=="hip"); + if(!match&&prefLower=="cuda") + match=(regLower=="cuda"); + + if(match) + { + std::string desc=std::string(ggml_backend_dev_name(dev)) + +" ("+ggml_backend_dev_description(dev)+") ["+regName+"]"; + matchedDevices.push_back(desc); + } + } + } + + if(!matchedDevices.empty()) + { + spdlog::info("Backend priority for '{}': [{}] — {} GPU device(s) available", + model, + [&]() + { + std::string s; + for(const std::string &p:backendPriority) + { + if(!s.empty()) s+=", "; + s+=p; + } + return s; + }(), + matchedDevices.size()); + + for(size_t i=0; i(actualContext); - cparams.n_threads=std::thread::hardware_concurrency(); - cparams.n_threads_batch=std::thread::hardware_concurrency(); + int maxAttempts=2; // 1 normal + 1 retry after backend reinit - llama_context *llamaCtx=llama_init_from_model(llamaModel, cparams); - if(!llamaCtx) + for(int attempt=0; attempt0) + { + spdlog::warn("Retrying model load for '{}' (attempt {}/{})", model, attempt+1, maxAttempts); + } + + // Start capturing llama.cpp log output for diagnostics + beginLlamaLogCapture(); - m_lastLoadError=classifyLoadFailure(captured, model, filePath, actualContext); + llama_model_params mparams=llama_model_default_params(); + mparams.n_gpu_layers=options.nGpuLayers.value_or(99); - // If classification didn't catch a specific VRAM/context issue, - // context creation failure is almost always a memory issue - if(m_lastLoadError.reason==LoadFailureReason::Unknown|| - m_lastLoadError.reason==LoadFailureReason::BackendError) + if(options.noMmap.has_value()&&options.noMmap.value()) { - m_lastLoadError.reason=LoadFailureReason::InsufficientVram; - m_lastLoadError.summary="Failed to create context (size="+std::to_string(actualContext)+ - ") — likely insufficient GPU memory"; - m_lastLoadError.suggestion="Try a smaller context size or use a smaller quantization variant. " - "You can also unload other models to free VRAM."; - m_lastLoadError.action="reduce_context"; - m_lastLoadError.recoverable=true; + mparams.use_mmap=false; } - spdlog::error("Failed to create llama context for model: {} — {}", model, m_lastLoadError.summary); - llama_model_free(llamaModel); - return ErrorCode::ModelLoadError; - } + // On UMA/iGPU systems (e.g. AMD APUs), mmap causes model tensors to be + // imported as host-visible "CPU_Mapped" buffers via VK_EXT_external_memory_host + // instead of being allocated as device-local memory. This bypasses the Vulkan + // backend's normal allocation path and results in dramatically slower GPU access. + // Auto-disable mmap when any active GPU is a unified-memory device, unless + // the user explicitly set no_mmap=false. + if(!options.noMmap.has_value()&&mparams.use_mmap) + { + std::vector gpus=HardwareDetector::instance().getGpus(); + + for(const GpuInfo &gpu:gpus) + { + if(gpu.unifiedMemory) + { + spdlog::info("UMA device detected ({}), disabling mmap for model '{}' " + "to ensure device-local memory allocation", + gpu.name, model); + mparams.use_mmap=false; + break; + } + } + } + + // NOTE: mparams.devices is intentionally left as NULL (default). + // See comment above about why explicit device lists hurt UMA performance. - endLlamaLogCapture(); + llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams); + if(!llamaModel) + { + std::string captured=m_llamaLogCapture.str(); + endLlamaLogCapture(); - LoadedModel &entry=m_models[model]; - entry.llamaModel=llamaModel; - entry.llamaCtx=llamaCtx; - entry.maxContextSize=nativeContext; - entry.contextSize=static_cast(llama_n_ctx(llamaCtx)); + m_lastLoadError=classifyLoadFailure(captured, model, filePath, contextSize); + spdlog::error("Failed to load llama model from: {} — {}", filePath, m_lastLoadError.summary); - spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={})", - model, entry.contextSize, entry.maxContextSize); - return ErrorCode::Success; + // If Vulkan device lost and we haven't retried yet, reinit and try again + if(m_lastLoadError.reason==LoadFailureReason::VulkanDeviceLost&&attempt+1 0 → user/config requested explicit size + // contextSize == 0 → use model's native training context + // In both cases, cap by the hardware-fit maximum. + int actualContext=contextSize; + if(actualContext<=0) + { + actualContext=nativeContext; + } + if(maxHardwareContext>0&&actualContext>maxHardwareContext) + { + spdlog::info("Capping context from {} to {} (hardware limit) for model '{}'", + actualContext, maxHardwareContext, model); + actualContext=maxHardwareContext; + } + + llama_context_params cparams=llama_context_default_params(); + cparams.n_ctx=static_cast(actualContext); + cparams.n_threads=std::thread::hardware_concurrency(); + cparams.n_threads_batch=std::thread::hardware_concurrency(); + + // Apply runtime options to context params + if(options.flashAttn.has_value()) + { + cparams.flash_attn_type=options.flashAttn.value() + ?LLAMA_FLASH_ATTN_TYPE_ENABLED + :LLAMA_FLASH_ATTN_TYPE_DISABLED; + } + + if(options.kvCacheTypeK.has_value()) + { + ggml_type kType=parseGgmlType(options.kvCacheTypeK.value()); + if(kType!=GGML_TYPE_COUNT) + { + cparams.type_k=kType; + } + } + + if(options.kvCacheTypeV.has_value()) + { + ggml_type vType=parseGgmlType(options.kvCacheTypeV.value()); + if(vType!=GGML_TYPE_COUNT) + { + cparams.type_v=vType; + } + } + + if(options.swaFull.has_value()) + { + cparams.swa_full=options.swaFull.value(); + } + + llama_context *llamaCtx=llama_init_from_model(llamaModel, cparams); + if(!llamaCtx) + { + std::string captured=m_llamaLogCapture.str(); + endLlamaLogCapture(); + + m_lastLoadError=classifyLoadFailure(captured, model, filePath, actualContext); + + // If Vulkan device lost and we haven't retried yet, reinit and try again + if(m_lastLoadError.reason==LoadFailureReason::VulkanDeviceLost&&attempt+1(llama_n_ctx(llamaCtx)); + + spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={}, ngl={}, flash_attn={}, mmap={}, backend_filter={})", + model, entry.contextSize, entry.maxContextSize, + options.nGpuLayers.value_or(99), + options.flashAttn.has_value()?(options.flashAttn.value()?"enabled":"disabled"):"auto", + mparams.use_mmap?"on":"off", + backendPriority.empty()?"all":[&]() + { + std::string s; + for(const std::string &p:backendPriority) + { + if(!s.empty()) s+=","; + s+=p; + } + return s; + }()); + return ErrorCode::Success; + } + + // Should not reach here, but just in case + return ErrorCode::ModelLoadError; } void ModelRuntime::freeLlamaModel(LoadedModel &entry) diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h index d8c07e2..79a7bda 100644 --- a/src/arbiterAI/modelRuntime.h +++ b/src/arbiterAI/modelRuntime.h @@ -42,7 +42,8 @@ enum class LoadFailureReason { InsufficientRam, // not enough system RAM ContextTooLarge, // requested context exceeds model or hardware limits UnsupportedArch, // model architecture not supported by this llama.cpp build - BackendError // llama.cpp internal error + BackendError, // llama.cpp internal error + VulkanDeviceLost // Vulkan device lost — GPU driver reset or hung pipeline }; /// Convert a LoadFailureReason to a stable, snake_case string for API responses. @@ -72,6 +73,7 @@ struct LoadedModel { bool pinned=false; llama_model *llamaModel=nullptr; llama_context *llamaCtx=nullptr; + RuntimeOptions activeOptions; // llama.cpp options active for this loaded model }; class ModelRuntime { @@ -86,11 +88,13 @@ class ModelRuntime { /// @param model Model name from ModelManager. /// @param variant Quantization variant (empty = auto-select best fitting). /// @param contextSize Context size (0 = use model default). + /// @param optionsOverride Optional runtime options to merge on top of model config defaults. /// @return ErrorCode::Success, ModelDownloading, ModelNotFound, ModelLoadError. ErrorCode loadModel( const std::string &model, const std::string &variant="", - int contextSize=0); + int contextSize=0, + const RuntimeOptions &optionsOverride=RuntimeOptions{}); /// Download model files without loading into VRAM. /// Launches an async background download that respects the concurrent @@ -121,7 +125,8 @@ class ModelRuntime { ErrorCode swapModel( const std::string &newModel, const std::string &variant="", - int contextSize=0); + int contextSize=0, + const RuntimeOptions &optionsOverride=RuntimeOptions{}); /// Get the state of all tracked models. std::vector getModelStates() const; @@ -145,6 +150,14 @@ class ModelRuntime { /// Get the current RAM budget for "Ready" tier models. int getReadyRamBudget() const; + /// Set the default backend priority used when a model config does not + /// specify its own backend_priority. E.g. ["vulkan"] to force Vulkan + /// on systems where ROCm is unstable. + void setDefaultBackendPriority(const std::vector &priority); + + /// Get the current default backend priority. + std::vector getDefaultBackendPriority() const; + /// Evict least-recently-used non-pinned models to free VRAM. void evictIfNeeded(int requiredVramMb); @@ -197,15 +210,30 @@ class ModelRuntime { /// Initialize the llama.cpp backend (called once on first local model load). void initLlamaBackend(); + /// Resolve the effective backend priority for a model, applying layered + /// rules: (1) model config, (2) architecture rule from config repo matched + /// against GPU name, (3) server default. Disabled backends from all layers + /// are removed from the result. + std::vector resolveBackendPriority(const ModelInfo &model) const; + + /// Tear down and reinitialize the llama.cpp backend. + /// Used to recover from Vulkan device-lost errors. + void reinitLlamaBackend(); + /// Load a GGUF file into llama.cpp. /// @param contextSize Requested context (0 = use model's native training context). /// @param maxHardwareContext Hardware-fit limit (0 = no limit). + /// @param options Resolved runtime options to apply. + /// @param backendPriority Ordered backend preference (e.g. ["vulkan","rocm"]). + /// Empty = use all available backends (default). ErrorCode loadLlamaModel( const std::string &model, const std::string &filePath, int contextSize, const std::vector &gpuIndices, - int maxHardwareContext=0); + int maxHardwareContext=0, + const RuntimeOptions &options=RuntimeOptions{}, + const std::vector &backendPriority={}); /// Free llama.cpp resources for a model. void freeLlamaModel(LoadedModel &entry); @@ -222,6 +250,7 @@ class ModelRuntime { std::map m_models; mutable std::mutex m_mutex; int m_readyRamBudgetMb=0; + std::vector m_defaultBackendPriority; std::atomic m_inferenceActive{false}; std::string m_inferenceModel; bool m_llamaInitialized=false; @@ -230,6 +259,7 @@ class ModelRuntime { std::string model; std::string variant; int contextSize=0; + RuntimeOptions optionsOverride; }; std::queue m_pendingSwaps; diff --git a/src/server/dashboard.h b/src/server/dashboard.h index d68ed32..90d123c 100644 --- a/src/server/dashboard.h +++ b/src/server/dashboard.h @@ -554,7 +554,7 @@ td
-

Loaded Models

+

Loaded Models Downloaded Models →

@@ -575,41 +575,27 @@ td
-

Downloaded Models

-
-
- Used: - - Limit: - -
-
-
-
-
-
-
- Auto-cleanup: - - -
-
+

Active Requests

- - - - - - - - - - - - - - - - - - -
ModelVariantSizeDownloadedLast UsedUsesStateHot ReadyProtectedActions
No downloaded models
+
+ + + + + + + + + + + + + + + + +
ModelStatusInput TokensOutput TokensPrompt t/sGen t/sLatencyTotal Time
No recent requests
+
@@ -1136,7 +1122,6 @@ async function toggleHotReady(name, variant, currentlyHotReady) const method=currentlyHotReady?"DELETE":"POST"; const url="/api/models/"+encodeURIComponent(name)+"/variants/"+encodeURIComponent(variant)+"/hot-ready"; await fetch(url, {method}); - await refreshStorage(); } async function toggleProtected(name, variant, currentlyProtected) @@ -1144,37 +1129,6 @@ async function toggleProtected(name, variant, currentlyProtected) const method=currentlyProtected?"DELETE":"POST"; const url="/api/models/"+encodeURIComponent(name)+"/variants/"+encodeURIComponent(variant)+"/protected"; await fetch(url, {method}); - await refreshStorage(); -} - -async function deleteModelFile(name, variant) -{ - if(!confirm("Delete "+name+" "+variant+"? This cannot be undone.")) return; - const url="/api/models/"+encodeURIComponent(name)+"/files"+(variant?"?variant="+encodeURIComponent(variant):""); - const resp=await fetch(url, {method:"DELETE"}); - if(resp.status===409) - { - const data=await resp.json(); - alert(data.error?.message||"Cannot delete: variant is guarded"); - } - await refreshStorage(); -} - -function renderStorageBar(storage) -{ - if(!storage) return; - - const used=storage.used_by_models_bytes||0; - const limit=storage.storage_limit_bytes; - const free=storage.free_disk_bytes||0; - const total=limit>0?limit:(used+free); - const pct=total>0?(used/total*100):0; - - document.getElementById("storageUsedLabel").textContent="Used: "+formatBytesJs(used); - document.getElementById("storageLimitLabel").textContent=limit>0?"Limit: "+formatBytesJs(limit):"Limit: All free space"; - document.getElementById("storageBarFill").style.width=pct.toFixed(1)+"%"; - document.getElementById("storageBarText").textContent=formatBytesJs(used)+" / "+formatBytesJs(total)+" ("+pct.toFixed(1)+"%)"; - document.getElementById("storageCleanupLabel").textContent="Auto-cleanup: "+(storage.cleanup_enabled?"ON":"OFF"); } function renderDownloadProgress(downloads) @@ -1213,64 +1167,46 @@ function renderDownloadProgress(downloads) el.innerHTML=html; } -function renderDownloadedModels(models) +function renderActiveRequests(history) { - const el=document.getElementById("downloadedModelTable"); + const el=document.getElementById("activeRequestTable"); - if(!models||models.length===0) + if(!history||history.length===0) { - el.innerHTML='No downloaded models'; + el.innerHTML='No recent requests'; return; } + const recent=history.slice(-20).reverse(); let html=""; - for(const m of models) + for(const s of recent) { - const ageClass=rowAgeClass(m.last_used_at); - const guarded=m.hot_ready||m.protected; - const hrClass=m.hot_ready?"btn-toggle active":"btn-toggle"; - const prClass=m.protected?"btn-toggle active":"btn-toggle"; - const deleteDisabled=guarded?"btn-disabled":""; - const deleteTitle=guarded?"Clear hot_ready and protected first":"Delete model file"; + const promptTps=s.prompt_tokens_per_second||0; + const genTps=s.generation_tokens_per_second||0; + const totalMs=s.total_time_ms||0; + const latencyMs=s.latency_ms||0; + const isActive=(totalMs===0&&latencyMs===0); - html+=` - ${m.model} - ${m.variant||"-"} - ${m.file_size_display||formatBytesJs(m.file_size_bytes)} - ${formatDate(m.downloaded_at)} - ${formatDate(m.last_used_at)} - ${m.usage_count||0} - ${m.runtime_state||"Unloaded"} - - - + html+=` + ${s.model} + ${isActive?"Running":"Done"} + ${s.prompt_tokens.toLocaleString()} + ${s.completion_tokens.toLocaleString()} + ${promptTps.toFixed(1)} + ${genTps.toFixed(1)} + ${latencyMs.toFixed(0)} ms + ${totalMs.toFixed(0)} ms `; } el.innerHTML=html; } -async function refreshStorage() +async function refreshDownloads() { - const [storage, storageModels, downloads, cleanupPreview]=await Promise.all([ - fetchJson("/api/storage"), - fetchJson("/api/storage/models"), - fetchJson("/api/downloads"), - fetchJson("/api/storage/cleanup/preview") - ]); - - renderStorageBar(storage); + const downloads=await fetchJson("/api/downloads"); if(downloads&&downloads.downloads) renderDownloadProgress(downloads.downloads); else renderDownloadProgress([]); - - if(storageModels&&storageModels.models) renderDownloadedModels(storageModels.models); - else renderDownloadedModels([]); - - if(cleanupPreview) - { - const count=cleanupPreview.candidate_count||0; - document.getElementById("storageCandidatesLabel").textContent=count>0?count+" cleanup candidate"+(count>1?"s":""):""; - } } async function refresh() @@ -1332,11 +1268,14 @@ async function refresh() // Inference history if(history) renderInferences(history); + // Active requests summary + if(history) renderActiveRequests(history); + // Swaps if(swaps) renderSwaps(swaps); - // Storage (runs in parallel) - refreshStorage(); + // Active downloads + refreshDownloads(); } async function loadVersion() @@ -1361,6 +1300,532 @@ setInterval(refreshLogs, POLL_INTERVAL); )HTML"; +const std::string DASHBOARD_STORAGE_HTML=R"HTML( + + + + +ArbiterAI — Downloaded Models + + + +
+
+ ← Dashboard + Downloaded Models + +
+
Connected
+
+
+
+

Storage

+
+
+ Used: - + Limit: - +
+
+
+
-
+
+
+ Auto-cleanup: - + +
+
+
+
+
+

Model Files

+ + + + + + + + + + + + + + + + + + +
ModelVariantSizeDownloadedLast UsedUsesStateHot ReadyProtectedActions
No downloaded models
+
+
+ + +)HTML"; + } // namespace server } // namespace arbiterAI diff --git a/src/server/main.cpp b/src/server/main.cpp index 5e156fc..7fb4f67 100644 --- a/src/server/main.cpp +++ b/src/server/main.cpp @@ -138,6 +138,7 @@ int main(int argc, char *argv[]) std::string defaultModel=cfg.value("default_model", ""); std::string defaultVariant=cfg.value("default_variant", ""); std::string overridePath=cfg.value("override_path", ""); + std::string injectedConfigDir=cfg.value("injected_config_dir", ""); int ramBudget=cfg.value("ram_budget_mb", 0); int maxDownloads=cfg.value("max_concurrent_downloads", 2); @@ -152,6 +153,15 @@ int main(int argc, char *argv[]) nlohmann::json hwCfg=cfg.value("hardware", nlohmann::json::object()); nlohmann::json vramOverrides=hwCfg.value("vram_overrides", nlohmann::json::object()); + std::vector defaultBackendPriority; + if(hwCfg.contains("default_backend_priority")&&hwCfg["default_backend_priority"].is_array()) + { + for(const nlohmann::json &bp:hwCfg["default_backend_priority"]) + { + defaultBackendPriority.push_back(bp.get()); + } + } + // Logging nlohmann::json logCfg=cfg.value("logging", nlohmann::json::object()); std::string logLevel=logCfg.value("level", "info"); @@ -220,6 +230,17 @@ int main(int argc, char *argv[]) spdlog::info("ArbiterAI initialized successfully"); + // ── Restore injected model configs ─────────────────────────── + if(!injectedConfigDir.empty()) + { + arbiterAI::ModelManager::instance().setInjectedConfigDir(injectedConfigDir); + int restored=arbiterAI::ModelManager::instance().loadInjectedConfigs(); + if(restored>0) + { + spdlog::info("Restored {} injected model config(s)", restored); + } + } + // ── Apply VRAM overrides ───────────────────────────────────── for(auto it=vramOverrides.begin(); it!=vramOverrides.end(); ++it) { @@ -260,6 +281,12 @@ int main(int argc, char *argv[]) spdlog::info("Ready model RAM budget set to {} MB", ramBudget); } + // ── Default backend priority ───────────────────────────────── + if(!defaultBackendPriority.empty()) + { + arbiterAI::ModelRuntime::instance().setDefaultBackendPriority(defaultBackendPriority); + } + // ── Concurrent download limit ──────────────────────────────── if(maxDownloads>0) { diff --git a/src/server/routes.cpp b/src/server/routes.cpp index ef178e6..843b88b 100644 --- a/src/server/routes.cpp +++ b/src/server/routes.cpp @@ -111,7 +111,23 @@ nlohmann::json systemInfoToJson(const SystemInfo &hw) nlohmann::json gpus=nlohmann::json::array(); for(const GpuInfo &gpu:hw.gpus) { - gpus.push_back(gpuInfoToJson(gpu)); + nlohmann::json gpuJson=gpuInfoToJson(gpu); + + // Attach matched architecture rule (if any) + std::optional rule=ModelManager::instance().findGpuBackendRule(gpu.name); + if(rule) + { + nlohmann::json ruleJson={ + {"name", rule->name} + }; + if(!rule->disabledBackends.empty()) + ruleJson["disabled_backends"]=rule->disabledBackends; + if(!rule->backendPriority.empty()) + ruleJson["backend_priority"]=rule->backendPriority; + gpuJson["architecture_rule"]=ruleJson; + } + + gpus.push_back(gpuJson); } return { @@ -136,6 +152,54 @@ std::string modelStateToString(ModelState state) } } +nlohmann::json runtimeOptionsToJson(const RuntimeOptions &opts) +{ + nlohmann::json j=nlohmann::json::object(); + + if(opts.flashAttn.has_value()) + j["flash_attn"]=opts.flashAttn.value(); + if(opts.kvCacheTypeK.has_value()) + j["kv_cache_type_k"]=opts.kvCacheTypeK.value(); + if(opts.kvCacheTypeV.has_value()) + j["kv_cache_type_v"]=opts.kvCacheTypeV.value(); + if(opts.noMmap.has_value()) + j["no_mmap"]=opts.noMmap.value(); + if(opts.reasoningBudget.has_value()) + j["reasoning_budget"]=opts.reasoningBudget.value(); + if(opts.swaFull.has_value()) + j["swa_full"]=opts.swaFull.value(); + if(opts.nGpuLayers.has_value()) + j["n_gpu_layers"]=opts.nGpuLayers.value(); + if(opts.overrideTensor.has_value()) + j["override_tensor"]=opts.overrideTensor.value(); + + return j; +} + +RuntimeOptions parseRuntimeOptions(const nlohmann::json &j) +{ + RuntimeOptions opts; + + if(j.contains("flash_attn")&&j["flash_attn"].is_boolean()) + opts.flashAttn=j["flash_attn"].get(); + if(j.contains("kv_cache_type_k")&&j["kv_cache_type_k"].is_string()) + opts.kvCacheTypeK=j["kv_cache_type_k"].get(); + if(j.contains("kv_cache_type_v")&&j["kv_cache_type_v"].is_string()) + opts.kvCacheTypeV=j["kv_cache_type_v"].get(); + if(j.contains("no_mmap")&&j["no_mmap"].is_boolean()) + opts.noMmap=j["no_mmap"].get(); + if(j.contains("reasoning_budget")&&j["reasoning_budget"].is_number_integer()) + opts.reasoningBudget=j["reasoning_budget"].get(); + if(j.contains("swa_full")&&j["swa_full"].is_boolean()) + opts.swaFull=j["swa_full"].get(); + if(j.contains("n_gpu_layers")&&j["n_gpu_layers"].is_number_integer()) + opts.nGpuLayers=j["n_gpu_layers"].get(); + if(j.contains("override_tensor")&&j["override_tensor"].is_string()) + opts.overrideTensor=j["override_tensor"].get(); + + return opts; +} + nlohmann::json loadedModelToJson(const LoadedModel &m) { nlohmann::json gpuIndices=nlohmann::json::array(); @@ -144,7 +208,7 @@ nlohmann::json loadedModelToJson(const LoadedModel &m) gpuIndices.push_back(idx); } - return { + nlohmann::json j={ {"model", m.modelName}, {"variant", m.variant}, {"state", modelStateToString(m.state)}, @@ -156,6 +220,14 @@ nlohmann::json loadedModelToJson(const LoadedModel &m) {"gpu_indices", gpuIndices}, {"pinned", m.pinned} }; + + nlohmann::json activeOpts=runtimeOptionsToJson(m.activeOptions); + if(!activeOpts.empty()) + { + j["runtime_options"]=activeOpts; + } + + return j; } nlohmann::json inferenceStatsToJson(const InferenceStats &s) @@ -226,6 +298,33 @@ std::string errorCodeToString(ErrorCode code) } } +/// Parse a model identifier that may contain a ":variant" suffix. +/// Examples: +/// "Qwen3.5-27B:Q4_K_M" → ("Qwen3.5-27B", "Q4_K_M") +/// "gpt-4" → ("gpt-4", "") +/// "gpt-oss-120b:Q8_0" → ("gpt-oss-120b", "Q8_0") +/// Uses rfind to handle model names that may themselves contain colons. +std::pair parseModelVariant(const std::string &modelId) +{ + // Only split on ':' if the suffix looks like a quantization variant + // (starts with Q, F, IQ, or BF — e.g. Q4_K_M, F16, IQ4_XS, BF16). + // This avoids breaking model names that contain colons for other reasons. + size_t pos=modelId.rfind(':'); + if(pos!=std::string::npos&&pos+1=2&&(suffix.substr(0, 2)=="IQ"||suffix.substr(0, 2)=="iq" + ||suffix.substr(0, 2)=="BF"||suffix.substr(0, 2)=="bf"))) + { + return {modelId.substr(0, pos), suffix}; + } + } + return {modelId, ""}; +} + } // anonymous namespace // ========== Override Path ========== @@ -312,6 +411,9 @@ void registerRoutes(httplib::Server &server) // Logs server.Get("/api/logs", handleGetLogs); + // Runtime options + server.Get("/api/runtime-options", handleGetRuntimeOptions); + // Storage management server.Get("/api/storage", handleGetStorage); server.Get("/api/storage/models", handleGetStorageModels); @@ -330,6 +432,7 @@ void registerRoutes(httplib::Server &server) server.Get("/api/downloads", handleGetActiveDownloads); // Dashboard + server.Get("/dashboard/storage", handleDashboardStorage); server.Get("/dashboard", handleDashboard); spdlog::info("Registered all HTTP routes"); @@ -358,6 +461,31 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) { arbiterRequest.model=requestJson.at("model"); + // Parse "model:variant" syntax (e.g. "Qwen3.5-27B:Q4_K_M") + // Strip the variant from the model name so the core API gets the bare + // model name. If a variant was specified, pre-load it so the llama + // provider uses the right quantization. + auto [baseName, requestedVariant]=parseModelVariant(arbiterRequest.model); + arbiterRequest.model=baseName; + + if(!requestedVariant.empty()) + { + ErrorCode loadErr=ArbiterAI::instance().loadModel(baseName, requestedVariant); + if(loadErr==ErrorCode::ModelDownloading) + { + res.status=503; + res.set_content(errorJson("Model '"+baseName+"' variant '"+requestedVariant + +"' is still downloading", "server_error", "model", "model_downloading").dump(), + "application/json"); + return; + } + if(loadErr!=ErrorCode::Success) + { + spdlog::warn("Failed to pre-load model '{}' variant '{}' (error={})", + baseName, requestedVariant, errorCodeToString(loadErr)); + } + } + // Parse messages with full OpenAI message format support for(const nlohmann::json &msg:requestJson.at("messages")) { @@ -365,8 +493,9 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) m.role=msg.at("role").get(); // content can be null for assistant messages with tool_calls + // content can be a string or an array of content parts (OpenAI spec) if(msg.contains("content") && !msg.at("content").is_null()) - m.content=msg.at("content").get(); + m.content=contentToString(msg.at("content")); // tool_call_id for role="tool" messages if(msg.contains("tool_call_id")) @@ -466,6 +595,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) bool stream=requestJson.value("stream", false); std::string requestId=generateId("chatcmpl-"); auto created=std::time(nullptr); + std::string responseModelId=requestJson.at("model").get(); // Check for stream_options.include_usage bool includeUsage=false; @@ -478,14 +608,14 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) { res.set_chunked_content_provider( "text/event-stream", - [arbiterRequest, requestId, created, includeUsage](size_t, httplib::DataSink &sink) + [arbiterRequest, requestId, created, includeUsage, responseModelId](size_t, httplib::DataSink &sink) { // Send initial chunk with role nlohmann::json roleChunk={ {"id", requestId}, {"object", "chat.completion.chunk"}, {"created", created}, - {"model", arbiterRequest.model}, + {"model", responseModelId}, {"system_fingerprint", nullptr}, {"choices", {{ {"index", 0}, @@ -503,7 +633,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) {"id", requestId}, {"object", "chat.completion.chunk"}, {"created", created}, - {"model", arbiterRequest.model}, + {"model", responseModelId}, {"system_fingerprint", nullptr}, {"choices", {{ {"index", 0}, @@ -529,7 +659,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) {"id", requestId}, {"object", "chat.completion.chunk"}, {"created", created}, - {"model", arbiterRequest.model}, + {"model", responseModelId}, {"system_fingerprint", nullptr}, {"choices", {{ {"index", 0}, @@ -547,7 +677,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) {"id", requestId}, {"object", "chat.completion.chunk"}, {"created", created}, - {"model", arbiterRequest.model}, + {"model", responseModelId}, {"system_fingerprint", nullptr}, {"choices", nlohmann::json::array()}, {"usage", { @@ -636,7 +766,7 @@ void handleChatCompletions(const httplib::Request &req, httplib::Response &res) {"id", requestId}, {"object", "chat.completion"}, {"created", created}, - {"model", arbiterResponse.model}, + {"model", responseModelId}, {"system_fingerprint", nullptr}, {"choices", {{ {"index", 0}, @@ -664,6 +794,7 @@ void handleListModelsV1(const httplib::Request &, httplib::Response &res) nlohmann::json data=nlohmann::json::array(); for(const std::string &name:modelNames) { + // Always emit the bare model name data.push_back({ {"id", name}, {"object", "model"}, @@ -671,6 +802,23 @@ void handleListModelsV1(const httplib::Request &, httplib::Response &res) {"owned_by", "arbiterai"}, {"permission", nlohmann::json::array()} }); + + // For models with variants, also emit "model:variant" entries + ModelInfo info; + if(ArbiterAI::instance().getModelInfo(name, info)==ErrorCode::Success + &&!info.variants.empty()) + { + for(const ModelVariant &v:info.variants) + { + data.push_back({ + {"id", name+":"+v.quantization}, + {"object", "model"}, + {"created", created}, + {"owned_by", "arbiterai"}, + {"permission", nlohmann::json::array()} + }); + } + } } nlohmann::json response={ @@ -684,18 +832,29 @@ void handleListModelsV1(const httplib::Request &, httplib::Response &res) void handleGetModelV1(const httplib::Request &req, httplib::Response &res) { std::string modelId=req.matches[1]; + auto [baseName, variantName]=parseModelVariant(modelId); - std::vector modelNames; - ArbiterAI::instance().getAvailableModels(modelNames); + ModelInfo info; + bool found=(ArbiterAI::instance().getModelInfo(baseName, info)==ErrorCode::Success); - bool found=false; - for(const std::string &name:modelNames) + // If a variant was specified, verify it exists on this model + if(found&&!variantName.empty()&&!info.variants.empty()) { - if(name==modelId) + bool variantFound=false; + for(const ModelVariant &v:info.variants) { - found=true; - break; + if(v.quantization==variantName) + { + variantFound=true; + break; + } } + if(!variantFound) found=false; + } + else if(!variantName.empty()&&info.variants.empty()) + { + // Variant requested but model has no variants + found=false; } if(!found) @@ -830,7 +989,24 @@ void handleGetModels(const httplib::Request &, httplib::Response &res) // Add models with hardware fit info for(const ModelFit &f:fits) { - models.push_back(modelFitToJson(f)); + nlohmann::json modelJson=modelFitToJson(f); + + // Include runtime_options and backend_priority from model config + ModelInfo info; + if(ArbiterAI::instance().getModelInfo(f.model, info)==ErrorCode::Success) + { + nlohmann::json opts=runtimeOptionsToJson(info.runtimeOptions); + if(!opts.empty()) + { + modelJson["runtime_options"]=opts; + } + if(!info.backendPriority.empty()) + { + modelJson["backend_priority"]=info.backendPriority; + } + } + + models.push_back(modelJson); } // Add cloud models (no fit data) @@ -844,7 +1020,7 @@ void handleGetModels(const httplib::Request &, httplib::Response &res) { if(fitModels.find(name)==fitModels.end()) { - models.push_back({ + nlohmann::json modelJson={ {"model", name}, {"variant", ""}, {"can_run", true}, @@ -852,7 +1028,23 @@ void handleGetModels(const httplib::Request &, httplib::Response &res) {"limiting_factor", ""}, {"estimated_vram_mb", 0}, {"gpu_indices", nlohmann::json::array()} - }); + }; + + ModelInfo info; + if(ArbiterAI::instance().getModelInfo(name, info)==ErrorCode::Success) + { + nlohmann::json opts=runtimeOptionsToJson(info.runtimeOptions); + if(!opts.empty()) + { + modelJson["runtime_options"]=opts; + } + if(!info.backendPriority.empty()) + { + modelJson["backend_priority"]=info.backendPriority; + } + } + + models.push_back(modelJson); } } @@ -879,6 +1071,7 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) std::string modelName=req.matches[1]; std::string variant; int contextSize=0; + RuntimeOptions optionsOverride; // Accept parameters from query string if(req.has_param("variant")) @@ -898,6 +1091,8 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) contextSize=body["context"].get(); if(body.contains("context_size")&&body["context_size"].is_number_integer()) contextSize=body["context_size"].get(); + if(body.contains("runtime_options")&&body["runtime_options"].is_object()) + optionsOverride=parseRuntimeOptions(body["runtime_options"]); } catch(const nlohmann::json::parse_error &) { @@ -907,7 +1102,7 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) spdlog::info("Load request: model='{}' variant='{}' context={}", modelName, variant, contextSize); - ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize); + ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize, &optionsOverride); if(err==ErrorCode::Success) { @@ -919,6 +1114,12 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) { response["context_size"]=state->contextSize; response["max_context_size"]=state->maxContextSize; + + nlohmann::json activeOpts=runtimeOptionsToJson(state->activeOptions); + if(!activeOpts.empty()) + { + response["runtime_options"]=activeOpts; + } } res.set_content(response.dump(), "application/json"); @@ -1247,7 +1448,7 @@ void handleAddModelConfig(const httplib::Request &req, httplib::Response &res) added.push_back(modelJson["model"].get()); } - // Persist if override path is set + // Legacy: persist to single override file if configured if(!g_overridePath.empty()) { mm.saveOverrides(g_overridePath); @@ -1315,7 +1516,7 @@ void handleUpdateModelConfig(const httplib::Request &req, httplib::Response &res created.push_back(modelName); } - // Persist if override path is set + // Legacy: persist to single override file if configured if(!g_overridePath.empty()) { mm.saveOverrides(g_overridePath); @@ -1358,7 +1559,7 @@ void handleDeleteModelConfig(const httplib::Request &req, httplib::Response &res return; } - // Persist if override path is set + // Legacy: persist to single override file if configured if(!g_overridePath.empty()) { mm.saveOverrides(g_overridePath); @@ -1430,7 +1631,15 @@ void handleGetHardware(const httplib::Request &, httplib::Response &res) HardwareDetector::instance().refresh(); SystemInfo hw=HardwareDetector::instance().getSystemInfo(); - res.set_content(systemInfoToJson(hw).dump(), "application/json"); + nlohmann::json j=systemInfoToJson(hw); + + std::vector defaultBP=ModelRuntime::instance().getDefaultBackendPriority(); + if(!defaultBP.empty()) + { + j["default_backend_priority"]=defaultBP; + } + + res.set_content(j.dump(), "application/json"); } void handleSetVramOverride(const httplib::Request &req, httplib::Response &res) @@ -1989,6 +2198,78 @@ void handleGetActiveDownloads(const httplib::Request &, httplib::Response &res) res.set_content(nlohmann::json{{"downloads", downloads}}.dump(), "application/json"); } +// ========== Runtime Options ========== + +void handleGetRuntimeOptions(const httplib::Request &, httplib::Response &res) +{ + // Return a schema describing all available runtime options, their types, + // defaults, and valid values — so callers know what can be set. + nlohmann::json options=nlohmann::json::array(); + + options.push_back({ + {"name", "flash_attn"}, + {"type", "boolean"}, + {"description", "Enable or disable flash attention (-fa). Some models crash with it enabled."}, + {"default", nullptr} + }); + options.push_back({ + {"name", "kv_cache_type_k"}, + {"type", "string"}, + {"description", "KV cache data type for keys (-ctk). Lower precision uses less VRAM."}, + {"valid_values", {"f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"}}, + {"default", "f16"} + }); + options.push_back({ + {"name", "kv_cache_type_v"}, + {"type", "string"}, + {"description", "KV cache data type for values (-ctv). Lower precision uses less VRAM."}, + {"valid_values", {"f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"}}, + {"default", "f16"} + }); + options.push_back({ + {"name", "no_mmap"}, + {"type", "boolean"}, + {"description", "Disable memory-mapped file I/O (--no-mmap). Required for some models/systems."}, + {"default", false} + }); + options.push_back({ + {"name", "reasoning_budget"}, + {"type", "integer"}, + {"description", "Reasoning token budget (--reasoning-budget). 0 disables reasoning/thinking tokens."}, + {"default", nullptr} + }); + options.push_back({ + {"name", "swa_full"}, + {"type", "boolean"}, + {"description", "Use full-size sliding window attention cache (--swa-full)."}, + {"default", nullptr} + }); + options.push_back({ + {"name", "n_gpu_layers"}, + {"type", "integer"}, + {"description", "Number of layers to offload to GPU (-ngl). 99 offloads all layers."}, + {"default", 99} + }); + options.push_back({ + {"name", "override_tensor"}, + {"type", "string"}, + {"description", "Tensor override pattern (-ot). Advanced: route specific tensors to CPU/GPU."}, + {"default", nullptr} + }); + + nlohmann::json backendPriorityInfo={ + {"name", "backend_priority"}, + {"type", "array of strings"}, + {"description", "Ordered preference for GPU backends. First available backend is used."}, + {"valid_values", {"vulkan", "rocm", "cuda"}} + }; + + res.set_content(nlohmann::json{ + {"runtime_options", options}, + {"backend_priority", backendPriorityInfo} + }.dump(), "application/json"); +} + // ========== Logs ========== void handleGetLogs(const httplib::Request &req, httplib::Response &res) @@ -2054,5 +2335,10 @@ void handleDashboard(const httplib::Request &, httplib::Response &res) res.set_content(DASHBOARD_HTML, "text/html"); } +void handleDashboardStorage(const httplib::Request &, httplib::Response &res) +{ + res.set_content(DASHBOARD_STORAGE_HTML, "text/html"); +} + } // namespace server } // namespace arbiterAI diff --git a/src/server/routes.h b/src/server/routes.h index ed79d13..418daf4 100644 --- a/src/server/routes.h +++ b/src/server/routes.h @@ -59,6 +59,10 @@ void handleGetHardware(const httplib::Request &req, httplib::Response &res); void handleSetVramOverride(const httplib::Request &req, httplib::Response &res); void handleClearVramOverride(const httplib::Request &req, httplib::Response &res); +// ========== Runtime Options ========== + +void handleGetRuntimeOptions(const httplib::Request &req, httplib::Response &res); + // ========== Logs ========== void handleGetLogs(const httplib::Request &req, httplib::Response &res); @@ -84,6 +88,7 @@ void handleGetActiveDownloads(const httplib::Request &req, httplib::Response &re // ========== Dashboard ========== void handleDashboard(const httplib::Request &req, httplib::Response &res); +void handleDashboardStorage(const httplib::Request &req, httplib::Response &res); } // namespace server } // namespace arbiterAI diff --git a/vcpkg/custom_ports/llama-cpp/portfile.cmake b/vcpkg/custom_ports/llama-cpp/portfile.cmake index 7007666..d734586 100644 --- a/vcpkg/custom_ports/llama-cpp/portfile.cmake +++ b/vcpkg/custom_ports/llama-cpp/portfile.cmake @@ -1,16 +1,39 @@ -vcpkg_from_github( - OUT_SOURCE_PATH SOURCE_PATH - REPO ggml-org/llama.cpp - REF b${VERSION} - SHA512 b05f130a2052d3c2cec483c3b098f71585fe7d00fa1971786c0a646717f82320211801780625b9aabc9fc1e1797f8995381e40661f3e8a115c72710f147083cd - HEAD_REF master -) +if("head" IN_LIST FEATURES) + # Build from latest master HEAD — use git clone directly + # vcpkg_from_github with HEAD_REF only works with `--head` flag, + # so we clone manually for manifest-mode compatibility. + set(SOURCE_PATH "${CURRENT_BUILDTREES_DIR}/src/llama-cpp-master") + if(NOT EXISTS "${SOURCE_PATH}/.git") + file(REMOVE_RECURSE "${SOURCE_PATH}") + vcpkg_execute_required_process( + COMMAND git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${SOURCE_PATH}" + WORKING_DIRECTORY "${CURRENT_BUILDTREES_DIR}/src" + LOGNAME clone-llama-cpp + ) + else() + vcpkg_execute_required_process( + COMMAND git pull --ff-only + WORKING_DIRECTORY "${SOURCE_PATH}" + LOGNAME pull-llama-cpp + ) + endif() +else() + # Build from a pinned release tag + vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO ggml-org/llama.cpp + REF b${VERSION} + SHA512 6be3482ef58872ee4a386ba831175e53ce0d93c6992e4389ffd97f9af3cc7becdd1356fda575702681f55261e7fe81bc1baa12edd0d5f809aa80684f5c890bac + HEAD_REF master + ) +endif() vcpkg_cmake_configure( SOURCE_PATH "${SOURCE_PATH}" OPTIONS -DGGML_CCACHE=OFF -DGGML_VULKAN=ON + -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TOOLS=OFF diff --git a/vcpkg/custom_ports/llama-cpp/vcpkg.json b/vcpkg/custom_ports/llama-cpp/vcpkg.json index f6565d7..cc672ae 100644 --- a/vcpkg/custom_ports/llama-cpp/vcpkg.json +++ b/vcpkg/custom_ports/llama-cpp/vcpkg.json @@ -1,7 +1,7 @@ { "name": "llama-cpp", - "version": "8573", - "port-version": 1, + "version": "8748", + "port-version": 0, "description": "LLM inference in C/C++", "homepage": "https://github.com/ggml-org/llama.cpp", "license": "MIT", @@ -14,5 +14,10 @@ "name": "vcpkg-cmake-config", "host": true } - ] + ], + "features": { + "head": { + "description": "Build from latest master HEAD instead of a pinned release tag" + } + } } \ No newline at end of file