diff --git a/CMakeLists.txt b/CMakeLists.txt index eccca43..a9cb298 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,10 @@ cmake_minimum_required(VERSION 3.15) project(arbiterAI VERSION 0.1.6 LANGUAGES C CXX) +# Read llama-cpp build number from our custom vcpkg port +file(READ "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg/custom_ports/llama-cpp/vcpkg.json" LLAMA_PORT_JSON) +string(JSON ARBITERAI_LLAMACPP_BUILD GET "${LLAMA_PORT_JSON}" "version") + # Generate version header from template configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/src/arbiterAI/version.h.in" @@ -139,6 +143,7 @@ target_link_libraries(arbiterai tests/telemetryCollectorTests.cpp tests/llamaProviderTests.cpp tests/storageManagerTests.cpp + tests/serverConnectTests.cpp ) target_link_libraries(arbiterai_tests @@ -185,7 +190,6 @@ target_link_libraries(arbiterAI-server PRIVATE arbiterai httplib::httplib - cxxopts::cxxopts spdlog::spdlog ) diff --git a/docker/Dockerfile b/docker/Dockerfile index 95a8318..1e72eaf 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,8 +1,11 @@ # syntax=docker/dockerfile:1 -ARG DOCKER_VERSION=1.1.1 -FROM ubuntu:22.04 +ARG DOCKER_VERSION=1.2.0 +FROM ubuntu:24.04 -# Install basic build tools and Python 3 +# Install basic build tools, Python 3, and GPU libraries. +# Vulkan headers + glslc are needed at build time for llama.cpp's Vulkan backend. +# At runtime, GPU inference requires a Vulkan ICD on the host; without one +# llama.cpp falls back to CPU-only. RUN apt-get update && apt-get install -y \ build-essential \ cmake \ @@ -26,17 +29,11 @@ RUN apt-get update && apt-get install -y \ libxrandr-dev \ libxinerama-dev \ libglu1-mesa-dev \ - && apt-get remove -y nodejs npm \ - && rm -rf /var/lib/apt/lists/* - -# Install NVIDIA CUDA toolkit and Vulkan SDK for GPU-accelerated inference. -# These are optional at runtime — builds and tests work without a GPU, -# but llama.cpp inference falls back to CPU-only. -RUN apt-get update && apt-get install -y \ - nvidia-cuda-toolkit \ vulkan-tools \ libvulkan-dev \ mesa-vulkan-drivers \ + glslc \ + glslang-tools \ && rm -rf /var/lib/apt/lists/* RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \ diff --git a/docs/server.md b/docs/server.md index 99f025c..84b655c 100644 --- a/docs/server.md +++ b/docs/server.md @@ -12,8 +12,9 @@ Standalone HTTP server that wraps the ArbiterAI library, providing an OpenAI-com - [Model Config Injection](#33-model-config-injection) - [Telemetry](#34-telemetry) - [Storage Management](#35-storage-management) - - [Health & Version](#36-health--version) - - [Dashboard](#37-dashboard) + - [Logs](#36-logs) + - [Health & Version](#37-health--version) + - [Dashboard](#38-dashboard) 4. [Configuration Persistence](#4-configuration-persistence) 5. [Error Format](#5-error-format) @@ -45,40 +46,94 @@ The server supports: ### CLI Options -| Option | Default | Description | -|--------|---------|-------------| -| `-p, --port` | `8080` | HTTP port | -| `-H, --host` | `0.0.0.0` | Bind address | -| `-c, --config` | `config` | Model config directory path(s) | -| `-m, --model` | *(none)* | Default model to load on startup | -| `-v, --variant` | *(none)* | Default quantization variant (e.g., `Q4_K_M`) | -| `--override-path` | *(none)* | Path to write runtime model config overrides (enables persistence) | -| `--ram-budget` | `0` (auto 50%) | Ready-model RAM budget in MB | -| `--models-dir` | `/models` | Directory where downloaded model files are stored | -| `--storage-limit` | `0` (unlimited) | Maximum storage for model files (e.g., `50G`, `500M`). `0` = use all free disk space. | -| `--cleanup-enabled` | `true` | Enable automated storage cleanup | -| `--cleanup-max-age` | `720` | Max age in hours before a variant becomes a cleanup candidate (default: 30 days) | -| `--cleanup-interval` | `24` | Hours between automated cleanup runs | -| `--log-level` | `info` | Log level (`trace`, `debug`, `info`, `warn`, `error`) | -| `-h, --help` | | Print usage | +The server accepts only two command-line options: -### Examples +| Option | Description | +|--------|-------------| +| `-c, --config ` | Path to server configuration JSON file (**required**) | +| `-h, --help` | Print usage | -```bash -# Start with defaults -./arbiterAI-server +### Configuration File + +All server settings are defined in a JSON configuration file. See [`examples/server_config.json`](../examples/server_config.json) for a complete example. + +```json +{ + "host": "0.0.0.0", + "port": 8080, + "model_config_paths": ["config"], + "models_dir": "/models", + "default_model": "", + "default_variant": "", + "override_path": "", + "ram_budget_mb": 0, + "max_concurrent_downloads": 2, + "storage": { + "limit": "0", + "cleanup_enabled": true, + "cleanup_max_age_days": 30, + "cleanup_interval_hours": 24 + }, + "hardware": { + "vram_overrides": { + "0": 32000 + } + }, + "logging": { + "level": "info", + "directory": "", + "rotate_hour": 0, + "retain_days": 7 + } +} +``` + +#### Configuration Reference + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `host` | `string` | `"0.0.0.0"` | Bind address | +| `port` | `int` | `8080` | HTTP port | +| `model_config_paths` | `string[]` | `["config"]` | Model config directory paths | +| `models_dir` | `string` | `"/models"` | Directory for downloaded model files | +| `default_model` | `string` | `""` | Model to load on startup | +| `default_variant` | `string` | `""` | Default quantization variant (e.g., `Q4_K_M`) | +| `override_path` | `string` | `""` | Path to write runtime model config overrides | +| `ram_budget_mb` | `int` | `0` | Ready-model RAM budget in MB (`0` = auto 50%) | +| `max_concurrent_downloads` | `int` | `2` | Maximum simultaneous model downloads | + +**`storage` object:** -# Custom port, load a model on startup -./arbiterAI-server -p 9090 -m gpt-4 --log-level debug +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `limit` | `string` | `"0"` | Max storage for model files (e.g., `"50G"`, `"500M"`). `"0"` = all free disk. | +| `cleanup_enabled` | `bool` | `true` | Enable automated storage cleanup | +| `cleanup_max_age_days` | `int` | `30` | Days since last use before cleanup candidacy | +| `cleanup_interval_hours` | `int` | `24` | Hours between automated cleanup runs | -# Enable runtime config persistence -./arbiterAI-server --override-path /data/overrides.json +**`hardware` object:** -# Load a local model with a specific variant -./arbiterAI-server -m qwen2.5-7b-instruct -v Q4_K_M --ram-budget 8192 +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `vram_overrides` | `object` | `{}` | GPU index → VRAM MB overrides (e.g., `{"0": 32000}`) | -# Limit model storage to 50 GB with cleanup every 12 hours -./arbiterAI-server --models-dir /data/models --storage-limit 50G --cleanup-interval 12 +**`logging` object:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `level` | `string` | `"info"` | Log level (`trace`, `debug`, `info`, `warn`, `error`) | +| `directory` | `string` | `""` | Directory for log files (empty = console only) | +| `rotate_hour` | `int` | `0` | Hour of day (0–23) to rotate log files | +| `retain_days` | `int` | `7` | Number of daily log files to keep | + +### Examples + +```bash +# Start with a config file +./arbiterAI-server --config /etc/arbiterai/server_config.json + +# Short form +./arbiterAI-server -c server_config.json ``` --- @@ -338,6 +393,56 @@ Load a model into VRAM for inference. **Response (202):** `{"status": "downloading", "model": "qwen2.5-7b-instruct"}` — model file is being downloaded. +**Response (400):** Model load failed. The response includes structured error details so callers can programmatically react to the failure. + +```json +{ + "error": { + "message": "Model architecture is not supported by this llama.cpp build", + "type": "invalid_request_error", + "code": "model_load_error", + "param": "model", + "details": { + "model": "qwen3.5-27b", + "variant": "Q4_K_M", + "context_requested": 4096, + "error_code": "model_load_error", + "reason": "unsupported_arch", + "recoverable": false, + "action": "update_server", + "suggestion": "Update the server to a newer version that supports this model architecture, or use a different model.", + "llama_log": "llama_model_load: error loading model architecture: unknown model architecture: 'qwen35'\nllama_model_load_from_file_impl: failed to load model" + } + } +} +``` + +**`details.reason` values:** + +| Reason | Description | Recoverable | +|--------|-------------|-------------| +| `file_not_found` | GGUF file does not exist at the expected path | Yes | +| `file_corrupt` | GGUF header invalid, bad magic, or file truncated | Yes | +| `insufficient_vram` | Not enough GPU memory to load model at requested context | Yes | +| `insufficient_ram` | Not enough system RAM | Yes | +| `context_too_large` | Requested context size exceeds model or hardware limits | Yes | +| `unsupported_arch` | Model architecture not supported by this llama.cpp build | No | +| `backend_error` | Generic llama.cpp internal error | No | +| `unknown` | Could not classify the failure | No | + +**`details.action` values:** + +| Action | Description | +|--------|-------------| +| `redownload` | Re-download the model file (file missing) | +| `delete_and_redownload` | Delete the corrupt file, then re-download | +| `reduce_context` | Retry with a smaller context size | +| `use_smaller_variant` | Try a smaller quantization variant | +| `update_server` | Update the server to a newer version | +| `check_logs` | Inspect the `llama_log` field or server logs for details | + +**`details.recoverable`** is `true` when the caller can take an automated action (re-download, reduce context, switch variant) to resolve the failure. When `false`, human intervention or a server update is required. + **Response (507):** Insufficient storage — the model file won't fit within the configured storage limit. Includes `available_bytes`, `required_bytes`, and `storage_limit_bytes` for programmatic decision-making. #### `POST /api/models/:name/unload` @@ -944,7 +1049,45 @@ List all active downloads with progress, speed, and ETA. --- -### 3.6 Health & Version +### 3.6 Logs + +#### `GET /api/logs` + +Retrieve recent server log entries from the in-memory ring buffer. Useful for debugging model load failures, provider errors, and server behaviour without SSH access. + +**Query parameters:** + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `count` | `200` | Number of log entries to return (max `1000`) | +| `level` | *(all)* | Filter by minimum level: `trace`, `debug`, `info`, `warning`, `error`, `critical` | + +**Response:** + +```json +{ + "logs": [ + { + "timestamp": "2025-01-15T14:30:05.123Z", + "epoch_ms": 1736952605123, + "level": "info", + "message": "Loading model qwen2.5-7b-instruct variant Q4_K_M context 4096" + }, + { + "timestamp": "2025-01-15T14:30:06.456Z", + "epoch_ms": 1736952606456, + "level": "error", + "message": "Model load failed: insufficient VRAM" + } + ] +} +``` + +The ring buffer holds the most recent 1000 entries. Entries are returned in chronological order (oldest first). The dashboard polls this endpoint to display a live scrolling log panel. + +--- + +### 3.7 Health & Version #### `GET /health` (or `/v1/health`) @@ -969,7 +1112,7 @@ Library version. --- -### 3.7 Dashboard +### 3.8 Dashboard #### `GET /dashboard` @@ -983,6 +1126,7 @@ Returns an HTML page with a live-updating dashboard showing: - **Download progress** — Active downloads with progress bar, bytes transferred, speed (MB/s), and ETA - **Row age coloring** — Fresh (green, <14 days), stale (yellow, 14–30 days), old (red, >30 days) - Model deletion (guarded variants show disabled delete button with tooltip) +- **Server log panel** — Collapsible live-scrolling log viewer with level filtering (trace/debug/info/warning/error/critical) and auto-scroll toggle. Polls `/api/logs` every 2 seconds. Open in a browser: `http://localhost:8080/dashboard` diff --git a/examples/server_config.json b/examples/server_config.json new file mode 100644 index 0000000..ef8c883 --- /dev/null +++ b/examples/server_config.json @@ -0,0 +1,36 @@ +{ + "host": "0.0.0.0", + "port": 8080, + + "model_config_paths": [ + "config" + ], + + "models_dir": "/models", + + "default_model": "", + "default_variant": "", + + "override_path": "", + + "ram_budget_mb": 0, + "max_concurrent_downloads": 2, + + "storage": { + "limit": "0", + "cleanup_enabled": true, + "cleanup_max_age_days": 30, + "cleanup_interval_hours": 24 + }, + + "hardware": { + "vram_overrides": {} + }, + + "logging": { + "level": "info", + "directory": "", + "rotate_hour": 0, + "retain_days": 7 + } +} diff --git a/schemas/model_config.schema.json b/schemas/model_config.schema.json index c282b51..d112f68 100644 --- a/schemas/model_config.schema.json +++ b/schemas/model_config.schema.json @@ -201,7 +201,7 @@ }, "download": { "type": "object", - "description": "Download metadata for this variant", + "description": "Download metadata for this variant (single file)", "properties": { "url": { "type": "string", @@ -219,6 +219,31 @@ } }, "required": ["url", "filename"] + }, + "files": { + "type": "array", + "description": "Download metadata for split/multi-shard GGUF variants. When present, takes precedence over the single download field.", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "Download URL for this shard" + }, + "sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$", + "description": "SHA256 checksum for this shard" + }, + "filename": { + "type": "string", + "description": "Local filename for this shard" + } + }, + "required": ["url", "filename"] + }, + "minItems": 1 } } } diff --git a/src/arbiterAI/arbiterAI.cpp b/src/arbiterAI/arbiterAI.cpp index 53869fc..9d8b013 100644 --- a/src/arbiterAI/arbiterAI.cpp +++ b/src/arbiterAI/arbiterAI.cpp @@ -29,7 +29,8 @@ std::string VersionInfo::toString() const VersionInfo getVersion() { - return {ARBITERAI_VERSION_MAJOR, ARBITERAI_VERSION_MINOR, ARBITERAI_VERSION_PATCH}; + return {ARBITERAI_VERSION_MAJOR, ARBITERAI_VERSION_MINOR, ARBITERAI_VERSION_PATCH, + "b" ARBITERAI_LLAMACPP_BUILD}; } VersionInfo ArbiterAI::getVersion() @@ -87,8 +88,8 @@ ErrorCode ArbiterAI::initialize(const std::vector &config // Mark global singleton initialized so subsequent operations succeed ArbiterAI::instance().initialized = true; - // Initialize StorageManager with default models directory - StorageManager::instance().initialize("/models"); + // Note: StorageManager is initialized by the application (e.g., server main.cpp) + // with the user-configured models directory path, not here. return ErrorCode::Success; } @@ -536,6 +537,21 @@ ErrorCode ArbiterAI::loadModel(const std::string &model, const std::string &vari return ModelRuntime::instance().loadModel(model, variant, contextSize); } +ErrorCode ArbiterAI::downloadModel(const std::string &model, const std::string &variant) +{ + return ModelRuntime::instance().downloadModel(model, variant); +} + +void ArbiterAI::setMaxConcurrentDownloads(int max) +{ + ModelRuntime::instance().setMaxConcurrentDownloads(max); +} + +int ArbiterAI::getMaxConcurrentDownloads() const +{ + return ModelRuntime::instance().getMaxConcurrentDownloads(); +} + ErrorCode ArbiterAI::unloadModel(const std::string &model) { return ModelRuntime::instance().unloadModel(model); diff --git a/src/arbiterAI/arbiterAI.h b/src/arbiterAI/arbiterAI.h index 2a6e2cb..7f817ea 100644 --- a/src/arbiterAI/arbiterAI.h +++ b/src/arbiterAI/arbiterAI.h @@ -43,6 +43,7 @@ struct VersionInfo { int major; int minor; int patch; + std::string llamaCppBuild; ///< llama.cpp build number (e.g. "b8573") /// Returns the version as "major.minor.patch". std::string toString() const; @@ -608,6 +609,30 @@ class ArbiterAI */ ErrorCode loadModel(const std::string &model, const std::string &variant="", int contextSize=0); + /** + * @brief Download model files without loading into VRAM + * + * Launches an async background download that respects the concurrent + * download limit. Returns ModelDownloading on success (download started), + * Success if files are already present, or an error code. + * @param model Model name + * @param variant Quantization variant (empty = auto-select) + * @return ErrorCode::ModelDownloading, Success, ModelNotFound, InsufficientStorage + */ + ErrorCode downloadModel(const std::string &model, const std::string &variant=""); + + /** + * @brief Set the maximum number of concurrent model downloads + * @param max Maximum concurrent downloads (minimum 1, default 2) + */ + void setMaxConcurrentDownloads(int max); + + /** + * @brief Get the current concurrent download limit + * @return Maximum concurrent downloads + */ + int getMaxConcurrentDownloads() const; + /** * @brief Unload a model from VRAM/RAM * @param model Model name diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp index 0d292d0..d0ac4de 100644 --- a/src/arbiterAI/hardwareDetector.cpp +++ b/src/arbiterAI/hardwareDetector.cpp @@ -54,7 +54,9 @@ typedef enum typedef enum { VK_STRUCTURE_TYPE_APPLICATION_INFO=0, - VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO=1 + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO=1, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2=1000059006, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT=1000237000 } VkStructureType; typedef enum @@ -98,8 +100,8 @@ typedef VkInstance_T *VkInstance; struct VkPhysicalDevice_T; typedef VkPhysicalDevice_T *VkPhysicalDevice; -typedef struct VkPhysicalDeviceLimits { - char padding[504]; // exact size of VkPhysicalDeviceLimits +typedef struct alignas(8) VkPhysicalDeviceLimits { + char padding[504]; // exact size and alignment of VkPhysicalDeviceLimits } VkPhysicalDeviceLimits; typedef struct VkPhysicalDeviceSparseProperties { @@ -147,6 +149,29 @@ typedef VkResult (*VkEnumeratePhysicalDevicesFunc)(VkInstance, uint32_t *, VkPhy typedef void (*VkGetPhysicalDevicePropertiesFunc)(VkPhysicalDevice, VkPhysicalDeviceProperties *); typedef void (*VkGetPhysicalDeviceMemoryPropertiesFunc)(VkPhysicalDevice, VkPhysicalDeviceMemoryProperties *); +// Vulkan 1.1+ / VK_EXT_memory_budget structures and function pointers + +typedef struct VkPhysicalDeviceMemoryProperties2 { + VkStructureType sType; + void *pNext; + VkPhysicalDeviceMemoryProperties memoryProperties; +} VkPhysicalDeviceMemoryProperties2; + +typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT { + VkStructureType sType; + void *pNext; + uint64_t heapBudget[16]; // VK_MAX_MEMORY_HEAPS + uint64_t heapUsage[16]; +} VkPhysicalDeviceMemoryBudgetPropertiesEXT; + +typedef struct VkExtensionProperties { + char extensionName[256]; // VK_MAX_EXTENSION_NAME_SIZE + uint32_t specVersion; +} VkExtensionProperties; + +typedef void (*VkGetPhysicalDeviceMemoryProperties2Func)(VkPhysicalDevice, VkPhysicalDeviceMemoryProperties2 *); +typedef VkResult (*VkEnumerateDeviceExtensionPropertiesFunc)(VkPhysicalDevice, const char *, uint32_t *, VkExtensionProperties *); + const int NVML_DEVICE_NAME_BUFFER_SIZE=96; } // anonymous namespace @@ -184,6 +209,9 @@ void HardwareDetector::refresh() detectNvmlGpus(); detectVulkanGpus(); detectUnifiedMemory(); + applyVramOverrides(); + + m_firstRefreshDone=true; } SystemInfo HardwareDetector::getSystemInfo() const @@ -226,6 +254,95 @@ bool HardwareDetector::isVulkanAvailable() const return m_vulkanLoaded; } +void HardwareDetector::setVramOverride(int gpuIndex, int vramMb) +{ + std::lock_guard lock(m_mutex); + m_vramOverrides[gpuIndex]=vramMb; + + // Apply immediately to current data + for(GpuInfo &gpu:m_systemInfo.gpus) + { + if(gpu.index==gpuIndex) + { + // Preserve the actual used amount: free = newTotal - used + int usedMb=gpu.vramTotalMb-gpu.vramFreeMb; + gpu.vramTotalMb=vramMb; + gpu.vramFreeMb=std::max(0, vramMb-usedMb); + gpu.vramOverridden=true; + + if(gpu.unifiedMemory) + { + int accessibleUsedMb=gpu.gpuAccessibleRamMb-gpu.gpuAccessibleRamFreeMb; + gpu.gpuAccessibleRamMb=vramMb; + gpu.gpuAccessibleRamFreeMb=std::max(0, vramMb-accessibleUsedMb); + } + + spdlog::info("VRAM override applied to GPU {}: {} MB", gpuIndex, vramMb); + break; + } + } +} + +void HardwareDetector::clearVramOverride(int gpuIndex) +{ + std::lock_guard lock(m_mutex); + m_vramOverrides.erase(gpuIndex); + spdlog::info("VRAM override cleared for GPU {}", gpuIndex); +} + +void HardwareDetector::clearAllVramOverrides() +{ + std::lock_guard lock(m_mutex); + m_vramOverrides.clear(); + spdlog::info("All VRAM overrides cleared"); +} + +bool HardwareDetector::hasVramOverride(int gpuIndex) const +{ + std::lock_guard lock(m_mutex); + return m_vramOverrides.find(gpuIndex)!=m_vramOverrides.end(); +} + +int HardwareDetector::getVramOverride(int gpuIndex) const +{ + std::lock_guard lock(m_mutex); + auto it=m_vramOverrides.find(gpuIndex); + + if(it!=m_vramOverrides.end()) + { + return it->second; + } + return 0; +} + +void HardwareDetector::applyVramOverrides() +{ + for(GpuInfo &gpu:m_systemInfo.gpus) + { + auto it=m_vramOverrides.find(gpu.index); + + if(it==m_vramOverrides.end()) + { + continue; + } + + int overrideMb=it->second; + + // Preserve the actual used amount: free = newTotal - used + int usedMb=gpu.vramTotalMb-gpu.vramFreeMb; + gpu.vramTotalMb=overrideMb; + gpu.vramFreeMb=std::max(0, overrideMb-usedMb); + gpu.vramOverridden=true; + + if(gpu.unifiedMemory) + { + int accessibleUsedMb=gpu.gpuAccessibleRamMb-gpu.gpuAccessibleRamFreeMb; + gpu.gpuAccessibleRamMb=overrideMb; + gpu.gpuAccessibleRamFreeMb=std::max(0, overrideMb-accessibleUsedMb); + } + } +} + // --- System RAM detection --- void HardwareDetector::detectSystemRam() @@ -474,6 +591,10 @@ bool HardwareDetector::loadVulkan() m_vkGetPhysicalDeviceProperties=dlsym(m_vulkanLib, "vkGetPhysicalDeviceProperties"); m_vkGetPhysicalDeviceMemoryProperties=dlsym(m_vulkanLib, "vkGetPhysicalDeviceMemoryProperties"); + // Vulkan 1.1+ optional symbols for memory budget queries + m_vkGetPhysicalDeviceMemoryProperties2=dlsym(m_vulkanLib, "vkGetPhysicalDeviceMemoryProperties2"); + m_vkEnumerateDeviceExtensionProperties=dlsym(m_vulkanLib, "vkEnumerateDeviceExtensionProperties"); + if(!m_vkCreateInstance||!m_vkDestroyInstance|| !m_vkEnumeratePhysicalDevices||!m_vkGetPhysicalDeviceProperties|| !m_vkGetPhysicalDeviceMemoryProperties) @@ -505,6 +626,8 @@ void HardwareDetector::unloadVulkan() m_vkEnumeratePhysicalDevices=nullptr; m_vkGetPhysicalDeviceProperties=nullptr; m_vkGetPhysicalDeviceMemoryProperties=nullptr; + m_vkGetPhysicalDeviceMemoryProperties2=nullptr; + m_vkEnumerateDeviceExtensionProperties=nullptr; #endif } @@ -521,12 +644,25 @@ void HardwareDetector::detectVulkanGpus() VkGetPhysicalDevicePropertiesFunc getProperties=reinterpret_cast(m_vkGetPhysicalDeviceProperties); VkGetPhysicalDeviceMemoryPropertiesFunc getMemProperties=reinterpret_cast(m_vkGetPhysicalDeviceMemoryProperties); - // Create a minimal Vulkan instance + // Optional Vulkan 1.1+ function for memory budget queries + VkGetPhysicalDeviceMemoryProperties2Func getMemProperties2=nullptr; + VkEnumerateDeviceExtensionPropertiesFunc enumDeviceExtensions=nullptr; + + if(m_vkGetPhysicalDeviceMemoryProperties2) + { + getMemProperties2=reinterpret_cast(m_vkGetPhysicalDeviceMemoryProperties2); + } + if(m_vkEnumerateDeviceExtensionProperties) + { + enumDeviceExtensions=reinterpret_cast(m_vkEnumerateDeviceExtensionProperties); + } + + // Create a Vulkan instance — request API 1.1 for vkGetPhysicalDeviceMemoryProperties2 VkApplicationInfo appInfo{}; appInfo.sType=VK_STRUCTURE_TYPE_APPLICATION_INFO; appInfo.pApplicationName="ArbiterAI HardwareDetector"; appInfo.applicationVersion=1; - appInfo.apiVersion=(1u<<22)|(0u<<12)|0u; // VK_MAKE_API_VERSION(0,1,0,0) + appInfo.apiVersion=(1u<<22)|(1u<<12)|0u; // VK_MAKE_API_VERSION(0,1,1,0) VkInstanceCreateInfo createInfo{}; createInfo.sType=VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; @@ -582,29 +718,151 @@ void HardwareDetector::detectVulkanGpus() continue; } - VkPhysicalDeviceMemoryProperties memProps{}; - getMemProperties(devices[i], &memProps); - - // Sum device-local heap sizes for total VRAM - int vramTotalMb=0; - for(uint32_t h=0; h0) { - vramTotalMb+=static_cast(memProps.memoryHeaps[h].size/(1024*1024)); + std::vector exts(extCount); + if(enumDeviceExtensions(devices[i], nullptr, &extCount, exts.data())==VK_SUCCESS) + { + for(uint32_t e=0; e(i); gpu.name=props.deviceName; gpu.backend=GpuBackend::Vulkan; - gpu.vramTotalMb=vramTotalMb; - gpu.vramFreeMb=vramTotalMb; // Vulkan doesn't provide free VRAM — approximate as total - gpu.unifiedMemory=(props.deviceType==VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU); + gpu.unifiedMemory=isIntegrated; + + // Use VK_EXT_memory_budget for accurate per-heap budget/usage data. + // This is the authoritative runtime signal for how much memory + // the GPU can actually use, especially on UMA/APU systems where + // heap sizes alone can be misleading. + if(hasBudgetExt&&getMemProperties2) + { + VkPhysicalDeviceMemoryBudgetPropertiesEXT budgetProps{}; + budgetProps.sType=VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT; + + VkPhysicalDeviceMemoryProperties2 memProps2{}; + memProps2.sType=VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2; + memProps2.pNext=&budgetProps; + + getMemProperties2(devices[i], &memProps2); + + const VkPhysicalDeviceMemoryProperties &mp=memProps2.memoryProperties; - spdlog::info("Vulkan GPU {}: {} ({}MB VRAM, integrated={})", - gpu.index, gpu.name, gpu.vramTotalMb, gpu.unifiedMemory); + // Sum DEVICE_LOCAL heaps — on discrete GPUs this is dedicated VRAM, + // on UMA systems this is the GPU-accessible portion of system RAM. + uint64_t deviceLocalBudgetBytes=0; + uint64_t deviceLocalUsageBytes=0; + uint64_t deviceLocalSizeBytes=0; + + for(uint32_t h=0; h(mp.memoryHeaps[h].size)/(1024.0*1024.0), + static_cast(budgetProps.heapBudget[h])/(1024.0*1024.0), + static_cast(budgetProps.heapUsage[h])/(1024.0*1024.0)); + + MemoryHeapInfo heapInfo; + heapInfo.index=static_cast(h); + heapInfo.deviceLocal=deviceLocal; + heapInfo.sizeMb=static_cast(mp.memoryHeaps[h].size/(1024ULL*1024ULL)); + heapInfo.budgetMb=static_cast(budgetProps.heapBudget[h]/(1024ULL*1024ULL)); + heapInfo.usageMb=static_cast(budgetProps.heapUsage[h]/(1024ULL*1024ULL)); + gpu.memoryHeaps.push_back(heapInfo); + + if(deviceLocal) + { + deviceLocalSizeBytes+=mp.memoryHeaps[h].size; + deviceLocalBudgetBytes+=budgetProps.heapBudget[h]; + deviceLocalUsageBytes+=budgetProps.heapUsage[h]; + } + } + + gpu.hasMemoryBudget=true; + + // Budget is the best estimate of how much this process can allocate. + // On UMA, budget may be significantly larger than the raw heap size + // (driver exposes most of system RAM as available to the GPU). + uint64_t budgetTotalMb=deviceLocalBudgetBytes/(1024ULL*1024ULL); + uint64_t budgetUsedMb=deviceLocalUsageBytes/(1024ULL*1024ULL); + uint64_t heapSizeMb=deviceLocalSizeBytes/(1024ULL*1024ULL); + + // Use the larger of heap size and budget for total — on some UMA + // drivers the budget exceeds the reported heap size. + uint64_t effectiveTotalMb=(budgetTotalMb>heapSizeMb) ? budgetTotalMb : heapSizeMb; + uint64_t effectiveFreeMb=(deviceLocalBudgetBytes>deviceLocalUsageBytes) + ? (deviceLocalBudgetBytes-deviceLocalUsageBytes)/(1024ULL*1024ULL) + : 0; + + gpu.vramTotalMb=static_cast(effectiveTotalMb); + gpu.vramFreeMb=static_cast(effectiveFreeMb); + + if(isIntegrated) + { + // For UMA, the GPU-accessible pool is the DEVICE_LOCAL budget — + // this is the closest Vulkan gives to "how much memory the GPU + // can actually use right now" on unified-memory systems. + gpu.gpuAccessibleRamMb=static_cast(effectiveTotalMb); + gpu.gpuAccessibleRamFreeMb=static_cast(effectiveFreeMb); + } + + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "Vulkan GPU {}: {} (budget: {}MB total, {}MB free, " + "heap size: {}MB, integrated={}, memoryBudget=true)", + gpu.index, gpu.name, + gpu.vramTotalMb, gpu.vramFreeMb, + static_cast(heapSizeMb), gpu.unifiedMemory); + } + else + { + // Fallback: no memory budget extension — use raw heap sizes. + // Free VRAM is unknown, approximate as total. + VkPhysicalDeviceMemoryProperties memProps{}; + getMemProperties(devices[i], &memProps); + + int vramTotalMb=0; + for(uint32_t h=0; h(h); + heapInfo.deviceLocal=deviceLocal; + heapInfo.sizeMb=static_cast(memProps.memoryHeaps[h].size/(1024ULL*1024ULL)); + gpu.memoryHeaps.push_back(heapInfo); + + if(deviceLocal) + { + vramTotalMb+=static_cast(memProps.memoryHeaps[h].size/(1024*1024)); + } + } + + gpu.vramTotalMb=vramTotalMb; + gpu.vramFreeMb=vramTotalMb; + + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "Vulkan GPU {}: {} ({}MB VRAM, integrated={}, memoryBudget=false)", + gpu.index, gpu.name, gpu.vramTotalMb, gpu.unifiedMemory); + } m_systemInfo.gpus.push_back(gpu); } @@ -613,23 +871,14 @@ void HardwareDetector::detectVulkanGpus() } // --- Unified memory detection via sysfs (amdgpu) --- +// Supplements Vulkan memory budget data with amdgpu kernel-side diagnostics. +// If VK_EXT_memory_budget already provided gpuAccessibleRam, sysfs is used +// only for logging correlation. If budget was not available, sysfs provides +// the fallback for GPU-accessible memory detection. void HardwareDetector::detectUnifiedMemory() { #ifdef __linux__ - // For each GPU flagged as integrated, attempt to read amdgpu sysfs - // to get the actual GPU-accessible memory pool (GTT + VRAM). - // - // amdgpu exposes per-device files under /sys/class/drm/card*/device/: - // mem_info_vram_total — dedicated VRAM in bytes - // mem_info_vram_used — used VRAM in bytes - // mem_info_gtt_total — system RAM accessible to GPU (GTT) in bytes - // mem_info_gtt_used — used GTT in bytes - // - // On unified memory APUs (e.g. Ryzen AI Max+), the GPU can address - // GTT + VRAM as its working memory pool. The GTT size is typically - // most of system RAM minus a kernel reservation. - for(GpuInfo &gpu:m_systemInfo.gpus) { if(!gpu.unifiedMemory) @@ -637,7 +886,15 @@ void HardwareDetector::detectUnifiedMemory() continue; } + // If Vulkan budget already set gpuAccessibleRamMb, we have good data. + // Still attempt sysfs for diagnostic logging, but don't overwrite. + bool hasBudgetData=(gpu.gpuAccessibleRamMb>0); + // Find the matching DRM card by scanning /sys/class/drm/card* + // Match by reading mem_info_vram_total from sysfs and comparing + // to the basic Vulkan heap size. Note: when budget is available, + // gpu.vramTotalMb may reflect the budget (much larger on UMA), + // so we read the raw sysfs VRAM and look for any reasonable match. std::string matchedCardPath; for(int card=0; card<16; ++card) @@ -651,107 +908,109 @@ void HardwareDetector::detectUnifiedMemory() continue; } - // Verify this card's VRAM matches what Vulkan reported - // (to handle multi-GPU systems correctly) + // On UMA APUs, the sysfs VRAM is typically a small carveout + // (e.g. 512MB) that doesn't match the Vulkan budget-derived + // vramTotalMb. Accept the match if this is the only amdgpu card + // with sysfs data available, or if the values are close. long long vramBytes=0; testFile >> vramBytes; - int vramMb=static_cast(vramBytes/(1024LL*1024LL)); - - // Allow 1% tolerance for rounding - int diff=vramMb-gpu.vramTotalMb; - if(diff<0) - { - diff=-diff; - } - if(diff<=gpu.vramTotalMb/100+1) - { - matchedCardPath=cardPath; - break; - } + // Just accept the first card that has sysfs data for UMA — + // multi-GPU UMA systems are extremely rare. + matchedCardPath=cardPath; + break; } if(matchedCardPath.empty()) { - // No sysfs match — fall back to system RAM as GPU-accessible pool. - // Unified memory GPUs share system RAM, so the GPU can access most of it. - // Use total system RAM as the accessible pool estimate. - gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb; - gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb; - - spdlog::info("Unified memory GPU {}: {} — no sysfs match, " - "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool", - gpu.index, gpu.name, - gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb); + if(!hasBudgetData) + { + // No sysfs and no budget — fall back to system RAM + gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb; + gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb; + + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "Unified memory GPU {}: {} — no sysfs or budget data, " + "falling back to system RAM ({}MB total, {}MB free)", + gpu.index, gpu.name, + gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb); + } continue; } - // Read VRAM used (better than Vulkan's "assume all free") + // Read sysfs VRAM and GTT for diagnostic logging + long long sysfsVramTotal=0, sysfsVramUsed=0; + long long gttTotalBytes=0, gttUsedBytes=0; + + { + std::ifstream file(matchedCardPath+"/mem_info_vram_total"); + if(file.is_open()) file >> sysfsVramTotal; + } { std::ifstream file(matchedCardPath+"/mem_info_vram_used"); - if(file.is_open()) - { - long long usedBytes=0; - file >> usedBytes; - - int usedMb=static_cast(usedBytes/(1024LL*1024LL)); - gpu.vramFreeMb=gpu.vramTotalMb-usedMb; - if(gpu.vramFreeMb<0) - { - gpu.vramFreeMb=0; - } - } + if(file.is_open()) file >> sysfsVramUsed; } - - // Read GTT total and used - long long gttTotalBytes=0; - long long gttUsedBytes=0; - { std::ifstream file(matchedCardPath+"/mem_info_gtt_total"); - if(file.is_open()) - { - file >> gttTotalBytes; - } + if(file.is_open()) file >> gttTotalBytes; } { std::ifstream file(matchedCardPath+"/mem_info_gtt_used"); - if(file.is_open()) - { - file >> gttUsedBytes; - } + if(file.is_open()) file >> gttUsedBytes; } - if(gttTotalBytes>0) + int sysfsVramTotalMb=static_cast(sysfsVramTotal/(1024LL*1024LL)); + int sysfsVramUsedMb=static_cast(sysfsVramUsed/(1024LL*1024LL)); + int gttTotalMb=static_cast(gttTotalBytes/(1024LL*1024LL)); + int gttUsedMb=static_cast(gttUsedBytes/(1024LL*1024LL)); + int gttFreeMb=gttTotalMb-gttUsedMb; + + if(gttFreeMb<0) gttFreeMb=0; + + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "Unified memory GPU {}: {} — sysfs: VRAM {}MB ({}MB used), " + "GTT {}MB ({}MB used), budgetAlreadySet={}", + gpu.index, gpu.name, + sysfsVramTotalMb, sysfsVramUsedMb, + gttTotalMb, gttUsedMb, hasBudgetData); + + if(hasBudgetData) { - int gttTotalMb=static_cast(gttTotalBytes/(1024LL*1024LL)); - int gttUsedMb=static_cast(gttUsedBytes/(1024LL*1024LL)); - int gttFreeMb=gttTotalMb-gttUsedMb; + // Vulkan budget is authoritative for allocation decisions. + // Log sysfs as a diagnostic side channel only. + continue; + } - if(gttFreeMb<0) - { - gttFreeMb=0; - } + // No Vulkan budget — use sysfs data for GPU-accessible memory. + // Refine VRAM free from sysfs (more accurate than "assume all free"). + if(sysfsVramTotal>0) + { + gpu.vramTotalMb=sysfsVramTotalMb; + gpu.vramFreeMb=sysfsVramTotalMb-sysfsVramUsedMb; + if(gpu.vramFreeMb<0) gpu.vramFreeMb=0; + } + if(gttTotalBytes>0) + { // GPU-accessible memory = VRAM + GTT (system RAM mapped to GPU) gpu.gpuAccessibleRamMb=gpu.vramTotalMb+gttTotalMb; gpu.gpuAccessibleRamFreeMb=gpu.vramFreeMb+gttFreeMb; - spdlog::info("Unified memory GPU {}: {} — VRAM {}MB ({}MB free), " - "GTT {}MB ({}MB free), total accessible {}MB ({}MB free)", + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "Unified memory GPU {}: {} — sysfs fallback: " + "total accessible {}MB ({}MB free)", gpu.index, gpu.name, - gpu.vramTotalMb, gpu.vramFreeMb, - gttTotalMb, gttFreeMb, gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb); } else { - // sysfs card matched but no GTT info — fall back to system RAM + // No GTT info — fall back to system RAM gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb; gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb; - spdlog::info("Unified memory GPU {}: {} — no GTT info, " - "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool", + spdlog::log(m_firstRefreshDone ? spdlog::level::debug : spdlog::level::info, + "Unified memory GPU {}: {} — no GTT info, " + "falling back to system RAM ({}MB total, {}MB free)", gpu.index, gpu.name, gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb); } diff --git a/src/arbiterAI/hardwareDetector.h b/src/arbiterAI/hardwareDetector.h index 4dd511a..069191b 100644 --- a/src/arbiterAI/hardwareDetector.h +++ b/src/arbiterAI/hardwareDetector.h @@ -3,6 +3,7 @@ #include #include +#include #include namespace arbiterAI @@ -14,6 +15,14 @@ enum class GpuBackend { Vulkan }; +struct MemoryHeapInfo { + int index=0; + bool deviceLocal=false; + int sizeMb=0; + int budgetMb=0; // 0 if VK_EXT_memory_budget not available + int usageMb=0; // 0 if VK_EXT_memory_budget not available +}; + struct GpuInfo { int index=0; std::string name; @@ -25,6 +34,9 @@ struct GpuInfo { bool unifiedMemory=false; // true for APUs/iGPUs sharing system RAM int gpuAccessibleRamMb=0; // total RAM the GPU can access (GTT+VRAM on APUs, 0 for discrete) int gpuAccessibleRamFreeMb=0; // free RAM the GPU can access + bool hasMemoryBudget=false; // true if VK_EXT_memory_budget was used + bool vramOverridden=false; // true if VRAM values were overridden by user + std::vector memoryHeaps; // per-heap details from Vulkan }; struct SystemInfo { @@ -60,6 +72,22 @@ class HardwareDetector { /// Check whether Vulkan is available at runtime bool isVulkanAvailable() const; + /// Override the reported VRAM for a GPU (persists across refresh cycles). + /// For unified memory devices, also overrides gpuAccessibleRamMb. + void setVramOverride(int gpuIndex, int vramMb); + + /// Clear a VRAM override for a specific GPU + void clearVramOverride(int gpuIndex); + + /// Clear all VRAM overrides + void clearAllVramOverrides(); + + /// Check whether a GPU has a VRAM override set + bool hasVramOverride(int gpuIndex) const; + + /// Get the VRAM override value for a GPU (0 if not set) + int getVramOverride(int gpuIndex) const; + private: HardwareDetector(); ~HardwareDetector(); @@ -73,6 +101,7 @@ class HardwareDetector { void detectNvmlGpus(); void detectVulkanGpus(); void detectUnifiedMemory(); + void applyVramOverrides(); // NVML dlopen handles bool loadNvml(); @@ -84,6 +113,7 @@ class HardwareDetector { SystemInfo m_systemInfo; mutable std::mutex m_mutex; + std::map m_vramOverrides; // gpuIndex → vramMb // Runtime library handles void *m_nvmlLib=nullptr; @@ -107,10 +137,15 @@ class HardwareDetector { void *m_vkEnumeratePhysicalDevices=nullptr; void *m_vkGetPhysicalDeviceProperties=nullptr; void *m_vkGetPhysicalDeviceMemoryProperties=nullptr; + void *m_vkGetPhysicalDeviceMemoryProperties2=nullptr; + void *m_vkEnumerateDeviceExtensionProperties=nullptr; // CPU utilization tracking (for delta calculation) long long m_prevCpuIdle=0; long long m_prevCpuTotal=0; + + // First refresh logs at info, subsequent refreshes at debug + bool m_firstRefreshDone=false; }; } // namespace arbiterAI diff --git a/src/arbiterAI/modelDownloader.cpp b/src/arbiterAI/modelDownloader.cpp index 33d5109..a8af745 100644 --- a/src/arbiterAI/modelDownloader.cpp +++ b/src/arbiterAI/modelDownloader.cpp @@ -20,68 +20,8 @@ ModelDownloader::ModelDownloader(std::shared_ptr fileVerifier) : std::future ModelDownloader::downloadModel(const std::string &downloadUrl, const std::string &filePathStr, const std::optional &fileHash, const std::optional &minVersion, const std::optional &maxVersion) { - return std::async(std::launch::async, [this, downloadUrl, filePathStr, fileHash, minVersion, maxVersion]() - { - // Check version compatibility first - if(minVersion||maxVersion) - { - std::string clientVersion="1.0.0"; // TODO: Get from build config - if ((minVersion && ModelManager::compareVersions(clientVersion, *minVersion) < 0) || - (maxVersion && ModelManager::compareVersions(clientVersion, *maxVersion) > 0)) - { - spdlog::error("Version mismatch: client {} not compatible with model requirements (min: {}, max: {})", - clientVersion, minVersion?*minVersion:"none", maxVersion?*maxVersion:"none"); - return false; - } - } - - std::filesystem::path filePath(filePathStr); - if(std::filesystem::exists(filePath)) - { - if(fileHash&&m_fileVerifier->verifyFile(filePath.string(), *fileHash)) - { - spdlog::info("Model already exists and is verified: {}", filePath.string()); - return true; - } - } - - spdlog::info("Downloading model from {} to {}", downloadUrl, filePath.string()); - cpr::Response r=cpr::Get(cpr::Url{ downloadUrl }); - if(r.status_code!=200) - { - spdlog::error("Failed to download model. Status code: {}", r.status_code); - return false; - } - - try - { - std::filesystem::create_directories(filePath.parent_path()); - std::ofstream outFile(filePath, std::ios::binary); - outFile.write(r.text.c_str(), r.text.length()); - outFile.close(); - } - catch(const std::filesystem::filesystem_error &e) - { - spdlog::error("Failed to write model to file: {}. Error: {}", filePath.string(), e.what()); - return false; - } - - if(fileHash) - { - if(m_fileVerifier->verifyFile(filePath.string(), *fileHash)) - { - spdlog::info("Model downloaded and verified successfully: {}", filePath.string()); - return true; - } - else - { - spdlog::error("SHA256 verification failed for: {}", filePath.string()); - return false; - } - } - - return true; - }); + // Delegate to downloadModelWithProgress with no callback or tracking name + return downloadModelWithProgress(downloadUrl, filePathStr, fileHash, nullptr, "", ""); } @@ -188,31 +128,31 @@ std::future ModelDownloader::downloadModelWithProgress( const std::string &variant) { // Create tracking state - auto downloadState = std::make_shared(); - downloadState->modelName = modelName.empty() ? filePathStr : modelName; - downloadState->variant = variant; - downloadState->status = DownloadStatus::Pending; - downloadState->startTime = std::chrono::steady_clock::now(); + auto downloadState=std::make_shared(); + downloadState->modelName=modelName.empty()?filePathStr:modelName; + downloadState->variant=variant; + downloadState->status=DownloadStatus::Pending; + downloadState->startTime=std::chrono::steady_clock::now(); { std::lock_guard lock(m_downloadsMutex); - m_activeDownloads[downloadState->modelName] = downloadState; + m_activeDownloads[downloadState->modelName]=downloadState; } return std::async(std::launch::async, [this, downloadUrl, filePathStr, fileHash, progressCallback, downloadState]() { - downloadState->status = DownloadStatus::InProgress; + downloadState->status=DownloadStatus::InProgress; std::filesystem::path filePath(filePathStr); // Check if file already exists and is valid - if (std::filesystem::exists(filePath)) + if(std::filesystem::exists(filePath)) { - if (fileHash && m_fileVerifier->verifyFile(filePath.string(), *fileHash)) + if(fileHash&&m_fileVerifier->verifyFile(filePath.string(), *fileHash)) { spdlog::info("Model already exists and is verified: {}", filePath.string()); - downloadState->status = DownloadStatus::Completed; - downloadState->percentComplete = 100.0f; - if (progressCallback) + downloadState->status=DownloadStatus::Completed; + downloadState->percentComplete=100.0f; + if(progressCallback) { progressCallback(0, 0, 100.0f); } @@ -220,120 +160,155 @@ std::future ModelDownloader::downloadModelWithProgress( } } - // Check for partial download (resume support) - std::string partialPath = filePathStr + ".partial"; - int64_t existingBytes = 0; - if (std::filesystem::exists(partialPath)) - { - existingBytes = std::filesystem::file_size(partialPath); - spdlog::info("Found partial download with {} bytes", existingBytes); - } - spdlog::info("Downloading model from {} to {}", downloadUrl, filePath.string()); try { std::filesystem::create_directories(filePath.parent_path()); } - catch (const std::filesystem::filesystem_error &e) + catch(const std::filesystem::filesystem_error &e) { spdlog::error("Failed to create directory: {}", e.what()); - downloadState->status = DownloadStatus::Failed; - downloadState->error = e.what(); + downloadState->status=DownloadStatus::Failed; + downloadState->error=e.what(); return false; } - // Use CPR with progress callback - cpr::Response r = cpr::Get( + // Stream directly to a .partial file on disk to avoid buffering the + // entire response body in RAM. A 20 GB model download would otherwise + // require 20+ GB of heap, which caused OOM / heap corruption (SEGV). + std::string partialPath=filePathStr+".partial"; + + // Remove stale partial file so we start fresh + { + std::error_code ec; + std::filesystem::remove(partialPath, ec); + } + + std::ofstream outFile(partialPath, std::ios::binary|std::ios::trunc); + if(!outFile.is_open()) + { + spdlog::error("Failed to open partial file for writing: {}", partialPath); + downloadState->status=DownloadStatus::Failed; + downloadState->error="Failed to open "+partialPath; + return false; + } + + bool writeError=false; + + cpr::Response r=cpr::Get( cpr::Url{downloadUrl}, - cpr::ProgressCallback([&downloadState, &progressCallback](cpr::cpr_off_t downloadTotal, - cpr::cpr_off_t downloadNow, - cpr::cpr_off_t uploadTotal, - cpr::cpr_off_t uploadNow, - intptr_t userdata) -> bool + cpr::WriteCallback([&outFile, &writeError](const std::string_view &data, intptr_t) -> bool { - downloadState->bytesDownloaded = downloadNow; - downloadState->totalBytes = downloadTotal; - - float percent = 0.0f; - if (downloadTotal > 0) + outFile.write(data.data(), static_cast(data.size())); + if(!outFile.good()) + { + writeError=true; + return false; // abort transfer + } + return true; + }), + cpr::ProgressCallback([&downloadState, &progressCallback](cpr::cpr_off_t downloadTotal, + cpr::cpr_off_t downloadNow, + cpr::cpr_off_t uploadTotal, + cpr::cpr_off_t uploadNow, + intptr_t userdata) -> bool + { + (void)uploadTotal; + (void)uploadNow; + (void)userdata; + + downloadState->bytesDownloaded=downloadNow; + downloadState->totalBytes=downloadTotal; + + float percent=0.0f; + if(downloadTotal>0) { - percent = (static_cast(downloadNow) / downloadTotal) * 100.0f; + percent=(static_cast(downloadNow)/downloadTotal)*100.0f; } - downloadState->percentComplete = percent; + downloadState->percentComplete=percent; // Record speed sample { std::lock_guard lock(downloadState->speedMutex); - auto now = std::chrono::steady_clock::now(); + std::chrono::steady_clock::time_point now=std::chrono::steady_clock::now(); downloadState->speedSamples.push_back({now, downloadNow}); // Keep only last 10 seconds of samples - auto cutoff = now - std::chrono::seconds(10); - while(!downloadState->speedSamples.empty() && downloadState->speedSamples.front().first < cutoff) + std::chrono::steady_clock::time_point cutoff=now-std::chrono::seconds(10); + while(!downloadState->speedSamples.empty()&&downloadState->speedSamples.front().firstspeedSamples.pop_front(); } } - - if (progressCallback) + + if(progressCallback) { progressCallback(downloadNow, downloadTotal, percent); } - - return true; // Continue downloading + + return true; }) ); - if (r.status_code != 200) + outFile.close(); + + if(writeError) + { + spdlog::error("Write error during download to {}", partialPath); + std::error_code ec; + std::filesystem::remove(partialPath, ec); + downloadState->status=DownloadStatus::Failed; + downloadState->error="Disk write error"; + return false; + } + + if(r.status_code!=200) { spdlog::error("Failed to download model. Status code: {}", r.status_code); - downloadState->status = DownloadStatus::Failed; - downloadState->error = "HTTP error: " + std::to_string(r.status_code); + std::error_code ec; + std::filesystem::remove(partialPath, ec); + downloadState->status=DownloadStatus::Failed; + downloadState->error="HTTP error: "+std::to_string(r.status_code); return false; } - try + // Rename .partial -> final path atomically { - std::ofstream outFile(filePath, std::ios::binary); - outFile.write(r.text.c_str(), r.text.length()); - outFile.close(); - - // Remove partial file if it exists - if (std::filesystem::exists(partialPath)) + std::error_code ec; + std::filesystem::rename(partialPath, filePath, ec); + if(ec) { - std::filesystem::remove(partialPath); + spdlog::error("Failed to rename {} -> {}: {}", partialPath, filePath.string(), ec.message()); + std::filesystem::remove(partialPath, ec); + downloadState->status=DownloadStatus::Failed; + downloadState->error="Failed to rename partial file"; + return false; } } - catch (const std::filesystem::filesystem_error &e) - { - spdlog::error("Failed to write model to file: {}. Error: {}", filePath.string(), e.what()); - downloadState->status = DownloadStatus::Failed; - downloadState->error = e.what(); - return false; - } - if (fileHash) + if(fileHash) { - if (m_fileVerifier->verifyFile(filePath.string(), *fileHash)) + if(m_fileVerifier->verifyFile(filePath.string(), *fileHash)) { spdlog::info("Model downloaded and verified successfully: {}", filePath.string()); - downloadState->status = DownloadStatus::Completed; - downloadState->percentComplete = 100.0f; + downloadState->status=DownloadStatus::Completed; + downloadState->percentComplete=100.0f; return true; } else { spdlog::error("SHA256 verification failed for: {}", filePath.string()); - downloadState->status = DownloadStatus::Failed; - downloadState->error = "SHA256 verification failed"; + downloadState->status=DownloadStatus::Failed; + downloadState->error="SHA256 verification failed"; return false; } } - downloadState->status = DownloadStatus::Completed; - downloadState->percentComplete = 100.0f; + spdlog::info("Model downloaded successfully: {}", filePath.string()); + downloadState->status=DownloadStatus::Completed; + downloadState->percentComplete=100.0f; return true; }); } diff --git a/src/arbiterAI/modelManager.cpp b/src/arbiterAI/modelManager.cpp index c02d529..1523cd2 100644 --- a/src/arbiterAI/modelManager.cpp +++ b/src/arbiterAI/modelManager.cpp @@ -35,6 +35,33 @@ bool ModelInfo::isSchemaCompatible(const std::string &schemaVersion) const ModelManager::compareVersions(schemaVersion, configVersion) <= 0; } +std::vector ModelVariant::getAllFiles() const +{ + if(!files.empty()) + { + return files; + } + if(!download.filename.empty()) + { + return {download}; + } + return {}; +} + +std::string ModelVariant::getPrimaryFilename() const +{ + if(!files.empty()) + { + return files.front().filename; + } + return download.filename; +} + +bool ModelVariant::isSplit() const +{ + return files.size()>1; +} + ModelManager &ModelManager::instance() { static ModelManager instance; @@ -299,6 +326,26 @@ bool ModelManager::parseModelInfo(const nlohmann::json &modelJson, ModelInfo &in variant.download.filename=dl["filename"].get(); } } + if(variantJson.contains("files")&&variantJson["files"].is_array()) + { + for(const auto &fileJson:variantJson["files"]) + { + VariantDownload vd; + if(fileJson.contains("url")) + { + vd.url=fileJson["url"].get(); + } + if(fileJson.contains("sha256")) + { + vd.sha256=fileJson["sha256"].get(); + } + if(fileJson.contains("filename")) + { + vd.filename=fileJson["filename"].get(); + } + variant.files.push_back(vd); + } + } info.variants.push_back(variant); } } @@ -737,6 +784,19 @@ nlohmann::json ModelManager::modelInfoToJson(const ModelInfo &info) {"filename", v.download.filename} }; } + if(!v.files.empty()) + { + nlohmann::json filesArr=nlohmann::json::array(); + for(const VariantDownload &f:v.files) + { + nlohmann::json fj; + fj["url"]=f.url; + fj["sha256"]=f.sha256; + fj["filename"]=f.filename; + filesArr.push_back(fj); + } + vj["files"]=filesArr; + } variants.push_back(vj); } j["variants"]=variants; diff --git a/src/arbiterAI/modelManager.h b/src/arbiterAI/modelManager.h index 5e5648d..703cb0f 100644 --- a/src/arbiterAI/modelManager.h +++ b/src/arbiterAI/modelManager.h @@ -45,7 +45,19 @@ struct ModelVariant { int fileSizeMb=0; int minVramMb=0; int recommendedVramMb=0; - VariantDownload download; + VariantDownload download; // Primary / single-file download (backward compat) + std::vector files; // All shard files (split GGUF). Empty = use download field. + + /// Get the complete list of files to download for this variant. + /// Returns files if non-empty, otherwise a 1-element vector from download (if non-empty). + std::vector getAllFiles() const; + + /// Get the primary filename (first shard / single file) used as the llama.cpp load path. + /// Returns empty string if no download info is configured. + std::string getPrimaryFilename() const; + + /// Check whether this variant is a split (multi-file) GGUF. + bool isSplit() const; }; struct Pricing diff --git a/src/arbiterAI/modelRuntime.cpp b/src/arbiterAI/modelRuntime.cpp index e915a9f..f4f80dd 100644 --- a/src/arbiterAI/modelRuntime.cpp +++ b/src/arbiterAI/modelRuntime.cpp @@ -4,10 +4,12 @@ #include "arbiterAI/storageManager.h" #include +#include #include #include #include #include +#include namespace arbiterAI { @@ -21,8 +23,29 @@ ModelRuntime &ModelRuntime::instance() void ModelRuntime::reset() { ModelRuntime &rt=instance(); + + // Signal all background download threads to stop waiting + { + std::lock_guard lock(rt.m_mutex); + rt.m_shuttingDown=true; + rt.m_downloadCv.notify_all(); + } + + for(std::thread &t:rt.m_downloadThreads) + { + if(t.joinable()) + { + t.join(); + } + } + std::lock_guard lock(rt.m_mutex); + rt.m_downloadThreads.clear(); + rt.m_activeDownloadCount=0; + rt.m_maxConcurrentDownloads=2; + rt.m_shuttingDown=false; + // Free all llama.cpp resources for(auto &pair:rt.m_models) { @@ -49,6 +72,230 @@ ModelRuntime::ModelRuntime() m_readyRamBudgetMb=hw.totalRamMb/2; } +// ---- llama.cpp log capture ------------------------------------------------ + +// Thread-local pointer to the active ModelRuntime capturing logs. +// Only one load happens at a time (under m_mutex), so this is safe. +static ModelRuntime *s_capturingInstance=nullptr; + +static void llamaLogCallback(enum ggml_log_level level, const char *text, void *userData) +{ + (void)userData; + ModelRuntime *rt=s_capturingInstance; + if(rt) + { + rt->appendLlamaLog(text); + } + + // Also forward to spdlog so logs still appear in the server log + if(text) + { + std::string msg(text); + // Strip trailing newline for spdlog + while(!msg.empty()&&(msg.back()=='\n'||msg.back()=='\r')) + { + msg.pop_back(); + } + if(msg.empty()) return; + + switch(level) + { + case GGML_LOG_LEVEL_ERROR: + spdlog::error("[llama] {}", msg); + break; + case GGML_LOG_LEVEL_WARN: + spdlog::warn("[llama] {}", msg); + break; + case GGML_LOG_LEVEL_DEBUG: + spdlog::debug("[llama] {}", msg); + break; + default: + spdlog::info("[llama] {}", msg); + break; + } + } +} + +void ModelRuntime::appendLlamaLog(const char *text) +{ + if(m_capturingLlamaLog&&text) + { + m_llamaLogCapture< lock(m_mutex); + return m_lastLoadError; +} + +void ModelRuntime::setMaxConcurrentDownloads(int max) +{ + std::lock_guard lock(m_mutex); + m_maxConcurrentDownloads=std::max(1, max); + m_downloadCv.notify_all(); +} + +int ModelRuntime::getMaxConcurrentDownloads() const +{ + std::lock_guard lock(m_mutex); + return m_maxConcurrentDownloads; +} + ErrorCode ModelRuntime::loadModel( const std::string &model, const std::string &variant, @@ -56,6 +303,9 @@ ErrorCode ModelRuntime::loadModel( { std::lock_guard lock(m_mutex); + // Clear previous load error + m_lastLoadError=LoadErrorDetail{}; + // Check if already loaded auto it=m_models.find(model); if(it!=m_models.end()) @@ -110,8 +360,13 @@ ErrorCode ModelRuntime::loadModel( } // Determine context size + // For llama provider models, contextSize=0 means "use model's native + // training context from GGUF metadata" — resolved in loadLlamaModel after + // loading model weights. For cloud models, fall back to the config value. int resolvedContext=contextSize; - if(resolvedContext<=0) + bool useNativeContext=(resolvedContext<=0&&modelInfo->provider=="llama"); + + if(resolvedContext<=0&&!useNativeContext) { resolvedContext=modelInfo->contextWindow; } @@ -137,6 +392,16 @@ ErrorCode ModelRuntime::loadModel( ModelFit fit=ModelFitCalculator::calculateModelFit(modelInfo.value(), *selectedVar, hw); if(!fit.canRun) { + m_lastLoadError.reason=(fit.limitingFactor=="ram") + ?LoadFailureReason::InsufficientRam + :LoadFailureReason::InsufficientVram; + m_lastLoadError.summary="Model '"+model+"' variant '"+selectedVariant+ + "' cannot run on this hardware: "+fit.limitingFactor; + m_lastLoadError.suggestion=(fit.limitingFactor=="ram") + ?"Insufficient system RAM. Try a smaller quantization variant or close other applications." + :"Insufficient VRAM. Try a smaller quantization variant, reduce context size, or unload other models."; + m_lastLoadError.action="use_smaller_variant"; + m_lastLoadError.recoverable=true; spdlog::error("Model '{}' variant '{}' cannot run: {}", model, selectedVariant, fit.limitingFactor); return ErrorCode::ModelLoadError; } @@ -144,19 +409,31 @@ ErrorCode ModelRuntime::loadModel( // Evict if needed to make room evictIfNeeded(selectedVar->minVramMb); - // Check if model file exists, initiate download if needed - if(!selectedVar->download.filename.empty()) + // Check if all model files exist, initiate async download for any missing ones + std::vector allFiles=selectedVar->getAllFiles(); + std::string primaryFilename=selectedVar->getPrimaryFilename(); + + if(!allFiles.empty()) { - std::string filePath="/models/"+selectedVar->download.filename; - if(!std::filesystem::exists(filePath)&&!selectedVar->download.url.empty()) + bool anyMissing=false; + for(const VariantDownload &file:allFiles) + { + std::string filePath="/models/"+file.filename; + if(!std::filesystem::exists(filePath)&&!file.url.empty()) + { + anyMissing=true; + break; + } + } + + if(anyMissing) { - // Check storage quota before downloading - int64_t fileSizeBytes=static_cast(selectedVar->fileSizeMb)*1024*1024; - if(!StorageManager::instance().canDownload(fileSizeBytes)) + // Check storage quota for total missing file size + int64_t totalDownloadBytes=static_cast(selectedVar->fileSizeMb)*1024*1024; + if(!StorageManager::instance().canDownload(totalDownloadBytes)) { - // Try cleanup first StorageManager::instance().runCleanup(); - if(!StorageManager::instance().canDownload(fileSizeBytes)) + if(!StorageManager::instance().canDownload(totalDownloadBytes)) { spdlog::error("Insufficient storage to download '{}' variant '{}' ({} MB)", model, selectedVariant, selectedVar->fileSizeMb); @@ -164,47 +441,29 @@ ErrorCode ModelRuntime::loadModel( } } - // Mark as downloading + // Mark as downloading and launch async background download LoadedModel &dlEntry=m_models[model]; dlEntry.modelName=model; dlEntry.variant=selectedVariant; dlEntry.state=ModelState::Downloading; dlEntry.lastUsed=std::chrono::steady_clock::now(); - spdlog::info("Downloading model '{}' variant '{}'", model, selectedVariant); - - // Release the lock during download (can be very large) - m_mutex.unlock(); - bool downloadOk=downloadModelFile( - selectedVar->download.url, - filePath, - selectedVar->download.sha256, - model, - selectedVariant); - m_mutex.lock(); - - if(!downloadOk) - { - m_models.erase(model); - return ErrorCode::ModelDownloadFailed; - } - // Register the download with StorageManager - int64_t actualSize=0; - std::error_code ec; - if(std::filesystem::exists(filePath, ec)) - { - actualSize=static_cast(std::filesystem::file_size(filePath, ec)); - } - StorageManager::instance().registerDownload( - model, selectedVariant, selectedVar->download.filename, actualSize); + spdlog::info("Model '{}' variant '{}' needs download — launching async download", + model, selectedVariant); + + m_downloadThreads.emplace_back( + &ModelRuntime::runBackgroundDownload, this, + model, selectedVariant, modelInfo.value()); + + return ErrorCode::ModelDownloading; } } - // Create/update loaded model entry + // All files present — create/update loaded model entry LoadedModel &entry=m_models[model]; entry.modelName=model; entry.variant=selectedVariant; - entry.contextSize=std::min(resolvedContext, fit.maxContextSize); + entry.contextSize=useNativeContext?0:std::min(resolvedContext, fit.maxContextSize); entry.estimatedVramUsageMb=fit.estimatedVramUsageMb; entry.gpuIndices=fit.gpuIndices; entry.lastUsed=std::chrono::steady_clock::now(); @@ -212,8 +471,9 @@ ErrorCode ModelRuntime::loadModel( // Actually load llama.cpp model for local providers if(modelInfo->provider=="llama") { - std::string filePath="/models/"+selectedVar->download.filename; - ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices); + std::string filePath="/models/"+primaryFilename; + ErrorCode loadResult=loadLlamaModel(model, filePath, entry.contextSize, entry.gpuIndices, + fit.maxContextSize); if(loadResult!=ErrorCode::Success) { m_models.erase(model); @@ -253,6 +513,246 @@ ErrorCode ModelRuntime::loadModel( return ErrorCode::Success; } +ErrorCode ModelRuntime::downloadModel( + const std::string &model, + const std::string &variant) +{ + std::lock_guard lock(m_mutex); + + // Check if already downloading + auto it=m_models.find(model); + if(it!=m_models.end()) + { + if(it->second.state==ModelState::Downloading) + { + return ErrorCode::ModelDownloading; + } + if(it->second.state==ModelState::Loaded||it->second.state==ModelState::Ready) + { + return ErrorCode::Success; + } + } + + // Look up model info from ModelManager + std::optional modelInfo=ModelManager::instance().getModelInfo(model); + if(!modelInfo.has_value()) + { + spdlog::error("downloadModel: model '{}' not found in ModelManager", model); + return ErrorCode::ModelNotFound; + } + + if(modelInfo->variants.empty()) + { + // Cloud model — nothing to download + return ErrorCode::Success; + } + + // Determine which variant to use + std::string selectedVariant=variant; + if(selectedVariant.empty()) + { + selectedVariant=selectBestVariant(modelInfo.value()); + } + + // Find the selected variant + const ModelVariant *selectedVar=nullptr; + for(const ModelVariant &v:modelInfo->variants) + { + if(v.quantization==selectedVariant) + { + selectedVar=&v; + break; + } + } + + if(!selectedVar) + { + spdlog::error("downloadModel: variant '{}' not found for model '{}'", selectedVariant, model); + return ErrorCode::ModelNotFound; + } + + // Check if all files are already present + std::vector allFiles=selectedVar->getAllFiles(); + bool anyMissing=false; + for(const VariantDownload &file:allFiles) + { + std::string filePath="/models/"+file.filename; + if(!std::filesystem::exists(filePath)&&!file.url.empty()) + { + anyMissing=true; + break; + } + } + + if(!anyMissing) + { + return ErrorCode::Success; + } + + // Check storage quota + int64_t totalDownloadBytes=static_cast(selectedVar->fileSizeMb)*1024*1024; + if(!StorageManager::instance().canDownload(totalDownloadBytes)) + { + StorageManager::instance().runCleanup(); + if(!StorageManager::instance().canDownload(totalDownloadBytes)) + { + spdlog::error("Insufficient storage to download '{}' variant '{}' ({} MB)", + model, selectedVariant, selectedVar->fileSizeMb); + return ErrorCode::InsufficientStorage; + } + } + + // Mark as downloading and launch async background download + LoadedModel &dlEntry=m_models[model]; + dlEntry.modelName=model; + dlEntry.variant=selectedVariant; + dlEntry.state=ModelState::Downloading; + dlEntry.lastUsed=std::chrono::steady_clock::now(); + + spdlog::info("downloadModel: launching async download for '{}' variant '{}'", + model, selectedVariant); + + m_downloadThreads.emplace_back( + &ModelRuntime::runBackgroundDownload, this, + model, selectedVariant, modelInfo.value()); + + return ErrorCode::ModelDownloading; +} + +void ModelRuntime::runBackgroundDownload( + const std::string &model, + const std::string &variant, + const ModelInfo &info) +{ + // Wait for a download slot (respects concurrent download limit) + { + std::unique_lock lock(m_mutex); + m_downloadCv.wait(lock, [this]() + { + return m_shuttingDown||m_activeDownloadCount lock(m_mutex); + m_models.erase(model); + --m_activeDownloadCount; + m_downloadCv.notify_one(); + return; + } + + std::vector allFiles=selectedVar->getAllFiles(); + std::string primaryFilename=selectedVar->getPrimaryFilename(); + + // Collect files that need downloading + std::vector missingFiles; + for(const VariantDownload &file:allFiles) + { + std::string filePath="/models/"+file.filename; + if(!std::filesystem::exists(filePath)&&!file.url.empty()) + { + missingFiles.push_back(&file); + } + } + + if(missingFiles.size()>1||allFiles.size()>1) + { + spdlog::info("Downloading model '{}' variant '{}' ({} files, {} missing)", + model, variant, allFiles.size(), missingFiles.size()); + } + else + { + spdlog::info("Downloading model '{}' variant '{}'", model, variant); + } + + // Download each missing file (no mutex held) + bool allDownloadsOk=true; + for(const VariantDownload *file:missingFiles) + { + std::string filePath="/models/"+file->filename; + bool downloadOk=downloadModelFile( + file->url, + filePath, + file->sha256, + model, + variant); + + if(!downloadOk) + { + allDownloadsOk=false; + spdlog::error("Failed to download shard '{}' for model '{}' variant '{}'", + file->filename, model, variant); + break; + } + } + + // Release the download slot + { + std::lock_guard lock(m_mutex); + --m_activeDownloadCount; + m_downloadCv.notify_one(); + } + + // Update model state under lock + std::lock_guard lock(m_mutex); + + if(!allDownloadsOk) + { + m_models.erase(model); + spdlog::error("Download failed for model '{}' variant '{}'", model, variant); + return; + } + + // Register all files with StorageManager + int64_t totalActualSize=0; + std::vector extraFiles; + for(size_t i=0; i(std::filesystem::file_size(filePath, ec)); + } + totalActualSize+=actualSize; + if(i>0) + { + extraFiles.push_back(allFiles[i].filename); + } + } + StorageManager::instance().registerDownload( + model, variant, primaryFilename, totalActualSize, extraFiles); + + // Transition to Unloaded (downloaded, not yet loaded into VRAM) + auto it=m_models.find(model); + if(it!=m_models.end()&&it->second.state==ModelState::Downloading) + { + it->second.state=ModelState::Unloaded; + spdlog::info("Download complete for model '{}' variant '{}' — state is now Unloaded", model, variant); + } +} + ErrorCode ModelRuntime::unloadModel(const std::string &model) { std::lock_guard lock(m_mutex); @@ -693,38 +1193,89 @@ ErrorCode ModelRuntime::loadLlamaModel( const std::string &model, const std::string &filePath, int contextSize, - const std::vector &gpuIndices) + const std::vector &gpuIndices, + int maxHardwareContext) { initLlamaBackend(); + // Start capturing llama.cpp log output for diagnostics + beginLlamaLogCapture(); + llama_model_params mparams=llama_model_default_params(); mparams.n_gpu_layers=99; // offload all layers to GPU by default llama_model *llamaModel=llama_model_load_from_file(filePath.c_str(), mparams); if(!llamaModel) { - spdlog::error("Failed to load llama model from: {}", filePath); + std::string captured=m_llamaLogCapture.str(); + endLlamaLogCapture(); + + m_lastLoadError=classifyLoadFailure(captured, model, filePath, contextSize); + spdlog::error("Failed to load llama model from: {} — {}", filePath, m_lastLoadError.summary); return ErrorCode::ModelLoadError; } + // Query native training context from GGUF metadata + int nativeContext=llama_model_n_ctx_train(llamaModel); + + // Resolve actual context to allocate: + // contextSize > 0 → user/config requested explicit size + // contextSize == 0 → use model's native training context + // In both cases, cap by the hardware-fit maximum. + int actualContext=contextSize; + if(actualContext<=0) + { + actualContext=nativeContext; + } + if(maxHardwareContext>0&&actualContext>maxHardwareContext) + { + spdlog::info("Capping context from {} to {} (hardware limit) for model '{}'", + actualContext, maxHardwareContext, model); + actualContext=maxHardwareContext; + } + llama_context_params cparams=llama_context_default_params(); - cparams.n_ctx=contextSize; + cparams.n_ctx=static_cast(actualContext); cparams.n_threads=std::thread::hardware_concurrency(); cparams.n_threads_batch=std::thread::hardware_concurrency(); llama_context *llamaCtx=llama_init_from_model(llamaModel, cparams); if(!llamaCtx) { - spdlog::error("Failed to create llama context for model: {}", model); + std::string captured=m_llamaLogCapture.str(); + endLlamaLogCapture(); + + m_lastLoadError=classifyLoadFailure(captured, model, filePath, actualContext); + + // If classification didn't catch a specific VRAM/context issue, + // context creation failure is almost always a memory issue + if(m_lastLoadError.reason==LoadFailureReason::Unknown|| + m_lastLoadError.reason==LoadFailureReason::BackendError) + { + m_lastLoadError.reason=LoadFailureReason::InsufficientVram; + m_lastLoadError.summary="Failed to create context (size="+std::to_string(actualContext)+ + ") — likely insufficient GPU memory"; + m_lastLoadError.suggestion="Try a smaller context size or use a smaller quantization variant. " + "You can also unload other models to free VRAM."; + m_lastLoadError.action="reduce_context"; + m_lastLoadError.recoverable=true; + } + + spdlog::error("Failed to create llama context for model: {} — {}", model, m_lastLoadError.summary); llama_model_free(llamaModel); return ErrorCode::ModelLoadError; } + endLlamaLogCapture(); + LoadedModel &entry=m_models[model]; entry.llamaModel=llamaModel; entry.llamaCtx=llamaCtx; + entry.maxContextSize=nativeContext; + entry.contextSize=static_cast(llama_n_ctx(llamaCtx)); - spdlog::info("llama.cpp model loaded: {} (context={})", model, contextSize); + spdlog::info("llama.cpp model loaded: {} (context={}, maxContext={})", + model, entry.contextSize, entry.maxContextSize); return ErrorCode::Success; } diff --git a/src/arbiterAI/modelRuntime.h b/src/arbiterAI/modelRuntime.h index 288ceac..d8c07e2 100644 --- a/src/arbiterAI/modelRuntime.h +++ b/src/arbiterAI/modelRuntime.h @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include // Forward declarations for llama.cpp types struct llama_model; @@ -30,6 +33,31 @@ enum class ModelState { Unloading }; +/// Classification of why a model load failed, with actionable guidance. +enum class LoadFailureReason { + Unknown, // could not determine + FileNotFound, // GGUF file does not exist at expected path + FileCorrupt, // GGUF header invalid or file truncated + InsufficientVram, // not enough VRAM / failed to allocate GPU buffers + InsufficientRam, // not enough system RAM + ContextTooLarge, // requested context exceeds model or hardware limits + UnsupportedArch, // model architecture not supported by this llama.cpp build + BackendError // llama.cpp internal error +}; + +/// Convert a LoadFailureReason to a stable, snake_case string for API responses. +const char *loadFailureReasonToString(LoadFailureReason reason); + +/// Detailed information about the last model load failure. +struct LoadErrorDetail { + LoadFailureReason reason=LoadFailureReason::Unknown; + std::string summary; // one-line human-readable summary + std::string suggestion; // actionable fix for a human operator + std::string action; // machine-readable recovery action (e.g. "redownload", "reduce_context") + bool recoverable=false; // true if the caller can take an automated action to fix the problem + std::string llamaLog; // raw llama.cpp log output captured during the load attempt +}; + struct LoadedModel { std::string modelName; std::string variant; @@ -38,6 +66,7 @@ struct LoadedModel { int ramUsageMb=0; int estimatedVramUsageMb=0; int contextSize=0; + int maxContextSize=0; // model's native/training context from GGUF metadata std::vector gpuIndices; std::chrono::steady_clock::time_point lastUsed; bool pinned=false; @@ -51,6 +80,9 @@ class ModelRuntime { static void reset(); // For testing /// Load a model into VRAM for inference. + /// If files are not yet downloaded, triggers an async download and returns + /// ModelDownloading immediately. If a download is already in progress for + /// this model the call also returns ModelDownloading. /// @param model Model name from ModelManager. /// @param variant Quantization variant (empty = auto-select best fitting). /// @param contextSize Context size (0 = use model default). @@ -60,6 +92,20 @@ class ModelRuntime { const std::string &variant="", int contextSize=0); + /// Download model files without loading into VRAM. + /// Launches an async background download that respects the concurrent + /// download limit. Returns ModelDownloading on success, Success if + /// files are already present, or an error code. + ErrorCode downloadModel( + const std::string &model, + const std::string &variant=""); + + /// Set the maximum number of concurrent model downloads (default: 2). + void setMaxConcurrentDownloads(int max); + + /// Get the current concurrent download limit. + int getMaxConcurrentDownloads() const; + /// Unload a model. Pinned models move to Ready; others to Unloaded. ErrorCode unloadModel(const std::string &model); @@ -122,6 +168,14 @@ class ModelRuntime { /// Get the ModelInfo for a loaded model. std::optional getLoadedModelInfo(const std::string &model) const; + /// Get detailed information about the most recent model load failure. + /// Cleared at the start of each loadModel() call. + LoadErrorDetail getLastLoadError() const; + + /// Called from the llama.cpp log callback to append captured text. + /// Public so the C-style callback can reach it; not intended for external use. + void appendLlamaLog(const char *text); + private: ModelRuntime(); @@ -144,11 +198,14 @@ class ModelRuntime { void initLlamaBackend(); /// Load a GGUF file into llama.cpp. + /// @param contextSize Requested context (0 = use model's native training context). + /// @param maxHardwareContext Hardware-fit limit (0 = no limit). ErrorCode loadLlamaModel( const std::string &model, const std::string &filePath, int contextSize, - const std::vector &gpuIndices); + const std::vector &gpuIndices, + int maxHardwareContext=0); /// Free llama.cpp resources for a model. void freeLlamaModel(LoadedModel &entry); @@ -177,6 +234,39 @@ class ModelRuntime { std::queue m_pendingSwaps; ModelDownloader m_downloader; + + /// Background download concurrency management. + int m_maxConcurrentDownloads=2; + int m_activeDownloadCount=0; + bool m_shuttingDown=false; + std::condition_variable m_downloadCv; + std::vector m_downloadThreads; + + /// Internal: run a background download for a model. Respects the + /// concurrent download semaphore and registers files with StorageManager + /// on success. Called on a detached background thread. + void runBackgroundDownload( + const std::string &model, + const std::string &variant, + const ModelInfo &info); + + /// Last load error detail (set in loadLlamaModel, cleared in loadModel). + LoadErrorDetail m_lastLoadError; + + /// Buffer for capturing llama.cpp log output during model load. + std::ostringstream m_llamaLogCapture; + bool m_capturingLlamaLog=false; + + /// Install/remove the llama.cpp log callback that routes to m_llamaLogCapture. + void beginLlamaLogCapture(); + void endLlamaLogCapture(); + + /// Analyze captured llama.cpp log to classify the failure reason. + LoadErrorDetail classifyLoadFailure( + const std::string &llamaLog, + const std::string &model, + const std::string &filePath, + int contextSize) const; }; } // namespace arbiterAI diff --git a/src/arbiterAI/providers/llama.cpp b/src/arbiterAI/providers/llama.cpp index 163219b..c3b1187 100644 --- a/src/arbiterAI/providers/llama.cpp +++ b/src/arbiterAI/providers/llama.cpp @@ -51,9 +51,11 @@ ErrorCode Llama::completion(const CompletionRequest &request, std::string resultText; int promptTokens=0; int completionTokens=0; + double promptTimeMs=0.0; + double generationTimeMs=0.0; ErrorCode code=runInference(llamaModel, llamaCtx, request, model, - resultText, promptTokens, completionTokens, nullptr); + resultText, promptTokens, completionTokens, promptTimeMs, generationTimeMs, nullptr); std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now(); double totalTimeMs=std::chrono::duration(endTime-startTime).count(); @@ -79,7 +81,11 @@ ErrorCode Llama::completion(const CompletionRequest &request, stats.promptTokens=promptTokens; stats.completionTokens=completionTokens; stats.totalTimeMs=totalTimeMs; + stats.promptTimeMs=promptTimeMs; + stats.generationTimeMs=generationTimeMs; stats.tokensPerSecond=totalTimeMs>0.0?(completionTokens/(totalTimeMs/1000.0)):0.0; + stats.promptTokensPerSecond=promptTimeMs>0.0?(promptTokens/(promptTimeMs/1000.0)):0.0; + stats.generationTokensPerSecond=generationTimeMs>0.0?(completionTokens/(generationTimeMs/1000.0)):0.0; stats.timestamp=std::chrono::system_clock::now(); TelemetryCollector::instance().recordInference(stats); } @@ -120,9 +126,11 @@ ErrorCode Llama::streamingCompletion(const CompletionRequest &request, std::string resultText; int promptTokens=0; int completionTokens=0; + double promptTimeMs=0.0; + double generationTimeMs=0.0; ErrorCode code=runInference(llamaModel, llamaCtx, request, *modelInfo, - resultText, promptTokens, completionTokens, callback); + resultText, promptTokens, completionTokens, promptTimeMs, generationTimeMs, callback); std::chrono::steady_clock::time_point endTime=std::chrono::steady_clock::now(); double totalTimeMs=std::chrono::duration(endTime-startTime).count(); @@ -139,7 +147,11 @@ ErrorCode Llama::streamingCompletion(const CompletionRequest &request, stats.promptTokens=promptTokens; stats.completionTokens=completionTokens; stats.totalTimeMs=totalTimeMs; + stats.promptTimeMs=promptTimeMs; + stats.generationTimeMs=generationTimeMs; stats.tokensPerSecond=totalTimeMs>0.0?(completionTokens/(totalTimeMs/1000.0)):0.0; + stats.promptTokensPerSecond=promptTimeMs>0.0?(promptTokens/(promptTimeMs/1000.0)):0.0; + stats.generationTokensPerSecond=generationTimeMs>0.0?(completionTokens/(generationTimeMs/1000.0)):0.0; stats.timestamp=std::chrono::system_clock::now(); TelemetryCollector::instance().recordInference(stats); } @@ -331,6 +343,7 @@ std::string Llama::applyTemplate(llama_model *model, ErrorCode Llama::runInference(llama_model *model, llama_context *ctx, const CompletionRequest &request, const ModelInfo &modelInfo, std::string &result, int &promptTokens, int &completionTokens, + double &promptTimeMs, double &generationTimeMs, std::function streamCallback) { const llama_vocab *vocab=llama_model_get_vocab(model); @@ -358,7 +371,7 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx, promptTokens=nTokens; // Clear KV cache for fresh inference - llama_kv_cache_clear(ctx); + llama_memory_clear(llama_get_memory(ctx), true); llama_batch batch=llama_batch_init(std::max(nTokens, 512), 0, 1); @@ -374,7 +387,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx, } batch.logits[batch.n_tokens-1]=1; - // Process prompt + // Process prompt (timed) + std::chrono::steady_clock::time_point promptStart=std::chrono::steady_clock::now(); + if(llama_decode(ctx, batch)!=0) { spdlog::error("llama_decode failed during prompt processing"); @@ -382,6 +397,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx, return ErrorCode::GenerationError; } + std::chrono::steady_clock::time_point promptEnd=std::chrono::steady_clock::now(); + promptTimeMs=std::chrono::duration(promptEnd-promptStart).count(); + int maxOutputTokens=request.max_tokens.value_or(modelInfo.maxOutputTokens); int nCur=nTokens; completionTokens=0; @@ -412,7 +430,9 @@ ErrorCode Llama::runInference(llama_model *model, llama_context *ctx, llama_sampler_accept(samplerChain, token); } - // Generation loop + // Generation loop (timed) + std::chrono::steady_clock::time_point genStart=std::chrono::steady_clock::now(); + for(int i=0; i(genEnd-genStart).count(); + llama_sampler_free(samplerChain); llama_batch_free(batch); diff --git a/src/arbiterAI/providers/llama.h b/src/arbiterAI/providers/llama.h index c683ba1..6366271 100644 --- a/src/arbiterAI/providers/llama.h +++ b/src/arbiterAI/providers/llama.h @@ -43,6 +43,7 @@ class Llama : public BaseProvider { ErrorCode runInference(llama_model *model, llama_context *ctx, const CompletionRequest &request, const ModelInfo &modelInfo, std::string &result, int &promptTokens, int &completionTokens, + double &promptTimeMs, double &generationTimeMs, std::function streamCallback); }; diff --git a/src/arbiterAI/storageManager.cpp b/src/arbiterAI/storageManager.cpp index 5d64b8a..fe9f5e0 100644 --- a/src/arbiterAI/storageManager.cpp +++ b/src/arbiterAI/storageManager.cpp @@ -77,6 +77,10 @@ void StorageManager::initialize(const std::filesystem::path &modelsDir) { std::lock_guard lock(m_mutex); + // Clear any existing state from a previous initialize() call + m_entries.clear(); + m_dirty=false; + m_modelsDir=modelsDir; if(!std::filesystem::exists(m_modelsDir)) @@ -187,7 +191,8 @@ std::vector StorageManager::getDownloadedModels() const void StorageManager::registerDownload(const std::string &modelName, const std::string &variant, const std::string &filename, - int64_t fileSizeBytes) + int64_t fileSizeBytes, + const std::vector &additionalFiles) { std::lock_guard lock(m_mutex); @@ -196,6 +201,7 @@ void StorageManager::registerDownload(const std::string &modelName, if(existing) { existing->filename=filename; + existing->additionalFiles=additionalFiles; existing->fileSizeBytes=fileSizeBytes; existing->downloadedAt=std::chrono::system_clock::now(); m_dirty=true; @@ -206,6 +212,7 @@ void StorageManager::registerDownload(const std::string &modelName, entry.modelName=modelName; entry.variant=variant; entry.filename=filename; + entry.additionalFiles=additionalFiles; entry.fileSizeBytes=fileSizeBytes; entry.downloadedAt=std::chrono::system_clock::now(); entry.lastUsedAt=std::chrono::system_clock::now(); @@ -216,8 +223,16 @@ void StorageManager::registerDownload(const std::string &modelName, m_entries.push_back(entry); m_dirty=true; - spdlog::info("StorageManager: registered download {} variant {} ({})", - modelName, variant, formatBytes(fileSizeBytes)); + if(additionalFiles.empty()) + { + spdlog::info("StorageManager: registered download {} variant {} ({})", + modelName, variant, formatBytes(fileSizeBytes)); + } + else + { + spdlog::info("StorageManager: registered download {} variant {} ({}, {} files)", + modelName, variant, formatBytes(fileSizeBytes), 1+additionalFiles.size()); + } } void StorageManager::recordUsage(const std::string &modelName, const std::string &variant) @@ -267,7 +282,10 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st // Delete files and remove entries (in reverse order to preserve indices) for(auto it=toRemove.rbegin(); it!=toRemove.rend(); ++it) { - std::filesystem::path filePath=m_modelsDir/m_entries[*it].filename; + ModelFileEntry &entry=m_entries[*it]; + + // Delete primary file + std::filesystem::path filePath=m_modelsDir/entry.filename; std::error_code ec; if(std::filesystem::exists(filePath, ec)) @@ -280,9 +298,23 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st } } - freedBytes+=m_entries[*it].fileSizeBytes; + // Delete additional shard files + for(const std::string &extraFile:entry.additionalFiles) + { + std::filesystem::path extraPath=m_modelsDir/extraFile; + if(std::filesystem::exists(extraPath, ec)) + { + std::filesystem::remove(extraPath, ec); + if(ec) + { + spdlog::warn("StorageManager: failed to delete shard {}: {}", extraPath.string(), ec.message()); + } + } + } + + freedBytes+=entry.fileSizeBytes; spdlog::info("StorageManager: deleted {} variant {} ({})", - m_entries[*it].modelName, m_entries[*it].variant, formatBytes(m_entries[*it].fileSizeBytes)); + entry.modelName, entry.variant, formatBytes(entry.fileSizeBytes)); m_entries.erase(m_entries.begin()+static_cast(*it)); } } @@ -299,6 +331,7 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st return false; } + // Delete primary file std::filesystem::path filePath=m_modelsDir/it->filename; std::error_code ec; @@ -312,6 +345,20 @@ bool StorageManager::deleteModelFile(const std::string &modelName, const std::st } } + // Delete additional shard files + for(const std::string &extraFile:it->additionalFiles) + { + std::filesystem::path extraPath=m_modelsDir/extraFile; + if(std::filesystem::exists(extraPath, ec)) + { + std::filesystem::remove(extraPath, ec); + if(ec) + { + spdlog::warn("StorageManager: failed to delete shard {}: {}", extraPath.string(), ec.message()); + } + } + } + freedBytes=it->fileSizeBytes; spdlog::info("StorageManager: deleted {} variant {} ({})", modelName, variant, formatBytes(it->fileSizeBytes)); @@ -439,7 +486,7 @@ void StorageManager::scanModelsDirectory() continue; } - // Check if already tracked + // Check if already tracked (as primary or additional file) bool found=false; for(const ModelFileEntry &entry:m_entries) { @@ -448,6 +495,15 @@ void StorageManager::scanModelsDirectory() found=true; break; } + for(const std::string &extra:entry.additionalFiles) + { + if(extra==filename) + { + found=true; + break; + } + } + if(found) break; } if(!found) @@ -653,6 +709,10 @@ void StorageManager::loadUsageData() entry.modelName=m.value("model", ""); entry.variant=m.value("variant", ""); entry.filename=m.value("filename", ""); + if(m.contains("additional_files")&&m["additional_files"].is_array()) + { + entry.additionalFiles=m["additional_files"].get>(); + } entry.fileSizeBytes=m.value("file_size_bytes", int64_t(0)); entry.downloadedAt=isoToTimePoint(m.value("downloaded_at", "")); entry.lastUsedAt=isoToTimePoint(m.value("last_used_at", "")); @@ -660,10 +720,29 @@ void StorageManager::loadUsageData() entry.hotReady=m.value("hot_ready", false); entry.isProtected=m.value("protected", false); - if(!entry.filename.empty()) + if(entry.filename.empty()) { - m_entries.push_back(entry); + continue; } + + // Deduplicate: skip if we already have an entry with the same filename + bool duplicate=false; + for(const ModelFileEntry &existing:m_entries) + { + if(existing.filename==entry.filename) + { + duplicate=true; + break; + } + } + + if(duplicate) + { + spdlog::warn("StorageManager: skipping duplicate entry for file: {}", entry.filename); + continue; + } + + m_entries.push_back(entry); } } @@ -704,6 +783,10 @@ void StorageManager::saveUsageData() const m["model"]=entry.modelName; m["variant"]=entry.variant; m["filename"]=entry.filename; + if(!entry.additionalFiles.empty()) + { + m["additional_files"]=entry.additionalFiles; + } m["file_size_bytes"]=entry.fileSizeBytes; m["downloaded_at"]=timePointToIso(entry.downloadedAt); m["last_used_at"]=timePointToIso(entry.lastUsedAt); @@ -740,13 +823,13 @@ std::vector StorageManager::collectCleanupCandidates() const if(m_cleanupPolicy.respectHotReady&&entry.hotReady) continue; if(m_cleanupPolicy.respectProtected&&entry.isProtected) continue; - // Skip entries that are currently Loaded or Ready in ModelRuntime + // Skip entries that are currently Loaded, Ready, or Downloading in ModelRuntime // Note: we don't hold ModelRuntime's lock here, so this is a best-effort check std::optional runtimeState=ModelRuntime::instance().getModelState(entry.modelName); if(runtimeState.has_value()) { ModelState state=runtimeState->state; - if(state==ModelState::Loaded||state==ModelState::Ready) + if(state==ModelState::Loaded||state==ModelState::Ready||state==ModelState::Downloading) { continue; } @@ -785,6 +868,7 @@ DownloadedModelFile StorageManager::entryToPublic(const ModelFileEntry &entry) c f.modelName=entry.modelName; f.variant=entry.variant; f.filename=entry.filename; + f.additionalFiles=entry.additionalFiles; f.filePath=m_modelsDir/entry.filename; f.fileSizeBytes=entry.fileSizeBytes; f.downloadedAt=entry.downloadedAt; diff --git a/src/arbiterAI/storageManager.h b/src/arbiterAI/storageManager.h index dcb9fea..6718549 100644 --- a/src/arbiterAI/storageManager.h +++ b/src/arbiterAI/storageManager.h @@ -27,7 +27,8 @@ struct StorageInfo { struct DownloadedModelFile { std::string modelName; std::string variant; // quantization (e.g., "Q4_K_M") - std::string filename; + std::string filename; // primary filename (first shard / single file) + std::vector additionalFiles; // extra shard filenames for split GGUF models std::filesystem::path filePath; int64_t fileSizeBytes=0; std::chrono::system_clock::time_point downloadedAt; @@ -84,10 +85,12 @@ class StorageManager { std::vector getDownloadedModels() const; /// Register a completed download (updates inventory). + /// @param additionalFiles Extra shard filenames for split GGUF models (empty for single-file). void registerDownload(const std::string &modelName, const std::string &variant, const std::string &filename, - int64_t fileSizeBytes); + int64_t fileSizeBytes, + const std::vector &additionalFiles={}); /// Record a model usage event (inference served). void recordUsage(const std::string &modelName, const std::string &variant); @@ -152,6 +155,7 @@ class StorageManager { std::string modelName; std::string variant; std::string filename; + std::vector additionalFiles; // extra shard filenames for split GGUF int64_t fileSizeBytes=0; std::chrono::system_clock::time_point downloadedAt; std::chrono::system_clock::time_point lastUsedAt; diff --git a/src/arbiterAI/telemetryCollector.cpp b/src/arbiterAI/telemetryCollector.cpp index 989cddd..cd5266e 100644 --- a/src/arbiterAI/telemetryCollector.cpp +++ b/src/arbiterAI/telemetryCollector.cpp @@ -70,6 +70,32 @@ SystemSnapshot TelemetryCollector::getSnapshot() const snapshot.avgTokensPerSecond=getAvgTokensPerSecond(); snapshot.activeRequests=ModelRuntime::instance().isInferenceActive()?1:0; + // Calculate average prompt/generation speeds over last 5 minutes + std::chrono::system_clock::time_point cutoff= + std::chrono::system_clock::now()-std::chrono::minutes(5); + double promptSum=0.0, genSum=0.0; + int promptCount=0, genCount=0; + + for(const InferenceStats &stat:m_inferenceHistory) + { + if(stat.timestamp>=cutoff) + { + if(stat.promptTokensPerSecond>0.0) + { + promptSum+=stat.promptTokensPerSecond; + promptCount++; + } + if(stat.generationTokensPerSecond>0.0) + { + genSum+=stat.generationTokensPerSecond; + genCount++; + } + } + } + + snapshot.avgPromptTokensPerSecond=promptCount>0?(promptSum/promptCount):0.0; + snapshot.avgGenerationTokensPerSecond=genCount>0?(genSum/genCount):0.0; + return snapshot; } diff --git a/src/arbiterAI/telemetryCollector.h b/src/arbiterAI/telemetryCollector.h index 350bf17..625edcd 100644 --- a/src/arbiterAI/telemetryCollector.h +++ b/src/arbiterAI/telemetryCollector.h @@ -17,10 +17,14 @@ struct InferenceStats { std::string model; std::string variant; double tokensPerSecond=0.0; + double promptTokensPerSecond=0.0; // prompt processing speed (tokens in / sec) + double generationTokensPerSecond=0.0; // generation speed (tokens out / sec) int promptTokens=0; int completionTokens=0; double latencyMs=0.0; // time to first token double totalTimeMs=0.0; // total request time + double promptTimeMs=0.0; // time spent processing prompt + double generationTimeMs=0.0; // time spent generating tokens std::chrono::system_clock::time_point timestamp; }; @@ -35,6 +39,8 @@ struct SystemSnapshot { SystemInfo hardware; std::vector models; double avgTokensPerSecond=0.0; + double avgPromptTokensPerSecond=0.0; + double avgGenerationTokensPerSecond=0.0; int activeRequests=0; }; diff --git a/src/arbiterAI/version.h.in b/src/arbiterAI/version.h.in index 88ec987..3887fc9 100644 --- a/src/arbiterAI/version.h.in +++ b/src/arbiterAI/version.h.in @@ -15,4 +15,6 @@ #define ARBITERAI_VERSION_PATCH @PROJECT_VERSION_PATCH@ #define ARBITERAI_VERSION_STRING "@PROJECT_VERSION@" +#define ARBITERAI_LLAMACPP_BUILD "@ARBITERAI_LLAMACPP_BUILD@" + #endif//_arbiterAI_version_h_ diff --git a/src/server/dashboard.h b/src/server/dashboard.h index fc9b3e6..d68ed32 100644 --- a/src/server/dashboard.h +++ b/src/server/dashboard.h @@ -314,10 +314,42 @@ td { background: linear-gradient(90deg, #4caf50, #66cc66); } +.cpu-bar-fill +{ + background: linear-gradient(90deg, #f0a030, #f0c040); +} .gpu-row { margin-bottom: 8px; } +.heap-tooltip-wrap +{ + position: relative; + display: inline; + cursor: help; + border-bottom: 1px dotted #666; +} +.heap-tooltip-wrap .heap-tooltip +{ + display: none; + position: absolute; + bottom: 120%; + right: 0; + background: #1a1d2e; + border: 1px solid #2a2d3e; + border-radius: 6px; + padding: 8px 10px; + white-space: nowrap; + font-size: 11px; + color: #ccc; + z-index: 100; + box-shadow: 0 4px 12px rgba(0,0,0,0.4); + line-height: 1.6; +} +.heap-tooltip-wrap:hover .heap-tooltip +{ + display: block; +} .gpu-label { display: flex; @@ -332,6 +364,18 @@ td fill: none; stroke-width: 2; } +#promptChart +{ + stroke: #4caf50; + fill: none; + stroke-width: 2; +} +#genChart +{ + stroke: #f0c040; + fill: none; + stroke-width: 2; +} .chart-grid line { stroke: #1f2230; @@ -342,19 +386,138 @@ td fill: #666; font-size: 10px; } +.chart-legend +{ + display: flex; + gap: 16px; + margin-top: 6px; + font-size: 11px; + color: #999; +} +.chart-legend-dot +{ + display: inline-block; + width: 10px; + height: 10px; + border-radius: 2px; + margin-right: 4px; + vertical-align: middle; +} +.log-panel +{ + background: #0d0f14; + border: 1px solid #2a2d3a; + border-radius: 6px; + font-family: 'Menlo', 'Consolas', 'Courier New', monospace; + font-size: 12px; + line-height: 1.5; + max-height: 400px; + overflow-y: auto; + padding: 8px 0; +} +.log-panel.collapsed +{ + max-height: 0; + padding: 0; + border: none; + overflow: hidden; +} +.log-entry +{ + padding: 1px 12px; + white-space: pre-wrap; + word-break: break-all; +} +.log-entry:hover +{ + background: #161922; +} +.log-ts +{ + color: #555; + margin-right: 8px; +} +.log-level +{ + display: inline-block; + width: 52px; + font-weight: 600; + margin-right: 6px; +} +.log-level-trace { color: #666; } +.log-level-debug { color: #888; } +.log-level-info { color: #4caf50; } +.log-level-warning { color: #f0c040; } +.log-level-error { color: #ff4444; } +.log-level-critical { color: #ff2222; font-weight: 800; } +.log-msg +{ + color: #ccc; +} +.log-toolbar +{ + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 8px; +} +.log-toolbar select +{ + background: #1a1d27; + color: #ccc; + border: 1px solid #2a2d3a; + border-radius: 4px; + padding: 4px 8px; + font-size: 12px; +} +.log-toolbar label +{ + color: #888; + font-size: 12px; + display: flex; + align-items: center; + gap: 4px; +} +.log-toolbar input[type="checkbox"] +{ + accent-color: #7c8aff; +} +.card-header-row +{ + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 12px; + cursor: pointer; +} +.card-header-row h2 +{ + margin-bottom: 0; +} +.collapse-icon +{ + color: #888; + font-size: 16px; + transition: transform 0.2s; + user-select: none; +} +.collapse-icon.open +{ + transform: rotate(180deg); +}
-

ArbiterAI Dashboard

+

ArbiterAI Dashboard

Connected

Performance

-
0.0
-
tokens/sec (avg)
+
Prompt Speed0.0 t/s
+
Generation Speed0.0 t/s
Active Requests0
Total Inferences0
Model Swaps0
@@ -364,16 +527,24 @@ td
Total RAM-
Free RAM-
CPU Cores-
-
CPU Usage-
+
CPU Usage-
+
-

Tokens/sec Over Time

+

Processing Speed Over Time

- + - + + + +
+
+ Prompt (tokens in/s) + Generation (tokens out/s) +
@@ -391,6 +562,7 @@ td Variant State Context + Max Context VRAM (MB) RAM (MB) Pinned @@ -443,9 +615,9 @@ td

Recent Inferences

- + - +
ModelTPSPromptCompletionLatency
ModelPrompt t/sGen t/sPromptCompletionLatency
No recent inferences
No recent inferences
@@ -459,13 +631,118 @@ td
+
+
+

Server Log

+ +
+
+ + +
+
+
Loading logs...
+
+
)HTML"; diff --git a/src/server/logBuffer.h b/src/server/logBuffer.h new file mode 100644 index 0000000..45dd4fc --- /dev/null +++ b/src/server/logBuffer.h @@ -0,0 +1,87 @@ +#ifndef _ARBITERAI_SERVER_LOGBUFFER_H_ +#define _ARBITERAI_SERVER_LOGBUFFER_H_ + +#include +#include + +#include +#include +#include +#include + +namespace arbiterAI +{ +namespace server +{ + +struct LogEntry { + std::chrono::system_clock::time_point timestamp; + std::string level; + std::string message; +}; + +/// Thread-safe ring-buffer spdlog sink that retains the most recent N log +/// entries in memory so the dashboard / REST API can serve them. +class LogBufferSink : public spdlog::sinks::base_sink { +public: + explicit LogBufferSink(size_t maxEntries=500) + :m_maxEntries(maxEntries) {} + + /// Return a snapshot of the buffered entries (newest last). + std::deque getEntries() const + { + std::lock_guard lock(m_readMutex); + return m_entries; + } + + /// Return only the last `count` entries. + std::deque getEntries(size_t count) const + { + std::lock_guard lock(m_readMutex); + if(count>=m_entries.size()) return m_entries; + + return std::deque(m_entries.end()-static_cast(count), m_entries.end()); + } + + void clear() + { + std::lock_guard lock(m_readMutex); + m_entries.clear(); + } + +protected: + void sink_it_(const spdlog::details::log_msg &msg) override + { + LogEntry entry; + entry.timestamp=msg.time; + entry.level=std::string(spdlog::level::to_string_view(msg.level).data(), + spdlog::level::to_string_view(msg.level).size()); + entry.message=std::string(msg.payload.data(), msg.payload.size()); + + std::lock_guard lock(m_readMutex); + m_entries.push_back(std::move(entry)); + while(m_entries.size()>m_maxEntries) + { + m_entries.pop_front(); + } + } + + void flush_() override {} + +private: + size_t m_maxEntries; + mutable std::mutex m_readMutex; + std::deque m_entries; +}; + +/// Global accessor so routes.cpp can read the buffer without coupling to main. +inline std::shared_ptr &logBufferSinkInstance() +{ + static std::shared_ptr instance; + return instance; +} + +} // namespace server +} // namespace arbiterAI + +#endif//_ARBITERAI_SERVER_LOGBUFFER_H_ diff --git a/src/server/main.cpp b/src/server/main.cpp index 1f59cb8..5e156fc 100644 --- a/src/server/main.cpp +++ b/src/server/main.cpp @@ -1,59 +1,168 @@ #include "routes.h" +#include "logBuffer.h" #include "arbiterAI/arbiterAI.h" +#include "arbiterAI/hardwareDetector.h" #include "arbiterAI/modelRuntime.h" #include "arbiterAI/storageManager.h" #include -#include +#include #include +#include +#include #include +#include #include #include #include +namespace +{ + +int64_t parseStorageLimit(const std::string &str) +{ + if(str.empty()||str=="0") return 0; + + char suffix=str.back(); + std::string numStr=str; + + if(suffix=='G'||suffix=='g') + { + numStr.pop_back(); + return static_cast(std::stod(numStr)*1073741824); + } + if(suffix=='M'||suffix=='m') + { + numStr.pop_back(); + return static_cast(std::stod(numStr)*1048576); + } + if(suffix=='K'||suffix=='k') + { + numStr.pop_back(); + return static_cast(std::stod(numStr)*1024); + } + return std::stoll(str); +} + +void printUsage() +{ + std::cout<<"Usage: arbiterAI-server [options]\n" + "\n" + "Options:\n" + " -c, --config Path to server configuration JSON file (required)\n" + " -h, --help Print this help message\n" + "\n" + "See examples/server_config.json for the configuration file format.\n"; +} + +} // anonymous namespace + int main(int argc, char *argv[]) { - cxxopts::Options options("arbiterAI-server", "ArbiterAI standalone server with OpenAI-compatible API"); - - options.add_options() - ("p,port", "HTTP port", cxxopts::value()->default_value("8080")) - ("H,host", "Bind address", cxxopts::value()->default_value("0.0.0.0")) - ("c,config", "Model config path(s)", cxxopts::value>()->default_value("config")) - ("m,model", "Default model to load on startup", cxxopts::value()->default_value("")) - ("v,variant", "Default variant (e.g., Q4_K_M)", cxxopts::value()->default_value("")) - ("override-path", "Path to write runtime model config overrides (enables persistence)", cxxopts::value()->default_value("")) - ("ram-budget", "Ready model RAM budget in MB (0 = auto 50%)", cxxopts::value()->default_value("0")) - ("models-dir", "Path to directory for downloaded model files", cxxopts::value()->default_value("/models")) - ("storage-limit", "Maximum bytes for model storage (0 = all free, supports suffixes: 10G, 500M)", cxxopts::value()->default_value("0")) - ("cleanup-enabled", "Enable automated storage cleanup", cxxopts::value()->default_value("true")) - ("cleanup-max-age", "Maximum days since last use before cleanup candidacy", cxxopts::value()->default_value("30")) - ("cleanup-interval", "Hours between automated cleanup runs", cxxopts::value()->default_value("24")) - ("log-level", "Log level (trace, debug, info, warn, error)", cxxopts::value()->default_value("info")) - ("h,help", "Print usage"); - - cxxopts::ParseResult result; + // ── Parse CLI — only --config and --help ────────────────────── + std::string configPath; + + for(int i=1; i is required.\n\n"; + printUsage(); + return 1; + } + + // ── Load config file ───────────────────────────────────────── + nlohmann::json cfg; try { - result=options.parse(argc, argv); + std::ifstream file(configPath); + + if(!file.is_open()) + { + std::cerr<<"Error: cannot open config file: "< modelConfigPaths; + if(cfg.contains("model_config_paths")&&cfg["model_config_paths"].is_array()) + { + for(const nlohmann::json &p:cfg["model_config_paths"]) + { + modelConfigPaths.push_back(p.get()); + } + } + if(modelConfigPaths.empty()) { - std::cout<(); + std::string modelsDir=cfg.value("models_dir", "/models"); + std::string defaultModel=cfg.value("default_model", ""); + std::string defaultVariant=cfg.value("default_variant", ""); + std::string overridePath=cfg.value("override_path", ""); + int ramBudget=cfg.value("ram_budget_mb", 0); + int maxDownloads=cfg.value("max_concurrent_downloads", 2); + + // Storage + nlohmann::json storageCfg=cfg.value("storage", nlohmann::json::object()); + std::string storageLimitStr=storageCfg.value("limit", "0"); + bool cleanupEnabled=storageCfg.value("cleanup_enabled", true); + int cleanupMaxAgeDays=storageCfg.value("cleanup_max_age_days", 30); + int cleanupIntervalHours=storageCfg.value("cleanup_interval_hours", 24); + + // Hardware + nlohmann::json hwCfg=cfg.value("hardware", nlohmann::json::object()); + nlohmann::json vramOverrides=hwCfg.value("vram_overrides", nlohmann::json::object()); + + // Logging + nlohmann::json logCfg=cfg.value("logging", nlohmann::json::object()); + std::string logLevel=logCfg.value("level", "info"); + std::string logDir=logCfg.value("directory", ""); + int logRotateHour=logCfg.value("rotate_hour", 0); + int logRetainDays=logCfg.value("retain_days", 7); + + if(logRotateHour<0||logRotateHour>23) logRotateHour=0; + if(logRetainDays<1) logRetainDays=1; + + // ── Configure logging ──────────────────────────────────────── if(logLevel=="trace") spdlog::set_level(spdlog::level::trace); else if(logLevel=="debug") spdlog::set_level(spdlog::level::debug); else if(logLevel=="info") spdlog::set_level(spdlog::level::info); @@ -61,59 +170,47 @@ int main(int argc, char *argv[]) else if(logLevel=="error") spdlog::set_level(spdlog::level::err); else spdlog::set_level(spdlog::level::info); - int port=result["port"].as(); - std::string host=result["host"].as(); - std::vector configStrs=result["config"].as>(); - std::string defaultModel=result["model"].as(); - std::string defaultVariant=result["variant"].as(); - std::string overridePath=result["override-path"].as(); - int ramBudget=result["ram-budget"].as(); - std::string modelsDir=result["models-dir"].as(); - std::string storageLimitStr=result["storage-limit"].as(); - bool cleanupEnabled=result["cleanup-enabled"].as(); - int cleanupMaxAgeDays=result["cleanup-max-age"].as(); - int cleanupIntervalHours=result["cleanup-interval"].as(); - - // Parse storage limit (supports suffixes: G, M, K) - int64_t storageLimitBytes=0; - if(!storageLimitStr.empty()&&storageLimitStr!="0") - { - char suffix=storageLimitStr.back(); - std::string numStr=storageLimitStr; - - if(suffix=='G'||suffix=='g') - { - numStr.pop_back(); - storageLimitBytes=static_cast(std::stod(numStr)*1073741824); - } - else if(suffix=='M'||suffix=='m') - { - numStr.pop_back(); - storageLimitBytes=static_cast(std::stod(numStr)*1048576); - } - else if(suffix=='K'||suffix=='k') - { - numStr.pop_back(); - storageLimitBytes=static_cast(std::stod(numStr)*1024); - } - else - { - storageLimitBytes=std::stoll(storageLimitStr); - } + auto consoleSink=std::make_shared(); + auto bufferSink=std::make_shared(1000); + arbiterAI::server::logBufferSinkInstance()=bufferSink; + + std::vector sinks{consoleSink, bufferSink}; + + if(!logDir.empty()) + { + std::filesystem::create_directories(logDir); + std::string logPath=logDir+"/arbiterAI-server.log"; + + auto fileSink=std::make_shared( + logPath, logRotateHour, 0, false, + static_cast(logRetainDays)); + sinks.push_back(fileSink); + } + + auto logger=std::make_shared("arbiterAI", + sinks.begin(), sinks.end()); + logger->set_level(spdlog::get_level()); + spdlog::set_default_logger(logger); + + spdlog::info("Loaded config from: {}", configPath); + + if(!logDir.empty()) + { + spdlog::info("File logging enabled: dir={}, rotate at {:02d}:00, retain {} days", + logDir, logRotateHour, logRetainDays); } - // Convert config paths - std::vector configPaths; - for(const std::string &s:configStrs) + // ── Initialize ArbiterAI ───────────────────────────────────── + std::vector fsPaths; + for(const std::string &s:modelConfigPaths) { - configPaths.push_back(s); + fsPaths.push_back(s); } spdlog::info("Initializing ArbiterAI..."); - // Initialize ArbiterAI arbiterAI::ArbiterAI &ai=arbiterAI::ArbiterAI::instance(); - arbiterAI::ErrorCode err=ai.initialize(configPaths); + arbiterAI::ErrorCode err=ai.initialize(fsPaths); if(err!=arbiterAI::ErrorCode::Success) { @@ -123,7 +220,22 @@ int main(int argc, char *argv[]) spdlog::info("ArbiterAI initialized successfully"); - // Configure StorageManager + // ── Apply VRAM overrides ───────────────────────────────────── + for(auto it=vramOverrides.begin(); it!=vramOverrides.end(); ++it) + { + int gpuIndex=std::stoi(it.key()); + int vramMb=it.value().get(); + + if(vramMb>0) + { + arbiterAI::HardwareDetector::instance().setVramOverride(gpuIndex, vramMb); + spdlog::info("VRAM override set for GPU {}: {} MB", gpuIndex, vramMb); + } + } + + // ── Configure StorageManager ───────────────────────────────── + int64_t storageLimitBytes=parseStorageLimit(storageLimitStr); + arbiterAI::StorageManager &storage=arbiterAI::StorageManager::instance(); storage.initialize(modelsDir); @@ -141,14 +253,21 @@ int main(int argc, char *argv[]) spdlog::info("Cleanup policy: enabled={}, maxAge={}d, interval={}h", cleanupEnabled, cleanupMaxAgeDays, cleanupIntervalHours); - // Set RAM budget if specified + // ── RAM budget ─────────────────────────────────────────────── if(ramBudget>0) { arbiterAI::ModelRuntime::instance().setReadyRamBudget(ramBudget); spdlog::info("Ready model RAM budget set to {} MB", ramBudget); } - // Load default model if specified + // ── Concurrent download limit ──────────────────────────────── + if(maxDownloads>0) + { + arbiterAI::ModelRuntime::instance().setMaxConcurrentDownloads(maxDownloads); + spdlog::info("Max concurrent downloads set to {}", maxDownloads); + } + + // ── Load default model ─────────────────────────────────────── if(!defaultModel.empty()) { spdlog::info("Loading default model: {} (variant: {})", defaultModel, defaultVariant.empty()?"auto":defaultVariant); @@ -168,20 +287,17 @@ int main(int argc, char *argv[]) } } - // Create HTTP server + // ── HTTP server ────────────────────────────────────────────── httplib::Server server; - // Register routes arbiterAI::server::registerRoutes(server); - // Set override path for runtime model config persistence if(!overridePath.empty()) { arbiterAI::server::setOverridePath(overridePath); spdlog::info("Runtime model config overrides will be saved to: {}", overridePath); } - // Log available endpoints spdlog::info("Server endpoints:"); spdlog::info(" GET /health - Health check"); spdlog::info(" POST /v1/chat/completions - Chat completions (OpenAI-compatible)"); @@ -202,6 +318,8 @@ int main(int argc, char *argv[]) spdlog::info(" GET /api/stats/history - Inference history"); spdlog::info(" GET /api/stats/swaps - Swap history"); spdlog::info(" GET /api/hardware - Hardware info"); + spdlog::info(" POST /api/hardware/vram-override - Set VRAM override"); + spdlog::info(" DEL /api/hardware/vram-override/:idx - Clear VRAM override"); spdlog::info(" GET /api/storage - Storage overview"); spdlog::info(" GET /api/storage/models - Downloaded models"); spdlog::info(" GET /api/storage/models/:n - Model storage stats"); diff --git a/src/server/routes.cpp b/src/server/routes.cpp index 527a819..ef178e6 100644 --- a/src/server/routes.cpp +++ b/src/server/routes.cpp @@ -1,5 +1,6 @@ #include "routes.h" #include "dashboard.h" +#include "logBuffer.h" #include "arbiterAI/arbiterAI.h" #include "arbiterAI/modelManager.h" @@ -71,7 +72,8 @@ nlohmann::json gpuInfoToJson(const GpuInfo &gpu) {"vram_free_mb", gpu.vramFreeMb}, {"compute_capability", gpu.computeCapability}, {"utilization_percent", gpu.utilizationPercent}, - {"unified_memory", gpu.unifiedMemory} + {"unified_memory", gpu.unifiedMemory}, + {"vram_overridden", gpu.vramOverridden} }; if(gpu.unifiedMemory&&gpu.gpuAccessibleRamMb>0) @@ -80,6 +82,27 @@ nlohmann::json gpuInfoToJson(const GpuInfo &gpu) j["gpu_accessible_ram_free_mb"]=gpu.gpuAccessibleRamFreeMb; } + if(!gpu.memoryHeaps.empty()) + { + nlohmann::json heaps=nlohmann::json::array(); + for(const MemoryHeapInfo &heap:gpu.memoryHeaps) + { + nlohmann::json h={ + {"index", heap.index}, + {"device_local", heap.deviceLocal}, + {"size_mb", heap.sizeMb} + }; + if(gpu.hasMemoryBudget) + { + h["budget_mb"]=heap.budgetMb; + h["usage_mb"]=heap.usageMb; + } + heaps.push_back(h); + } + j["memory_heaps"]=heaps; + j["has_memory_budget"]=gpu.hasMemoryBudget; + } + return j; } @@ -129,6 +152,7 @@ nlohmann::json loadedModelToJson(const LoadedModel &m) {"ram_usage_mb", m.ramUsageMb}, {"estimated_vram_mb", m.estimatedVramUsageMb}, {"context_size", m.contextSize}, + {"max_context_size", m.maxContextSize}, {"gpu_indices", gpuIndices}, {"pinned", m.pinned} }; @@ -140,10 +164,14 @@ nlohmann::json inferenceStatsToJson(const InferenceStats &s) {"model", s.model}, {"variant", s.variant}, {"tokens_per_second", s.tokensPerSecond}, + {"prompt_tokens_per_second", s.promptTokensPerSecond}, + {"generation_tokens_per_second", s.generationTokensPerSecond}, {"prompt_tokens", s.promptTokens}, {"completion_tokens", s.completionTokens}, {"latency_ms", s.latencyMs}, - {"total_time_ms", s.totalTimeMs} + {"total_time_ms", s.totalTimeMs}, + {"prompt_time_ms", s.promptTimeMs}, + {"generation_time_ms", s.generationTimeMs} }; } @@ -278,6 +306,11 @@ void registerRoutes(httplib::Server &server) server.Get("/api/stats/history", handleGetStatsHistory); server.Get("/api/stats/swaps", handleGetStatsSwaps); server.Get("/api/hardware", handleGetHardware); + server.Post("/api/hardware/vram-override", handleSetVramOverride); + server.Delete(R"(/api/hardware/vram-override/(\d+))", handleClearVramOverride); + + // Logs + server.Get("/api/logs", handleGetLogs); // Storage management server.Get("/api/storage", handleGetStorage); @@ -778,7 +811,8 @@ void handleGetVersion(const httplib::Request &, httplib::Response &res) {"version", ver.toString()}, {"major", ver.major}, {"minor", ver.minor}, - {"patch", ver.patch} + {"patch", ver.patch}, + {"llamaCppBuild", ver.llamaCppBuild} }; res.set_content(j.dump(), "application/json"); } @@ -846,16 +880,48 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) std::string variant; int contextSize=0; + // Accept parameters from query string if(req.has_param("variant")) variant=req.get_param_value("variant"); if(req.has_param("context")) contextSize=std::stoi(req.get_param_value("context")); + // Also accept from JSON body (body takes precedence) + if(!req.body.empty()) + { + try + { + nlohmann::json body=nlohmann::json::parse(req.body); + if(body.contains("variant")&&body["variant"].is_string()) + variant=body["variant"].get(); + if(body.contains("context")&&body["context"].is_number_integer()) + contextSize=body["context"].get(); + if(body.contains("context_size")&&body["context_size"].is_number_integer()) + contextSize=body["context_size"].get(); + } + catch(const nlohmann::json::parse_error &) + { + // Not JSON body — ignore, use query params + } + } + + spdlog::info("Load request: model='{}' variant='{}' context={}", modelName, variant, contextSize); + ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize); if(err==ErrorCode::Success) { - res.set_content(nlohmann::json{{"status", "loaded"}, {"model", modelName}}.dump(), "application/json"); + // Include the actual context info in the response + nlohmann::json response={{"status", "loaded"}, {"model", modelName}}; + + std::optional state=ModelRuntime::instance().getModelState(modelName); + if(state.has_value()) + { + response["context_size"]=state->contextSize; + response["max_context_size"]=state->maxContextSize; + } + + res.set_content(response.dump(), "application/json"); } else if(err==ErrorCode::ModelDownloading) { @@ -882,8 +948,88 @@ void handleLoadModel(const httplib::Request &req, httplib::Response &res) } else { + std::string errCode=errorCodeToString(err); + std::string reason; + + switch(err) + { + case ErrorCode::ModelNotFound: + reason="Model '"+modelName+"' is not defined in any loaded configuration file. " + "Check that the model name matches a config entry and that the config path is correct."; + break; + case ErrorCode::ModelLoadError: + { + LoadErrorDetail detail=ModelRuntime::instance().getLastLoadError(); + reason=detail.summary; + if(reason.empty()) + { + reason="Failed to load model '"+modelName+"'"; + if(!variant.empty()) reason+=" variant '"+variant+"'"; + reason+=". Check the server log for details."; + } + break; + } + case ErrorCode::ModelDownloadFailed: + reason="Download failed for model '"+modelName+"'"; + if(!variant.empty()) reason+=" variant '"+variant+"'"; + reason+=". The download URL may be unreachable or the SHA256 hash did not match."; + break; + case ErrorCode::InvalidRequest: + reason="Invalid request for model '"+modelName+"'. " + "The model may be a local provider without variants defined."; + break; + case ErrorCode::UnsupportedProvider: + reason="The provider for model '"+modelName+"' is not supported or not enabled in this build."; + break; + default: + reason="Unexpected error loading model '"+modelName+"': "+errCode; + break; + } + + spdlog::warn("Load failed: model='{}' variant='{}' error={} — {}", modelName, variant, errCode, reason); + + // Build detail payload; include structured error fields for programmatic handling + LoadErrorDetail loadDetail=ModelRuntime::instance().getLastLoadError(); + + nlohmann::json details={ + {"model", modelName}, + {"variant", variant.empty()?nlohmann::json(nullptr):nlohmann::json(variant)}, + {"context_requested", contextSize}, + {"error_code", errCode}, + {"reason", loadFailureReasonToString(loadDetail.reason)}, + {"recoverable", loadDetail.recoverable} + }; + + if(!loadDetail.action.empty()) + { + details["action"]=loadDetail.action; + } + if(!loadDetail.suggestion.empty()) + { + details["suggestion"]=loadDetail.suggestion; + } + if(!loadDetail.llamaLog.empty()) + { + // Trim to a reasonable size for the response + std::string logSnippet=loadDetail.llamaLog; + if(logSnippet.size()>2000) + { + logSnippet=logSnippet.substr(logSnippet.size()-2000); + } + details["llama_log"]=logSnippet; + } + + nlohmann::json body={ + {"error", { + {"message", reason}, + {"type", "invalid_request_error"}, + {"code", errCode}, + {"param", "model"}, + {"details", details} + }} + }; res.status=400; - res.set_content(errorJson("Failed to load model: "+errorCodeToString(err), "invalid_request_error", "model", errorCodeToString(err)).dump(), "application/json"); + res.set_content(body.dump(), "application/json"); } } catch(const std::exception &e) @@ -949,12 +1095,11 @@ void handleDownloadModel(const httplib::Request &req, httplib::Response &res) { std::string modelName=req.matches[1]; - // Initiate download via loadModel (which triggers download if file not present) std::string variant; if(req.has_param("variant")) variant=req.get_param_value("variant"); - ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant); + ErrorCode err=ArbiterAI::instance().downloadModel(modelName, variant); if(err==ErrorCode::ModelDownloading) { @@ -1238,6 +1383,8 @@ void handleGetStats(const httplib::Request &, httplib::Response &res) {"hardware", systemInfoToJson(snapshot.hardware)}, {"models", models}, {"avg_tokens_per_second", snapshot.avgTokensPerSecond}, + {"avg_prompt_tokens_per_second", snapshot.avgPromptTokensPerSecond}, + {"avg_generation_tokens_per_second", snapshot.avgGenerationTokensPerSecond}, {"active_requests", snapshot.activeRequests} }; @@ -1286,6 +1433,66 @@ void handleGetHardware(const httplib::Request &, httplib::Response &res) res.set_content(systemInfoToJson(hw).dump(), "application/json"); } +void handleSetVramOverride(const httplib::Request &req, httplib::Response &res) +{ + nlohmann::json body; + + try + { + body=nlohmann::json::parse(req.body); + } + catch(const std::exception &) + { + res.status=400; + res.set_content(R"({"error":"Invalid JSON body"})", "application/json"); + return; + } + + if(!body.contains("gpu_index")||!body.contains("vram_mb")) + { + res.status=400; + res.set_content(R"({"error":"Missing required fields: gpu_index, vram_mb"})", "application/json"); + return; + } + + int gpuIndex=body["gpu_index"].get(); + int vramMb=body["vram_mb"].get(); + + if(vramMb<=0) + { + res.status=400; + res.set_content(R"({"error":"vram_mb must be positive"})", "application/json"); + return; + } + + HardwareDetector::instance().setVramOverride(gpuIndex, vramMb); + + nlohmann::json result={ + {"status", "ok"}, + {"gpu_index", gpuIndex}, + {"vram_mb", vramMb} + }; + + res.set_content(result.dump(), "application/json"); +} + +void handleClearVramOverride(const httplib::Request &req, httplib::Response &res) +{ + int gpuIndex=std::stoi(req.matches[1]); + + HardwareDetector::instance().clearVramOverride(gpuIndex); + + // Refresh to restore detected values + HardwareDetector::instance().refresh(); + + nlohmann::json result={ + {"status", "ok"}, + {"gpu_index", gpuIndex} + }; + + res.set_content(result.dump(), "application/json"); +} + // ========== Storage Management ========== namespace @@ -1323,7 +1530,7 @@ std::string timePointToIsoStr(const std::chrono::system_clock::time_point &tp) nlohmann::json downloadedModelToJson(const DownloadedModelFile &f) { - return { + nlohmann::json j={ {"model", f.modelName}, {"variant", f.variant}, {"filename", f.filename}, @@ -1334,8 +1541,14 @@ nlohmann::json downloadedModelToJson(const DownloadedModelFile &f) {"usage_count", f.usageCount}, {"hot_ready", f.hotReady}, {"protected", f.isProtected}, - {"runtime_state", f.runtimeState} + {"runtime_state", f.runtimeState}, + {"file_count", 1+static_cast(f.additionalFiles.size())} }; + if(!f.additionalFiles.empty()) + { + j["additional_files"]=f.additionalFiles; + } + return j; } } // anonymous namespace @@ -1366,15 +1579,18 @@ void handleGetStorageModels(const httplib::Request &req, httplib::Response &res) std::string sortField=req.has_param("sort")?req.get_param_value("sort"):"last_used"; std::string sortOrder=req.has_param("order")?req.get_param_value("order"):"desc"; + bool ascending=(sortOrder=="asc"); + auto compare=[&](const DownloadedModelFile &a, const DownloadedModelFile &b) -> bool { - bool result=false; - if(sortField=="name") result=a.modelName state=ModelRuntime::instance().getModelState(modelName); + if(state.has_value()&&state->state==ModelState::Downloading) + { + if(variant.empty()||state->variant==variant) + { + res.status=409; + res.set_content(nlohmann::json{ + {"error", { + {"message", "Cannot delete model '"+modelName+"': download is in progress"}, + {"type", "invalid_request_error"} + }} + }.dump(), "application/json"); + return; + } + } + + // Unload from ModelRuntime if loaded if(state.has_value()&&(state->state==ModelState::Loaded||state->state==ModelState::Ready)) { if(variant.empty()||state->variant==variant) @@ -1757,6 +1989,64 @@ void handleGetActiveDownloads(const httplib::Request &, httplib::Response &res) res.set_content(nlohmann::json{{"downloads", downloads}}.dump(), "application/json"); } +// ========== Logs ========== + +void handleGetLogs(const httplib::Request &req, httplib::Response &res) +{ + auto &sink=logBufferSinkInstance(); + if(!sink) + { + res.set_content(nlohmann::json{{"logs", nlohmann::json::array()}}.dump(), "application/json"); + return; + } + + size_t count=200; + std::string levelFilter; + + if(req.has_param("count")) + { + count=static_cast(std::stoi(req.get_param_value("count"))); + if(count>1000) count=1000; + } + if(req.has_param("level")) + { + levelFilter=req.get_param_value("level"); + } + + std::deque entries=sink->getEntries(count); + + nlohmann::json logs=nlohmann::json::array(); + for(const LogEntry &entry:entries) + { + if(!levelFilter.empty()&&entry.level!=levelFilter) + continue; + + auto epochMs=std::chrono::duration_cast( + entry.timestamp.time_since_epoch()).count(); + + // Format ISO timestamp + std::time_t t=std::chrono::system_clock::to_time_t(entry.timestamp); + std::tm tm{}; + gmtime_r(&t, &tm); + + auto ms=epochMs%1000; + char buf[32]; + std::strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", &tm); + + std::ostringstream ts; + ts< gpus=hw.getGpus(); + + if(gpus.empty()) + { + GTEST_SKIP()<<"No GPUs detected, cannot test override persistence"; + } + + int gpuIdx=gpus[0].index; + int overrideValue=12345; + + hw.setVramOverride(gpuIdx, overrideValue); + hw.refresh(); + + SystemInfo info=hw.getSystemInfo(); + ASSERT_FALSE(info.gpus.empty()); + + bool found=false; + for(const GpuInfo &gpu:info.gpus) + { + if(gpu.index==gpuIdx) + { + EXPECT_EQ(gpu.vramTotalMb, overrideValue); + EXPECT_LE(gpu.vramFreeMb, overrideValue); + EXPECT_TRUE(gpu.vramOverridden); + found=true; + break; + } + } + EXPECT_TRUE(found); +} + +TEST_F(VramOverrideTest, OverrideAppliedToGpuInfo) +{ + HardwareDetector &hw=HardwareDetector::instance(); + std::vector gpus=hw.getGpus(); + + if(gpus.empty()) + { + GTEST_SKIP()<<"No GPUs detected, cannot test override application"; + } + + int gpuIdx=gpus[0].index; + int originalTotal=gpus[0].vramTotalMb; + int overrideValue=originalTotal/2; + + if(overrideValue<=0) overrideValue=1024; + + hw.setVramOverride(gpuIdx, overrideValue); + + SystemInfo info=hw.getSystemInfo(); + + for(const GpuInfo &gpu:info.gpus) + { + if(gpu.index==gpuIdx) + { + EXPECT_EQ(gpu.vramTotalMb, overrideValue); + EXPECT_TRUE(gpu.vramOverridden); + break; + } + } +} + +TEST_F(VramOverrideTest, NonOverriddenGpuUnaffected) +{ + HardwareDetector &hw=HardwareDetector::instance(); + std::vector gpus=hw.getGpus(); + + if(gpus.empty()) + { + GTEST_SKIP()<<"No GPUs detected"; + } + + int gpuIdx=gpus[0].index; + int originalTotal=gpus[0].vramTotalMb; + + // Override a non-existent GPU index + hw.setVramOverride(999, 99999); + hw.refresh(); + + SystemInfo info=hw.getSystemInfo(); + for(const GpuInfo &gpu:info.gpus) + { + if(gpu.index==gpuIdx) + { + // Original GPU should be unaffected + EXPECT_FALSE(gpu.vramOverridden); + break; + } + } +} + +TEST_F(VramOverrideTest, OverrideAffectsModelFitCalculation) +{ + GpuInfo gpu; + gpu.index=0; + gpu.name="Test GPU"; + gpu.backend=GpuBackend::CUDA; + gpu.vramTotalMb=2000; + gpu.vramFreeMb=2000; + + SystemInfo hw; + hw.totalRamMb=32000; + hw.freeRamMb=24000; + hw.cpuCores=8; + hw.gpus={gpu}; + + ModelInfo model; + model.model="test-model"; + model.provider="llama"; + + HardwareRequirements hwReqs; + hwReqs.minSystemRamMb=8192; + hwReqs.parameterCount="7B"; + model.hardwareRequirements=hwReqs; + + ContextScaling scaling; + scaling.baseContext=4096; + scaling.maxContext=131072; + scaling.vramPer1kContextMb=64; + model.contextScaling=scaling; + + ModelVariant variant; + variant.quantization="Q4_K_M"; + variant.fileSizeMb=4370; + variant.minVramMb=4096; + variant.recommendedVramMb=8192; + + // Without override: 2000 MB VRAM cannot fit 4096 minVram, CPU fallback gives base context + ModelFit fitBefore=ModelFitCalculator::calculateModelFit(model, variant, hw); + EXPECT_EQ(fitBefore.maxContextSize, 4096); + + // Apply override: 16000 MB VRAM now fits model + allows extra context + hw.gpus[0].vramTotalMb=16000; + hw.gpus[0].vramFreeMb=16000; + hw.gpus[0].vramOverridden=true; + + ModelFit fitAfter=ModelFitCalculator::calculateModelFit(model, variant, hw); + + EXPECT_TRUE(fitAfter.canRun); + EXPECT_GT(fitAfter.maxContextSize, fitBefore.maxContextSize); +} + } // namespace arbiterAI diff --git a/tests/modelManagerTests.cpp b/tests/modelManagerTests.cpp index 75d06a4..8b31798 100644 --- a/tests/modelManagerTests.cpp +++ b/tests/modelManagerTests.cpp @@ -628,4 +628,178 @@ TEST_F(ModelManagerConfigInjectionTest, AddModelFromJson_AllOptionalFields) EXPECT_DOUBLE_EQ(info->pricing.completion_token_cost, 0.0006); } +TEST_F(ModelManagerTest, ParseSplitGGUFVariant) +{ + std::filesystem::create_directory("config_split"); + std::filesystem::create_directory("config_split/models"); + std::ofstream outfile("config_split/models/split_model.json"); + outfile << R"({ + "models": [ + { + "model": "big-model-120b", + "provider": "llama", + "variants": [ + { + "quantization": "Q4_K_M", + "file_size_mb": 70000, + "min_vram_mb": 40000, + "recommended_vram_mb": 48000, + "download": { + "url": "https://example.com/big-model-Q4_K_M-00001-of-00002.gguf", + "sha256": "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1", + "filename": "big-model-Q4_K_M-00001-of-00002.gguf" + }, + "files": [ + { + "url": "https://example.com/big-model-Q4_K_M-00001-of-00002.gguf", + "sha256": "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1", + "filename": "big-model-Q4_K_M-00001-of-00002.gguf" + }, + { + "url": "https://example.com/big-model-Q4_K_M-00002-of-00002.gguf", + "sha256": "bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb2", + "filename": "big-model-Q4_K_M-00002-of-00002.gguf" + } + ] + } + ] + } + ] + })"; + outfile.close(); + + ModelManager &mm=ModelManager::instance(); + mm.initialize({"config_split"}); + + auto model=mm.getModelInfo("big-model-120b"); + ASSERT_TRUE(model.has_value()); + ASSERT_EQ(model->variants.size(), 1u); + + const ModelVariant &v=model->variants[0]; + EXPECT_EQ(v.quantization, "Q4_K_M"); + EXPECT_EQ(v.fileSizeMb, 70000); + + // Single-file download field (backward compat) + EXPECT_EQ(v.download.url, "https://example.com/big-model-Q4_K_M-00001-of-00002.gguf"); + EXPECT_EQ(v.download.filename, "big-model-Q4_K_M-00001-of-00002.gguf"); + + // Multi-file files array + ASSERT_EQ(v.files.size(), 2u); + EXPECT_EQ(v.files[0].filename, "big-model-Q4_K_M-00001-of-00002.gguf"); + EXPECT_EQ(v.files[1].filename, "big-model-Q4_K_M-00002-of-00002.gguf"); + EXPECT_EQ(v.files[1].sha256, "bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb2"); + + // Helper methods + EXPECT_TRUE(v.isSplit()); + EXPECT_EQ(v.getPrimaryFilename(), "big-model-Q4_K_M-00001-of-00002.gguf"); + + std::vector allFiles=v.getAllFiles(); + ASSERT_EQ(allFiles.size(), 2u); + EXPECT_EQ(allFiles[0].filename, "big-model-Q4_K_M-00001-of-00002.gguf"); + EXPECT_EQ(allFiles[1].filename, "big-model-Q4_K_M-00002-of-00002.gguf"); + + std::filesystem::remove_all("config_split"); +} + +TEST_F(ModelManagerTest, SingleFileVariantBackwardCompat) +{ + // Single-file variant without "files" array should still work + ModelManager &mm=ModelManager::instance(); + mm.initialize({"config1"}); + + auto model=mm.getModelInfo("model1"); + ASSERT_TRUE(model.has_value()); + + // Build a single-file variant via JSON injection + nlohmann::json modelJson={ + {"model", "single-file-model"}, + {"provider", "llama"}, + {"variants", {{ + {"quantization", "Q8_0"}, + {"file_size_mb", 8100}, + {"min_vram_mb", 8192}, + {"download", { + {"url", "https://example.com/single.gguf"}, + {"sha256", "abc123def456abc123def456abc123def456abc123def456abc123def456abcd"}, + {"filename", "single.gguf"} + }} + }}} + }; + + std::string error; + ASSERT_TRUE(mm.addModelFromJson(modelJson, error))<variants.size(), 1u); + + const ModelVariant &v=info->variants[0]; + + // files array should be empty + EXPECT_TRUE(v.files.empty()); + EXPECT_FALSE(v.isSplit()); + + // getAllFiles() should fall back to the single download field + std::vector allFiles=v.getAllFiles(); + ASSERT_EQ(allFiles.size(), 1u); + EXPECT_EQ(allFiles[0].filename, "single.gguf"); + EXPECT_EQ(allFiles[0].url, "https://example.com/single.gguf"); + + // getPrimaryFilename() should return the single download filename + EXPECT_EQ(v.getPrimaryFilename(), "single.gguf"); +} + +TEST_F(ModelManagerTest, SplitVariantSerializationRoundTrip) +{ + nlohmann::json modelJson={ + {"model", "roundtrip-split"}, + {"provider", "llama"}, + {"variants", {{ + {"quantization", "Q4_K_M"}, + {"file_size_mb", 70000}, + {"min_vram_mb", 40000}, + {"recommended_vram_mb", 48000}, + {"download", { + {"url", "https://example.com/shard-00001-of-00003.gguf"}, + {"sha256", "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1"}, + {"filename", "shard-00001-of-00003.gguf"} + }}, + {"files", { + {{"url", "https://example.com/shard-00001-of-00003.gguf"}, + {"sha256", "aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa111aaa1"}, + {"filename", "shard-00001-of-00003.gguf"}}, + {{"url", "https://example.com/shard-00002-of-00003.gguf"}, + {"sha256", "bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb222bbb2"}, + {"filename", "shard-00002-of-00003.gguf"}}, + {{"url", "https://example.com/shard-00003-of-00003.gguf"}, + {"sha256", "ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc333ccc3"}, + {"filename", "shard-00003-of-00003.gguf"}} + }} + }}} + }; + + ModelManager &mm=ModelManager::instance(); + mm.initialize({"config1"}); + + std::string error; + ASSERT_TRUE(mm.addModelFromJson(modelJson, error))<(), "shard-00003-of-00003.gguf"); + + // Verify download field is also present (backward compat) + ASSERT_TRUE(outputJson["variants"][0].contains("download")); + EXPECT_EQ(outputJson["variants"][0]["download"]["filename"].get(), "shard-00001-of-00003.gguf"); +} + } // namespace arbiterAI \ No newline at end of file diff --git a/tests/modelRuntimeTests.cpp b/tests/modelRuntimeTests.cpp index 03162ef..86ee084 100644 --- a/tests/modelRuntimeTests.cpp +++ b/tests/modelRuntimeTests.cpp @@ -1,6 +1,7 @@ #include "arbiterAI/modelRuntime.h" #include "arbiterAI/modelManager.h" #include "arbiterAI/hardwareDetector.h" +#include "arbiterAI/storageManager.h" #include #include #include @@ -484,4 +485,271 @@ TEST_F(ModelRuntimeTest, LoadLocalModelInvalidVariantFails) EXPECT_EQ(result, ErrorCode::ModelNotFound); } +// --- Download concurrency settings --- + +TEST_F(ModelRuntimeTest, DefaultMaxConcurrentDownloadsIsTwo) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + EXPECT_EQ(rt.getMaxConcurrentDownloads(), 2); +} + +TEST_F(ModelRuntimeTest, SetMaxConcurrentDownloads) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + rt.setMaxConcurrentDownloads(4); + EXPECT_EQ(rt.getMaxConcurrentDownloads(), 4); + + rt.setMaxConcurrentDownloads(1); + EXPECT_EQ(rt.getMaxConcurrentDownloads(), 1); +} + +TEST_F(ModelRuntimeTest, SetMaxConcurrentDownloadsClampsToMinimumOne) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + rt.setMaxConcurrentDownloads(0); + EXPECT_EQ(rt.getMaxConcurrentDownloads(), 1); + + rt.setMaxConcurrentDownloads(-5); + EXPECT_EQ(rt.getMaxConcurrentDownloads(), 1); +} + +// --- downloadModel --- + +TEST_F(ModelRuntimeTest, DownloadModelNotFoundFails) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + ErrorCode result=rt.downloadModel("nonexistent-model"); + + EXPECT_EQ(result, ErrorCode::ModelNotFound); +} + +TEST_F(ModelRuntimeTest, DownloadModelCloudModelReturnsSuccess) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + // Cloud models (mock) have no variants — nothing to download + ErrorCode result=rt.downloadModel("mock-model"); + + EXPECT_EQ(result, ErrorCode::Success); +} + +TEST_F(ModelRuntimeTest, DownloadModelInvalidVariantFails) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + ErrorCode result=rt.downloadModel("test-local-7b", "INVALID_QUANT"); + + EXPECT_EQ(result, ErrorCode::ModelNotFound); +} + +TEST_F(ModelRuntimeTest, DownloadModelAlreadyDownloadingReturnDownloading) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + // First call should initiate download (or fail fast if no URL, but will + // set state to Downloading if URL is empty — the background thread will + // finish immediately since there are no missing files with valid URLs). + // For this test, we rely on the test config having empty URLs: + // the variant URLs are empty, so no files are actually missing (they have + // no URL to download from). That means downloadModel returns Success. + // Instead, let's verify the state tracking for an already-loaded model. + rt.loadModel("mock-model"); + auto state=rt.getModelState("mock-model"); + ASSERT_TRUE(state.has_value()); + EXPECT_EQ(state->state, ModelState::Loaded); + + // Calling download on a loaded model returns Success + ErrorCode result=rt.downloadModel("mock-model"); + EXPECT_EQ(result, ErrorCode::Success); +} + +TEST_F(ModelRuntimeTest, LoadModelMissingFilesReturnsDownloading) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + // Create a config with a downloadable URL (fake but non-empty) + std::ofstream dlModel("rt_config/models/download_model.json"); + dlModel << R"({ + "models": [ + { + "model": "test-download-7b", + "provider": "llama", + "ranking": 40, + "context_window": 4096, + "max_tokens": 2048, + "hardware_requirements": { + "min_system_ram_mb": 4096, + "parameter_count": "7B" + }, + "context_scaling": { + "base_context": 4096, + "max_context": 32768, + "vram_per_1k_context_mb": 64 + }, + "variants": [ + { + "quantization": "Q4_K_M", + "file_size_mb": 1, + "min_vram_mb": 128, + "recommended_vram_mb": 256, + "download": { + "url": "http://localhost:99999/fake.gguf", + "sha256": "abc123", + "filename": "test-dl-7b-q4.gguf" + } + } + ] + } + ] + })"; + dlModel.close(); + + // Re-initialize ModelManager to pick up the new config + ModelManager::reset(); + ModelManager::instance().initialize({"rt_config"}); + ModelRuntime::reset(); + + // Initialize StorageManager so canDownload() succeeds + std::filesystem::create_directories("rt_test_models"); + StorageManager::instance().initialize("rt_test_models"); + StorageManager::instance().setStorageLimit(static_cast(10)*1024*1024*1024); + + // loadModel should detect missing file and return ModelDownloading + ErrorCode result=rt.loadModel("test-download-7b", "Q4_K_M"); + + EXPECT_EQ(result, ErrorCode::ModelDownloading); + + auto state=rt.getModelState("test-download-7b"); + ASSERT_TRUE(state.has_value()); + EXPECT_EQ(state->state, ModelState::Downloading); + + std::filesystem::remove_all("rt_test_models"); +} + +TEST_F(ModelRuntimeTest, DownloadModelMissingFilesReturnsDownloading) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + // Create a config with a downloadable URL + std::ofstream dlModel("rt_config/models/download_model2.json"); + dlModel << R"({ + "models": [ + { + "model": "test-download-7b-v2", + "provider": "llama", + "ranking": 40, + "context_window": 4096, + "max_tokens": 2048, + "hardware_requirements": { + "min_system_ram_mb": 4096, + "parameter_count": "7B" + }, + "context_scaling": { + "base_context": 4096, + "max_context": 32768, + "vram_per_1k_context_mb": 64 + }, + "variants": [ + { + "quantization": "Q4_K_M", + "file_size_mb": 1, + "min_vram_mb": 128, + "recommended_vram_mb": 256, + "download": { + "url": "http://localhost:99999/fake2.gguf", + "sha256": "def456", + "filename": "test-dl-7b-v2-q4.gguf" + } + } + ] + } + ] + })"; + dlModel.close(); + + ModelManager::reset(); + ModelManager::instance().initialize({"rt_config"}); + ModelRuntime::reset(); + + // Initialize StorageManager so canDownload() succeeds + std::filesystem::create_directories("rt_test_models"); + StorageManager::instance().initialize("rt_test_models"); + StorageManager::instance().setStorageLimit(static_cast(10)*1024*1024*1024); + + ErrorCode result=rt.downloadModel("test-download-7b-v2", "Q4_K_M"); + + EXPECT_EQ(result, ErrorCode::ModelDownloading); + + auto state=rt.getModelState("test-download-7b-v2"); + ASSERT_TRUE(state.has_value()); + EXPECT_EQ(state->state, ModelState::Downloading); + + std::filesystem::remove_all("rt_test_models"); +} + +TEST_F(ModelRuntimeTest, ResetClearsDownloadState) +{ + ModelRuntime &rt=ModelRuntime::instance(); + + // Create a config with a downloadable URL + std::ofstream dlModel("rt_config/models/download_model3.json"); + dlModel << R"({ + "models": [ + { + "model": "test-download-reset", + "provider": "llama", + "ranking": 40, + "context_window": 4096, + "max_tokens": 2048, + "hardware_requirements": { + "min_system_ram_mb": 4096, + "parameter_count": "7B" + }, + "context_scaling": { + "base_context": 4096, + "max_context": 32768, + "vram_per_1k_context_mb": 64 + }, + "variants": [ + { + "quantization": "Q4_K_M", + "file_size_mb": 1, + "min_vram_mb": 128, + "recommended_vram_mb": 256, + "download": { + "url": "http://localhost:99999/fake3.gguf", + "sha256": "ghi789", + "filename": "test-dl-reset-q4.gguf" + } + } + ] + } + ] + })"; + dlModel.close(); + + ModelManager::reset(); + ModelManager::instance().initialize({"rt_config"}); + ModelRuntime::reset(); + + // Initialize StorageManager so canDownload() succeeds + std::filesystem::create_directories("rt_test_models"); + StorageManager::instance().initialize("rt_test_models"); + StorageManager::instance().setStorageLimit(static_cast(10)*1024*1024*1024); + + rt.downloadModel("test-download-reset", "Q4_K_M"); + + // Reset should clear all state and join threads + ModelRuntime::reset(); + + EXPECT_TRUE(rt.getModelStates().empty()); + EXPECT_EQ(rt.getMaxConcurrentDownloads(), 2); + + std::filesystem::remove_all("rt_test_models"); +} + } // namespace arbiterAI diff --git a/tests/serverConnectTests.cpp b/tests/serverConnectTests.cpp new file mode 100644 index 0000000..dd60539 --- /dev/null +++ b/tests/serverConnectTests.cpp @@ -0,0 +1,330 @@ +#include + +#include "arbiterAI/arbiterAI.h" +#include "arbiterAI/chatClient.h" +#include "arbiterAI/modelManager.h" +#include "arbiterAI/providers/openai.h" + +#include +#include +#include +#include + +namespace arbiterAI +{ + +// Remote arbiterAI server endpoint +static const std::string SERVER_URL="http://192.168.2.114:8080"; +static const std::string SERVER_API_URL=SERVER_URL+"/v1"; + +// Helper: check whether the remote server is reachable. +static bool isServerReachable() +{ + auto response=cpr::Get( + cpr::Url{SERVER_URL+"/health"}, + cpr::Timeout{3000}); + + if(response.error || response.status_code!=200) + return false; + + try + { + auto j=nlohmann::json::parse(response.text); + return j.value("status", "")=="ok"; + } + catch(...) + { + return false; + } +} + +// Helper: query /v1/models on the remote server and return the first model id. +static std::string getFirstAvailableModel() +{ + auto response=cpr::Get( + cpr::Url{SERVER_API_URL+"/models"}, + cpr::Timeout{5000}); + + if(response.error || response.status_code!=200) + return ""; + + try + { + auto j=nlohmann::json::parse(response.text); + if(j.contains("data") && j["data"].is_array() && !j["data"].empty()) + { + return j["data"][0].value("id", ""); + } + } + catch(...) + { + } + return ""; +} + +// Helper: find a model that is currently loaded and ready for inference +// on the remote server. Returns empty string if none. +static std::string getLoadedModel() +{ + auto response=cpr::Get( + cpr::Url{SERVER_URL+"/api/models/loaded"}, + cpr::Timeout{5000}); + + if(response.error || response.status_code!=200) + return ""; + + try + { + auto j=nlohmann::json::parse(response.text); + if(!j.contains("models") || !j["models"].is_array()) + return ""; + + for(const auto &m : j["models"]) + { + std::string state=m.value("state", ""); + if(state=="Loaded") + { + return m.value("model", ""); + } + } + } + catch(...) + { + } + return ""; +} + +class ServerConnectTest : public ::testing::Test { +protected: + void SetUp() override + { + if(!isServerReachable()) + { + GTEST_SKIP()<<"arbiterAI server not reachable at "<(); + + auto provider=std::make_unique(); + provider->setApiUrl(SERVER_API_URL); + provider->setApiKey("not-needed"); + provider->initialize(ModelManager::instance().getModels("openai")); + m_ai->providers["openai"]=std::move(provider); + m_ai->initialized=true; + } + + void TearDown() override + { + m_ai.reset(); + ModelManager::reset(); + } + + std::string m_remoteModel; + std::unique_ptr m_ai; +}; + +// ── Health ──────────────────────────────────────────────────── + +TEST_F(ServerConnectTest, HealthEndpointReturnsOk) +{ + auto response=cpr::Get( + cpr::Url{SERVER_URL+"/health"}, + cpr::Timeout{5000}); + + ASSERT_EQ(response.status_code, 200); + + auto j=nlohmann::json::parse(response.text); + EXPECT_EQ(j.value("status", ""), "ok"); + EXPECT_FALSE(j.value("version", "").empty()); +} + +// ── Model Listing ──────────────────────────────────────────── + +TEST_F(ServerConnectTest, ListModelsReturnsNonEmpty) +{ + auto response=cpr::Get( + cpr::Url{SERVER_API_URL+"/models"}, + cpr::Timeout{5000}); + + ASSERT_EQ(response.status_code, 200); + + auto j=nlohmann::json::parse(response.text); + ASSERT_TRUE(j.contains("data")); + ASSERT_TRUE(j["data"].is_array()); + EXPECT_GT(j["data"].size(), 0u); + + // Each entry should have an "id" + for(const auto &model : j["data"]) + { + EXPECT_TRUE(model.contains("id")); + EXPECT_FALSE(model["id"].get().empty()); + } +} + +// ── Library Completion via OpenAI Provider ──────────────────── + +TEST_F(ServerCompletionTest, CompletionThroughLibrary) +{ + CompletionRequest request; + request.model=m_remoteModel; + request.messages={{"user", "Say hello in exactly one word."}}; + request.max_tokens=32; + request.temperature=0.0; + request.api_key="not-needed"; + + CompletionResponse response; + ErrorCode result=m_ai->completion(request, response); + + ASSERT_EQ(result, ErrorCode::Success) + <<"Completion failed for model: "<streamingCompletion(request, callback); + + ASSERT_EQ(result, ErrorCode::Success) + <<"Streaming completion failed for model: "<createChatClient(config); + ASSERT_NE(client, nullptr) + <<"Failed to create ChatClient for model: "<completion(request, response); + + ASSERT_EQ(result, ErrorCode::Success); + EXPECT_FALSE(response.text.empty()); + + // Verify the history was updated + auto history=client->getHistory(); + EXPECT_GE(history.size(), 2u) + <<"History should contain at least the user message and assistant response"; + + // Verify usage stats + UsageStats stats; + client->getUsageStats(stats); + EXPECT_GT(stats.totalTokens, 0); + EXPECT_EQ(stats.completionCount, 1); +} + +// ── Multi-turn Conversation ─────────────────────────────────── + +TEST_F(ServerCompletionTest, MultiTurnConversation) +{ + ChatConfig config; + config.model=m_remoteModel; + config.maxTokens=64; + config.temperature=0.0; + config.apiKey="not-needed"; + + auto client=m_ai->createChatClient(config); + ASSERT_NE(client, nullptr); + + // Turn 1 + CompletionRequest req1; + req1.messages={{"user", "Remember the number 42."}}; + req1.max_tokens=64; + req1.temperature=0.0; + + CompletionResponse resp1; + ASSERT_EQ(client->completion(req1, resp1), ErrorCode::Success); + EXPECT_FALSE(resp1.text.empty()); + + // Turn 2 — references turn 1 + CompletionRequest req2; + req2.messages={{"user", "What number did I ask you to remember?"}}; + req2.max_tokens=32; + req2.temperature=0.0; + + CompletionResponse resp2; + ASSERT_EQ(client->completion(req2, resp2), ErrorCode::Success); + EXPECT_FALSE(resp2.text.empty()); + + // History should contain system (if any) + 4 messages (2 user + 2 assistant) + auto history=client->getHistory(); + EXPECT_GE(history.size(), 4u); + + // Usage should reflect both completions + UsageStats stats; + client->getUsageStats(stats); + EXPECT_EQ(stats.completionCount, 2); +} + +} // namespace arbiterAI diff --git a/vcpkg/custom_ports/llama-cpp/0001-external-ggml.patch b/vcpkg/custom_ports/llama-cpp/0001-external-ggml.patch deleted file mode 100644 index 3a745b4..0000000 --- a/vcpkg/custom_ports/llama-cpp/0001-external-ggml.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -145,6 +145,8 @@ endif() - # 3rd-party - # - -+find_package(ggml CONFIG REQUIRED) -+add_library(ggml ALIAS ggml::ggml) - if (NOT TARGET ggml) - add_subdirectory(ggml) - # ... otherwise assume ggml is added by a parent CMakeLists.txt diff --git a/vcpkg/custom_ports/llama-cpp/portfile.cmake b/vcpkg/custom_ports/llama-cpp/portfile.cmake index dc01956..7007666 100644 --- a/vcpkg/custom_ports/llama-cpp/portfile.cmake +++ b/vcpkg/custom_ports/llama-cpp/portfile.cmake @@ -2,26 +2,26 @@ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO ggml-org/llama.cpp REF b${VERSION} - SHA512 e093f4c7d4b2de425932bb4960683527a8a3bba242132c2f5e5bfed8480f0e336a06f97baf2d20ee591c6deee7535e159d40884a5e3f7caf0ae0967b8a046850 + SHA512 b05f130a2052d3c2cec483c3b098f71585fe7d00fa1971786c0a646717f82320211801780625b9aabc9fc1e1797f8995381e40661f3e8a115c72710f147083cd HEAD_REF master - PATCHES - 0001-external-ggml.patch ) -file(REMOVE_RECURSE "${SOURCE_PATH}/ggml") - vcpkg_cmake_configure( SOURCE_PATH "${SOURCE_PATH}" OPTIONS -DGGML_CCACHE=OFF + -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF + -DLLAMA_BUILD_TOOLS=OFF + -DLLAMA_BUILD_SERVER=OFF -DLLAMA_ALL_WARNINGS=OFF ${FEATURE_OPTIONS} ) vcpkg_cmake_install() -vcpkg_cmake_config_fixup(PACKAGE_NAME llama CONFIG_PATH "lib/cmake/llama") +vcpkg_cmake_config_fixup(PACKAGE_NAME llama CONFIG_PATH "lib/cmake/llama" DO_NOT_DELETE_PARENT_CONFIG_PATH) +vcpkg_cmake_config_fixup(PACKAGE_NAME ggml CONFIG_PATH "lib/cmake/ggml") vcpkg_copy_pdbs() vcpkg_fixup_pkgconfig() diff --git a/vcpkg/custom_ports/llama-cpp/vcpkg.json b/vcpkg/custom_ports/llama-cpp/vcpkg.json index e20d77f..f6565d7 100644 --- a/vcpkg/custom_ports/llama-cpp/vcpkg.json +++ b/vcpkg/custom_ports/llama-cpp/vcpkg.json @@ -1,12 +1,11 @@ { "name": "llama-cpp", - "version": "4743", + "version": "8573", "port-version": 1, "description": "LLM inference in C/C++", "homepage": "https://github.com/ggml-org/llama.cpp", "license": "MIT", "dependencies": [ - "ggml", { "name": "vcpkg-cmake", "host": true