From fecff1ebb525c6ab0b0cf015a086cf97ba51e065 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Tue, 7 Oct 2025 21:55:49 +0100 Subject: [PATCH 01/13] html file to demo multimodal Demos llama.cpp multimodal support with a webcam. Signed-off-by: Eric Curtin --- assets/camera-demo.html | 265 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 assets/camera-demo.html diff --git a/assets/camera-demo.html b/assets/camera-demo.html new file mode 100644 index 000000000..072de8d56 --- /dev/null +++ b/assets/camera-demo.html @@ -0,0 +1,265 @@ + + + + + + Camera Interaction App + + + + +

Camera Interaction App

+ + + + +
+
+
+ +
+
+
+ +
+
+
+ +
+
+ +
+ + + +
+ + + + From 940282c2fa4e298345a9ac514263b74daf738972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 16:16:34 +0200 Subject: [PATCH 02/13] chore: rename camera-demo.html for improved accessibility --- {assets => demos/multimodal}/camera-demo.html | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {assets => demos/multimodal}/camera-demo.html (100%) diff --git a/assets/camera-demo.html b/demos/multimodal/camera-demo.html similarity index 100% rename from assets/camera-demo.html rename to demos/multimodal/camera-demo.html From 5a0263dab83dfb036d0fc6fe7cc17ca8805640d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 16:47:33 +0200 Subject: [PATCH 03/13] add readme and include model in the request --- demos/multimodal/README.md | 91 ++++++++++ demos/multimodal/camera-demo.html | 265 ---------------------------- demos/multimodal/demo.html | 281 ++++++++++++++++++++++++++++++ 3 files changed, 372 insertions(+), 265 deletions(-) create mode 100644 demos/multimodal/README.md delete mode 100644 demos/multimodal/camera-demo.html create mode 100644 demos/multimodal/demo.html diff --git a/demos/multimodal/README.md b/demos/multimodal/README.md new file mode 100644 index 000000000..eda968261 --- /dev/null +++ b/demos/multimodal/README.md @@ -0,0 +1,91 @@ +# Real-time Webcam Vision Model Demo + +This demo allows you to interact with a vision model in real-time using your webcam. The model can analyze the video feed and answer questions about what it sees. + +## Credits + +This demo is based on the excellent work by [ngxson/smolvlm-realtime-webcam](https://github.com/ngxson/smolvlm-realtime-webcam). Thank you for creating this impressive demonstration! + +## Prerequisites + +Before running this demo, you need: + +1. **Docker Model Runner** - Either through Docker Desktop or standalone installation +2. **The SmolVLM model** - Specifically `ai/smolvlm:500M-Q8_0` + +## Setup Instructions + +You have two options for setting up Docker Model Runner: + +### Option A: Using Docker Desktop (Easiest) + +This is the recommended approach for most users. + +1. **Enable Docker Model Runner** + - Open Docker Desktop settings + - Go to the **AI** tab + - Select **Enable Docker Model Runner** + +2. **Enable TCP Support and CORS** + - In the same settings page, select **Enable host-side TCP support** + - Set the **Port** to `12434` (default) + - In **CORS Allows Origins**, add `*` or the specific origin where you'll open the HTML file + + For detailed instructions, see the [Docker Model Runner documentation](https://docs.docker.com/ai/model-runner/get-started/#enable-docker-model-runner). + +3. **Pull the Model** + - Open Docker Desktop + - Go to the **Models** tab → **Docker Hub** + - Search for `ai/smolvlm:500M-Q8_0` and click **Pull** + + Or use the CLI: + ```bash + docker model pull ai/smolvlm:500M-Q8_0 + ``` + +### Option B: Using Standalone Docker Model Runner + +If you prefer not to use Docker Desktop, you can run Docker Model Runner directly: + +1. **Install Docker Model Runner** + + Follow the installation instructions in the [main README](../../README.md) for your platform. + +2. **Pull the Model** + ```bash + docker model pull ai/smolvlm:500M-Q8_0 + ``` + +> **Note:** TCP support is enabled by default on port `12434` when using Docker Engine. + +## Running the Demo + +1. **Open the Demo** + - Simply open `demo.html` in your web browser + - You can open it directly from your file system or serve it with a local web server + +2. **Grant Camera Permission** + - Your browser will ask for camera access + - Click "Allow" to grant permission + +3. **Configure the Demo** + - **Base API**: By default set to `http://localhost:12434/engines/llama.cpp` + - Change the port if you configured Docker Model Runner on a different port + - **Instruction**: Enter what you want the model to analyze (default: "What do you see?") + - Examples: "Describe the scene", "What objects can you see?", "What is the person doing?" + - **Interval**: Choose how often to send requests to the model (default: 500ms) + - Shorter intervals = more responsive but higher resource usage + - Longer intervals = lower resource usage but less real-time feel + +4. **Start the Interaction** + - Click the **Start** button + - The model will begin analyzing your webcam feed + - Responses will appear in the **Response** text area + - Click **Stop** when you're done + +## Learn More + +- [Community Slack Channel](https://app.slack.com/client/T0JK1PCN6/C09H9P5E57B) +- [Docker Model Runner Documentation](https://docs.docker.com/ai/model-runner/) +- [Original Demo by ngxson](https://github.com/ngxson/smolvlm-realtime-webcam) +- [SmolVLM Model Information](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) diff --git a/demos/multimodal/camera-demo.html b/demos/multimodal/camera-demo.html deleted file mode 100644 index 072de8d56..000000000 --- a/demos/multimodal/camera-demo.html +++ /dev/null @@ -1,265 +0,0 @@ - - - - - - Camera Interaction App - - - - -

Camera Interaction App

- - - - -
-
-
- -
-
-
- -
-
-
- -
-
- -
- - - -
- - - - diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html new file mode 100644 index 000000000..02603b0ca --- /dev/null +++ b/demos/multimodal/demo.html @@ -0,0 +1,281 @@ + + + + + + Camera Interaction App + + + + +

Camera Interaction App

+ + + + +
+
+
+ +
+
+
+ +
+
+
+ +
+
+ +
+ + + +
+ + + + \ No newline at end of file From 94e8efff75f498e516c517e94e8e701762006c3c Mon Sep 17 00:00:00 2001 From: Ignasi Date: Wed, 8 Oct 2025 16:50:07 +0200 Subject: [PATCH 04/13] Update demos/multimodal/demo.html Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- demos/multimodal/demo.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 02603b0ca..2b80ad261 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -187,7 +187,7 @@

Camera Interaction App

} async function sendData() { - if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval + if (isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval const instruction = instructionText.value; const imageBase64URL = captureImage(); From 0aea05beeb08ff488285b4e2432710d78d6f73f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 17:00:49 +0200 Subject: [PATCH 05/13] revert wrong fix --- demos/multimodal/demo.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 2b80ad261..02603b0ca 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -187,7 +187,7 @@

Camera Interaction App

} async function sendData() { - if (isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval + if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval const instruction = instructionText.value; const imageBase64URL = captureImage(); From 4e98f994fc558f23ae2817094ced85ba2834f3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 17:16:17 +0200 Subject: [PATCH 06/13] feat: add model selection and fetch functionality to demo --- demos/multimodal/README.md | 4 ++ demos/multimodal/demo.html | 109 +++++++++++++++++++++++++++++++------ 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/demos/multimodal/README.md b/demos/multimodal/README.md index eda968261..d42b6177a 100644 --- a/demos/multimodal/README.md +++ b/demos/multimodal/README.md @@ -71,6 +71,10 @@ If you prefer not to use Docker Desktop, you can run Docker Model Runner directl 3. **Configure the Demo** - **Base API**: By default set to `http://localhost:12434/engines/llama.cpp` - Change the port if you configured Docker Model Runner on a different port + - **Model**: Select from available models pulled to your Docker Model Runner + - The demo automatically fetches and displays all available models + - SmolVLM models will be auto-selected if available + - If model fetching fails, it falls back to `ai/smolvlm:500M-Q8_0` - **Instruction**: Enter what you want the model to analyze (default: "What do you see?") - Examples: "Describe the scene", "What objects can you see?", "What is the person doing?" - **Interval**: Choose how often to send requests to the model (default: 500ms) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 02603b0ca..607cdd6af 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -89,7 +89,13 @@

Camera Interaction App


- + +
+
+
+

@@ -118,6 +124,7 @@

Camera Interaction App

const video = document.getElementById('videoFeed'); const canvas = document.getElementById('canvas'); const baseURL = document.getElementById('baseURL'); + const modelSelect = document.getElementById('modelSelect'); const instructionText = document.getElementById('instructionText'); const responseText = document.getElementById('responseText'); const intervalSelect = document.getElementById('intervalSelect'); @@ -129,44 +136,110 @@

Camera Interaction App

let intervalId; let isProcessing = false; + // Fetch available models from the API + async function fetchModels() { + try { + const response = await fetch(`${baseURL.value}/v1/models`); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + const data = await response.json(); + + // Clear the loading option + modelSelect.innerHTML = ''; + + if (data && data.length > 0) { + let totalTags = 0; + // Populate dropdown with available models using their tags + data.forEach(model => { + if (model.tags && model.tags.length > 0) { + model.tags.forEach(tag => { + const option = document.createElement('option'); + option.value = tag; + option.textContent = tag; + modelSelect.appendChild(option); + totalTags++; + }); + } + }); + + if (totalTags > 0) { + // Try to select smolvlm model by default, or use the first option + const options = Array.from(modelSelect.options); + const smolvlmOption = options.find(opt => opt.value.toLowerCase().includes('smolvlm')); + if (smolvlmOption) { + modelSelect.value = smolvlmOption.value; + } else { + modelSelect.value = options[0].value; + } + + responseText.value = `Found ${totalTags} model(s). Ready to start.`; + } else { + modelSelect.innerHTML = ''; + responseText.value = "No tagged models found. Please pull a model first."; + } + } else { + modelSelect.innerHTML = ''; + responseText.value = "No models found. Please pull a model first."; + } + } catch (error) { + console.error('Error fetching models:', error); + modelSelect.innerHTML = ''; + responseText.value = `Could not fetch models: ${error.message}. Using fallback model.`; + } + } + // Returns response text (string) async function sendChatCompletionRequest(instruction, imageBase64URL) { + const selectedModel = modelSelect.value; + if (!selectedModel) { + return "Error: No model selected"; + } + const response = await fetch(`${baseURL.value}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ + model: selectedModel, max_tokens: 100, - model: "ai/smolvlm:500M-Q8_0", messages: [ - { - role: 'user', content: [ - {type: 'text', text: instruction}, - { - type: 'image_url', image_url: { + { role: 'user', content: [ + { type: 'text', text: instruction }, + { type: 'image_url', image_url: { url: imageBase64URL, - } - } - ] - }, + } } + ] }, ] }) }); if (!response.ok) { - const errorData = await response.text(); - return `Server error: ${response.status} - ${errorData}`; + const errorText = await response.text(); + try { + const errorData = JSON.parse(errorText); + // Check if error message indicates no multimodal support + if (errorData.error && errorData.error.message && + errorData.error.message.includes('image input is not supported')) { + return "Error: The selected model does not support image input. Please select a vision model (e.g., SmolVLM)."; + } + return `Server error: ${response.status} - ${errorData.error?.message || errorText}`; + } catch (e) { + // If JSON parse fails, use the raw text + return `Server error: ${response.status} - ${errorText}`; + } } const data = await response.json(); return data.choices[0].message.content; } - // 1. Ask for camera permission on load + // 1. Ask for camera permission and fetch models on load async function initCamera() { try { - stream = await navigator.mediaDevices.getUserMedia({video: true, audio: false}); + stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false }); video.srcObject = stream; - responseText.value = "Camera access granted. Ready to start."; + responseText.value = "Camera access granted. Loading models..."; + await fetchModels(); } catch (err) { console.error("Error accessing camera:", err); responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`; @@ -226,6 +299,7 @@

Camera Interaction App

instructionText.disabled = true; intervalSelect.disabled = true; + modelSelect.disabled = true; responseText.value = "Processing started..."; @@ -250,6 +324,7 @@

Camera Interaction App

instructionText.disabled = false; intervalSelect.disabled = false; + modelSelect.disabled = false; if (responseText.value.startsWith("Processing started...")) { responseText.value = "Processing stopped."; } @@ -278,4 +353,4 @@

Camera Interaction App

- \ No newline at end of file + From bd91ddff3badf1ce8f79c4421f851fb0dbd8cfe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 18:02:12 +0200 Subject: [PATCH 07/13] use /models to list models --- demos/multimodal/demo.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 607cdd6af..c713812b1 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -139,7 +139,8 @@

Camera Interaction App

// Fetch available models from the API async function fetchModels() { try { - const response = await fetch(`${baseURL.value}/v1/models`); + const base = new URL(baseURL.value); + const response = await fetch(`${base.protocol}//${base.host}/models`); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } From 1d8c7e8109a288461e47cc0cb7e304b55e18eb6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 20:16:55 +0200 Subject: [PATCH 08/13] Revert to a single model. We have only a few models that support multimodal input, so most models will throw an error if used here. --- demos/multimodal/README.md | 6 +-- demos/multimodal/demo.html | 107 +++---------------------------------- 2 files changed, 9 insertions(+), 104 deletions(-) diff --git a/demos/multimodal/README.md b/demos/multimodal/README.md index d42b6177a..7dea071a7 100644 --- a/demos/multimodal/README.md +++ b/demos/multimodal/README.md @@ -69,12 +69,8 @@ If you prefer not to use Docker Desktop, you can run Docker Model Runner directl - Click "Allow" to grant permission 3. **Configure the Demo** - - **Base API**: By default set to `http://localhost:12434/engines/llama.cpp` + - **Base API**: By default set to `http://127.0.0.1:12434/engines/llama.cpp` - Change the port if you configured Docker Model Runner on a different port - - **Model**: Select from available models pulled to your Docker Model Runner - - The demo automatically fetches and displays all available models - - SmolVLM models will be auto-selected if available - - If model fetching fails, it falls back to `ai/smolvlm:500M-Q8_0` - **Instruction**: Enter what you want the model to analyze (default: "What do you see?") - Examples: "Describe the scene", "What objects can you see?", "What is the person doing?" - **Interval**: Choose how often to send requests to the model (default: 500ms) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index c713812b1..f8d6878f0 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -14,7 +14,6 @@ padding: 20px; background-color: #f0f0f0; } - .controls, .io-areas { display: flex; gap: 10px; @@ -22,14 +21,12 @@ background-color: #fff; padding: 15px; border-radius: 8px; - box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); + box-shadow: 0 2px 5px rgba(0,0,0,0.1); } - .io-areas { flex-direction: column; align-items: stretch; } - textarea { width: 300px; height: 80px; @@ -38,7 +35,6 @@ border-radius: 4px; font-size: 14px; } - #videoFeed { width: 480px; height: 360px; @@ -46,7 +42,6 @@ background-color: #000; border-radius: 8px; } - #startButton { padding: 10px 20px; font-size: 16px; @@ -55,25 +50,20 @@ border-radius: 4px; color: white; } - #startButton.start { background-color: #28a745; /* Green */ } - #startButton.stop { background-color: #dc3545; /* Red */ } - label { font-weight: bold; } - select { padding: 8px; border-radius: 4px; border: 1px solid #ccc; } - .hidden { display: none; } @@ -91,20 +81,13 @@

Camera Interaction App


-
-
- -


- +
@@ -124,7 +107,6 @@

Camera Interaction App

const video = document.getElementById('videoFeed'); const canvas = document.getElementById('canvas'); const baseURL = document.getElementById('baseURL'); - const modelSelect = document.getElementById('modelSelect'); const instructionText = document.getElementById('instructionText'); const responseText = document.getElementById('responseText'); const intervalSelect = document.getElementById('intervalSelect'); @@ -136,74 +118,15 @@

Camera Interaction App

let intervalId; let isProcessing = false; - // Fetch available models from the API - async function fetchModels() { - try { - const base = new URL(baseURL.value); - const response = await fetch(`${base.protocol}//${base.host}/models`); - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const data = await response.json(); - - // Clear the loading option - modelSelect.innerHTML = ''; - - if (data && data.length > 0) { - let totalTags = 0; - // Populate dropdown with available models using their tags - data.forEach(model => { - if (model.tags && model.tags.length > 0) { - model.tags.forEach(tag => { - const option = document.createElement('option'); - option.value = tag; - option.textContent = tag; - modelSelect.appendChild(option); - totalTags++; - }); - } - }); - - if (totalTags > 0) { - // Try to select smolvlm model by default, or use the first option - const options = Array.from(modelSelect.options); - const smolvlmOption = options.find(opt => opt.value.toLowerCase().includes('smolvlm')); - if (smolvlmOption) { - modelSelect.value = smolvlmOption.value; - } else { - modelSelect.value = options[0].value; - } - - responseText.value = `Found ${totalTags} model(s). Ready to start.`; - } else { - modelSelect.innerHTML = ''; - responseText.value = "No tagged models found. Please pull a model first."; - } - } else { - modelSelect.innerHTML = ''; - responseText.value = "No models found. Please pull a model first."; - } - } catch (error) { - console.error('Error fetching models:', error); - modelSelect.innerHTML = ''; - responseText.value = `Could not fetch models: ${error.message}. Using fallback model.`; - } - } - // Returns response text (string) async function sendChatCompletionRequest(instruction, imageBase64URL) { - const selectedModel = modelSelect.value; - if (!selectedModel) { - return "Error: No model selected"; - } - const response = await fetch(`${baseURL.value}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ - model: selectedModel, + model: 'ai/smolvlm:500M-Q8_0', max_tokens: 100, messages: [ { role: 'user', content: [ @@ -216,31 +139,19 @@

Camera Interaction App

}) }); if (!response.ok) { - const errorText = await response.text(); - try { - const errorData = JSON.parse(errorText); - // Check if error message indicates no multimodal support - if (errorData.error && errorData.error.message && - errorData.error.message.includes('image input is not supported')) { - return "Error: The selected model does not support image input. Please select a vision model (e.g., SmolVLM)."; - } - return `Server error: ${response.status} - ${errorData.error?.message || errorText}`; - } catch (e) { - // If JSON parse fails, use the raw text - return `Server error: ${response.status} - ${errorText}`; - } + const errorData = await response.text(); + return `Server error: ${response.status} - ${errorData}`; } const data = await response.json(); return data.choices[0].message.content; } - // 1. Ask for camera permission and fetch models on load + // 1. Ask for camera permission on load async function initCamera() { try { stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false }); video.srcObject = stream; - responseText.value = "Camera access granted. Loading models..."; - await fetchModels(); + responseText.value = "Camera access granted. Ready to start."; } catch (err) { console.error("Error accessing camera:", err); responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`; @@ -300,7 +211,6 @@

Camera Interaction App

instructionText.disabled = true; intervalSelect.disabled = true; - modelSelect.disabled = true; responseText.value = "Processing started..."; @@ -325,7 +235,6 @@

Camera Interaction App

instructionText.disabled = false; intervalSelect.disabled = false; - modelSelect.disabled = false; if (responseText.value.startsWith("Processing started...")) { responseText.value = "Processing stopped."; } @@ -354,4 +263,4 @@

Camera Interaction App

- + \ No newline at end of file From 3f1bc79adc9b0ae6d0f5424fac07589435003097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Wed, 8 Oct 2025 20:25:22 +0200 Subject: [PATCH 09/13] feat: prevent overlapping requests by managing response state --- demos/multimodal/demo.html | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index f8d6878f0..fb28f9714 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -117,9 +117,11 @@

Camera Interaction App

let stream; let intervalId; let isProcessing = false; + let isWaitingForResponse = false; // Returns response text (string) async function sendChatCompletionRequest(instruction, imageBase64URL) { + isWaitingForResponse = true; const response = await fetch(`${baseURL.value}/v1/chat/completions`, { method: 'POST', headers: { @@ -138,6 +140,7 @@

Camera Interaction App

] }) }); + isWaitingForResponse = false; if (!response.ok) { const errorData = await response.text(); return `Server error: ${response.status} - ${errorData}`; @@ -173,6 +176,7 @@

Camera Interaction App

async function sendData() { if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval + if (isWaitingForResponse) return; // Skip if waiting for previous response const instruction = instructionText.value; const imageBase64URL = captureImage(); From a623618e7152c20ec3b126b0a3a8e72c3028f90c Mon Sep 17 00:00:00 2001 From: Ignasi Date: Wed, 8 Oct 2025 20:27:20 +0200 Subject: [PATCH 10/13] Update demos/multimodal/demo.html Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- demos/multimodal/demo.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index fb28f9714..24dec3ea7 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -79,7 +79,7 @@

Camera Interaction App


- +

From 87c8274dae925f79fd08be22e066be72460d317a Mon Sep 17 00:00:00 2001 From: Ignasi Date: Wed, 8 Oct 2025 20:28:06 +0200 Subject: [PATCH 11/13] Update demos/multimodal/demo.html Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- demos/multimodal/demo.html | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 24dec3ea7..1f88d41cb 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -188,13 +188,9 @@

Camera Interaction App

return; } - const payload = { - instruction: instruction, - imageBase64URL: imageBase64URL - }; try { - const response = await sendChatCompletionRequest(payload.instruction, payload.imageBase64URL); + const response = await sendChatCompletionRequest(instruction, imageBase64URL); responseText.value = response; } catch (error) { console.error('Error sending data:', error); From e77747808e71037c4f1af9788e281d763ed48424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Thu, 9 Oct 2025 14:39:31 +0200 Subject: [PATCH 12/13] feat(demo): enhance model selection with warnings and info messages --- demos/multimodal/demo.html | 123 +++++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 5 deletions(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 1f88d41cb..234f8d281 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -67,6 +67,40 @@ .hidden { display: none; } + #modelWarning { + background-color: #fff3cd; + color: #856404; + border: 1px solid #ffeaa7; + border-radius: 4px; + padding: 10px; + margin-top: 5px; + font-size: 14px; + width: 100%; + box-sizing: border-box; + display: none; + } + #modelWarning.show { + display: block; + } + #modelWarning a { + color: #856404; + text-decoration: underline; + } + #modelInfo { + background-color: #d1ecf1; + color: #0c5460; + border: 1px solid #bee5eb; + border-radius: 4px; + padding: 10px; + margin-top: 5px; + font-size: 14px; + width: 100%; + box-sizing: border-box; + } + #modelInfo a { + color: #0c5460; + text-decoration: underline; + } @@ -79,7 +113,15 @@

Camera Interaction App


- + +
+
+
+ +
+

@@ -107,6 +149,9 @@

Camera Interaction App

const video = document.getElementById('videoFeed'); const canvas = document.getElementById('canvas'); const baseURL = document.getElementById('baseURL'); + const modelSelect = document.getElementById('modelSelect'); + const modelWarning = document.getElementById('modelWarning'); + const modelInfo = document.getElementById('modelInfo'); const instructionText = document.getElementById('instructionText'); const responseText = document.getElementById('responseText'); const intervalSelect = document.getElementById('intervalSelect'); @@ -119,6 +164,62 @@

Camera Interaction App

let isProcessing = false; let isWaitingForResponse = false; + const RECOMMENDED_MODEL = 'ai/smolvlm:500M-Q8_0'; + + // Fetch available models from the API + async function fetchModels() { + // Base info message - always shown in modelInfo + modelInfo.innerHTML = `ℹ️ To pull a model, run: docker model pull <model-name>
Find more models at: https://hub.docker.com/r/ai`; + + try { + const response = await fetch(`${baseURL.value}/v1/models`); + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + const data = await response.json(); + const models = data.data.map(model => model.id); + + // Clear and populate the model selector + modelSelect.innerHTML = ''; + + if (models.length === 0) { + modelSelect.innerHTML = ''; + modelWarning.innerHTML = `⚠️ No models found. Please ensure the model runner is active and models are loaded.`; + modelWarning.classList.add('show'); + return; + } + + // Add all models to the selector + models.forEach(modelId => { + const option = document.createElement('option'); + option.value = modelId; + option.textContent = modelId; + modelSelect.appendChild(option); + }); + + // Check if the recommended model exists + const recommendedModelExists = models.includes(RECOMMENDED_MODEL); + + if (recommendedModelExists) { + modelSelect.value = RECOMMENDED_MODEL; + // Hide warning - everything is good + modelWarning.classList.remove('show'); + } else { + // Select the first model + modelSelect.value = models[0]; + // Show suggestion about recommended model + modelWarning.innerHTML = `💡 We recommend to run this demo with ${RECOMMENDED_MODEL}`; + modelWarning.classList.add('show'); + } + + } catch (error) { + console.error('Error fetching models:', error); + modelSelect.innerHTML = ''; + modelWarning.innerHTML = `⚠️ Error loading models: ${error.message}
Please check that the API is accessible at ${baseURL.value}/v1/models`; + modelWarning.classList.add('show'); + } + } + // Returns response text (string) async function sendChatCompletionRequest(instruction, imageBase64URL) { isWaitingForResponse = true; @@ -128,7 +229,7 @@

Camera Interaction App

'Content-Type': 'application/json' }, body: JSON.stringify({ - model: 'ai/smolvlm:500M-Q8_0', + model: modelSelect.value, max_tokens: 100, messages: [ { role: 'user', content: [ @@ -143,6 +244,15 @@

Camera Interaction App

isWaitingForResponse = false; if (!response.ok) { const errorData = await response.text(); + try { + const errorJson = JSON.parse(errorData); + if (errorJson.error && errorJson.error.message && + errorJson.error.message.includes('image input is not supported')) { + return `⚠️ This model doesn't support vision. Please select a vision-capable model like '${RECOMMENDED_MODEL}'.`; + } + } catch (e) { + // If parsing fails, fall through to generic error + } return `Server error: ${response.status} - ${errorData}`; } const data = await response.json(); @@ -248,8 +358,11 @@

Camera Interaction App

} }); - // Initialize camera when the page loads - window.addEventListener('DOMContentLoaded', initCamera); + // Initialize camera and fetch models when the page loads + window.addEventListener('DOMContentLoaded', () => { + initCamera(); + fetchModels(); + }); // Optional: Stop stream when page is closed/navigated away to release camera window.addEventListener('beforeunload', () => { @@ -263,4 +376,4 @@

Camera Interaction App

- \ No newline at end of file + From 4b33c578eafbbc9c11868ea875150c98c1224dde Mon Sep 17 00:00:00 2001 From: Ignasi Date: Thu, 9 Oct 2025 15:33:52 +0200 Subject: [PATCH 13/13] Update demos/multimodal/demo.html Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- demos/multimodal/demo.html | 2 -- 1 file changed, 2 deletions(-) diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html index 234f8d281..3e5b75e20 100644 --- a/demos/multimodal/demo.html +++ b/demos/multimodal/demo.html @@ -293,8 +293,6 @@

Camera Interaction App

if (!imageBase64URL) { responseText.value = "Failed to capture image. Stream might not be active."; - // Optionally stop processing if image capture fails consistently - // handleStop(); return; }