From fecff1ebb525c6ab0b0cf015a086cf97ba51e065 Mon Sep 17 00:00:00 2001
From: Eric Curtin <eric.curtin@docker.com>
Date: Tue, 7 Oct 2025 21:55:49 +0100
Subject: [PATCH 01/13] html file to demo multimodal

Demos llama.cpp multimodal support with a webcam.

Signed-off-by: Eric Curtin <eric.curtin@docker.com>
---
 assets/camera-demo.html | 265 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 assets/camera-demo.html
diff --git a/assets/camera-demo.html b/assets/camera-demo.html
new file mode 100644
index 000000000..072de8d56
--- /dev/null
+++ b/assets/camera-demo.html
@@ -0,0 +1,265 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Camera Interaction App</title>
+    <style>
+	body {
+	    font-family: sans-serif;
+	    display: flex;
+	    flex-direction: column;
+	    align-items: center;
+	    gap: 20px;
+	    padding: 20px;
+	    background-color: #f0f0f0;
+	}
+	.controls, .io-areas {
+	    display: flex;
+	    gap: 10px;
+	    align-items: center;
+	    background-color: #fff;
+	    padding: 15px;
+	    border-radius: 8px;
+	    box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+	}
+	.io-areas {
+	    flex-direction: column;
+	    align-items: stretch;
+	}
+	textarea {
+	    width: 300px;
+	    height: 80px;
+	    padding: 8px;
+	    border: 1px solid #ccc;
+	    border-radius: 4px;
+	    font-size: 14px;
+	}
+	#videoFeed {
+	    width: 480px;
+	    height: 360px;
+	    border: 2px solid #333;
+	    background-color: #000;
+	    border-radius: 8px;
+	}
+	#startButton {
+	    padding: 10px 20px;
+	    font-size: 16px;
+	    cursor: pointer;
+	    border: none;
+	    border-radius: 4px;
+	    color: white;
+	}
+	#startButton.start {
+	    background-color: #28a745; /* Green */
+	}
+	#startButton.stop {
+	    background-color: #dc3545; /* Red */
+	}
+	label {
+	    font-weight: bold;
+	}
+	select {
+	    padding: 8px;
+	    border-radius: 4px;
+	    border: 1px solid #ccc;
+	}
+	.hidden {
+	    display: none;
+	}
+    </style>
+</head>
+<body>
+
+    <h1>Camera Interaction App</h1>
+
+    <video id="videoFeed" autoplay playsinline></video>
+    <canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
+
+    <div class="io-areas">
+	<div>
+	    <label for="baseURL">Base API:</label><br>
+	    <input id="baseURL" name="Instruction" value="http://127.0.0.1:12434/engines/llama.cpp">
+	</div>
+	<div>
+	    <label for="instructionText">Instruction:</label><br>
+	    <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
+	</div>
+	<div>
+	    <label for="responseText">Response:</label><br>
+	    <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
+	</div>
+    </div>
+
+    <div class="controls">
+	<label for="intervalSelect">Interval between 2 requests:</label>
+	<select id="intervalSelect" name="Interval between 2 requests">
+	    <option value="100">100ms</option>
+	    <option value="250">250ms</option>
+	    <option value="500" selected>500ms</option>
+	    <option value="1000">1s</option>
+	    <option value="2000">2s</option>
+	</select>
+	<button id="startButton" class="start">Start</button>
+    </div>
+
+    <script>
+	const video = document.getElementById('videoFeed');
+	const canvas = document.getElementById('canvas');
+	const baseURL = document.getElementById('baseURL');
+	const instructionText = document.getElementById('instructionText');
+	const responseText = document.getElementById('responseText');
+	const intervalSelect = document.getElementById('intervalSelect');
+	const startButton = document.getElementById('startButton');
+
+	instructionText.value = "What do you see?"; // default instruction
+
+	let stream;
+	let intervalId;
+	let isProcessing = false;
+
+	// Returns response text (string)
+	async function sendChatCompletionRequest(instruction, imageBase64URL) {
+	    const response = await fetch(`${baseURL.value}/v1/chat/completions`, {
+		method: 'POST',
+		headers: {
+		    'Content-Type': 'application/json'
+		},
+		body: JSON.stringify({
+		    max_tokens: 100,
+		    messages: [
+			{ role: 'user', content: [
+			    { type: 'text', text: instruction },
+			    { type: 'image_url', image_url: {
+				url: imageBase64URL,
+			    } }
+			] },
+		    ]
+		})
+	    });
+	    if (!response.ok) {
+		const errorData = await response.text();
+		return `Server error: ${response.status} - ${errorData}`;
+	    }
+	    const data = await response.json();
+	    return data.choices[0].message.content;
+	}
+
+	// 1. Ask for camera permission on load
+	async function initCamera() {
+	    try {
+		stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
+		video.srcObject = stream;
+		responseText.value = "Camera access granted. Ready to start.";
+	    } catch (err) {
+		console.error("Error accessing camera:", err);
+		responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
+		alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
+	    }
+	}
+
+	function captureImage() {
+	    if (!stream || !video.videoWidth) {
+		console.warn("Video stream not ready for capture.");
+		return null;
+	    }
+	    canvas.width = video.videoWidth;
+	    canvas.height = video.videoHeight;
+	    const context = canvas.getContext('2d');
+	    context.drawImage(video, 0, 0, canvas.width, canvas.height);
+	    return canvas.toDataURL('image/jpeg', 0.8); // Use JPEG for smaller size, 0.8 quality
+	}
+
+	async function sendData() {
+	    if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
+
+	    const instruction = instructionText.value;
+	    const imageBase64URL = captureImage();
+
+	    if (!imageBase64URL) {
+		responseText.value = "Failed to capture image. Stream might not be active.";
+		// Optionally stop processing if image capture fails consistently
+		// handleStop();
+		return;
+	    }
+
+	    const payload = {
+		instruction: instruction,
+		imageBase64URL: imageBase64URL
+	    };
+
+	    try {
+		const response = await sendChatCompletionRequest(payload.instruction, payload.imageBase64URL);
+		responseText.value = response;
+	    } catch (error) {
+		console.error('Error sending data:', error);
+		responseText.value = `Error: ${error.message}`;
+	    }
+	}
+
+	function handleStart() {
+	    if (!stream) {
+		responseText.value = "Camera not available. Cannot start.";
+		alert("Camera not available. Please grant permission first.");
+		return;
+	    }
+	    isProcessing = true;
+	    startButton.textContent = "Stop";
+	    startButton.classList.remove('start');
+	    startButton.classList.add('stop');
+
+	    instructionText.disabled = true;
+	    intervalSelect.disabled = true;
+
+	    responseText.value = "Processing started...";
+
+	    const intervalMs = parseInt(intervalSelect.value, 10);
+
+	    // Initial immediate call
+	    sendData();
+
+	    // Then set interval
+	    intervalId = setInterval(sendData, intervalMs);
+	}
+
+	function handleStop() {
+	    isProcessing = false;
+	    if (intervalId) {
+		clearInterval(intervalId);
+		intervalId = null;
+	    }
+	    startButton.textContent = "Start";
+	    startButton.classList.remove('stop');
+	    startButton.classList.add('start');
+
+	    instructionText.disabled = false;
+	    intervalSelect.disabled = false;
+	    if (responseText.value.startsWith("Processing started...")) {
+		responseText.value = "Processing stopped.";
+	    }
+	}
+
+	startButton.addEventListener('click', () => {
+	    if (isProcessing) {
+		handleStop();
+	    } else {
+		handleStart();
+	    }
+	});
+
+	// Initialize camera when the page loads
+	window.addEventListener('DOMContentLoaded', initCamera);
+
+	// Optional: Stop stream when page is closed/navigated away to release camera
+	window.addEventListener('beforeunload', () => {
+	    if (stream) {
+		stream.getTracks().forEach(track => track.stop());
+	    }
+	    if (intervalId) {
+		clearInterval(intervalId);
+	    }
+	});
+
+    </script>
+</body>
+</html>

From 940282c2fa4e298345a9ac514263b74daf738972 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 16:16:34 +0200
Subject: [PATCH 02/13] chore: rename camera-demo.html for improved
 accessibility

---
 {assets => demos/multimodal}/camera-demo.html | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {assets => demos/multimodal}/camera-demo.html (100%)

diff --git a/assets/camera-demo.html b/demos/multimodal/camera-demo.html
similarity index 100%
rename from assets/camera-demo.html
rename to demos/multimodal/camera-demo.html

From 5a0263dab83dfb036d0fc6fe7cc17ca8805640d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 16:47:33 +0200
Subject: [PATCH 03/13] add readme and include model in the request

---
 demos/multimodal/README.md        |  91 ++++++++++
 demos/multimodal/camera-demo.html | 265 ----------------------------
 demos/multimodal/demo.html        | 281 ++++++++++++++++++++++++++++++
 3 files changed, 372 insertions(+), 265 deletions(-)
 create mode 100644 demos/multimodal/README.md
 delete mode 100644 demos/multimodal/camera-demo.html
 create mode 100644 demos/multimodal/demo.html

diff --git a/demos/multimodal/README.md b/demos/multimodal/README.md
new file mode 100644
index 000000000..eda968261
--- /dev/null
+++ b/demos/multimodal/README.md
@@ -0,0 +1,91 @@
+# Real-time Webcam Vision Model Demo
+
+This demo allows you to interact with a vision model in real-time using your webcam. The model can analyze the video feed and answer questions about what it sees.
+
+## Credits
+
+This demo is based on the excellent work by [ngxson/smolvlm-realtime-webcam](https://github.com/ngxson/smolvlm-realtime-webcam). Thank you for creating this impressive demonstration!
+
+## Prerequisites
+
+Before running this demo, you need:
+
+1. **Docker Model Runner** - Either through Docker Desktop or standalone installation
+2. **The SmolVLM model** - Specifically `ai/smolvlm:500M-Q8_0`
+
+## Setup Instructions
+
+You have two options for setting up Docker Model Runner:
+
+### Option A: Using Docker Desktop (Easiest)
+
+This is the recommended approach for most users.
+
+1. **Enable Docker Model Runner**
+   - Open Docker Desktop settings
+   - Go to the **AI** tab
+   - Select **Enable Docker Model Runner**
+
+2. **Enable TCP Support and CORS**
+   - In the same settings page, select **Enable host-side TCP support**
+   - Set the **Port** to `12434` (default)
+   - In **CORS Allows Origins**, add `*` or the specific origin where you'll open the HTML file
+   
+   For detailed instructions, see the [Docker Model Runner documentation](https://docs.docker.com/ai/model-runner/get-started/#enable-docker-model-runner).
+
+3. **Pull the Model**
+   - Open Docker Desktop
+   - Go to the **Models** tab → **Docker Hub**
+   - Search for `ai/smolvlm:500M-Q8_0` and click **Pull**
+   
+   Or use the CLI:
+   ```bash
+   docker model pull ai/smolvlm:500M-Q8_0
+   ```
+
+### Option B: Using Standalone Docker Model Runner
+
+If you prefer not to use Docker Desktop, you can run Docker Model Runner directly:
+
+1. **Install Docker Model Runner**
+   
+   Follow the installation instructions in the [main README](../../README.md) for your platform.
+
+2. **Pull the Model**
+   ```bash
+   docker model pull ai/smolvlm:500M-Q8_0
+   ```
+
+> **Note:** TCP support is enabled by default on port `12434` when using Docker Engine.
+
+## Running the Demo
+
+1. **Open the Demo**
+   - Simply open `demo.html` in your web browser
+   - You can open it directly from your file system or serve it with a local web server
+
+2. **Grant Camera Permission**
+   - Your browser will ask for camera access
+   - Click "Allow" to grant permission
+
+3. **Configure the Demo**
+   - **Base API**: By default set to `http://localhost:12434/engines/llama.cpp`
+     - Change the port if you configured Docker Model Runner on a different port
+   - **Instruction**: Enter what you want the model to analyze (default: "What do you see?")
+     - Examples: "Describe the scene", "What objects can you see?", "What is the person doing?"
+   - **Interval**: Choose how often to send requests to the model (default: 500ms)
+     - Shorter intervals = more responsive but higher resource usage
+     - Longer intervals = lower resource usage but less real-time feel
+
+4. **Start the Interaction**
+   - Click the **Start** button
+   - The model will begin analyzing your webcam feed
+   - Responses will appear in the **Response** text area
+   - Click **Stop** when you're done
+
+## Learn More
+
+- [Community Slack Channel](https://app.slack.com/client/T0JK1PCN6/C09H9P5E57B)
+- [Docker Model Runner Documentation](https://docs.docker.com/ai/model-runner/)
+- [Original Demo by ngxson](https://github.com/ngxson/smolvlm-realtime-webcam)
+- [SmolVLM Model Information](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct)
diff --git a/demos/multimodal/camera-demo.html b/demos/multimodal/camera-demo.html
deleted file mode 100644
index 072de8d56..000000000
--- a/demos/multimodal/camera-demo.html
+++ /dev/null
@@ -1,265 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Camera Interaction App</title>
-    <style>
-	body {
-	    font-family: sans-serif;
-	    display: flex;
-	    flex-direction: column;
-	    align-items: center;
-	    gap: 20px;
-	    padding: 20px;
-	    background-color: #f0f0f0;
-	}
-	.controls, .io-areas {
-	    display: flex;
-	    gap: 10px;
-	    align-items: center;
-	    background-color: #fff;
-	    padding: 15px;
-	    border-radius: 8px;
-	    box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-	}
-	.io-areas {
-	    flex-direction: column;
-	    align-items: stretch;
-	}
-	textarea {
-	    width: 300px;
-	    height: 80px;
-	    padding: 8px;
-	    border: 1px solid #ccc;
-	    border-radius: 4px;
-	    font-size: 14px;
-	}
-	#videoFeed {
-	    width: 480px;
-	    height: 360px;
-	    border: 2px solid #333;
-	    background-color: #000;
-	    border-radius: 8px;
-	}
-	#startButton {
-	    padding: 10px 20px;
-	    font-size: 16px;
-	    cursor: pointer;
-	    border: none;
-	    border-radius: 4px;
-	    color: white;
-	}
-	#startButton.start {
-	    background-color: #28a745; /* Green */
-	}
-	#startButton.stop {
-	    background-color: #dc3545; /* Red */
-	}
-	label {
-	    font-weight: bold;
-	}
-	select {
-	    padding: 8px;
-	    border-radius: 4px;
-	    border: 1px solid #ccc;
-	}
-	.hidden {
-	    display: none;
-	}
-    </style>
-</head>
-<body>
-
-    <h1>Camera Interaction App</h1>
-
-    <video id="videoFeed" autoplay playsinline></video>
-    <canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
-
-    <div class="io-areas">
-	<div>
-	    <label for="baseURL">Base API:</label><br>
-	    <input id="baseURL" name="Instruction" value="http://127.0.0.1:12434/engines/llama.cpp">
-	</div>
-	<div>
-	    <label for="instructionText">Instruction:</label><br>
-	    <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
-	</div>
-	<div>
-	    <label for="responseText">Response:</label><br>
-	    <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
-	</div>
-    </div>
-
-    <div class="controls">
-	<label for="intervalSelect">Interval between 2 requests:</label>
-	<select id="intervalSelect" name="Interval between 2 requests">
-	    <option value="100">100ms</option>
-	    <option value="250">250ms</option>
-	    <option value="500" selected>500ms</option>
-	    <option value="1000">1s</option>
-	    <option value="2000">2s</option>
-	</select>
-	<button id="startButton" class="start">Start</button>
-    </div>
-
-    <script>
-	const video = document.getElementById('videoFeed');
-	const canvas = document.getElementById('canvas');
-	const baseURL = document.getElementById('baseURL');
-	const instructionText = document.getElementById('instructionText');
-	const responseText = document.getElementById('responseText');
-	const intervalSelect = document.getElementById('intervalSelect');
-	const startButton = document.getElementById('startButton');
-
-	instructionText.value = "What do you see?"; // default instruction
-
-	let stream;
-	let intervalId;
-	let isProcessing = false;
-
-	// Returns response text (string)
-	async function sendChatCompletionRequest(instruction, imageBase64URL) {
-	    const response = await fetch(`${baseURL.value}/v1/chat/completions`, {
-		method: 'POST',
-		headers: {
-		    'Content-Type': 'application/json'
-		},
-		body: JSON.stringify({
-		    max_tokens: 100,
-		    messages: [
-			{ role: 'user', content: [
-			    { type: 'text', text: instruction },
-			    { type: 'image_url', image_url: {
-				url: imageBase64URL,
-			    } }
-			] },
-		    ]
-		})
-	    });
-	    if (!response.ok) {
-		const errorData = await response.text();
-		return `Server error: ${response.status} - ${errorData}`;
-	    }
-	    const data = await response.json();
-	    return data.choices[0].message.content;
-	}
-
-	// 1. Ask for camera permission on load
-	async function initCamera() {
-	    try {
-		stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
-		video.srcObject = stream;
-		responseText.value = "Camera access granted. Ready to start.";
-	    } catch (err) {
-		console.error("Error accessing camera:", err);
-		responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
-		alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
-	    }
-	}
-
-	function captureImage() {
-	    if (!stream || !video.videoWidth) {
-		console.warn("Video stream not ready for capture.");
-		return null;
-	    }
-	    canvas.width = video.videoWidth;
-	    canvas.height = video.videoHeight;
-	    const context = canvas.getContext('2d');
-	    context.drawImage(video, 0, 0, canvas.width, canvas.height);
-	    return canvas.toDataURL('image/jpeg', 0.8); // Use JPEG for smaller size, 0.8 quality
-	}
-
-	async function sendData() {
-	    if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
-
-	    const instruction = instructionText.value;
-	    const imageBase64URL = captureImage();
-
-	    if (!imageBase64URL) {
-		responseText.value = "Failed to capture image. Stream might not be active.";
-		// Optionally stop processing if image capture fails consistently
-		// handleStop();
-		return;
-	    }
-
-	    const payload = {
-		instruction: instruction,
-		imageBase64URL: imageBase64URL
-	    };
-
-	    try {
-		const response = await sendChatCompletionRequest(payload.instruction, payload.imageBase64URL);
-		responseText.value = response;
-	    } catch (error) {
-		console.error('Error sending data:', error);
-		responseText.value = `Error: ${error.message}`;
-	    }
-	}
-
-	function handleStart() {
-	    if (!stream) {
-		responseText.value = "Camera not available. Cannot start.";
-		alert("Camera not available. Please grant permission first.");
-		return;
-	    }
-	    isProcessing = true;
-	    startButton.textContent = "Stop";
-	    startButton.classList.remove('start');
-	    startButton.classList.add('stop');
-
-	    instructionText.disabled = true;
-	    intervalSelect.disabled = true;
-
-	    responseText.value = "Processing started...";
-
-	    const intervalMs = parseInt(intervalSelect.value, 10);
-
-	    // Initial immediate call
-	    sendData();
-
-	    // Then set interval
-	    intervalId = setInterval(sendData, intervalMs);
-	}
-
-	function handleStop() {
-	    isProcessing = false;
-	    if (intervalId) {
-		clearInterval(intervalId);
-		intervalId = null;
-	    }
-	    startButton.textContent = "Start";
-	    startButton.classList.remove('stop');
-	    startButton.classList.add('start');
-
-	    instructionText.disabled = false;
-	    intervalSelect.disabled = false;
-	    if (responseText.value.startsWith("Processing started...")) {
-		responseText.value = "Processing stopped.";
-	    }
-	}
-
-	startButton.addEventListener('click', () => {
-	    if (isProcessing) {
-		handleStop();
-	    } else {
-		handleStart();
-	    }
-	});
-
-	// Initialize camera when the page loads
-	window.addEventListener('DOMContentLoaded', initCamera);
-
-	// Optional: Stop stream when page is closed/navigated away to release camera
-	window.addEventListener('beforeunload', () => {
-	    if (stream) {
-		stream.getTracks().forEach(track => track.stop());
-	    }
-	    if (intervalId) {
-		clearInterval(intervalId);
-	    }
-	});
-
-    </script>
-</body>
-</html>
diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
new file mode 100644
index 000000000..02603b0ca
--- /dev/null
+++ b/demos/multimodal/demo.html
@@ -0,0 +1,281 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Camera Interaction App</title>
+    <style>
+        body {
+            font-family: sans-serif;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            gap: 20px;
+            padding: 20px;
+            background-color: #f0f0f0;
+        }
+
+        .controls, .io-areas {
+            display: flex;
+            gap: 10px;
+            align-items: center;
+            background-color: #fff;
+            padding: 15px;
+            border-radius: 8px;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
+        }
+
+        .io-areas {
+            flex-direction: column;
+            align-items: stretch;
+        }
+
+        textarea {
+            width: 300px;
+            height: 80px;
+            padding: 8px;
+            border: 1px solid #ccc;
+            border-radius: 4px;
+            font-size: 14px;
+        }
+
+        #videoFeed {
+            width: 480px;
+            height: 360px;
+            border: 2px solid #333;
+            background-color: #000;
+            border-radius: 8px;
+        }
+
+        #startButton {
+            padding: 10px 20px;
+            font-size: 16px;
+            cursor: pointer;
+            border: none;
+            border-radius: 4px;
+            color: white;
+        }
+
+        #startButton.start {
+            background-color: #28a745; /* Green */
+        }
+
+        #startButton.stop {
+            background-color: #dc3545; /* Red */
+        }
+
+        label {
+            font-weight: bold;
+        }
+
+        select {
+            padding: 8px;
+            border-radius: 4px;
+            border: 1px solid #ccc;
+        }
+
+        .hidden {
+            display: none;
+        }
+    </style>
+</head>
+<body>
+
+<h1>Camera Interaction App</h1>
+
+<video id="videoFeed" autoplay playsinline></video>
+<canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
+
+<div class="io-areas">
+    <div>
+        <label for="baseURL">Base API:</label><br>
+        <input id="baseURL" name="Instruction" value="http://localhost:12434/engines/llama.cpp"></textarea>
+    </div>
+    <div>
+        <label for="instructionText">Instruction:</label><br>
+        <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
+    </div>
+    <div>
+        <label for="responseText">Response:</label><br>
+        <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly
+                  placeholder="Server response will appear here..."></textarea>
+    </div>
+</div>
+
+<div class="controls">
+    <label for="intervalSelect">Interval between 2 requests:</label>
+    <select id="intervalSelect" name="Interval between 2 requests">
+        <option value="100">100ms</option>
+        <option value="250">250ms</option>
+        <option value="500" selected>500ms</option>
+        <option value="1000">1s</option>
+        <option value="2000">2s</option>
+    </select>
+    <button id="startButton" class="start">Start</button>
+</div>
+
+<script>
+    const video = document.getElementById('videoFeed');
+    const canvas = document.getElementById('canvas');
+    const baseURL = document.getElementById('baseURL');
+    const instructionText = document.getElementById('instructionText');
+    const responseText = document.getElementById('responseText');
+    const intervalSelect = document.getElementById('intervalSelect');
+    const startButton = document.getElementById('startButton');
+
+    instructionText.value = "What do you see?"; // default instruction
+
+    let stream;
+    let intervalId;
+    let isProcessing = false;
+
+    // Returns response text (string)
+    async function sendChatCompletionRequest(instruction, imageBase64URL) {
+        const response = await fetch(`${baseURL.value}/v1/chat/completions`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({
+                max_tokens: 100,
+                model: "ai/smolvlm:500M-Q8_0",
+                messages: [
+                    {
+                        role: 'user', content: [
+                            {type: 'text', text: instruction},
+                            {
+                                type: 'image_url', image_url: {
+                                    url: imageBase64URL,
+                                }
+                            }
+                        ]
+                    },
+                ]
+            })
+        });
+        if (!response.ok) {
+            const errorData = await response.text();
+            return `Server error: ${response.status} - ${errorData}`;
+        }
+        const data = await response.json();
+        return data.choices[0].message.content;
+    }
+
+    // 1. Ask for camera permission on load
+    async function initCamera() {
+        try {
+            stream = await navigator.mediaDevices.getUserMedia({video: true, audio: false});
+            video.srcObject = stream;
+            responseText.value = "Camera access granted. Ready to start.";
+        } catch (err) {
+            console.error("Error accessing camera:", err);
+            responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
+            alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
+        }
+    }
+
+    function captureImage() {
+        if (!stream || !video.videoWidth) {
+            console.warn("Video stream not ready for capture.");
+            return null;
+        }
+        canvas.width = video.videoWidth;
+        canvas.height = video.videoHeight;
+        const context = canvas.getContext('2d');
+        context.drawImage(video, 0, 0, canvas.width, canvas.height);
+        return canvas.toDataURL('image/jpeg', 0.8); // Use JPEG for smaller size, 0.8 quality
+    }
+
+    async function sendData() {
+        if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
+
+        const instruction = instructionText.value;
+        const imageBase64URL = captureImage();
+
+        if (!imageBase64URL) {
+            responseText.value = "Failed to capture image. Stream might not be active.";
+            // Optionally stop processing if image capture fails consistently
+            // handleStop();
+            return;
+        }
+
+        const payload = {
+            instruction: instruction,
+            imageBase64URL: imageBase64URL
+        };
+
+        try {
+            const response = await sendChatCompletionRequest(payload.instruction, payload.imageBase64URL);
+            responseText.value = response;
+        } catch (error) {
+            console.error('Error sending data:', error);
+            responseText.value = `Error: ${error.message}`;
+        }
+    }
+
+    function handleStart() {
+        if (!stream) {
+            responseText.value = "Camera not available. Cannot start.";
+            alert("Camera not available. Please grant permission first.");
+            return;
+        }
+        isProcessing = true;
+        startButton.textContent = "Stop";
+        startButton.classList.remove('start');
+        startButton.classList.add('stop');
+
+        instructionText.disabled = true;
+        intervalSelect.disabled = true;
+
+        responseText.value = "Processing started...";
+
+        const intervalMs = parseInt(intervalSelect.value, 10);
+
+        // Initial immediate call
+        sendData();
+
+        // Then set interval
+        intervalId = setInterval(sendData, intervalMs);
+    }
+
+    function handleStop() {
+        isProcessing = false;
+        if (intervalId) {
+            clearInterval(intervalId);
+            intervalId = null;
+        }
+        startButton.textContent = "Start";
+        startButton.classList.remove('stop');
+        startButton.classList.add('start');
+
+        instructionText.disabled = false;
+        intervalSelect.disabled = false;
+        if (responseText.value.startsWith("Processing started...")) {
+            responseText.value = "Processing stopped.";
+        }
+    }
+
+    startButton.addEventListener('click', () => {
+        if (isProcessing) {
+            handleStop();
+        } else {
+            handleStart();
+        }
+    });
+
+    // Initialize camera when the page loads
+    window.addEventListener('DOMContentLoaded', initCamera);
+
+    // Optional: Stop stream when page is closed/navigated away to release camera
+    window.addEventListener('beforeunload', () => {
+        if (stream) {
+            stream.getTracks().forEach(track => track.stop());
+        }
+        if (intervalId) {
+            clearInterval(intervalId);
+        }
+    });
+
+</script>
+</body>
+</html>
\ No newline at end of file

From 94e8efff75f498e516c517e94e8e701762006c3c Mon Sep 17 00:00:00 2001
From: Ignasi <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 16:50:07 +0200
Subject: [PATCH 04/13] Update demos/multimodal/demo.html

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 demos/multimodal/demo.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 02603b0ca..2b80ad261 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -187,7 +187,7 @@ <h1>Camera Interaction App</h1>
     }
 
     async function sendData() {
-        if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
+        if (isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
 
         const instruction = instructionText.value;
         const imageBase64URL = captureImage();

From 0aea05beeb08ff488285b4e2432710d78d6f73f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 17:00:49 +0200
Subject: [PATCH 05/13] revert wrong fix

---
 demos/multimodal/demo.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 2b80ad261..02603b0ca 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -187,7 +187,7 @@ <h1>Camera Interaction App</h1>
     }
 
     async function sendData() {
-        if (isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
+        if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
 
         const instruction = instructionText.value;
         const imageBase64URL = captureImage();

From 4e98f994fc558f23ae2817094ced85ba2834f3e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 17:16:17 +0200
Subject: [PATCH 06/13] feat: add model selection and fetch functionality to
 demo

---
 demos/multimodal/README.md |   4 ++
 demos/multimodal/demo.html | 109 +++++++++++++++++++++++++++++++------
 2 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/demos/multimodal/README.md b/demos/multimodal/README.md
index eda968261..d42b6177a 100644
--- a/demos/multimodal/README.md
+++ b/demos/multimodal/README.md
@@ -71,6 +71,10 @@ If you prefer not to use Docker Desktop, you can run Docker Model Runner directl
 3. **Configure the Demo**
    - **Base API**: By default set to `http://localhost:12434/engines/llama.cpp`
      - Change the port if you configured Docker Model Runner on a different port
+   - **Model**: Select from available models pulled to your Docker Model Runner
+     - The demo automatically fetches and displays all available models
+     - SmolVLM models will be auto-selected if available
+     - If model fetching fails, it falls back to `ai/smolvlm:500M-Q8_0`
    - **Instruction**: Enter what you want the model to analyze (default: "What do you see?")
      - Examples: "Describe the scene", "What objects can you see?", "What is the person doing?"
    - **Interval**: Choose how often to send requests to the model (default: 500ms)
diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 02603b0ca..607cdd6af 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -89,7 +89,13 @@ <h1>Camera Interaction App</h1>
 <div class="io-areas">
     <div>
         <label for="baseURL">Base API:</label><br>
-        <input id="baseURL" name="Instruction" value="http://localhost:12434/engines/llama.cpp"></textarea>
+        <input id="baseURL" name="Instruction" value="http://127.0.0.1:12434/engines/llama.cpp">
+    </div>
+    <div>
+        <label for="modelSelect">Model:</label><br>
+        <select id="modelSelect" name="Model" style="width: 40em; padding: 8px;">
+            <option value="">Loading models...</option>
+        </select>
     </div>
     <div>
         <label for="instructionText">Instruction:</label><br>
@@ -118,6 +124,7 @@ <h1>Camera Interaction App</h1>
     const video = document.getElementById('videoFeed');
     const canvas = document.getElementById('canvas');
     const baseURL = document.getElementById('baseURL');
+    const modelSelect = document.getElementById('modelSelect');
     const instructionText = document.getElementById('instructionText');
     const responseText = document.getElementById('responseText');
     const intervalSelect = document.getElementById('intervalSelect');
@@ -129,44 +136,110 @@ <h1>Camera Interaction App</h1>
     let intervalId;
     let isProcessing = false;
 
+    // Fetch available models from the API
+    async function fetchModels() {
+        try {
+            const response = await fetch(`${baseURL.value}/v1/models`);
+            if (!response.ok) {
+                throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+            }
+            const data = await response.json();
+            
+            // Clear the loading option
+            modelSelect.innerHTML = '';
+            
+            if (data && data.length > 0) {
+                let totalTags = 0;
+                // Populate dropdown with available models using their tags
+                data.forEach(model => {
+                    if (model.tags && model.tags.length > 0) {
+                        model.tags.forEach(tag => {
+                            const option = document.createElement('option');
+                            option.value = tag;
+                            option.textContent = tag;
+                            modelSelect.appendChild(option);
+                            totalTags++;
+                        });
+                    }
+                });
+                
+                if (totalTags > 0) {
+                    // Try to select smolvlm model by default, or use the first option
+                    const options = Array.from(modelSelect.options);
+                    const smolvlmOption = options.find(opt => opt.value.toLowerCase().includes('smolvlm'));
+                    if (smolvlmOption) {
+                        modelSelect.value = smolvlmOption.value;
+                    } else {
+                        modelSelect.value = options[0].value;
+                    }
+                    
+                    responseText.value = `Found ${totalTags} model(s). Ready to start.`;
+                } else {
+                    modelSelect.innerHTML = '<option value="">No tagged models available</option>';
+                    responseText.value = "No tagged models found. Please pull a model first.";
+                }
+            } else {
+                modelSelect.innerHTML = '<option value="">No models available</option>';
+                responseText.value = "No models found. Please pull a model first.";
+            }
+        } catch (error) {
+            console.error('Error fetching models:', error);
+            modelSelect.innerHTML = '<option value="ai/smolvlm:500M-Q8_0">ai/smolvlm:500M-Q8_0 (fallback)</option>';
+            responseText.value = `Could not fetch models: ${error.message}. Using fallback model.`;
+        }
+    }
+
     // Returns response text (string)
     async function sendChatCompletionRequest(instruction, imageBase64URL) {
+        const selectedModel = modelSelect.value;
+        if (!selectedModel) {
+            return "Error: No model selected";
+        }
+        
         const response = await fetch(`${baseURL.value}/v1/chat/completions`, {
             method: 'POST',
             headers: {
                 'Content-Type': 'application/json'
             },
             body: JSON.stringify({
+                model: selectedModel,
                 max_tokens: 100,
-                model: "ai/smolvlm:500M-Q8_0",
                 messages: [
-                    {
-                        role: 'user', content: [
-                            {type: 'text', text: instruction},
-                            {
-                                type: 'image_url', image_url: {
+                    { role: 'user', content: [
+                            { type: 'text', text: instruction },
+                            { type: 'image_url', image_url: {
                                     url: imageBase64URL,
-                                }
-                            }
-                        ]
-                    },
+                                } }
+                        ] },
                 ]
             })
         });
         if (!response.ok) {
-            const errorData = await response.text();
-            return `Server error: ${response.status} - ${errorData}`;
+            const errorText = await response.text();
+            try {
+                const errorData = JSON.parse(errorText);
+                // Check if error message indicates no multimodal support
+                if (errorData.error && errorData.error.message && 
+                    errorData.error.message.includes('image input is not supported')) {
+                    return "Error: The selected model does not support image input. Please select a vision model (e.g., SmolVLM).";
+                }
+                return `Server error: ${response.status} - ${errorData.error?.message || errorText}`;
+            } catch (e) {
+                // If JSON parse fails, use the raw text
+                return `Server error: ${response.status} - ${errorText}`;
+            }
         }
         const data = await response.json();
         return data.choices[0].message.content;
     }
 
-    // 1. Ask for camera permission on load
+    // 1. Ask for camera permission and fetch models on load
     async function initCamera() {
         try {
-            stream = await navigator.mediaDevices.getUserMedia({video: true, audio: false});
+            stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
             video.srcObject = stream;
-            responseText.value = "Camera access granted. Ready to start.";
+            responseText.value = "Camera access granted. Loading models...";
+            await fetchModels();
         } catch (err) {
             console.error("Error accessing camera:", err);
             responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
@@ -226,6 +299,7 @@ <h1>Camera Interaction App</h1>
 
         instructionText.disabled = true;
         intervalSelect.disabled = true;
+        modelSelect.disabled = true;
 
         responseText.value = "Processing started...";
 
@@ -250,6 +324,7 @@ <h1>Camera Interaction App</h1>
 
         instructionText.disabled = false;
         intervalSelect.disabled = false;
+        modelSelect.disabled = false;
         if (responseText.value.startsWith("Processing started...")) {
             responseText.value = "Processing stopped.";
         }
@@ -278,4 +353,4 @@ <h1>Camera Interaction App</h1>
 
 </script>
 </body>
-</html>
\ No newline at end of file
+</html>

From bd91ddff3badf1ce8f79c4421f851fb0dbd8cfe9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 18:02:12 +0200
Subject: [PATCH 07/13] use /models to list models

---
 demos/multimodal/demo.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 607cdd6af..c713812b1 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -139,7 +139,8 @@ <h1>Camera Interaction App</h1>
     // Fetch available models from the API
     async function fetchModels() {
         try {
-            const response = await fetch(`${baseURL.value}/v1/models`);
+            const base = new URL(baseURL.value);
+            const response = await fetch(`${base.protocol}//${base.host}/models`);
             if (!response.ok) {
                 throw new Error(`HTTP ${response.status}: ${response.statusText}`);
             }

From 1d8c7e8109a288461e47cc0cb7e304b55e18eb6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 20:16:55 +0200
Subject: [PATCH 08/13] Revert to a single model. We have only a few models
 that support multimodal input, so most models will throw an error if used
 here.

---
 demos/multimodal/README.md |   6 +--
 demos/multimodal/demo.html | 107 +++----------------------------------
 2 files changed, 9 insertions(+), 104 deletions(-)

diff --git a/demos/multimodal/README.md b/demos/multimodal/README.md
index d42b6177a..7dea071a7 100644
--- a/demos/multimodal/README.md
+++ b/demos/multimodal/README.md
@@ -69,12 +69,8 @@ If you prefer not to use Docker Desktop, you can run Docker Model Runner directl
    - Click "Allow" to grant permission
 
 3. **Configure the Demo**
-   - **Base API**: By default set to `http://localhost:12434/engines/llama.cpp`
+   - **Base API**: By default set to `http://127.0.0.1:12434/engines/llama.cpp`
      - Change the port if you configured Docker Model Runner on a different port
-   - **Model**: Select from available models pulled to your Docker Model Runner
-     - The demo automatically fetches and displays all available models
-     - SmolVLM models will be auto-selected if available
-     - If model fetching fails, it falls back to `ai/smolvlm:500M-Q8_0`
    - **Instruction**: Enter what you want the model to analyze (default: "What do you see?")
      - Examples: "Describe the scene", "What objects can you see?", "What is the person doing?"
    - **Interval**: Choose how often to send requests to the model (default: 500ms)
diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index c713812b1..f8d6878f0 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -14,7 +14,6 @@
             padding: 20px;
             background-color: #f0f0f0;
         }
-
         .controls, .io-areas {
             display: flex;
             gap: 10px;
@@ -22,14 +21,12 @@
             background-color: #fff;
             padding: 15px;
             border-radius: 8px;
-            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
+            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
         }
-
         .io-areas {
             flex-direction: column;
             align-items: stretch;
         }
-
         textarea {
             width: 300px;
             height: 80px;
@@ -38,7 +35,6 @@
             border-radius: 4px;
             font-size: 14px;
         }
-
         #videoFeed {
             width: 480px;
             height: 360px;
@@ -46,7 +42,6 @@
             background-color: #000;
             border-radius: 8px;
         }
-
         #startButton {
             padding: 10px 20px;
             font-size: 16px;
@@ -55,25 +50,20 @@
             border-radius: 4px;
             color: white;
         }
-
         #startButton.start {
             background-color: #28a745; /* Green */
         }
-
         #startButton.stop {
             background-color: #dc3545; /* Red */
         }
-
         label {
             font-weight: bold;
         }
-
         select {
             padding: 8px;
             border-radius: 4px;
             border: 1px solid #ccc;
         }
-
         .hidden {
             display: none;
         }
@@ -91,20 +81,13 @@ <h1>Camera Interaction App</h1>
         <label for="baseURL">Base API:</label><br>
         <input id="baseURL" name="Instruction" value="http://127.0.0.1:12434/engines/llama.cpp">
     </div>
-    <div>
-        <label for="modelSelect">Model:</label><br>
-        <select id="modelSelect" name="Model" style="width: 40em; padding: 8px;">
-            <option value="">Loading models...</option>
-        </select>
-    </div>
     <div>
         <label for="instructionText">Instruction:</label><br>
         <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
     </div>
     <div>
         <label for="responseText">Response:</label><br>
-        <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly
-                  placeholder="Server response will appear here..."></textarea>
+        <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
     </div>
 </div>
 
@@ -124,7 +107,6 @@ <h1>Camera Interaction App</h1>
     const video = document.getElementById('videoFeed');
     const canvas = document.getElementById('canvas');
     const baseURL = document.getElementById('baseURL');
-    const modelSelect = document.getElementById('modelSelect');
     const instructionText = document.getElementById('instructionText');
     const responseText = document.getElementById('responseText');
     const intervalSelect = document.getElementById('intervalSelect');
@@ -136,74 +118,15 @@ <h1>Camera Interaction App</h1>
     let intervalId;
     let isProcessing = false;
 
-    // Fetch available models from the API
-    async function fetchModels() {
-        try {
-            const base = new URL(baseURL.value);
-            const response = await fetch(`${base.protocol}//${base.host}/models`);
-            if (!response.ok) {
-                throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-            }
-            const data = await response.json();
-            
-            // Clear the loading option
-            modelSelect.innerHTML = '';
-            
-            if (data && data.length > 0) {
-                let totalTags = 0;
-                // Populate dropdown with available models using their tags
-                data.forEach(model => {
-                    if (model.tags && model.tags.length > 0) {
-                        model.tags.forEach(tag => {
-                            const option = document.createElement('option');
-                            option.value = tag;
-                            option.textContent = tag;
-                            modelSelect.appendChild(option);
-                            totalTags++;
-                        });
-                    }
-                });
-                
-                if (totalTags > 0) {
-                    // Try to select smolvlm model by default, or use the first option
-                    const options = Array.from(modelSelect.options);
-                    const smolvlmOption = options.find(opt => opt.value.toLowerCase().includes('smolvlm'));
-                    if (smolvlmOption) {
-                        modelSelect.value = smolvlmOption.value;
-                    } else {
-                        modelSelect.value = options[0].value;
-                    }
-                    
-                    responseText.value = `Found ${totalTags} model(s). Ready to start.`;
-                } else {
-                    modelSelect.innerHTML = '<option value="">No tagged models available</option>';
-                    responseText.value = "No tagged models found. Please pull a model first.";
-                }
-            } else {
-                modelSelect.innerHTML = '<option value="">No models available</option>';
-                responseText.value = "No models found. Please pull a model first.";
-            }
-        } catch (error) {
-            console.error('Error fetching models:', error);
-            modelSelect.innerHTML = '<option value="ai/smolvlm:500M-Q8_0">ai/smolvlm:500M-Q8_0 (fallback)</option>';
-            responseText.value = `Could not fetch models: ${error.message}. Using fallback model.`;
-        }
-    }
-
     // Returns response text (string)
     async function sendChatCompletionRequest(instruction, imageBase64URL) {
-        const selectedModel = modelSelect.value;
-        if (!selectedModel) {
-            return "Error: No model selected";
-        }
-        
         const response = await fetch(`${baseURL.value}/v1/chat/completions`, {
             method: 'POST',
             headers: {
                 'Content-Type': 'application/json'
             },
             body: JSON.stringify({
-                model: selectedModel,
+                model: 'ai/smolvlm:500M-Q8_0',
                 max_tokens: 100,
                 messages: [
                     { role: 'user', content: [
@@ -216,31 +139,19 @@ <h1>Camera Interaction App</h1>
             })
         });
         if (!response.ok) {
-            const errorText = await response.text();
-            try {
-                const errorData = JSON.parse(errorText);
-                // Check if error message indicates no multimodal support
-                if (errorData.error && errorData.error.message && 
-                    errorData.error.message.includes('image input is not supported')) {
-                    return "Error: The selected model does not support image input. Please select a vision model (e.g., SmolVLM).";
-                }
-                return `Server error: ${response.status} - ${errorData.error?.message || errorText}`;
-            } catch (e) {
-                // If JSON parse fails, use the raw text
-                return `Server error: ${response.status} - ${errorText}`;
-            }
+            const errorData = await response.text();
+            return `Server error: ${response.status} - ${errorData}`;
         }
         const data = await response.json();
         return data.choices[0].message.content;
     }
 
-    // 1. Ask for camera permission and fetch models on load
+    // 1. Ask for camera permission on load
     async function initCamera() {
         try {
             stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
             video.srcObject = stream;
-            responseText.value = "Camera access granted. Loading models...";
-            await fetchModels();
+            responseText.value = "Camera access granted. Ready to start.";
         } catch (err) {
             console.error("Error accessing camera:", err);
             responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
@@ -300,7 +211,6 @@ <h1>Camera Interaction App</h1>
 
         instructionText.disabled = true;
         intervalSelect.disabled = true;
-        modelSelect.disabled = true;
 
         responseText.value = "Processing started...";
 
@@ -325,7 +235,6 @@ <h1>Camera Interaction App</h1>
 
         instructionText.disabled = false;
         intervalSelect.disabled = false;
-        modelSelect.disabled = false;
         if (responseText.value.startsWith("Processing started...")) {
             responseText.value = "Processing stopped.";
         }
@@ -354,4 +263,4 @@ <h1>Camera Interaction App</h1>
 
 </script>
 </body>
-</html>
+</html>
\ No newline at end of file

From 3f1bc79adc9b0ae6d0f5424fac07589435003097 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 20:25:22 +0200
Subject: [PATCH 09/13] feat: prevent overlapping requests by managing response
 state

---
 demos/multimodal/demo.html | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index f8d6878f0..fb28f9714 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -117,9 +117,11 @@ <h1>Camera Interaction App</h1>
     let stream;
     let intervalId;
     let isProcessing = false;
+    let isWaitingForResponse = false;
 
     // Returns response text (string)
     async function sendChatCompletionRequest(instruction, imageBase64URL) {
+        isWaitingForResponse = true;
         const response = await fetch(`${baseURL.value}/v1/chat/completions`, {
             method: 'POST',
             headers: {
@@ -138,6 +140,7 @@ <h1>Camera Interaction App</h1>
                 ]
             })
         });
+        isWaitingForResponse = false;
         if (!response.ok) {
             const errorData = await response.text();
             return `Server error: ${response.status} - ${errorData}`;
@@ -173,6 +176,7 @@ <h1>Camera Interaction App</h1>
 
     async function sendData() {
         if (!isProcessing) return; // Ensure we don't have overlapping requests if processing takes longer than interval
+        if (isWaitingForResponse) return; // Skip if waiting for previous response
 
         const instruction = instructionText.value;
         const imageBase64URL = captureImage();

From a623618e7152c20ec3b126b0a3a8e72c3028f90c Mon Sep 17 00:00:00 2001
From: Ignasi <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 20:27:20 +0200
Subject: [PATCH 10/13] Update demos/multimodal/demo.html

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 demos/multimodal/demo.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index fb28f9714..24dec3ea7 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -79,7 +79,7 @@ <h1>Camera Interaction App</h1>
 <div class="io-areas">
     <div>
         <label for="baseURL">Base API:</label><br>
-        <input id="baseURL" name="Instruction" value="http://127.0.0.1:12434/engines/llama.cpp">
+        <input id="baseURL" name="baseURL" value="http://127.0.0.1:12434/engines/llama.cpp">
     </div>
     <div>
         <label for="instructionText">Instruction:</label><br>

From 87c8274dae925f79fd08be22e066be72460d317a Mon Sep 17 00:00:00 2001
From: Ignasi <ignasi.lopez.luna@gmail.com>
Date: Wed, 8 Oct 2025 20:28:06 +0200
Subject: [PATCH 11/13] Update demos/multimodal/demo.html

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 demos/multimodal/demo.html | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 24dec3ea7..1f88d41cb 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -188,13 +188,9 @@ <h1>Camera Interaction App</h1>
             return;
         }
 
-        const payload = {
-            instruction: instruction,
-            imageBase64URL: imageBase64URL
-        };
 
         try {
-            const response = await sendChatCompletionRequest(payload.instruction, payload.imageBase64URL);
+            const response = await sendChatCompletionRequest(instruction, imageBase64URL);
             responseText.value = response;
         } catch (error) {
             console.error('Error sending data:', error);

From e77747808e71037c4f1af9788e281d763ed48424 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= <ignasi.lopez.luna@gmail.com>
Date: Thu, 9 Oct 2025 14:39:31 +0200
Subject: [PATCH 12/13] feat(demo): enhance model selection with warnings and
 info messages

---
 demos/multimodal/demo.html | 123 +++++++++++++++++++++++++++++++++++--
 1 file changed, 118 insertions(+), 5 deletions(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 1f88d41cb..234f8d281 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -67,6 +67,40 @@
         .hidden {
             display: none;
         }
+        #modelWarning {
+            background-color: #fff3cd;
+            color: #856404;
+            border: 1px solid #ffeaa7;
+            border-radius: 4px;
+            padding: 10px;
+            margin-top: 5px;
+            font-size: 14px;
+            width: 100%;
+            box-sizing: border-box;
+            display: none;
+        }
+        #modelWarning.show {
+            display: block;
+        }
+        #modelWarning a {
+            color: #856404;
+            text-decoration: underline;
+        }
+        #modelInfo {
+            background-color: #d1ecf1;
+            color: #0c5460;
+            border: 1px solid #bee5eb;
+            border-radius: 4px;
+            padding: 10px;
+            margin-top: 5px;
+            font-size: 14px;
+            width: 100%;
+            box-sizing: border-box;
+        }
+        #modelInfo a {
+            color: #0c5460;
+            text-decoration: underline;
+        }
     </style>
 </head>
 <body>
@@ -79,7 +113,15 @@ <h1>Camera Interaction App</h1>
 <div class="io-areas">
     <div>
         <label for="baseURL">Base API:</label><br>
-        <input id="baseURL" name="baseURL" value="http://127.0.0.1:12434/engines/llama.cpp">
+        <input id="baseURL" name="baseURL" value="http://127.0.0.1:12434/engines/llama.cpp" style="width: 20em;">
+    </div>
+    <div>
+        <label for="modelSelect">Model:</label><br>
+        <select id="modelSelect" name="Model" style="width: 40em; padding: 8px;">
+            <option value="">Loading models...</option>
+        </select>
+        <div id="modelWarning"></div>
+        <div id="modelInfo"></div>
     </div>
     <div>
         <label for="instructionText">Instruction:</label><br>
@@ -107,6 +149,9 @@ <h1>Camera Interaction App</h1>
     const video = document.getElementById('videoFeed');
     const canvas = document.getElementById('canvas');
     const baseURL = document.getElementById('baseURL');
+    const modelSelect = document.getElementById('modelSelect');
+    const modelWarning = document.getElementById('modelWarning');
+    const modelInfo = document.getElementById('modelInfo');
     const instructionText = document.getElementById('instructionText');
     const responseText = document.getElementById('responseText');
     const intervalSelect = document.getElementById('intervalSelect');
@@ -119,6 +164,62 @@ <h1>Camera Interaction App</h1>
     let isProcessing = false;
     let isWaitingForResponse = false;
 
+    const RECOMMENDED_MODEL = 'ai/smolvlm:500M-Q8_0';
+
+    // Fetch available models from the API
+    async function fetchModels() {
+        // Base info message - always shown in modelInfo
+        modelInfo.innerHTML = `ℹ️ To pull a model, run: <code>docker model pull &lt;model-name&gt;</code><br>Find more models at: <a href="https://hub.docker.com/r/ai" target="_blank">https://hub.docker.com/r/ai</a>`;
+        
+        try {
+            const response = await fetch(`${baseURL.value}/v1/models`);
+            if (!response.ok) {
+                throw new Error(`HTTP error! status: ${response.status}`);
+            }
+            const data = await response.json();
+            const models = data.data.map(model => model.id);
+            
+            // Clear and populate the model selector
+            modelSelect.innerHTML = '';
+            
+            if (models.length === 0) {
+                modelSelect.innerHTML = '<option value="">No models available</option>';
+                modelWarning.innerHTML = `⚠️ No models found. Please ensure the model runner is active and models are loaded.`;
+                modelWarning.classList.add('show');
+                return;
+            }
+            
+            // Add all models to the selector
+            models.forEach(modelId => {
+                const option = document.createElement('option');
+                option.value = modelId;
+                option.textContent = modelId;
+                modelSelect.appendChild(option);
+            });
+            
+            // Check if the recommended model exists
+            const recommendedModelExists = models.includes(RECOMMENDED_MODEL);
+            
+            if (recommendedModelExists) {
+                modelSelect.value = RECOMMENDED_MODEL;
+                // Hide warning - everything is good
+                modelWarning.classList.remove('show');
+            } else {
+                // Select the first model
+                modelSelect.value = models[0];
+                // Show suggestion about recommended model
+                modelWarning.innerHTML = `💡 We recommend to run this demo with <code>${RECOMMENDED_MODEL}</code>`;
+                modelWarning.classList.add('show');
+            }
+            
+        } catch (error) {
+            console.error('Error fetching models:', error);
+            modelSelect.innerHTML = '<option value="">Error loading models</option>';
+            modelWarning.innerHTML = `⚠️ Error loading models: ${error.message}<br>Please check that the API is accessible at ${baseURL.value}/v1/models`;
+            modelWarning.classList.add('show');
+        }
+    }
+
     // Returns response text (string)
     async function sendChatCompletionRequest(instruction, imageBase64URL) {
         isWaitingForResponse = true;
@@ -128,7 +229,7 @@ <h1>Camera Interaction App</h1>
                 'Content-Type': 'application/json'
             },
             body: JSON.stringify({
-                model: 'ai/smolvlm:500M-Q8_0',
+                model: modelSelect.value,
                 max_tokens: 100,
                 messages: [
                     { role: 'user', content: [
@@ -143,6 +244,15 @@ <h1>Camera Interaction App</h1>
         isWaitingForResponse = false;
         if (!response.ok) {
             const errorData = await response.text();
+            try {
+                const errorJson = JSON.parse(errorData);
+                if (errorJson.error && errorJson.error.message && 
+                    errorJson.error.message.includes('image input is not supported')) {
+                    return `⚠️ This model doesn't support vision. Please select a vision-capable model like '${RECOMMENDED_MODEL}'.`;
+                }
+            } catch (e) {
+                // If parsing fails, fall through to generic error
+            }
             return `Server error: ${response.status} - ${errorData}`;
         }
         const data = await response.json();
@@ -248,8 +358,11 @@ <h1>Camera Interaction App</h1>
         }
     });
 
-    // Initialize camera when the page loads
-    window.addEventListener('DOMContentLoaded', initCamera);
+    // Initialize camera and fetch models when the page loads
+    window.addEventListener('DOMContentLoaded', () => {
+        initCamera();
+        fetchModels();
+    });
 
     // Optional: Stop stream when page is closed/navigated away to release camera
     window.addEventListener('beforeunload', () => {
@@ -263,4 +376,4 @@ <h1>Camera Interaction App</h1>
 
 </script>
 </body>
-</html>
\ No newline at end of file
+</html>

From 4b33c578eafbbc9c11868ea875150c98c1224dde Mon Sep 17 00:00:00 2001
From: Ignasi <ignasi.lopez.luna@gmail.com>
Date: Thu, 9 Oct 2025 15:33:52 +0200
Subject: [PATCH 13/13] Update demos/multimodal/demo.html

Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com>
---
 demos/multimodal/demo.html | 2 --
 1 file changed, 2 deletions(-)

diff --git a/demos/multimodal/demo.html b/demos/multimodal/demo.html
index 234f8d281..3e5b75e20 100644
--- a/demos/multimodal/demo.html
+++ b/demos/multimodal/demo.html
@@ -293,8 +293,6 @@ <h1>Camera Interaction App</h1>
 
         if (!imageBase64URL) {
             responseText.value = "Failed to capture image. Stream might not be active.";
-            // Optionally stop processing if image capture fails consistently
-            // handleStop();
             return;
         }