Docker-wise the only difference between the machines is that the Linux machine uses a docker-model-runner image docker/model-runner:latest-cuda:
~$ docker image inspect docker/model-runner:latest-cuda
[
{
"Id": "sha256:e1f97bbd775da35eace40d334de8986cafdc2fef9a0070334126ad24765db231",
"RepoTags": [
"docker/model-runner:latest-cuda"
],
"RepoDigests": [
"docker/model-runner@sha256:8b1b1cbfe23bf253a81ef5cae84929245232a6ab1d4630b347834e91b2f68d48"
],
"Comment": "buildkit.dockerfile.v0",
"Created": "2026-02-05T10:20:27.927412661Z",
"Config": {
"User": "modelrunner",
"Env": [
"PATH=/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"NVARCH=x86_64",
"NVIDIA_REQUIRE_CUDA=cuda>=12.9 brand=unknown,driver>=535,driver<536 brand=grid,driver>=535,driver<536 brand=tesla,driver>=535,driver<536 brand=nvidia,driver>=535,driver<536 brand=quadro,driver>=535,driver<536 brand=quadrortx,driver>=535,driver<536 brand=nvidiartx,driver>=535,driver<536 brand=vapps,driver>=535,driver<536 brand=vpc,driver>=535,driver<536 brand=vcs,driver>=535,driver<536 brand=vws,driver>=535,driver<536 brand=cloudgaming,driver>=535,driver<536 brand=unknown,driver>=550,driver<551 brand=grid,driver>=550,driver<551 brand=tesla,driver>=550,driver<551 brand=nvidia,driver>=550,driver<551 brand=quadro,driver>=550,driver<551 brand=quadrortx,driver>=550,driver<551 brand=nvidiartx,driver>=550,driver<551 brand=vapps,driver>=550,driver<551 brand=vpc,driver>=550,driver<551 brand=vcs,driver>=550,driver<551 brand=vws,driver>=550,driver<551 brand=cloudgaming,driver>=550,driver<551 brand=unknown,driver>=560,driver<561 brand=grid,driver>=560,driver<561 brand=tesla,driver>=560,driver<561 brand=nvidia,driver>=560,driver<561 brand=quadro,driver>=560,driver<561 brand=quadrortx,driver>=560,driver<561 brand=nvidiartx,driver>=560,driver<561 brand=vapps,driver>=560,driver<561 brand=vpc,driver>=560,driver<561 brand=vcs,driver>=560,driver<561 brand=vws,driver>=560,driver<561 brand=cloudgaming,driver>=560,driver<561 brand=unknown,driver>=565,driver<566 brand=grid,driver>=565,driver<566 brand=tesla,driver>=565,driver<566 brand=nvidia,driver>=565,driver<566 brand=quadro,driver>=565,driver<566 brand=quadrortx,driver>=565,driver<566 brand=nvidiartx,driver>=565,driver<566 brand=vapps,driver>=565,driver<566 brand=vpc,driver>=565,driver<566 brand=vcs,driver>=565,driver<566 brand=vws,driver>=565,driver<566 brand=cloudgaming,driver>=565,driver<566 brand=unknown,driver>=570,driver<571 brand=grid,driver>=570,driver<571 brand=tesla,driver>=570,driver<571 brand=nvidia,driver>=570,driver<571 brand=quadro,driver>=570,driver<571 brand=quadrortx,driver>=570,driver<571 brand=nvidiartx,driver>=570,driver<571 brand=vapps,driver>=570,driver<571 brand=vpc,driver>=570,driver<571 brand=vcs,driver>=570,driver<571 brand=vws,driver>=570,driver<571 brand=cloudgaming,driver>=570,driver<571",
"NV_CUDA_CUDART_VERSION=12.9.37-1",
"CUDA_VERSION=12.9.0",
"LD_LIBRARY_PATH=/app/lib",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_DRIVER_CAPABILITIES=compute,utility",
"NV_CUDA_LIB_VERSION=12.9.0-1",
"NV_NVTX_VERSION=12.9.19-1",
"NV_LIBNPP_VERSION=12.4.0.27-1",
"NV_LIBNPP_PACKAGE=libnpp-12-9=12.4.0.27-1",
"NV_LIBCUSPARSE_VERSION=12.5.9.5-1",
"NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-9",
"NV_LIBCUBLAS_VERSION=12.9.0.13-1",
"NV_LIBCUBLAS_PACKAGE=libcublas-12-9=12.9.0.13-1",
"NV_LIBNCCL_PACKAGE_NAME=libnccl2",
"NV_LIBNCCL_PACKAGE_VERSION=2.26.5-1",
"NCCL_VERSION=2.26.5-1",
"NV_LIBNCCL_PACKAGE=libnccl2=2.26.5-1+cuda12.9",
"NVIDIA_PRODUCT_NAME=CUDA",
"MODEL_RUNNER_SOCK=/var/run/model-runner/model-runner.sock",
"MODEL_RUNNER_PORT=12434",
"LLAMA_SERVER_PATH=/app/bin",
"HOME=/home/modelrunner",
"MODELS_PATH=/models"
],
"Entrypoint": [
"/app/model-runner"
],
"WorkingDir": "/app",
"Labels": {
"com.docker.desktop.service": "model-runner",
"maintainer": "NVIDIA CORPORATION <cudatools@nvidia.com>",
"org.opencontainers.image.ref.name": "ubuntu",
"org.opencontainers.image.version": "24.04"
}
},
"Architecture": "amd64",
"Os": "linux",
"Size": 4004833467,
"GraphDriver": {
"Data": {
"LowerDir": "/var/lib/docker/overlay2/084bbc115961a54dffea0770c268190848c3d2518f5957a64b1805e81d191cad/diff:/var/lib/docker/overlay2/08f165b6075f2dfdb5f6b7ace6f5d5488a8fad7c523a99b253de0ca78f7915b3/diff:/var/lib/docker/overlay2/c6e76c731440603bbae47a462de1f62e9838f627fe2a980ae99f6d8f9bf69d0c/diff:/var/lib/docker/overlay2/600864c90492b3bdd0753f0b69cbc528c5bf8a49f93abe878c67f9632a1ddb81/diff:/var/lib/docker/overlay2/2529a5330d84cbace21f4b6b23a2e9cff5a14477254ae34325f0e9f5e90ec7fa/diff:/var/lib/docker/overlay2/ab2283e6291e177c1812c2c1e8bec43d63fbbebc14136dba08adb18014a2eaa0/diff:/var/lib/docker/overlay2/ad38604147ca98fc853f04ed865ce490be54ea9084bf5e06ed3bad3550d46d0f/diff:/var/lib/docker/overlay2/54e723159b9262e4e5c844a4a69940c57900d308dcc8b2530e1151416edd8a2a/diff:/var/lib/docker/overlay2/f878ee887a52289e357a227d6fe45e4182420eab83d9247be0f88177e3be7d88/diff:/var/lib/docker/overlay2/3e4fb66b334720360049560b876919643716f0f5ded8abe58a1050240a13f9ad/diff:/var/lib/docker/overlay2/32e7031bd744e899903eebbb581db0b4c015f266b26e0880ab2ad08f82400a53/diff:/var/lib/docker/overlay2/8e5154092bd6c513394bd00ce5953730221a48eacd782d2035d8a7697a23bba7/diff:/var/lib/docker/overlay2/1948e805edc35bb513e1e06b0926ba681bfd1a0ef0565c208e6fc283bbd8ae38/diff:/var/lib/docker/overlay2/1614701efefc3e1e97ac01e54175e06dc1d57230e02988a30b48e93366884e5d/diff:/var/lib/docker/overlay2/9284cd2edccc9e7de0fc04479eb42a5f344a68f9a1de32b987b2ee85b1bebe6f/diff:/var/lib/docker/overlay2/4e9aa1d3d0f84e6d7dfcb18fea10283edaf0a36c0a179ded85576b8b69384ebc/diff",
"MergedDir": "/var/lib/docker/overlay2/f8753b7b8a834d76cbc259e4baafc097c270a6b5fa68999aba739d28a02e95c5/merged",
"UpperDir": "/var/lib/docker/overlay2/f8753b7b8a834d76cbc259e4baafc097c270a6b5fa68999aba739d28a02e95c5/diff",
"WorkDir": "/var/lib/docker/overlay2/f8753b7b8a834d76cbc259e4baafc097c270a6b5fa68999aba739d28a02e95c5/work"
},
"Name": "overlay2"
},
"RootFS": {
"Type": "layers",
"Layers": [
"sha256:8901a649dd5a9284fa6206a08f3ba3b5a12fddbfd2f82c880e68cdb699d98bfb",
"sha256:82c81020a72d6e7ffeb0302c9afb209694b5f41062f7b32b7dda0e6fb57ab4da",
"sha256:ea24cf18cd9223532f67c87e42d9b31979ba47e34c167f6d24abf590153a4cbc",
"sha256:ca36f6779ed63543cb06897fa2c9cb9a97e380e1e5210888350d3abfcf123948",
"sha256:a850df6ba29add00c1fadc58c015688a0fcf0f838adb70c30987e9b77525ceec",
"sha256:47d54ffdddd2f1a9619e84e34358ce693087c7ce1f47ca7e64e1a975ada70d9c",
"sha256:cb1fe51dbd1b4b113cbc420d3cc6b46181f4f8ea2f08a0f0711dc503e013b3c9",
"sha256:c087f59da42b692908ce70d0ae36d48a2e359643bd4a2d8152b12288e2876d78",
"sha256:90b7163d2189c488282c1dc13c7ed7e3f795be30cbd2821f6ad6120dee759fae",
"sha256:68ebc998fb82762909d04f8326f0e8461eb68a9ec1876467b3516de96d5875b9",
"sha256:9a0979cf681d83617f29718eecf798695b52162642cadab100ec206ca337b82e",
"sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef",
"sha256:79677acf435c75ca5683132ef6cb274a8a474af0ad7ddfd1bb6702e012fa1687",
"sha256:fbd7a6f6ccc055ce6a49c0a9531da1210bae2200740753bc8873738620c0bca5",
"sha256:c568b5af7de806e1ff1b91983dd87986f685054fa089c9edde4497763aa16fa9",
"sha256:fd0c7e7cc148b316f06ca96f6daa88585dd61388f0113a6789a1fee83617cc90",
"sha256:5b824cba9d4fe83b20642ecc3bf28247364ecdf601f2cc7a539a84562b6c43f8"
]
},
"Metadata": {
"LastTagTime": "0001-01-01T00:00:00Z"
}
}
]
The reason I want to package the context size, is that when using Docker Compose with ai/qwen3-vl:8B for example, it seems to automatically reset the context_size to 4096 once the model unloads after 5 minutes. As a workaround I set up a cronjob to execute this command every 5 minutes, but that's not ideal: docker model configure ai/qwen3-vl:8B --context-size 16384 && docker model run --detach ai/qwen3-vl:8B. It correctly sets the context size, but ignores runtime_flags in Docker Compose. Here is the models section in docker-compose.yml:
Originally asked in #461
When trying to package a model with a context-size on a Linux machine, it cannot find the model:
Versions:
When doing the exact same on a Macbook (M3), it does work:
Versions on Macbook:
Docker-wise the only difference between the machines is that the Linux machine uses a
docker-model-runnerimagedocker/model-runner:latest-cuda:The reason I want to package the context size, is that when using Docker Compose with
ai/qwen3-vl:8Bfor example, it seems to automatically reset thecontext_sizeto4096once the model unloads after 5 minutes. As a workaround I set up a cronjob to execute this command every 5 minutes, but that's not ideal:docker model configure ai/qwen3-vl:8B --context-size 16384 && docker model run --detach ai/qwen3-vl:8B. It correctly sets the context size, but ignoresruntime_flagsin Docker Compose. Here is the models section indocker-compose.yml: