From 2b72119aaf53110433c4c0a627bab2a88f6277e5 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Tue, 12 Aug 2025 11:17:33 +0100 Subject: [PATCH 1/2] CI: Cannot set `--env CUDA_VISIBLE_DEVICES` for `docker run` when using `--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"` - dropping --env arg. To see why, consider if we have export CUDA_VISIBLE_DEVICES=1, then setting --env CUDA_VISIBLE_DEVICES for docker run means that the docker runtime env will contain CUDA_VISIBLE_DEVICES=1; however, when you set docker run --gpus "device=${CUDA_VISIBLE_DEVICES:-all}", the docker runtime will only use GPU 1 but it renumbers it as zero. Therefore, when you run a cuda code inside docker the runtime only sees a single GPU device with device ID 0, but CUDA_VISIBLE_DEVICES is set to device id 1, and therefore you get an (uncaught) exception. --- .github/workflows/docker-devito.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 0d0c7bd73e..87503a9334 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -23,7 +23,7 @@ jobs: tag: 'nvidia-nvc' # Respect CUDA_VISIBLE_DEVICES set by the runner and hard-limit docker to that device. # (--env without value forwards host var; --gpus maps only that device) - flag: --init --env CUDA_VISIBLE_DEVICES --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" + flag: --init --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" test: 'tests/test_gpu_openacc.py tests/test_gpu_common.py' runner: ["self-hosted", "nvidiagpu"] From b0d3c601c6d4a3dd4031433501621fe585629c8b Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Tue, 12 Aug 2025 11:22:48 +0100 Subject: [PATCH 2/2] Corrected comment in .github/workflows/docker-devito.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/docker-devito.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 87503a9334..126caccd5e 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -22,7 +22,7 @@ jobs: - base: 'bases:nvidia-nvc' tag: 'nvidia-nvc' # Respect CUDA_VISIBLE_DEVICES set by the runner and hard-limit docker to that device. - # (--env without value forwards host var; --gpus maps only that device) + # (--gpus maps only the selected device from CUDA_VISIBLE_DEVICES) flag: --init --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" test: 'tests/test_gpu_openacc.py tests/test_gpu_common.py' runner: ["self-hosted", "nvidiagpu"]