From adf3dd178d04064dc508fe13212e2085a6a5b7d7 Mon Sep 17 00:00:00 2001
From: Gerard Gorman <g.gorman@imperial.ac.uk>
Date: Fri, 25 Jul 2025 11:36:12 +0100
Subject: [PATCH 1/4] ci: Remove dangling layers from CI docker builds.

---
 .github/workflows/docker-devito.yml |  1 -
 .github/workflows/pytest-gpu.yml    | 18 +++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml
index 95b70386c3..0d0c7bd73e 100644
--- a/.github/workflows/docker-devito.yml
+++ b/.github/workflows/docker-devito.yml
@@ -109,7 +109,6 @@ jobs:
           build-args: base=devitocodes/${{ matrix.base }}
 
       - name: Remove dangling layers
-        if: ${{ !contains(matrix.runner, 'nvidiagpu') }}
         run: docker system prune -f
 
       - name: Run tests
diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml
index 2b8b25cf2c..3db9d9343e 100644
--- a/.github/workflows/pytest-gpu.yml
+++ b/.github/workflows/pytest-gpu.yml
@@ -35,13 +35,6 @@ jobs:
       - self-hosted
       - ${{ matrix.runner_label }}
 
-    # Job-level env (includes per-runner image/container tags)
-    env:
-      DEVITO_ARCH: ${{ matrix.arch }}
-      DEVITO_PLATFORM: ${{ matrix.platform }}
-      DEVITO_LANGUAGE: ${{ matrix.language }}
-      OMPI_CC: ${{ matrix.arch }}
-
     strategy:
       fail-fast: false
       matrix:
@@ -110,11 +103,11 @@ jobs:
 
       - name: Test with pytest
         run: |
-          # Run a simple driver cmd first (nvidia-smi / rocm-smi)
           ci_env=$(bash <(curl -s https://codecov.io/env))
 
           docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
-            pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
+            pytest -vv -ra -l -s --full-trace --maxfail=1 \
+            --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
 
       - name: Test examples
         run: |
@@ -124,3 +117,10 @@ jobs:
         run: |
           docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
             mpiexec -n 2 pytest ${{ matrix.test_examples }}
+
+      - name: Clean up test image
+        if: always()
+        run: |
+          docker rmi -f "${DOCKER_IMAGE}" || true     # remove the image
+          docker system prune -f                      # free dangling layers
+

From 7d2023487bd6bf3d79e0c554005168c3c57c70ba Mon Sep 17 00:00:00 2001
From: Gerard Gorman <g.gorman@imperial.ac.uk>
Date: Sat, 26 Jul 2025 01:24:01 +0200
Subject: [PATCH 2/4] ci: bug fixes

---
 .github/workflows/pytest-gpu.yml | 50 +++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml
index 3db9d9343e..a0f90dcf57 100644
--- a/.github/workflows/pytest-gpu.yml
+++ b/.github/workflows/pytest-gpu.yml
@@ -70,7 +70,7 @@ jobs:
             --init --network=host
             --device=/dev/kfd --device=/dev/dri
             --ipc=host
-            --group-add video --group-add $(getent group render | cut -d: -f3)
+            --group-add video --group-add "$(getent group render | cut -d: -f3)"
             --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
             --rm -t
             --name ${CONTAINER_BASENAME}
@@ -81,13 +81,15 @@ jobs:
 
       - name: Set per-runner tags
         run: |
-          echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
           echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
           echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
 
       - name: Build docker image
         run: |
+          docker buildx create --use --name $RUNNER_NAME || true
           docker build . \
+            --builder $RUNNER_NAME \
+            --label ci-run=$GITHUB_RUN_ID \
             --rm --pull \
             --file docker/Dockerfile.devito \
             --tag "${DOCKER_IMAGE}" \
@@ -98,16 +100,47 @@ jobs:
 
       - name: Probe gpu
         run: |
-          # Run a simple driver cmd first (nvidia-smi / rocm-smi)
+          # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
+          # runners; fall back to "all" so the driver probe does not fail.
+          if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
+            echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
+          fi
+
+          # Run a simple driver-probe command (nvidia-smi / rocm-smi)
           docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
 
       - name: Test with pytest
+        env:
+          # Exported earlier in the job; needed inside the container for codecov
+          CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
         run: |
+          # 1.  Add Codecov’s environment variables (GITHUB_SHA, etc.)
           ci_env=$(bash <(curl -s https://codecov.io/env))
 
-          docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
+          # 2.  For NVIDIA, make sure the runner gave us a GPU mask
+          # Default to empty
+          NVIDIA_ENV_FLAGS=""
+          # For NVIDIA, check for device mask AND set specific env vars
+          if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
+            : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
+            NVIDIA_ENV_FLAGS=" \
+              -e CUDA_LAUNCH_BLOCKING=1 \
+              -e NV_ACC_NOTIFY=3 \
+              -e NV_ACC_DEBUG=1"
+          fi
+
+          # 3.  Run the test suite using the matrix-defined flags
+          docker run ${{ matrix.flags }} \
+            ${ci_env} \
+            -e CI=true \
+            -e PYTHONFAULTHANDLER=1 \
+            -e DEVITO_LOGGING=DEBUG \
+            -e CODECOV_TOKEN \
+            ${NVIDIA_ENV_FLAGS} \
+            "${DOCKER_IMAGE}" \
             pytest -vv -ra -l -s --full-trace --maxfail=1 \
-            --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
+                   --cov --cov-config=.coveragerc --cov-report=xml \
+                   ${{ matrix.test_files }}
 
       - name: Test examples
         run: |
@@ -121,6 +154,9 @@ jobs:
       - name: Clean up test image
         if: always()
         run: |
-          docker rmi -f "${DOCKER_IMAGE}" || true     # remove the image
-          docker system prune -f                      # free dangling layers
+          # Remove only the image we just built
+          docker rmi -f "${DOCKER_IMAGE}" || true
 
+          # Guard global prune the layers we created
+          # (label ci-run=$GITHUB_RUN_ID)
+          docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID

From d0491029e5206e0eb8c628f9393f77fe70216005 Mon Sep 17 00:00:00 2001
From: Gerard Gorman <g.gorman@imperial.ac.uk>
Date: Thu, 31 Jul 2025 12:13:29 +0200
Subject: [PATCH 3/4] CI: fix GPU-mask crash by keeping CUDA_VISIBLE_DEVICES
 out of the container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Background
----------
Each self-hosted runner is pinned to a specific GPU via a host-level
CUDA_VISIBLE_DEVICES and we forward that mask to Docker:

  docker run --gpus "device=$CUDA_VISIBLE_DEVICES" …

That flag alone is sufficient—Docker restricts the visible devices for the
container.

Problem
-------
We also injected the same variable into the container’s environment
(-e CUDA_VISIBLE_DEVICES).
Inside the container the CUDA/OpenACC runtime renumbers the visible GPUs
to 0…N-1, so a value like “1” or “2,3” is suddenly invalid and the first
kernel call aborts (`exit 1`) when multiple runners share the host.

Fix
---
* Drop the `-e CUDA_VISIBLE_DEVICES` export from `${{ matrix.flags }}`.
  The device list is still enforced by `--gpus`, but the runtime now
  starts counting at 0 as expected.

Verified on:
* Two concurrent nvidiagpu runners on a 4-V100 host – full test suite passes.
* amdgpu runner – unchanged.
---
 .github/workflows/pytest-gpu.yml | 115 +++++++++++++++----------------
 1 file changed, 55 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml
index a0f90dcf57..35da07cc21 100644
--- a/.github/workflows/pytest-gpu.yml
+++ b/.github/workflows/pytest-gpu.yml
@@ -15,18 +15,17 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  OUTPUT_PATH: ${{ github.workspace }}
   RESOURCE_GROUP: CI-gpu
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
   workflow_dispatch:
     inputs:
       tags:
-        description: 'Run GPU tests'
+        description: "Run GPU tests"
 
 jobs:
   build:
@@ -38,42 +37,38 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        name: [
-          pytest-gpu-acc-nvidia,
-          pytest-gpu-omp-amd
-        ]
+        name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
         test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
 
         include:
-        # -------------------- NVIDIA job --------------------
-        - name: pytest-gpu-acc-nvidia
-          test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
-          base: "devitocodes/bases:nvidia-nvc"
-          runner_label: nvidiagpu
-          test_drive_cmd: "nvidia-smi"
-          # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
-          # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
-          flags: >-
-            --init --rm -t
-            --name ${CONTAINER_BASENAME}
-            --env CUDA_VISIBLE_DEVICES
-            --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
-
-        # -------------------- AMD job -----------------------
-        - name: pytest-gpu-omp-amd
-          test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
-          runner_label: amdgpu
-          base: "devitocodes/bases:amd"
-          test_drive_cmd: "rocm-smi"
-          # Unchanged, still passes through required /dev nodes etc.
-          flags: >-
-            --init --network=host
-            --device=/dev/kfd --device=/dev/dri
-            --ipc=host
-            --group-add video --group-add "$(getent group render | cut -d: -f3)"
-            --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
-            --rm -t
-            --name ${CONTAINER_BASENAME}
+          # -------------------- NVIDIA job --------------------
+          - name: pytest-gpu-acc-nvidia
+            test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
+            base: "devitocodes/bases:nvidia-nvc"
+            runner_label: nvidiagpu
+            test_drive_cmd: "nvidia-smi"
+            # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
+            # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
+            flags: >-
+              --init --rm -t
+              --name ${CONTAINER_BASENAME}
+              --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
+
+          # -------------------- AMD job -----------------------
+          - name: pytest-gpu-omp-amd
+            test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
+            runner_label: amdgpu
+            base: "devitocodes/bases:amd"
+            test_drive_cmd: "rocm-smi"
+            # Unchanged, still passes through required /dev nodes etc.
+            flags: >-
+              --init --network=host
+              --device=/dev/kfd --device=/dev/dri
+              --ipc=host
+              --group-add video --group-add "$(getent group render | cut -d: -f3)"
+              --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
+              --rm -t
+              --name ${CONTAINER_BASENAME}
 
     steps:
       - name: Checkout devito
@@ -84,11 +79,17 @@ jobs:
           echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
           echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
 
+      - name: Ensure buildx builder
+        run: |
+          docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \
+            docker buildx create --name "$RUNNER_NAME" --driver docker-container
+          docker buildx use "$RUNNER_NAME"
+
       - name: Build docker image
         run: |
-          docker buildx create --use --name $RUNNER_NAME || true
-          docker build . \
-            --builder $RUNNER_NAME \
+          docker buildx build . \
+            --builder "$RUNNER_NAME" \
+            --load \
             --label ci-run=$GITHUB_RUN_ID \
             --rm --pull \
             --file docker/Dockerfile.devito \
@@ -107,6 +108,7 @@ jobs:
           fi
 
           # Run a simple driver-probe command (nvidia-smi / rocm-smi)
+          docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
           docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
 
       - name: Test with pytest
@@ -114,31 +116,20 @@ jobs:
           # Exported earlier in the job; needed inside the container for codecov
           CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
         run: |
-          # 1.  Add Codecov’s environment variables (GITHUB_SHA, etc.)
+          # Add Codecov’s environment variables (GITHUB_SHA, etc.)
           ci_env=$(bash <(curl -s https://codecov.io/env))
 
-          # 2.  For NVIDIA, make sure the runner gave us a GPU mask
-          # Default to empty
-          NVIDIA_ENV_FLAGS=""
-          # For NVIDIA, check for device mask AND set specific env vars
-          if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
-            : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
-            NVIDIA_ENV_FLAGS=" \
-              -e CUDA_LAUNCH_BLOCKING=1 \
-              -e NV_ACC_NOTIFY=3 \
-              -e NV_ACC_DEBUG=1"
-          fi
-
-          # 3.  Run the test suite using the matrix-defined flags
+          # Run the test suite using the matrix-defined flags
           docker run ${{ matrix.flags }} \
             ${ci_env} \
             -e CI=true \
             -e PYTHONFAULTHANDLER=1 \
             -e DEVITO_LOGGING=DEBUG \
             -e CODECOV_TOKEN \
-            ${NVIDIA_ENV_FLAGS} \
             "${DOCKER_IMAGE}" \
-            pytest -vv -ra -l -s --full-trace --maxfail=1 \
+            pytest -vvv --capture=no --showlocals \
+                   --log-cli-level=DEBUG -o log_cli=true \
+                   --full-trace --durations=10 \
                    --cov --cov-config=.coveragerc --cov-report=xml \
                    ${{ matrix.test_files }}
 
@@ -151,12 +142,16 @@ jobs:
           docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
             mpiexec -n 2 pytest ${{ matrix.test_examples }}
 
-      - name: Clean up test image
+      - name: Builder & image cleanup (keep 3 days of cache)
         if: always()
         run: |
-          # Remove only the image we just built
+          # Remove only the test image we built
           docker rmi -f "${DOCKER_IMAGE}" || true
 
-          # Guard global prune the layers we created
-          # (label ci-run=$GITHUB_RUN_ID)
+          # Classic image layers created in this job
           docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
+
+          # BuildKit cache: target the per-runner builder explicitly
+          docker builder prune --builder "$RUNNER_NAME" \
+                              -f \
+                              --filter "until=72h"

From 6e3ad7e135f60ce2fca5f1570433389ba436603c Mon Sep 17 00:00:00 2001
From: Gerard Gorman <g.gorman@imperial.ac.uk>
Date: Fri, 1 Aug 2025 16:14:01 +0200
Subject: [PATCH 4/4] CI: Change ${RUNNER_NAME} to ${RUNNER_NAME// /_} because
 runner name may contain spaces.

---
 .github/workflows/pytest-gpu.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml
index 35da07cc21..2edb501dc3 100644
--- a/.github/workflows/pytest-gpu.yml
+++ b/.github/workflows/pytest-gpu.yml
@@ -81,14 +81,14 @@ jobs:
 
       - name: Ensure buildx builder
         run: |
-          docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \
-            docker buildx create --name "$RUNNER_NAME" --driver docker-container
-          docker buildx use "$RUNNER_NAME"
+          docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
+            docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
+          docker buildx use "${RUNNER_NAME// /_}"
 
       - name: Build docker image
         run: |
           docker buildx build . \
-            --builder "$RUNNER_NAME" \
+            --builder "${RUNNER_NAME// /_}" \
             --load \
             --label ci-run=$GITHUB_RUN_ID \
             --rm --pull \
@@ -152,6 +152,6 @@ jobs:
           docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
 
           # BuildKit cache: target the per-runner builder explicitly
-          docker builder prune --builder "$RUNNER_NAME" \
+          docker builder prune --builder "${RUNNER_NAME// /_}" \
                               -f \
                               --filter "until=72h"