7070 --init --network=host
7171 --device=/dev/kfd --device=/dev/dri
7272 --ipc=host
73- --group-add video --group-add $(getent group render | cut -d: -f3)
73+ --group-add video --group-add " $(getent group render | cut -d: -f3)"
7474 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
7575 --rm -t
7676 --name ${CONTAINER_BASENAME}
@@ -81,13 +81,15 @@ jobs:
8181
8282 - name : Set per-runner tags
8383 run : |
84- echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
8584 echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8685 echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8786
8887 - name : Build docker image
8988 run : |
89+ docker buildx create --use --name $RUNNER_NAME || true
9090 docker build . \
91+ --builder $RUNNER_NAME \
92+ --label ci-run=$GITHUB_RUN_ID \
9193 --rm --pull \
9294 --file docker/Dockerfile.devito \
9395 --tag "${DOCKER_IMAGE}" \
@@ -98,16 +100,47 @@ jobs:
98100
99101 - name : Probe gpu
100102 run : |
101- # Run a simple driver cmd first (nvidia-smi / rocm-smi)
103+ # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
104+ # runners; fall back to "all" so the driver probe does not fail.
105+ if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
106+ echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
107+ fi
108+
109+ # Run a simple driver-probe command (nvidia-smi / rocm-smi)
102110 docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
103111
104112 - name : Test with pytest
113+ env :
114+ # Exported earlier in the job; needed inside the container for codecov
115+ CODECOV_TOKEN : ${{ env.CODECOV_TOKEN }}
105116 run : |
117+ # 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
106118 ci_env=$(bash <(curl -s https://codecov.io/env))
107119
108- docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
120+ # 2. For NVIDIA, make sure the runner gave us a GPU mask
121+ # Default to empty
122+ NVIDIA_ENV_FLAGS=""
123+ # For NVIDIA, check for device mask AND set specific env vars
124+ if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
125+ : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
126+ NVIDIA_ENV_FLAGS=" \
127+ -e CUDA_LAUNCH_BLOCKING=1 \
128+ -e NV_ACC_NOTIFY=3 \
129+ -e NV_ACC_DEBUG=1"
130+ fi
131+
132+ # 3. Run the test suite using the matrix-defined flags
133+ docker run ${{ matrix.flags }} \
134+ ${ci_env} \
135+ -e CI=true \
136+ -e PYTHONFAULTHANDLER=1 \
137+ -e DEVITO_LOGGING=DEBUG \
138+ -e CODECOV_TOKEN \
139+ ${NVIDIA_ENV_FLAGS} \
140+ "${DOCKER_IMAGE}" \
109141 pytest -vv -ra -l -s --full-trace --maxfail=1 \
110- --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
142+ --cov --cov-config=.coveragerc --cov-report=xml \
143+ ${{ matrix.test_files }}
111144
112145 - name : Test examples
113146 run : |
@@ -121,6 +154,9 @@ jobs:
121154 - name : Clean up test image
122155 if : always()
123156 run : |
124- docker rmi -f "${DOCKER_IMAGE}" || true # remove the image
125- docker system prune -f # free dangling layers
157+ # Remove only the image we just built
158+ docker rmi -f "${DOCKER_IMAGE}" || true
126159
160+ # Guard global prune the layers we created
161+ # (label ci-run=$GITHUB_RUN_ID)
162+ docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
0 commit comments