1- # Runner information:
1+ # Workflow information:
22# - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu`
33# - OpenMP on AMD runs on runners labeled `amdgpu`
4- #
5- # Changes vs original:
6- # * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
7- # * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse
8- # * Remove docker prune / global container deletes (we assume disk space is fine)
9- # * Add comments throughout
4+ # - Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
5+ # - Tag images and container names to avoid cross-runner races and maximize cache reuse
106
117name : CI-gpu
128
@@ -31,149 +27,111 @@ on:
3127 description : " Run GPU tests"
3228
3329jobs :
34- build :
30+ test-gpu-docker :
3531 name : ${{ matrix.name }}
3632 runs-on :
3733 - self-hosted
3834 - ${{ matrix.runner_label }}
3935
40- outputs :
41- unique : ${{ steps.uniquetag.outputs.unique }}
42-
4336 strategy :
4437 fail-fast : false
4538 matrix :
4639 name : [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
4740 test_examples : ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
4841
4942 include :
50- # -------------------- NVIDIA job --------------------
51- - name : pytest-gpu-acc-nvidia
52- test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory"
53- base : " devitocodes/bases:nvidia-nvc12"
54- runner_label : nvidiagpu
55- test_drive_cmd : " nvidia-smi"
56- # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
57- # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
58- dockerflags : >-
59- --init --rm -t
60- --name ${CONTAINER_BASENAME}
61- --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
62-
63- # -------------------- AMD job -----------------------
64- - name : pytest-gpu-omp-amd
65- test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory"
66- runner_label : amdgpu
67- base : " devitocodes/bases:amd"
68- test_drive_cmd : " rocm-smi"
69- # Unchanged, still passes through required /dev nodes etc.
70- dockerflags : >-
71- --init --network=host
72- --device=/dev/kfd --device=/dev/dri
73- --ipc=host
74- --group-add video --group-add "$(getent group render | cut -d: -f3)"
75- --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
76- --rm -t
77- --name ${CONTAINER_BASENAME}
43+ # -------------------- NVIDIA job --------------------
44+ - name : pytest-gpu-acc-nvidia
45+ test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory"
46+ base : " devitocodes/bases:nvidia-nvc12"
47+ runner_label : nvidiagpu
48+ test_drive_cmd : " nvidia-smi"
49+ # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
50+ # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
51+ dockerflags : --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
52+
53+ # -------------------- AMD job -----------------------
54+ - name : pytest-gpu-omp-amd
55+ test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory"
56+ runner_label : amdgpu
57+ base : " devitocodes/bases:amd"
58+ test_drive_cmd : " rocm-smi"
59+ # Unchanged, still passes through required /dev nodes etc.
60+ dockerflags : >-
61+ --network=host
62+ --device=/dev/kfd --device=/dev/dri
63+ --ipc=host
64+ --group-add video --group-add "$(getent group render | cut -d: -f3)"
65+ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
7866
7967 steps :
80- - name : Checkout devito
81- uses : actions/checkout@v6
82-
83- - name : Generate unique CI tag
84- id : uniquetag
85- run : |
86- UNIQUE=$(echo "${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}" | cksum | cut -f 1 -d " ")
87- echo "Unique ID: ${UNIQUE}"
88- echo "unique=${UNIQUE}" >> "$GITHUB_OUTPUT"
89-
90- - name : Set per-runner tags
91- env :
92- UNIQUE : ${{ steps.uniquetag.outputs.unique }}
93- run : |
94- echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}-${UNIQUE}" >> "$GITHUB_ENV"
95- echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}-${{ github.sha }}" >> "$GITHUB_ENV"
96-
97- - name : Ensure buildx builder
98- run : |
99- docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
100- docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
101- docker buildx use "${RUNNER_NAME// /_}"
102-
103- - name : Build docker image
104- run : |
105- docker buildx build . \
106- --builder "${RUNNER_NAME// /_}" \
107- --load \
108- --label ci-run="$GITHUB_RUN_ID" \
109- --rm --pull \
110- --file docker/Dockerfile.devito \
111- --tag "${DOCKER_IMAGE}" \
112- --build-arg base="${{ matrix.base }}"
113-
114- - name : Export CODECOV token
115- run : echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV"
116-
117- - name : Probe gpu
118- run : |
119- # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
120- # runners; fall back to "all" so the driver probe does not fail.
121- if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
122- echo "CUDA_VISIBLE_DEVICES=all" >> "$GITHUB_ENV"
123- fi
124-
125- # Run a simple driver-probe command (nvidia-smi / rocm-smi)
126- docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
127- docker run ${{ matrix.dockerflags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
128-
129- - name : Test with pytest
130- env :
131- # Exported earlier in the job; needed inside the container for codecov
132- CODECOV_TOKEN : ${{ env.CODECOV_TOKEN }}
133- run : |
134- # Add Codecov’s environment variables (GITHUB_SHA, etc.)
135- ci_env=$(bash <(curl -s https://codecov.io/env))
136-
137- # Run the test suite using the matrix-defined flags
138- docker run \
139- ${{ matrix.dockerflags }} \
140- "${ci_env}" \
141- --env CI=true \
142- --env PYTHONFAULTHANDLER=1 \
143- --env DEVITO_LOGGING=DEBUG \
144- --env CODECOV_TOKEN \
145- "${DOCKER_IMAGE}" \
146- pytest -vvv --capture=no --showlocals \
147- --log-cli-level=DEBUG -o log_cli=true \
148- --full-trace --durations=10 \
149- --cov --cov-config=.coveragerc --cov-report=xml \
150- ${{ matrix.test_files }}
151-
152- - name : Test examples
153- run : |
154- docker run \
155- ${{ matrix.dockerflags }} \
156- "${DOCKER_IMAGE}" \
157- pytest ${{ matrix.test_examples }}
158-
159- - name : Test examples with MPI
160- run : |
161- docker run \
162- ${{ matrix.dockerflags }} \
163- --env DEVITO_MPI=1 \
164- "${DOCKER_IMAGE}" \
165- mpiexec -n 2 pytest ${{ matrix.test_examples }}
166-
167- - name : Builder & image cleanup (keep 3 days of cache)
168- if : always()
169- run : |
170- # Remove only the test image we built
171- docker rmi -f "${DOCKER_IMAGE}" || true
172-
173- # Classic image layers created in this job
174- docker image prune -f --filter label=ci-run="$GITHUB_RUN_ID"
175-
176- # BuildKit cache: target the per-runner builder explicitly
177- docker builder prune --builder "${RUNNER_NAME// /_}" \
178- -f \
179- --filter "until=72h"
68+ - name : Checkout devito
69+ uses : actions/checkout@v6
70+
71+ - id : build
72+ name : Build docker image
73+ uses : ./.github/actions/docker-build
74+ with :
75+ file : docker/Dockerfile.devito
76+ tag : ${{ matrix.name }}
77+ base : ${{ matrix.base }}
78+
79+ - name : Probe GPU
80+ uses : ./.github/actions/docker-run
81+ with :
82+ uid : ${{ steps.build.outputs.unique }}
83+ tag : ${{ matrix.name }}
84+ args : ${{ matrix.dockerflags }}
85+ command : ${{ matrix.test_drive_cmd }}
86+
87+ - name : Test with pytest
88+ uses : ./.github/actions/docker-run
89+ with :
90+ uid : ${{ steps.build.outputs.unique }}
91+ tag : ${{ matrix.name }}
92+ args : ${{ matrix.dockerflags }}
93+ env : |
94+ CI=true
95+ CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}
96+ DEVITO_LOGGING=DEBUG
97+ PYTHONFAULTHANDLER=1
98+ command : |
99+ pytest \
100+ -vvv \
101+ --capture=no \
102+ --showlocals \
103+ --log-cli-level=DEBUG \
104+ -o log_cli=true \
105+ --full-trace \
106+ --durations=10 \
107+ --cov \
108+ --cov-config=.coveragerc \
109+ --cov-report=xml \
110+ ${{ matrix.test_files }}
111+
112+ - name : Test examples
113+ uses : ./.github/actions/docker-run
114+ with :
115+ uid : ${{ steps.build.outputs.unique }}
116+ tag : ${{ matrix.name }}
117+ args : ${{ matrix.dockerflags }}
118+ env : |
119+ command : pytest ${{ matrix.test_examples }}
120+
121+ - name : Test examples with MPI
122+ uses : ./.github/actions/docker-run
123+ with :
124+ uid : ${{ steps.build.outputs.unique }}
125+ tag : ${{ matrix.name }}
126+ args : ${{ matrix.dockerflags }}
127+ env : |
128+ DEVITO_LOGGING=DEBUG
129+ DEVITO_MPI=1
130+ command : mpiexec -n 2 pytest ${{ matrix.test_examples }}
131+
132+ - name : Cleanup docker image
133+ if : always()
134+ uses : ./.github/actions/docker-clean
135+ with :
136+ uid : ${{ steps.build.outputs.unique }}
137+ tag : ${{ matrix.name }}
0 commit comments