Skip to content

Commit b291607

Browse files
committed
ci: Use composite actions for GPU tests
1 parent e079057 commit b291607

2 files changed

Lines changed: 98 additions & 140 deletions

File tree

.github/actions/docker-run/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ inputs:
1414
env:
1515
description: "Environment variables to set inside the docker container, one environment variable per line"
1616
required: true
17-
default: Dockerfile
17+
default: ""
1818
tag:
1919
description: "Tag of the built image to use"
2020
required: true

.github/workflows/pytest-gpu.yaml

Lines changed: 97 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
1-
# Runner information:
1+
# Workflow information:
22
# - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu`
33
# - OpenMP on AMD runs on runners labeled `amdgpu`
4-
#
5-
# Changes vs original:
6-
# * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
7-
# * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse
8-
# * Remove docker prune / global container deletes (we assume disk space is fine)
9-
# * Add comments throughout
4+
# - Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
5+
# - Tag images and container names to avoid cross-runner races and maximize cache reuse
106

117
name: CI-gpu
128

@@ -31,149 +27,111 @@ on:
3127
description: "Run GPU tests"
3228

3329
jobs:
34-
build:
30+
test-gpu-docker:
3531
name: ${{ matrix.name }}
3632
runs-on:
3733
- self-hosted
3834
- ${{ matrix.runner_label }}
3935

40-
outputs:
41-
unique : ${{ steps.uniquetag.outputs.unique }}
42-
4336
strategy:
4437
fail-fast: false
4538
matrix:
4639
name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
4740
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
4841

4942
include:
50-
# -------------------- NVIDIA job --------------------
51-
- name: pytest-gpu-acc-nvidia
52-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory"
53-
base: "devitocodes/bases:nvidia-nvc12"
54-
runner_label: nvidiagpu
55-
test_drive_cmd: "nvidia-smi"
56-
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
57-
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
58-
dockerflags: >-
59-
--init --rm -t
60-
--name ${CONTAINER_BASENAME}
61-
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
62-
63-
# -------------------- AMD job -----------------------
64-
- name: pytest-gpu-omp-amd
65-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory"
66-
runner_label: amdgpu
67-
base: "devitocodes/bases:amd"
68-
test_drive_cmd: "rocm-smi"
69-
# Unchanged, still passes through required /dev nodes etc.
70-
dockerflags: >-
71-
--init --network=host
72-
--device=/dev/kfd --device=/dev/dri
73-
--ipc=host
74-
--group-add video --group-add "$(getent group render | cut -d: -f3)"
75-
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
76-
--rm -t
77-
--name ${CONTAINER_BASENAME}
43+
# -------------------- NVIDIA job --------------------
44+
- name: pytest-gpu-acc-nvidia
45+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory"
46+
base: "devitocodes/bases:nvidia-nvc12"
47+
runner_label: nvidiagpu
48+
test_drive_cmd: "nvidia-smi"
49+
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
50+
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
51+
dockerflags: --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
52+
53+
# -------------------- AMD job -----------------------
54+
- name: pytest-gpu-omp-amd
55+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory"
56+
runner_label: amdgpu
57+
base: "devitocodes/bases:amd"
58+
test_drive_cmd: "rocm-smi"
59+
# Unchanged, still passes through required /dev nodes etc.
60+
dockerflags: >-
61+
--network=host
62+
--device=/dev/kfd --device=/dev/dri
63+
--ipc=host
64+
--group-add video --group-add "$(getent group render | cut -d: -f3)"
65+
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
7866
7967
steps:
80-
- name: Checkout devito
81-
uses: actions/checkout@v6
82-
83-
- name: Generate unique CI tag
84-
id: uniquetag
85-
run: |
86-
UNIQUE=$(echo "${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}" | cksum | cut -f 1 -d " ")
87-
echo "Unique ID: ${UNIQUE}"
88-
echo "unique=${UNIQUE}" >> "$GITHUB_OUTPUT"
89-
90-
- name: Set per-runner tags
91-
env:
92-
UNIQUE: ${{ steps.uniquetag.outputs.unique }}
93-
run: |
94-
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}-${UNIQUE}" >> "$GITHUB_ENV"
95-
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}-${{ github.sha }}" >> "$GITHUB_ENV"
96-
97-
- name: Ensure buildx builder
98-
run: |
99-
docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
100-
docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
101-
docker buildx use "${RUNNER_NAME// /_}"
102-
103-
- name: Build docker image
104-
run: |
105-
docker buildx build . \
106-
--builder "${RUNNER_NAME// /_}" \
107-
--load \
108-
--label ci-run="$GITHUB_RUN_ID" \
109-
--rm --pull \
110-
--file docker/Dockerfile.devito \
111-
--tag "${DOCKER_IMAGE}" \
112-
--build-arg base="${{ matrix.base }}"
113-
114-
- name: Export CODECOV token
115-
run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV"
116-
117-
- name: Probe gpu
118-
run: |
119-
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
120-
# runners; fall back to "all" so the driver probe does not fail.
121-
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
122-
echo "CUDA_VISIBLE_DEVICES=all" >> "$GITHUB_ENV"
123-
fi
124-
125-
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
126-
docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
127-
docker run ${{ matrix.dockerflags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
128-
129-
- name: Test with pytest
130-
env:
131-
# Exported earlier in the job; needed inside the container for codecov
132-
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
133-
run: |
134-
# Add Codecov’s environment variables (GITHUB_SHA, etc.)
135-
ci_env=$(bash <(curl -s https://codecov.io/env))
136-
137-
# Run the test suite using the matrix-defined flags
138-
docker run \
139-
${{ matrix.dockerflags }} \
140-
"${ci_env}" \
141-
--env CI=true \
142-
--env PYTHONFAULTHANDLER=1 \
143-
--env DEVITO_LOGGING=DEBUG \
144-
--env CODECOV_TOKEN \
145-
"${DOCKER_IMAGE}" \
146-
pytest -vvv --capture=no --showlocals \
147-
--log-cli-level=DEBUG -o log_cli=true \
148-
--full-trace --durations=10 \
149-
--cov --cov-config=.coveragerc --cov-report=xml \
150-
${{ matrix.test_files }}
151-
152-
- name: Test examples
153-
run: |
154-
docker run \
155-
${{ matrix.dockerflags }} \
156-
"${DOCKER_IMAGE}" \
157-
pytest ${{ matrix.test_examples }}
158-
159-
- name: Test examples with MPI
160-
run: |
161-
docker run \
162-
${{ matrix.dockerflags }} \
163-
--env DEVITO_MPI=1 \
164-
"${DOCKER_IMAGE}" \
165-
mpiexec -n 2 pytest ${{ matrix.test_examples }}
166-
167-
- name: Builder & image cleanup (keep 3 days of cache)
168-
if: always()
169-
run: |
170-
# Remove only the test image we built
171-
docker rmi -f "${DOCKER_IMAGE}" || true
172-
173-
# Classic image layers created in this job
174-
docker image prune -f --filter label=ci-run="$GITHUB_RUN_ID"
175-
176-
# BuildKit cache: target the per-runner builder explicitly
177-
docker builder prune --builder "${RUNNER_NAME// /_}" \
178-
-f \
179-
--filter "until=72h"
68+
- name: Checkout devito
69+
uses: actions/checkout@v6
70+
71+
- id: build
72+
name: Build docker image
73+
uses: ./.github/actions/docker-build
74+
with:
75+
file: docker/Dockerfile.devito
76+
tag: ${{ matrix.name }}
77+
base: ${{ matrix.base }}
78+
79+
- name: Probe GPU
80+
uses: ./.github/actions/docker-run
81+
with:
82+
uid: ${{ steps.build.outputs.unique }}
83+
tag: ${{ matrix.name }}
84+
args: ${{ matrix.dockerflags }}
85+
command: ${{ matrix.test_drive_cmd }}
86+
87+
- name: Test with pytest
88+
uses: ./.github/actions/docker-run
89+
with:
90+
uid: ${{ steps.build.outputs.unique }}
91+
tag: ${{ matrix.name }}
92+
args: ${{ matrix.dockerflags }}
93+
env: |
94+
CI=true
95+
CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}
96+
DEVITO_LOGGING=DEBUG
97+
PYTHONFAULTHANDLER=1
98+
command: |
99+
pytest \
100+
-vvv \
101+
--capture=no \
102+
--showlocals \
103+
--log-cli-level=DEBUG \
104+
-o log_cli=true \
105+
--full-trace \
106+
--durations=10 \
107+
--cov \
108+
--cov-config=.coveragerc \
109+
--cov-report=xml \
110+
${{ matrix.test_files }}
111+
112+
- name: Test examples
113+
uses: ./.github/actions/docker-run
114+
with:
115+
uid: ${{ steps.build.outputs.unique }}
116+
tag: ${{ matrix.name }}
117+
args: ${{ matrix.dockerflags }}
118+
env: |
119+
command: pytest ${{ matrix.test_examples }}
120+
121+
- name: Test examples with MPI
122+
uses: ./.github/actions/docker-run
123+
with:
124+
uid: ${{ steps.build.outputs.unique }}
125+
tag: ${{ matrix.name }}
126+
args: ${{ matrix.dockerflags }}
127+
env: |
128+
DEVITO_LOGGING=DEBUG
129+
DEVITO_MPI=1
130+
command: mpiexec -n 2 pytest ${{ matrix.test_examples }}
131+
132+
- name: Cleanup docker image
133+
if: always()
134+
uses: ./.github/actions/docker-clean
135+
with:
136+
uid: ${{ steps.build.outputs.unique }}
137+
tag: ${{ matrix.name }}

0 commit comments

Comments
 (0)