@@ -15,18 +15,17 @@ concurrency:
1515 cancel-in-progress : true
1616
1717env :
18- OUTPUT_PATH : ${{ github.workspace }}
1918 RESOURCE_GROUP : CI-gpu
2019
2120on :
2221 push :
23- branches : [ main ]
22+ branches : [main]
2423 pull_request :
25- branches : [ main ]
24+ branches : [main]
2625 workflow_dispatch :
2726 inputs :
2827 tags :
29- description : ' Run GPU tests'
28+ description : " Run GPU tests"
3029
3130jobs :
3231 build :
@@ -38,42 +37,38 @@ jobs:
3837 strategy :
3938 fail-fast : false
4039 matrix :
41- name : [
42- pytest-gpu-acc-nvidia,
43- pytest-gpu-omp-amd
44- ]
40+ name : [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
4541 test_examples : ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
4642
4743 include :
48- # -------------------- NVIDIA job --------------------
49- - name : pytest-gpu-acc-nvidia
50- test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
51- base : " devitocodes/bases:nvidia-nvc"
52- runner_label : nvidiagpu
53- test_drive_cmd : " nvidia-smi"
54- # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
55- # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
56- flags : >-
57- --init --rm -t
58- --name ${CONTAINER_BASENAME}
59- --env CUDA_VISIBLE_DEVICES
60- --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
61-
62- # -------------------- AMD job -----------------------
63- - name : pytest-gpu-omp-amd
64- test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
65- runner_label : amdgpu
66- base : " devitocodes/bases:amd"
67- test_drive_cmd : " rocm-smi"
68- # Unchanged, still passes through required /dev nodes etc.
69- flags : >-
70- --init --network=host
71- --device=/dev/kfd --device=/dev/dri
72- --ipc=host
73- --group-add video --group-add "$(getent group render | cut -d: -f3)"
74- --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
75- --rm -t
76- --name ${CONTAINER_BASENAME}
44+ # -------------------- NVIDIA job --------------------
45+ - name : pytest-gpu-acc-nvidia
46+ test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
47+ base : " devitocodes/bases:nvidia-nvc"
48+ runner_label : nvidiagpu
49+ test_drive_cmd : " nvidia-smi"
50+ # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
51+ # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
52+ flags : >-
53+ --init --rm -t
54+ --name ${CONTAINER_BASENAME}
55+ --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
56+
57+ # -------------------- AMD job -----------------------
58+ - name : pytest-gpu-omp-amd
59+ test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
60+ runner_label : amdgpu
61+ base : " devitocodes/bases:amd"
62+ test_drive_cmd : " rocm-smi"
63+ # Unchanged, still passes through required /dev nodes etc.
64+ flags : >-
65+ --init --network=host
66+ --device=/dev/kfd --device=/dev/dri
67+ --ipc=host
68+ --group-add video --group-add "$(getent group render | cut -d: -f3)"
69+ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
70+ --rm -t
71+ --name ${CONTAINER_BASENAME}
7772
7873 steps :
7974 - name : Checkout devito
@@ -84,11 +79,17 @@ jobs:
8479 echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8580 echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8681
82+ - name : Ensure buildx builder
83+ run : |
84+ docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \
85+ docker buildx create --name "$RUNNER_NAME" --driver docker-container
86+ docker buildx use "$RUNNER_NAME"
87+
8788 - name : Build docker image
8889 run : |
89- docker buildx create --use --name $RUNNER_NAME || true
90- docker build . \
91- --builder $RUNNER_NAME \
90+ docker buildx build . \
91+ --builder "$RUNNER_NAME" \
92+ --load \
9293 --label ci-run=$GITHUB_RUN_ID \
9394 --rm --pull \
9495 --file docker/Dockerfile.devito \
@@ -107,38 +108,28 @@ jobs:
107108 fi
108109
109110 # Run a simple driver-probe command (nvidia-smi / rocm-smi)
111+ docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
110112 docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
111113
112114 - name : Test with pytest
113115 env :
114116 # Exported earlier in the job; needed inside the container for codecov
115117 CODECOV_TOKEN : ${{ env.CODECOV_TOKEN }}
116118 run : |
117- # 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
119+ # Add Codecov’s environment variables (GITHUB_SHA, etc.)
118120 ci_env=$(bash <(curl -s https://codecov.io/env))
119121
120- # 2. For NVIDIA, make sure the runner gave us a GPU mask
121- # Default to empty
122- NVIDIA_ENV_FLAGS=""
123- # For NVIDIA, check for device mask AND set specific env vars
124- if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
125- : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
126- NVIDIA_ENV_FLAGS=" \
127- -e CUDA_LAUNCH_BLOCKING=1 \
128- -e NV_ACC_NOTIFY=3 \
129- -e NV_ACC_DEBUG=1"
130- fi
131-
132- # 3. Run the test suite using the matrix-defined flags
122+ # Run the test suite using the matrix-defined flags
133123 docker run ${{ matrix.flags }} \
134124 ${ci_env} \
135125 -e CI=true \
136126 -e PYTHONFAULTHANDLER=1 \
137127 -e DEVITO_LOGGING=DEBUG \
138128 -e CODECOV_TOKEN \
139- ${NVIDIA_ENV_FLAGS} \
140129 "${DOCKER_IMAGE}" \
141- pytest -vv -ra -l -s --full-trace --maxfail=1 \
130+ pytest -vvv --capture=no --showlocals \
131+ --log-cli-level=DEBUG -o log_cli=true \
132+ --full-trace --durations=10 \
142133 --cov --cov-config=.coveragerc --cov-report=xml \
143134 ${{ matrix.test_files }}
144135
@@ -151,12 +142,16 @@ jobs:
151142 docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
152143 mpiexec -n 2 pytest ${{ matrix.test_examples }}
153144
154- - name : Clean up test image
145+ - name : Builder & image cleanup (keep 3 days of cache)
155146 if : always()
156147 run : |
157- # Remove only the image we just built
148+ # Remove only the test image we built
158149 docker rmi -f "${DOCKER_IMAGE}" || true
159150
160- # Guard global prune the layers we created
161- # (label ci-run=$GITHUB_RUN_ID)
151+ # Classic image layers created in this job
162152 docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
153+
154+ # BuildKit cache: target the per-runner builder explicitly
155+ docker builder prune --builder "$RUNNER_NAME" \
156+ -f \
157+ --filter "until=72h"
0 commit comments