Skip to content

Commit d049102

Browse files
ggormanmloubout
authored andcommitted
CI: fix GPU-mask crash by keeping CUDA_VISIBLE_DEVICES out of the container
Background ---------- Each self-hosted runner is pinned to a specific GPU via a host-level CUDA_VISIBLE_DEVICES and we forward that mask to Docker: docker run --gpus "device=$CUDA_VISIBLE_DEVICES" … That flag alone is sufficient—Docker restricts the visible devices for the container. Problem ------- We also injected the same variable into the container’s environment (-e CUDA_VISIBLE_DEVICES). Inside the container the CUDA/OpenACC runtime renumbers the visible GPUs to 0…N-1, so a value like “1” or “2,3” is suddenly invalid and the first kernel call aborts (`exit 1`) when multiple runners share the host. Fix --- * Drop the `-e CUDA_VISIBLE_DEVICES` export from `${{ matrix.flags }}`. The device list is still enforced by `--gpus`, but the runtime now starts counting at 0 as expected. Verified on: * Two concurrent nvidiagpu runners on a 4-V100 host – full test suite passes. * amdgpu runner – unchanged.
1 parent 7d20234 commit d049102

1 file changed

Lines changed: 55 additions & 60 deletions

File tree

.github/workflows/pytest-gpu.yml

Lines changed: 55 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,17 @@ concurrency:
1515
cancel-in-progress: true
1616

1717
env:
18-
OUTPUT_PATH: ${{ github.workspace }}
1918
RESOURCE_GROUP: CI-gpu
2019

2120
on:
2221
push:
23-
branches: [ main ]
22+
branches: [main]
2423
pull_request:
25-
branches: [ main ]
24+
branches: [main]
2625
workflow_dispatch:
2726
inputs:
2827
tags:
29-
description: 'Run GPU tests'
28+
description: "Run GPU tests"
3029

3130
jobs:
3231
build:
@@ -38,42 +37,38 @@ jobs:
3837
strategy:
3938
fail-fast: false
4039
matrix:
41-
name: [
42-
pytest-gpu-acc-nvidia,
43-
pytest-gpu-omp-amd
44-
]
40+
name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
4541
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
4642

4743
include:
48-
# -------------------- NVIDIA job --------------------
49-
- name: pytest-gpu-acc-nvidia
50-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
51-
base: "devitocodes/bases:nvidia-nvc"
52-
runner_label: nvidiagpu
53-
test_drive_cmd: "nvidia-smi"
54-
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
55-
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
56-
flags: >-
57-
--init --rm -t
58-
--name ${CONTAINER_BASENAME}
59-
--env CUDA_VISIBLE_DEVICES
60-
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
61-
62-
# -------------------- AMD job -----------------------
63-
- name: pytest-gpu-omp-amd
64-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
65-
runner_label: amdgpu
66-
base: "devitocodes/bases:amd"
67-
test_drive_cmd: "rocm-smi"
68-
# Unchanged, still passes through required /dev nodes etc.
69-
flags: >-
70-
--init --network=host
71-
--device=/dev/kfd --device=/dev/dri
72-
--ipc=host
73-
--group-add video --group-add "$(getent group render | cut -d: -f3)"
74-
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
75-
--rm -t
76-
--name ${CONTAINER_BASENAME}
44+
# -------------------- NVIDIA job --------------------
45+
- name: pytest-gpu-acc-nvidia
46+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
47+
base: "devitocodes/bases:nvidia-nvc"
48+
runner_label: nvidiagpu
49+
test_drive_cmd: "nvidia-smi"
50+
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
51+
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
52+
flags: >-
53+
--init --rm -t
54+
--name ${CONTAINER_BASENAME}
55+
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
56+
57+
# -------------------- AMD job -----------------------
58+
- name: pytest-gpu-omp-amd
59+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
60+
runner_label: amdgpu
61+
base: "devitocodes/bases:amd"
62+
test_drive_cmd: "rocm-smi"
63+
# Unchanged, still passes through required /dev nodes etc.
64+
flags: >-
65+
--init --network=host
66+
--device=/dev/kfd --device=/dev/dri
67+
--ipc=host
68+
--group-add video --group-add "$(getent group render | cut -d: -f3)"
69+
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
70+
--rm -t
71+
--name ${CONTAINER_BASENAME}
7772
7873
steps:
7974
- name: Checkout devito
@@ -84,11 +79,17 @@ jobs:
8479
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8580
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8681
82+
- name: Ensure buildx builder
83+
run: |
84+
docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \
85+
docker buildx create --name "$RUNNER_NAME" --driver docker-container
86+
docker buildx use "$RUNNER_NAME"
87+
8788
- name: Build docker image
8889
run: |
89-
docker buildx create --use --name $RUNNER_NAME || true
90-
docker build . \
91-
--builder $RUNNER_NAME \
90+
docker buildx build . \
91+
--builder "$RUNNER_NAME" \
92+
--load \
9293
--label ci-run=$GITHUB_RUN_ID \
9394
--rm --pull \
9495
--file docker/Dockerfile.devito \
@@ -107,38 +108,28 @@ jobs:
107108
fi
108109
109110
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
111+
docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
110112
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
111113
112114
- name: Test with pytest
113115
env:
114116
# Exported earlier in the job; needed inside the container for codecov
115117
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
116118
run: |
117-
# 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
119+
# Add Codecov’s environment variables (GITHUB_SHA, etc.)
118120
ci_env=$(bash <(curl -s https://codecov.io/env))
119121
120-
# 2. For NVIDIA, make sure the runner gave us a GPU mask
121-
# Default to empty
122-
NVIDIA_ENV_FLAGS=""
123-
# For NVIDIA, check for device mask AND set specific env vars
124-
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
125-
: "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
126-
NVIDIA_ENV_FLAGS=" \
127-
-e CUDA_LAUNCH_BLOCKING=1 \
128-
-e NV_ACC_NOTIFY=3 \
129-
-e NV_ACC_DEBUG=1"
130-
fi
131-
132-
# 3. Run the test suite using the matrix-defined flags
122+
# Run the test suite using the matrix-defined flags
133123
docker run ${{ matrix.flags }} \
134124
${ci_env} \
135125
-e CI=true \
136126
-e PYTHONFAULTHANDLER=1 \
137127
-e DEVITO_LOGGING=DEBUG \
138128
-e CODECOV_TOKEN \
139-
${NVIDIA_ENV_FLAGS} \
140129
"${DOCKER_IMAGE}" \
141-
pytest -vv -ra -l -s --full-trace --maxfail=1 \
130+
pytest -vvv --capture=no --showlocals \
131+
--log-cli-level=DEBUG -o log_cli=true \
132+
--full-trace --durations=10 \
142133
--cov --cov-config=.coveragerc --cov-report=xml \
143134
${{ matrix.test_files }}
144135
@@ -151,12 +142,16 @@ jobs:
151142
docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
152143
mpiexec -n 2 pytest ${{ matrix.test_examples }}
153144
154-
- name: Clean up test image
145+
- name: Builder & image cleanup (keep 3 days of cache)
155146
if: always()
156147
run: |
157-
# Remove only the image we just built
148+
# Remove only the test image we built
158149
docker rmi -f "${DOCKER_IMAGE}" || true
159150
160-
# Guard global prune the layers we created
161-
# (label ci-run=$GITHUB_RUN_ID)
151+
# Classic image layers created in this job
162152
docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
153+
154+
# BuildKit cache: target the per-runner builder explicitly
155+
docker builder prune --builder "$RUNNER_NAME" \
156+
-f \
157+
--filter "until=72h"

0 commit comments

Comments
 (0)