Skip to content

Commit 7d20234

Browse files
ggormanmloubout
authored andcommitted
ci: bug fixes
1 parent adf3dd1 commit 7d20234

1 file changed

Lines changed: 43 additions & 7 deletions

File tree

.github/workflows/pytest-gpu.yml

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
--init --network=host
7171
--device=/dev/kfd --device=/dev/dri
7272
--ipc=host
73-
--group-add video --group-add $(getent group render | cut -d: -f3)
73+
--group-add video --group-add "$(getent group render | cut -d: -f3)"
7474
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
7575
--rm -t
7676
--name ${CONTAINER_BASENAME}
@@ -81,13 +81,15 @@ jobs:
8181

8282
- name: Set per-runner tags
8383
run: |
84-
echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
8584
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8685
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8786
8887
- name: Build docker image
8988
run: |
89+
docker buildx create --use --name $RUNNER_NAME || true
9090
docker build . \
91+
--builder $RUNNER_NAME \
92+
--label ci-run=$GITHUB_RUN_ID \
9193
--rm --pull \
9294
--file docker/Dockerfile.devito \
9395
--tag "${DOCKER_IMAGE}" \
@@ -98,16 +100,47 @@ jobs:
98100

99101
- name: Probe gpu
100102
run: |
101-
# Run a simple driver cmd first (nvidia-smi / rocm-smi)
103+
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
104+
# runners; fall back to "all" so the driver probe does not fail.
105+
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
106+
echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
107+
fi
108+
109+
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
102110
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
103111
104112
- name: Test with pytest
113+
env:
114+
# Exported earlier in the job; needed inside the container for codecov
115+
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
105116
run: |
117+
# 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
106118
ci_env=$(bash <(curl -s https://codecov.io/env))
107119
108-
docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
120+
# 2. For NVIDIA, make sure the runner gave us a GPU mask
121+
# Default to empty
122+
NVIDIA_ENV_FLAGS=""
123+
# For NVIDIA, check for device mask AND set specific env vars
124+
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
125+
: "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
126+
NVIDIA_ENV_FLAGS=" \
127+
-e CUDA_LAUNCH_BLOCKING=1 \
128+
-e NV_ACC_NOTIFY=3 \
129+
-e NV_ACC_DEBUG=1"
130+
fi
131+
132+
# 3. Run the test suite using the matrix-defined flags
133+
docker run ${{ matrix.flags }} \
134+
${ci_env} \
135+
-e CI=true \
136+
-e PYTHONFAULTHANDLER=1 \
137+
-e DEVITO_LOGGING=DEBUG \
138+
-e CODECOV_TOKEN \
139+
${NVIDIA_ENV_FLAGS} \
140+
"${DOCKER_IMAGE}" \
109141
pytest -vv -ra -l -s --full-trace --maxfail=1 \
110-
--cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
142+
--cov --cov-config=.coveragerc --cov-report=xml \
143+
${{ matrix.test_files }}
111144
112145
- name: Test examples
113146
run: |
@@ -121,6 +154,9 @@ jobs:
121154
- name: Clean up test image
122155
if: always()
123156
run: |
124-
docker rmi -f "${DOCKER_IMAGE}" || true # remove the image
125-
docker system prune -f # free dangling layers
157+
# Remove only the image we just built
158+
docker rmi -f "${DOCKER_IMAGE}" || true
126159
160+
# Guard global prune the layers we created
161+
# (label ci-run=$GITHUB_RUN_ID)
162+
docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID

0 commit comments

Comments
 (0)