From adf3dd178d04064dc508fe13212e2085a6a5b7d7 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Fri, 25 Jul 2025 11:36:12 +0100 Subject: [PATCH 1/4] ci: Remove dangling layers from CI docker builds. --- .github/workflows/docker-devito.yml | 1 - .github/workflows/pytest-gpu.yml | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 95b70386c3..0d0c7bd73e 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -109,7 +109,6 @@ jobs: build-args: base=devitocodes/${{ matrix.base }} - name: Remove dangling layers - if: ${{ !contains(matrix.runner, 'nvidiagpu') }} run: docker system prune -f - name: Run tests diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index 2b8b25cf2c..3db9d9343e 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -35,13 +35,6 @@ jobs: - self-hosted - ${{ matrix.runner_label }} - # Job-level env (includes per-runner image/container tags) - env: - DEVITO_ARCH: ${{ matrix.arch }} - DEVITO_PLATFORM: ${{ matrix.platform }} - DEVITO_LANGUAGE: ${{ matrix.language }} - OMPI_CC: ${{ matrix.arch }} - strategy: fail-fast: false matrix: @@ -110,11 +103,11 @@ jobs: - name: Test with pytest run: | - # Run a simple driver cmd first (nvidia-smi / rocm-smi) ci_env=$(bash <(curl -s https://codecov.io/env)) docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \ - pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }} + pytest -vv -ra -l -s --full-trace --maxfail=1 \ + --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }} - name: Test examples run: | @@ -124,3 +117,10 @@ jobs: run: | docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \ mpiexec -n 2 pytest ${{ matrix.test_examples }} + + - name: Clean up test image + if: always() + run: | + docker rmi -f "${DOCKER_IMAGE}" || true # remove the image + docker system prune -f # free dangling layers + From 7d2023487bd6bf3d79e0c554005168c3c57c70ba Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Sat, 26 Jul 2025 01:24:01 +0200 Subject: [PATCH 2/4] ci: bug fixes --- .github/workflows/pytest-gpu.yml | 50 +++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index 3db9d9343e..a0f90dcf57 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -70,7 +70,7 @@ jobs: --init --network=host --device=/dev/kfd --device=/dev/dri --ipc=host - --group-add video --group-add $(getent group render | cut -d: -f3) + --group-add video --group-add "$(getent group render | cut -d: -f3)" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --rm -t --name ${CONTAINER_BASENAME} @@ -81,13 +81,15 @@ jobs: - name: Set per-runner tags run: | - echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV - name: Build docker image run: | + docker buildx create --use --name $RUNNER_NAME || true docker build . \ + --builder $RUNNER_NAME \ + --label ci-run=$GITHUB_RUN_ID \ --rm --pull \ --file docker/Dockerfile.devito \ --tag "${DOCKER_IMAGE}" \ @@ -98,16 +100,47 @@ jobs: - name: Probe gpu run: | - # Run a simple driver cmd first (nvidia-smi / rocm-smi) + # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA + # runners; fall back to "all" so the driver probe does not fail. + if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then + echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV + fi + + # Run a simple driver-probe command (nvidia-smi / rocm-smi) docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }} - name: Test with pytest + env: + # Exported earlier in the job; needed inside the container for codecov + CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }} run: | + # 1. Add Codecov’s environment variables (GITHUB_SHA, etc.) ci_env=$(bash <(curl -s https://codecov.io/env)) - docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \ + # 2. For NVIDIA, make sure the runner gave us a GPU mask + # Default to empty + NVIDIA_ENV_FLAGS="" + # For NVIDIA, check for device mask AND set specific env vars + if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then + : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}" + NVIDIA_ENV_FLAGS=" \ + -e CUDA_LAUNCH_BLOCKING=1 \ + -e NV_ACC_NOTIFY=3 \ + -e NV_ACC_DEBUG=1" + fi + + # 3. Run the test suite using the matrix-defined flags + docker run ${{ matrix.flags }} \ + ${ci_env} \ + -e CI=true \ + -e PYTHONFAULTHANDLER=1 \ + -e DEVITO_LOGGING=DEBUG \ + -e CODECOV_TOKEN \ + ${NVIDIA_ENV_FLAGS} \ + "${DOCKER_IMAGE}" \ pytest -vv -ra -l -s --full-trace --maxfail=1 \ - --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }} + --cov --cov-config=.coveragerc --cov-report=xml \ + ${{ matrix.test_files }} - name: Test examples run: | @@ -121,6 +154,9 @@ jobs: - name: Clean up test image if: always() run: | - docker rmi -f "${DOCKER_IMAGE}" || true # remove the image - docker system prune -f # free dangling layers + # Remove only the image we just built + docker rmi -f "${DOCKER_IMAGE}" || true + # Guard global prune the layers we created + # (label ci-run=$GITHUB_RUN_ID) + docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID From d0491029e5206e0eb8c628f9393f77fe70216005 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Thu, 31 Jul 2025 12:13:29 +0200 Subject: [PATCH 3/4] CI: fix GPU-mask crash by keeping CUDA_VISIBLE_DEVICES out of the container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background ---------- Each self-hosted runner is pinned to a specific GPU via a host-level CUDA_VISIBLE_DEVICES and we forward that mask to Docker: docker run --gpus "device=$CUDA_VISIBLE_DEVICES" … That flag alone is sufficient—Docker restricts the visible devices for the container. Problem ------- We also injected the same variable into the container’s environment (-e CUDA_VISIBLE_DEVICES). Inside the container the CUDA/OpenACC runtime renumbers the visible GPUs to 0…N-1, so a value like “1” or “2,3” is suddenly invalid and the first kernel call aborts (`exit 1`) when multiple runners share the host. Fix --- * Drop the `-e CUDA_VISIBLE_DEVICES` export from `${{ matrix.flags }}`. The device list is still enforced by `--gpus`, but the runtime now starts counting at 0 as expected. Verified on: * Two concurrent nvidiagpu runners on a 4-V100 host – full test suite passes. * amdgpu runner – unchanged. --- .github/workflows/pytest-gpu.yml | 115 +++++++++++++++---------------- 1 file changed, 55 insertions(+), 60 deletions(-) diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index a0f90dcf57..35da07cc21 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -15,18 +15,17 @@ concurrency: cancel-in-progress: true env: - OUTPUT_PATH: ${{ github.workspace }} RESOURCE_GROUP: CI-gpu on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] workflow_dispatch: inputs: tags: - description: 'Run GPU tests' + description: "Run GPU tests" jobs: build: @@ -38,42 +37,38 @@ jobs: strategy: fail-fast: false matrix: - name: [ - pytest-gpu-acc-nvidia, - pytest-gpu-omp-amd - ] + name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd] test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"] include: - # -------------------- NVIDIA job -------------------- - - name: pytest-gpu-acc-nvidia - test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" - base: "devitocodes/bases:nvidia-nvc" - runner_label: nvidiagpu - test_drive_cmd: "nvidia-smi" - # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. - # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). - flags: >- - --init --rm -t - --name ${CONTAINER_BASENAME} - --env CUDA_VISIBLE_DEVICES - --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" - - # -------------------- AMD job ----------------------- - - name: pytest-gpu-omp-amd - test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" - runner_label: amdgpu - base: "devitocodes/bases:amd" - test_drive_cmd: "rocm-smi" - # Unchanged, still passes through required /dev nodes etc. - flags: >- - --init --network=host - --device=/dev/kfd --device=/dev/dri - --ipc=host - --group-add video --group-add "$(getent group render | cut -d: -f3)" - --cap-add=SYS_PTRACE --security-opt seccomp=unconfined - --rm -t - --name ${CONTAINER_BASENAME} + # -------------------- NVIDIA job -------------------- + - name: pytest-gpu-acc-nvidia + test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" + base: "devitocodes/bases:nvidia-nvc" + runner_label: nvidiagpu + test_drive_cmd: "nvidia-smi" + # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. + # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). + flags: >- + --init --rm -t + --name ${CONTAINER_BASENAME} + --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" + + # -------------------- AMD job ----------------------- + - name: pytest-gpu-omp-amd + test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" + runner_label: amdgpu + base: "devitocodes/bases:amd" + test_drive_cmd: "rocm-smi" + # Unchanged, still passes through required /dev nodes etc. + flags: >- + --init --network=host + --device=/dev/kfd --device=/dev/dri + --ipc=host + --group-add video --group-add "$(getent group render | cut -d: -f3)" + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined + --rm -t + --name ${CONTAINER_BASENAME} steps: - name: Checkout devito @@ -84,11 +79,17 @@ jobs: echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV + - name: Ensure buildx builder + run: | + docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \ + docker buildx create --name "$RUNNER_NAME" --driver docker-container + docker buildx use "$RUNNER_NAME" + - name: Build docker image run: | - docker buildx create --use --name $RUNNER_NAME || true - docker build . \ - --builder $RUNNER_NAME \ + docker buildx build . \ + --builder "$RUNNER_NAME" \ + --load \ --label ci-run=$GITHUB_RUN_ID \ --rm --pull \ --file docker/Dockerfile.devito \ @@ -107,6 +108,7 @@ jobs: fi # Run a simple driver-probe command (nvidia-smi / rocm-smi) + docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }} - name: Test with pytest @@ -114,31 +116,20 @@ jobs: # Exported earlier in the job; needed inside the container for codecov CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }} run: | - # 1. Add Codecov’s environment variables (GITHUB_SHA, etc.) + # Add Codecov’s environment variables (GITHUB_SHA, etc.) ci_env=$(bash <(curl -s https://codecov.io/env)) - # 2. For NVIDIA, make sure the runner gave us a GPU mask - # Default to empty - NVIDIA_ENV_FLAGS="" - # For NVIDIA, check for device mask AND set specific env vars - if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then - : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}" - NVIDIA_ENV_FLAGS=" \ - -e CUDA_LAUNCH_BLOCKING=1 \ - -e NV_ACC_NOTIFY=3 \ - -e NV_ACC_DEBUG=1" - fi - - # 3. Run the test suite using the matrix-defined flags + # Run the test suite using the matrix-defined flags docker run ${{ matrix.flags }} \ ${ci_env} \ -e CI=true \ -e PYTHONFAULTHANDLER=1 \ -e DEVITO_LOGGING=DEBUG \ -e CODECOV_TOKEN \ - ${NVIDIA_ENV_FLAGS} \ "${DOCKER_IMAGE}" \ - pytest -vv -ra -l -s --full-trace --maxfail=1 \ + pytest -vvv --capture=no --showlocals \ + --log-cli-level=DEBUG -o log_cli=true \ + --full-trace --durations=10 \ --cov --cov-config=.coveragerc --cov-report=xml \ ${{ matrix.test_files }} @@ -151,12 +142,16 @@ jobs: docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \ mpiexec -n 2 pytest ${{ matrix.test_examples }} - - name: Clean up test image + - name: Builder & image cleanup (keep 3 days of cache) if: always() run: | - # Remove only the image we just built + # Remove only the test image we built docker rmi -f "${DOCKER_IMAGE}" || true - # Guard global prune the layers we created - # (label ci-run=$GITHUB_RUN_ID) + # Classic image layers created in this job docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID + + # BuildKit cache: target the per-runner builder explicitly + docker builder prune --builder "$RUNNER_NAME" \ + -f \ + --filter "until=72h" From 6e3ad7e135f60ce2fca5f1570433389ba436603c Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Fri, 1 Aug 2025 16:14:01 +0200 Subject: [PATCH 4/4] CI: Change ${RUNNER_NAME} to ${RUNNER_NAME// /_} because runner name may contain spaces. --- .github/workflows/pytest-gpu.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index 35da07cc21..2edb501dc3 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -81,14 +81,14 @@ jobs: - name: Ensure buildx builder run: | - docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \ - docker buildx create --name "$RUNNER_NAME" --driver docker-container - docker buildx use "$RUNNER_NAME" + docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \ + docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container + docker buildx use "${RUNNER_NAME// /_}" - name: Build docker image run: | docker buildx build . \ - --builder "$RUNNER_NAME" \ + --builder "${RUNNER_NAME// /_}" \ --load \ --label ci-run=$GITHUB_RUN_ID \ --rm --pull \ @@ -152,6 +152,6 @@ jobs: docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID # BuildKit cache: target the per-runner builder explicitly - docker builder prune --builder "$RUNNER_NAME" \ + docker builder prune --builder "${RUNNER_NAME// /_}" \ -f \ --filter "until=72h"