diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 95b70386c3..0d0c7bd73e 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -109,7 +109,6 @@ jobs: build-args: base=devitocodes/${{ matrix.base }} - name: Remove dangling layers - if: ${{ !contains(matrix.runner, 'nvidiagpu') }} run: docker system prune -f - name: Run tests diff --git a/.github/workflows/pytest-gpu.yml b/.github/workflows/pytest-gpu.yml index 2b8b25cf2c..2edb501dc3 100644 --- a/.github/workflows/pytest-gpu.yml +++ b/.github/workflows/pytest-gpu.yml @@ -15,18 +15,17 @@ concurrency: cancel-in-progress: true env: - OUTPUT_PATH: ${{ github.workspace }} RESOURCE_GROUP: CI-gpu on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] workflow_dispatch: inputs: tags: - description: 'Run GPU tests' + description: "Run GPU tests" jobs: build: @@ -35,52 +34,41 @@ jobs: - self-hosted - ${{ matrix.runner_label }} - # Job-level env (includes per-runner image/container tags) - env: - DEVITO_ARCH: ${{ matrix.arch }} - DEVITO_PLATFORM: ${{ matrix.platform }} - DEVITO_LANGUAGE: ${{ matrix.language }} - OMPI_CC: ${{ matrix.arch }} - strategy: fail-fast: false matrix: - name: [ - pytest-gpu-acc-nvidia, - pytest-gpu-omp-amd - ] + name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd] test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"] include: - # -------------------- NVIDIA job -------------------- - - name: pytest-gpu-acc-nvidia - test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" - base: "devitocodes/bases:nvidia-nvc" - runner_label: nvidiagpu - test_drive_cmd: "nvidia-smi" - # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. - # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). - flags: >- - --init --rm -t - --name ${CONTAINER_BASENAME} - --env CUDA_VISIBLE_DEVICES - --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" - - # -------------------- AMD job ----------------------- - - name: pytest-gpu-omp-amd - test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" - runner_label: amdgpu - base: "devitocodes/bases:amd" - test_drive_cmd: "rocm-smi" - # Unchanged, still passes through required /dev nodes etc. - flags: >- - --init --network=host - --device=/dev/kfd --device=/dev/dri - --ipc=host - --group-add video --group-add $(getent group render | cut -d: -f3) - --cap-add=SYS_PTRACE --security-opt seccomp=unconfined - --rm -t - --name ${CONTAINER_BASENAME} + # -------------------- NVIDIA job -------------------- + - name: pytest-gpu-acc-nvidia + test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" + base: "devitocodes/bases:nvidia-nvc" + runner_label: nvidiagpu + test_drive_cmd: "nvidia-smi" + # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. + # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). + flags: >- + --init --rm -t + --name ${CONTAINER_BASENAME} + --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" + + # -------------------- AMD job ----------------------- + - name: pytest-gpu-omp-amd + test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" + runner_label: amdgpu + base: "devitocodes/bases:amd" + test_drive_cmd: "rocm-smi" + # Unchanged, still passes through required /dev nodes etc. + flags: >- + --init --network=host + --device=/dev/kfd --device=/dev/dri + --ipc=host + --group-add video --group-add "$(getent group render | cut -d: -f3)" + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined + --rm -t + --name ${CONTAINER_BASENAME} steps: - name: Checkout devito @@ -88,13 +76,21 @@ jobs: - name: Set per-runner tags run: | - echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV + - name: Ensure buildx builder + run: | + docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \ + docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container + docker buildx use "${RUNNER_NAME// /_}" + - name: Build docker image run: | - docker build . \ + docker buildx build . \ + --builder "${RUNNER_NAME// /_}" \ + --load \ + --label ci-run=$GITHUB_RUN_ID \ --rm --pull \ --file docker/Dockerfile.devito \ --tag "${DOCKER_IMAGE}" \ @@ -105,16 +101,37 @@ jobs: - name: Probe gpu run: | - # Run a simple driver cmd first (nvidia-smi / rocm-smi) + # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA + # runners; fall back to "all" so the driver probe does not fail. + if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then + echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV + fi + + # Run a simple driver-probe command (nvidia-smi / rocm-smi) + docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }} - name: Test with pytest + env: + # Exported earlier in the job; needed inside the container for codecov + CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }} run: | - # Run a simple driver cmd first (nvidia-smi / rocm-smi) + # Add Codecov’s environment variables (GITHUB_SHA, etc.) ci_env=$(bash <(curl -s https://codecov.io/env)) - docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \ - pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }} + # Run the test suite using the matrix-defined flags + docker run ${{ matrix.flags }} \ + ${ci_env} \ + -e CI=true \ + -e PYTHONFAULTHANDLER=1 \ + -e DEVITO_LOGGING=DEBUG \ + -e CODECOV_TOKEN \ + "${DOCKER_IMAGE}" \ + pytest -vvv --capture=no --showlocals \ + --log-cli-level=DEBUG -o log_cli=true \ + --full-trace --durations=10 \ + --cov --cov-config=.coveragerc --cov-report=xml \ + ${{ matrix.test_files }} - name: Test examples run: | @@ -124,3 +141,17 @@ jobs: run: | docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \ mpiexec -n 2 pytest ${{ matrix.test_examples }} + + - name: Builder & image cleanup (keep 3 days of cache) + if: always() + run: | + # Remove only the test image we built + docker rmi -f "${DOCKER_IMAGE}" || true + + # Classic image layers created in this job + docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID + + # BuildKit cache: target the per-runner builder explicitly + docker builder prune --builder "${RUNNER_NAME// /_}" \ + -f \ + --filter "until=72h"