From bc7d93061c6044adf30b86bc79417e2970e74e5d Mon Sep 17 00:00:00 2001 From: neil-the-nowledgable <254185769+neil-the-nowledgable@users.noreply.github.com> Date: Thu, 7 May 2026 11:01:21 -0400 Subject: [PATCH] Add sm87 to aarch64 CUDA wheel targets (Jetson Orin) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds sm_87 (NVIDIA Jetson Orin: Nano / NX / AGX) to the aarch64 build_capability list in .github/scripts/build-cuda.sh and documents the addition in installation.mdx. Why an explicit cubin is needed: the CMake arch logic at CMakeLists.txt:226-230 only emits PTX for the latest capability. sm87 hardware can't JIT from sm90+ PTX (forward-compat is upward- only), so aarch64 wheels currently targeting sm75/sm80/sm90 ship PTX only for sm90 and Jetson Orin users fall back to slow or unsupported paths. This rebuts the "sm80 should cover sm87" reasoning that closed #1781. Wheel size impact (measured on Linux aarch64, CUDA 12.6.68, source HEAD a57d8e2): baseline (sm75;80;90): 5,710,520 bytes (5.45 MiB) with sm87 (sm75;80;87;90): 7,353,064 bytes (7.01 MiB) delta: +1,642,544 bytes (+1.57 MiB, +28.76%) Adds tests/test_linear4bit_sm87_multishape_regression.py — pytest reproducer for the multi-shape Linear4bit cold-start fault on sm_87 (#1936). The test runs the historical failing recipe (NF4 + bf16 quant_storage + bf16 compute + double_quant + ABC shape order + no hygiene + batch=1) at sm_87 cold-state. The fault is cold-start-specific; the test docstring documents the warm/cold distinction so CI runners can configure accordingly. Closes #1930 Closes #1218 Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/build-cuda.sh | 9 +- docs/source/installation.mdx | 4 +- ...t_linear4bit_sm87_multishape_regression.py | 103 ++++++++++++++++++ 3 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 tests/test_linear4bit_sm87_multishape_regression.py diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh index 9bb2f1a66..2d9d06b7c 100644 --- a/.github/scripts/build-cuda.sh +++ b/.github/scripts/build-cuda.sh @@ -9,13 +9,16 @@ set -xeuo pipefail if [[ -v cuda_targets ]]; then build_capability="${cuda_targets}" elif [ "${build_arch}" = "aarch64" ]; then - build_capability="75;80;90" + # sm87 (Jetson Orin) is aarch64-only and needs an explicit cubin: only + # the latest capability gets PTX, so sm87 hardware cannot JIT from + # sm90+ PTX. See bitsandbytes-foundation/bitsandbytes#1930. + build_capability="75;80;87;90" # CUDA 12.8-12.9: Add sm100/sm120 - [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120" + [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;87;90;100;120" # CUDA 13.0+: Add sm100/sm110/sm120 - [[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;90;100;110;120;121" + [[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;87;90;100;110;120;121" else # By default, target Pascal through Hopper. build_capability="60;70;75;80;86;89;90" diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index b9d48603a..b0fc0f3c7 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -52,8 +52,8 @@ The currently distributed `bitsandbytes` packages are built with the following c | **Linux x86-64** | 11.8 - 12.6 | GCC 11.2 | sm60, sm70, sm75, sm80, sm86, sm89, sm90 | **Linux x86-64** | 12.8 - 12.9 | GCC 11.2 | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120 | **Linux x86-64** | 13.0 | GCC 11.2 | sm75, sm80, sm86, sm89, sm90, sm100, sm120 -| **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm90 -| **Linux aarch64** | 12.8 - 13.0 | GCC 11.2 | sm75, sm80, sm90, sm100, sm110, sm120, sm121 +| **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm87, sm90 +| **Linux aarch64** | 12.8 - 13.0 | GCC 11.2 | sm75, sm80, sm87, sm90, sm100, sm110, sm120, sm121 | **Windows x86-64** | 11.8 - 12.6 | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90 | **Windows x86-64** | 12.8 - 12.9 | MSVC 19.43+ (VS2022) | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120 | **Windows x86-64** | 13.0 | MSVC 19.43+ (VS2022) | sm75, sm80, sm86, sm89, sm90, sm100, sm120 diff --git a/tests/test_linear4bit_sm87_multishape_regression.py b/tests/test_linear4bit_sm87_multishape_regression.py new file mode 100644 index 000000000..14657da77 --- /dev/null +++ b/tests/test_linear4bit_sm87_multishape_regression.py @@ -0,0 +1,103 @@ +"""Regression test for #1936: sm_87 multi-shape Linear4bit reboot. + +Bug history +----------- +On bnb 0.46.1 against an NVIDIA Jetson Orin (sm_87) at nvpmodel +MAXN_SUPER, the host kernel reboots when three Linear4bit layers are +constructed and forwarded in sequence with the following recipe: + + - shape order: monotonically-increasing by output-feature product (A → B → C below) + - quant_type: NF4 + - quant_storage: torch.bfloat16 (FSDP-compatible) + - compute_dtype: torch.bfloat16 + - compress_statistics (double_quant): True + - no inter-layer memory hygiene (no `del layer` or `empty_cache`) + - batch size: 1 + +The fault is overwhelmingly cold-start-specific (~78% reboot rate +at cold-start, 0% at warm state across N=29+ warm-state samples). +bnb 0.49.2 also reboots at cold-start (N=1) — the original "fixed +in 0.49.2" framing was a warm-state artifact and was retracted in +a 2026-05-05 comment on the issue. A 256x256 NF4 forward executed +as the first GPU op after boot closes the cold-start race window +(N=3 verified). This test runs the original failing recipe at +sm_87 so any future regression surfaces in CI; CI runners targeting +sm_87 are expected to provide cold-state (e.g., reboot before the +test run) for it to fire reliably. + +References +---------- +- Issue: https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1936 +- Reproduced N=4 across two physically separate Jetson Orin Nano + Super 8GB units; fault travels with silicon + bnb build, not one + defective board. +- A 13-test orthogonal-axis bisection found six axes (shape order, + quant_type, quant_storage, compute_dtype, double_quant, hygiene) + each independently sufficient to prevent the fault. The recipe in + this test is the unique intersection that triggers it on 0.46.1. + +Caveats +------- +- On a *broken* bnb the test crashes the host (system reboot), not + just the test runner — failure mode is OS-level. pytest cannot + capture this; absence of test output IS the regression signal. +- The bug is timing-sensitive (race condition); lower-power + nvpmodel modes (15W / 25W) prevent it. The test does not enforce + a power mode — sm_87 CI is expected to run at MAXN. +- Skipped on all non-sm_87 hardware. sm_87 is exclusive to the + Jetson Orin family (Nano / NX / AGX). +""" + +import pytest +import torch + +import bitsandbytes as bnb + +pytestmark = pytest.mark.skipif( + not torch.cuda.is_available() or torch.cuda.get_device_capability() != (8, 7), + reason="Regression test for sm_87 (NVIDIA Jetson Orin family) only", +) + + +# Real-world shapes from Llama-3 / Qwen-2.5 / Mistral lm_head dimensions, +# selected to match the historical #1936 reproducer. Monotonically +# increasing by output-feature product is load-bearing — bisection found +# all five non-ABC permutations (ACB, BAC, BCA, CAB, CBA) pass cleanly. +# Do not substitute toy shapes here without re-validating against the +# historical repro. +SHAPES_ABC = [ + (4096, 32768), # A + (4096, 128256), # B + (3584, 152064), # C +] + + +def test_linear4bit_multishape_bf16_storage_no_fault(): + """Run the #1936 recipe; assert each forward completes without fault. + + Cold-start race: at sm_87 cold-state on a buggy bnb the host reboots + before the third forward returns. At warm-state (any prior bnb-NF4 op + in the same session) every bnb version tested passes — see module + docstring for the cold/warm characterization. + """ + for in_features, out_features in SHAPES_ABC: + layer = bnb.nn.Linear4bit( + in_features, + out_features, + bias=False, + compute_dtype=torch.bfloat16, + compress_statistics=True, # double_quant + quant_type="nf4", + quant_storage=torch.bfloat16, # FSDP-compatible storage path + ).to("cuda") + + x = torch.randn(1, in_features, dtype=torch.bfloat16, device="cuda") + y = layer(x) + + assert y.shape == (1, out_features) + assert y.dtype == torch.bfloat16 + + # Deliberately no `del layer`, `torch.cuda.empty_cache()`, or + # `torch.cuda.synchronize()` between iterations. Any of those + # individually prevents the historical fault and would mask a + # regression of the 0.49.2 fix.