refparity: --dtype option for memory-constrained HF reference runs

unamedkr · claude · unamedkr · commit 6727a7405bdc · 2026-04-21T13:51:30.000+09:00
Adds FP32/BF16/FP16 selection to hf_reference.py; plumbs per-entry dtype
through matrix.json → run_matrix.sh. Unblocks 4B-class models on 16 GB
machines (BF16 halves memory so a ~4B model can sit next to a 4B GGUF
engine run in the same 16 GB).

Also adds `_disabled: true` entry filter for matrix.json and documents the
intended (but currently oversized) DeltaNet-hybrid entry. Real 4B DeltaNet
comparison target is TBD — none of Qwen's DeltaNet HF releases are &lt;8B.

Smoke-tested BF16 load: Qwen3-0.6B top1 matches FP32 exactly.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tools/refparity/hf_reference.py b/tools/refparity/hf_reference.py
@@ -20,14 +20,27 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
-def run(model_name: str, prompt: str, out_path: str) -> int:
-    print(f"[refparity/hf] model={model_name}", file=sys.stderr)
+DTYPE_MAP = {
+    "float32": torch.float32, "fp32": torch.float32,
+    "bfloat16": torch.bfloat16, "bf16": torch.bfloat16,
+    "float16": torch.float16, "fp16": torch.float16,
+}
+
+
+def run(model_name: str, prompt: str, out_path: str,
+        dtype: str = "float32") -> int:
+    torch_dtype = DTYPE_MAP.get(dtype)
+    if torch_dtype is None:
+        print(f"error: unknown dtype {dtype!r}; valid: {list(DTYPE_MAP)}",
+              file=sys.stderr)
+        return 2
+    print(f"[refparity/hf] model={model_name} dtype={dtype}", file=sys.stderr)
     print(f"[refparity/hf] prompt: {prompt[:80]!r}{'...' if len(prompt) > 80 else ''}",
           file=sys.stderr)
 
     tok = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(
-        model_name, dtype=torch.float32, device_map="cpu")
+        model_name, dtype=torch_dtype, device_map="cpu")
     model.eval()
 
     ids = tok.encode(prompt, return_tensors="pt")
@@ -77,6 +90,10 @@ def main():
     group.add_argument("--prompt", help="literal prompt text")
     group.add_argument("--prompt-file", help="read prompt from file")
     ap.add_argument("--out", required=True, help="output .npz path")
+    ap.add_argument("--dtype", default="float32",
+                    choices=list(DTYPE_MAP.keys()),
+                    help="HF model dtype (default: float32). Use bfloat16 "
+                         "for 4B+ models on 16 GB machines.")
     args = ap.parse_args()
 
     if args.prompt:
@@ -85,7 +102,7 @@ def main():
         with open(args.prompt_file) as f:
             prompt = f.read().rstrip("\n")
 
-    return run(args.model, prompt, args.out)
+    return run(args.model, prompt, args.out, dtype=args.dtype)
 
 
 if __name__ == "__main__":
diff --git a/tools/refparity/matrix.json b/tools/refparity/matrix.json
@@ -15,15 +15,16 @@
     },
     {
       "name": "qwen3.5_4b_hybrid",
-      "hf_model": "Qwen/Qwen3.5-4B-Thinking",
+      "hf_model": "Qwen/Qwen3-Next-80B-A3B-Thinking",
       "engine_gguf": "Qwen3.5-4B-Q4_K_M.gguf",
-      "_note": "Hybrid DeltaNet+self-attn reference. HF model may need download.",
+      "_note": "Hybrid DeltaNet+self-attn. HF ref too big for 16 GB (80B). Use BF16 fallback or pick a smaller DeltaNet-class model. Currently DISABLED — kept for documentation.",
+      "_disabled": true,
       "prompts": [
-        "Hello",
-        "def fibonacci(n):"
+        "Hello"
       ],
       "threshold_l2_rel": 0.05,
-      "threshold_cosine": 0.90
+      "threshold_cosine": 0.90,
+      "dtype": "bfloat16"
     },
     {
       "name": "llama3.2_1b",
diff --git a/tools/refparity/run_matrix.sh b/tools/refparity/run_matrix.sh
@@ -39,14 +39,16 @@ with open("$MATRIX") as f:
     m = json.load(f)
 filt = "$FILTER"
 for t in m.get("tests", []):
+    if t.get("_disabled"):
+        continue
     if filt and filt not in t["name"]:
         continue
     gguf = t["engine_gguf"]
     for prompt in t["prompts"]:
         # bash-escaped prompt (base64 roundtrip to avoid quoting hell)
         import base64
         p64 = base64.b64encode(prompt.encode()).decode()
-        print(f"{t['name']}|{t['hf_model']}|{gguf}|{p64}|{t.get('threshold_l2_rel', 0.05)}|{t.get('threshold_cosine', 0.90)}")
+        print(f"{t['name']}|{t['hf_model']}|{gguf}|{p64}|{t.get('threshold_l2_rel', 0.05)}|{t.get('threshold_cosine', 0.90)}|{t.get('dtype', 'float32')}")
 PY
 )
 
@@ -66,7 +68,7 @@ echo ""
 
 PREV_NAME=""
 IDX=0
-while IFS='|' read -r NAME HF_MODEL GGUF P64 TH_L2 TH_COS; do
+while IFS='|' read -r NAME HF_MODEL GGUF P64 TH_L2 TH_COS DTYPE; do
     [[ -z "$NAME" ]] && continue
     PROMPT=$(echo "$P64" | base64 -d)
     TOTAL=$((TOTAL + 1))
@@ -93,7 +95,7 @@ while IFS='|' read -r NAME HF_MODEL GGUF P64 TH_L2 TH_COS; do
 
     # HF reference dump (one per slot — different prompts produce different tokens)
     REF_NPZ="$WORK_DIR/$SLOT.npz"
-    if ! python hf_reference.py --model "$HF_MODEL" --prompt "$PROMPT" --out "$REF_NPZ" 2>"$WORK_DIR/hf.err"; then
+    if ! python hf_reference.py --model "$HF_MODEL" --prompt "$PROMPT" --out "$REF_NPZ" --dtype "${DTYPE:-float32}" 2>"$WORK_DIR/hf.err"; then
         echo "   [ERROR] HF reference failed:"
         sed 's/^/     /' "$WORK_DIR/hf.err"
         FAILED_ENTRIES+=("$SLOT: hf_reference failed")