Fix snapshot: HF cache discovery, e2e benchmark on volume

lightsofapollo · claude · lightsofapollo · commit 5a56bf26e52e · 2026-03-10T13:20:52.000-06:00
- Fix _find_hf_cache_dir to check huggingface_hub constants and
  HF_HUB_CACHE env var (not just HF_HOME) — fixes 0/339 tensor
  matching on RunPod where HF uses /gpu-cli-workspaces/.cache/
- Move e2e benchmark to /gpu-cli-workspaces/ for disk space
- Clear pip cache + HF cache between scenarios for honest numbers
- Uninstall system torchvision to avoid operator conflict
- Add timing instrumentation to hydrate (python/tensors/reconstruct)
- Fix torch_dtype deprecation → dtype
- Include python/ in gpu.jsonc sync

Results (Qwen2.5-7B, RTX 4090):
  Snapshot: 0.2s (339/339 tensors matched to safetensors)
  Hydrate (warm runtime): 0.4s
  Hydrate (cold runtime): 11.6s (cloudpickle triggers transformers import)

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/gpu.jsonc b/gpu.jsonc
@@ -6,7 +6,7 @@
     { "type": "RTX A6000" }
   ],
   "min_vram": 24,
-  "include": ["bin/"],
+  "include": ["bin/", "tests/", "python/"],
   "outputs": ["benches/results/"],
   "environment": {
     "system": {
diff --git a/python/zerostart/snapshot.py b/python/zerostart/snapshot.py
@@ -204,24 +204,57 @@ def _find_safetensors_for_model(module: Any) -> list[Path]:
 
 def _find_hf_cache_dir(model_id: str) -> Path | None:
     """Find the HF hub cache directory for a model."""
-    hf_home = Path(os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface")))
-    hub_dir = hf_home / "hub"
-
-    # HF cache structure: hub/models--org--name/snapshots/<hash>/
     safe_id = model_id.replace("/", "--")
-    model_dir = hub_dir / f"models--{safe_id}"
+    model_subdir = f"models--{safe_id}"
+
+    # Check multiple possible HF cache locations in order of priority:
+    # 1. huggingface_hub constants (most reliable — reads all HF env vars)
+    # 2. HF_HUB_CACHE env var
+    # 3. HF_HOME env var + /hub
+    # 4. Default ~/.cache/huggingface/hub
+    candidates: list[Path] = []
+
+    try:
+        from huggingface_hub import constants
+        candidates.append(Path(constants.HF_HUB_CACHE))
+    except ImportError:
+        pass
+
+    if hf_hub_cache := os.environ.get("HF_HUB_CACHE"):
+        candidates.append(Path(hf_hub_cache))
+
+    if hf_home := os.environ.get("HF_HOME"):
+        candidates.append(Path(hf_home) / "hub")
 
-    if not model_dir.is_dir():
-        return None
+    candidates.append(Path(os.path.expanduser("~/.cache/huggingface/hub")))
 
-    # Find the latest snapshot
-    snapshots = model_dir / "snapshots"
-    if not snapshots.is_dir():
-        return None
+    # Dedupe preserving order
+    seen: set[str] = set()
+    unique: list[Path] = []
+    for c in candidates:
+        key = str(c)
+        if key not in seen:
+            seen.add(key)
+            unique.append(c)
 
-    # Get the most recent snapshot directory
-    snap_dirs = sorted(snapshots.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
-    return snap_dirs[0] if snap_dirs else None
+    for hub_dir in unique:
+        model_dir = hub_dir / model_subdir
+        if not model_dir.is_dir():
+            continue
+
+        snapshots = model_dir / "snapshots"
+        if not snapshots.is_dir():
+            continue
+
+        snap_dirs = sorted(snapshots.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
+        if snap_dirs:
+            result = snap_dirs[0]
+            sf_count = len(list(result.glob("*.safetensors")))
+            log.info("Found HF cache for %s at %s (%d safetensors files)", model_id, result, sf_count)
+            return result
+
+    log.warning("Could not find HF cache for %s in %s", model_id, [str(c) for c in unique])
+    return None
 
 
 def _build_tensor_to_file_map(
@@ -552,6 +585,7 @@ def hydrate(
     log.info("Tensors loaded via mmap (%.3fs, %d tensors)", t_mmap, len(loaded_tensors))
 
     # 4. Reconstruct state: wire tensors back into Python objects
+    t_reconstruct_start = time.monotonic()
     restored_state: dict[str, Any] = {}
 
     for key, value in cleaned_state.items():
@@ -570,10 +604,11 @@ def hydrate(
         else:
             restored_state[key] = value
 
+    t_reconstruct = time.monotonic() - t_reconstruct_start
     elapsed = time.monotonic() - t0
     log.info(
-        "Hydration complete (%.3fs total: %.3fs python + %.3fs tensors)",
-        elapsed, t_python, t_mmap,
+        "Hydration complete (%.3fs total: %.3fs python + %.3fs tensors + %.3fs reconstruct)",
+        elapsed, t_python, t_mmap, t_reconstruct,
     )
 
     return restored_state
@@ -595,22 +630,29 @@ def _reconstruct_module(
     if config and config.get("_type") == "transformers":
         try:
             import importlib
+            t_import = time.monotonic()
             model_module = importlib.import_module(config["_module"])
             model_class = getattr(model_module, config["_class"])
 
             config_module = importlib.import_module(config["config_module"])
             config_class = getattr(config_module, config["config_class"])
+            t_import_done = time.monotonic()
 
             model_config = config_class.from_dict(config["config_dict"])
 
             # Create model on meta device (zero memory allocation) then
             # replace meta tensors with real data via load_state_dict(assign=True).
             # This matches how from_pretrained works: 0.4s instead of 80s.
+            t_meta = time.monotonic()
             with _no_init_weights():
                 with torch.device("meta"):
                     module = model_class(model_config)
+            t_meta_done = time.monotonic()
 
-            log.info("Reconstructed %s from config (meta device)", config["_class"])
+            log.info(
+                "Reconstructed %s (import=%.2fs, meta_init=%.2fs)",
+                config["_class"], t_import_done - t_import, t_meta_done - t_meta,
+            )
         except Exception as e:
             log.warning("Failed to reconstruct from config: %s", e)
 
@@ -634,21 +676,29 @@ def _reconstruct_module(
             state_dict[param_name] = tensor
 
     if state_dict:
+        t_load = time.monotonic()
         try:
             module.load_state_dict(state_dict, strict=False, assign=True)
         except TypeError:
             # older torch doesn't have assign=True
             module.load_state_dict(state_dict, strict=False)
+        t_load_done = time.monotonic()
 
         # Re-tie weights (e.g., lm_head.weight = wte.weight in GPT-2)
         if hasattr(module, "tie_weights"):
             module.tie_weights()
 
+        log.info("load_state_dict: %.2fs (%d tensors)", t_load_done - t_load, len(state_dict))
+
     # Materialize any remaining meta tensors (computed buffers like
     # rotary_emb.inv_freq that aren't in state_dict/safetensors).
     # These need to be recreated by calling the module's init logic
     # just for the specific submodules that still have meta tensors.
+    t_mat = time.monotonic()
     _materialize_meta_tensors(module)
+    t_mat_done = time.monotonic()
+    if t_mat_done - t_mat > 0.01:
+        log.info("materialize_meta_tensors: %.2fs", t_mat_done - t_mat)
 
     # Move to device if requested
     if device:
diff --git a/tests/test_e2e_cold_start.sh b/tests/test_e2e_cold_start.sh
@@ -4,7 +4,6 @@ set -uo pipefail
 echo "=== End-to-End Cold Start Benchmark ==="
 echo "Date: $(date -u)"
 echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'none')"
-df -h /tmp | tail -1 | awk '{print "Disk: " $4 " free"}'
 echo ""
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
@@ -15,15 +14,30 @@ export PYTHONPATH="$PROJECT_DIR/python:${PYTHONPATH:-}"
 
 MODEL_ID="${SNAP_MODEL:-Qwen/Qwen2.5-7B}"
 
+# All temp data on the volume (more space than /tmp)
+BENCH_DIR="/gpu-cli-workspaces/.bench-e2e"
+rm -rf "$BENCH_DIR"
+mkdir -p "$BENCH_DIR"
+
+# Remove system torchvision that conflicts with fresh torch installs
+pip uninstall -y torchvision 2>/dev/null || true
+
+# Clear pip cache so scenario 1 is a true cold install
+pip cache purge 2>/dev/null || true
+
+# Clear HF cache so model download is truly cold
+export HF_HOME="$BENCH_DIR/hf-cache"
+
+df -h /gpu-cli-workspaces | tail -1 | awk '{print "Disk: " $4 " free on /gpu-cli-workspaces"}'
+echo ""
+
 # ============================================================
 # Scenario 1: pip install + from_pretrained (traditional)
 # ============================================================
 echo "--- Scenario 1: pip install + from_pretrained ---"
-# Clean slate
-rm -rf /tmp/.pip-bench-venv
 BENCH_START=$(date +%s%3N)
 
-cat > /tmp/bench_pip.py << PYEOF
+cat > "$BENCH_DIR/bench_pip.py" << PYEOF
 import time
 t_script = time.monotonic()
 
@@ -33,7 +47,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 t_import = time.monotonic()
 
 tokenizer = AutoTokenizer.from_pretrained("$MODEL_ID")
-model = AutoModelForCausalLM.from_pretrained("$MODEL_ID", torch_dtype=torch.bfloat16, device_map="cpu")
+model = AutoModelForCausalLM.from_pretrained("$MODEL_ID", dtype=torch.bfloat16, device_map="cpu")
 model.eval()
 t_model = time.monotonic()
 
@@ -47,27 +61,30 @@ print(f"RESULT: {result}")
 print(f"TIME import={t_import-t_script:.2f}s model={t_model-t_import:.2f}s inference={t_inf-t_model:.2f}s total={t_inf-t_script:.2f}s")
 PYEOF
 
-# Install into a fresh venv (simulates cold container)
-python3 -m venv /tmp/.pip-bench-venv
-/tmp/.pip-bench-venv/bin/pip install -q torch transformers accelerate 2>&1 | tail -3
+# Fresh venv, no pip cache — true cold install
+python3 -m venv "$BENCH_DIR/pip-venv"
+"$BENCH_DIR/pip-venv/bin/pip" install --no-cache-dir -q torch transformers accelerate 2>&1 | tail -3
 PIP_DONE=$(date +%s%3N)
 echo "  pip install: $(( PIP_DONE - BENCH_START ))ms"
 
-/tmp/.pip-bench-venv/bin/python /tmp/bench_pip.py 2>&1 | grep -E "^(RESULT|TIME)"
+"$BENCH_DIR/pip-venv/bin/python" "$BENCH_DIR/bench_pip.py" 2>&1 | tail -30
 BENCH_END=$(date +%s%3N)
 echo "  Total wall clock (install + load + inference): $(( BENCH_END - BENCH_START ))ms"
-rm -rf /tmp/.pip-bench-venv
+rm -rf "$BENCH_DIR/pip-venv"
 echo ""
 
+# Clear HF cache so scenario 2 also downloads fresh
+rm -rf "$HF_HOME"
+
 # ============================================================
 # Scenario 2: zerostart cold + from_pretrained
 # ============================================================
 echo "--- Scenario 2: zerostart cold + from_pretrained ---"
-export ZEROSTART_CACHE="/tmp/.zs-e2e-bench"
+export ZEROSTART_CACHE="$BENCH_DIR/zs-cache"
 export ZS_NO_SHARED_CACHE=1
 rm -rf "$ZEROSTART_CACHE"
 
-cat > /tmp/bench_zs_cold.py << PYEOF
+cat > "$BENCH_DIR/bench_zs_cold.py" << PYEOF
 import time
 t_script = time.monotonic()
 
@@ -77,7 +94,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 t_import = time.monotonic()
 
 tokenizer = AutoTokenizer.from_pretrained("$MODEL_ID")
-model = AutoModelForCausalLM.from_pretrained("$MODEL_ID", torch_dtype=torch.bfloat16, device_map="cpu")
+model = AutoModelForCausalLM.from_pretrained("$MODEL_ID", dtype=torch.bfloat16, device_map="cpu")
 model.eval()
 t_model = time.monotonic()
 
@@ -92,7 +109,7 @@ print(f"TIME import={t_import-t_script:.2f}s model={t_model-t_import:.2f}s infer
 PYEOF
 
 ZS_START=$(date +%s%3N)
-$ZS run -p torch -p transformers -p accelerate /tmp/bench_zs_cold.py 2>&1 | grep -E "^(RESULT|TIME|Resolved|Daemon|Environment|Cache)"
+$ZS run -p torch -p transformers -p accelerate "$BENCH_DIR/bench_zs_cold.py" 2>&1 | tail -30
 ZS_END=$(date +%s%3N)
 echo "  Total wall clock (zerostart cold + load + inference): $(( ZS_END - ZS_START ))ms"
 echo ""
@@ -101,10 +118,11 @@ echo ""
 # Scenario 3: zerostart warm + from_pretrained
 # ============================================================
 echo "--- Scenario 3: zerostart warm + from_pretrained ---"
-# Cache is now populated from Scenario 2
+# zerostart package cache is warm from Scenario 2
+# HF model cache is warm from Scenario 2
 
 ZS_WARM_START=$(date +%s%3N)
-$ZS run -p torch -p transformers -p accelerate /tmp/bench_zs_cold.py 2>&1 | grep -E "^(RESULT|TIME|Cache)"
+$ZS run -p torch -p transformers -p accelerate "$BENCH_DIR/bench_zs_cold.py" 2>&1 | tail -30
 ZS_WARM_END=$(date +%s%3N)
 echo "  Total wall clock (zerostart warm + load + inference): $(( ZS_WARM_END - ZS_WARM_START ))ms"
 echo ""
@@ -114,39 +132,46 @@ echo ""
 # ============================================================
 echo "--- Scenario 4: Create snapshot for hydrate ---"
 
-cat > /tmp/bench_create_snap.py << PYEOF
-import time
+cat > "$BENCH_DIR/bench_create_snap.py" << PYEOF
+import time, logging
+logging.basicConfig(level=logging.INFO, format="%(name)-20s %(message)s")
 t0 = time.monotonic()
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from zerostart.snapshot import snapshot
 
 tokenizer = AutoTokenizer.from_pretrained("$MODEL_ID")
-model = AutoModelForCausalLM.from_pretrained("$MODEL_ID", torch_dtype=torch.bfloat16, device_map="cpu")
+model = AutoModelForCausalLM.from_pretrained("$MODEL_ID", dtype=torch.bfloat16, device_map="cpu")
 model.eval()
 
 import shutil
-shutil.rmtree("/tmp/e2e-snapshot", ignore_errors=True)
-snapshot(state={"model": model, "tokenizer": tokenizer}, path="/tmp/e2e-snapshot")
+shutil.rmtree("$BENCH_DIR/e2e-snapshot", ignore_errors=True)
+snapshot(state={"model": model, "tokenizer": tokenizer}, path="$BENCH_DIR/e2e-snapshot")
 t1 = time.monotonic()
 print(f"Snapshot created in {t1-t0:.2f}s")
 PYEOF
 
-$ZS run -p torch -p transformers -p accelerate -p cloudpickle /tmp/bench_create_snap.py 2>&1 | grep -E "^(Snapshot|Cache)"
+$ZS run -p torch -p transformers -p accelerate -p cloudpickle "$BENCH_DIR/bench_create_snap.py" 2>&1 | tail -30
 
 echo ""
 echo "--- Scenario 4: zerostart warm + hydrate + inference ---"
 
-cat > /tmp/bench_hydrate.py << PYEOF
-import time
+# Reuse the warm zerostart package cache from scenarios 2/3 —
+# the comparison is model loading (hydrate vs from_pretrained),
+# not package installation.
+export ZEROSTART_CACHE="$BENCH_DIR/zs-cache"
+
+cat > "$BENCH_DIR/bench_hydrate.py" << PYEOF
+import time, logging
+logging.basicConfig(level=logging.INFO, format="%(name)-20s %(message)s")
 t_script = time.monotonic()
 
 import torch
 from zerostart.snapshot import hydrate
 
 t_import = time.monotonic()
 
-restored = hydrate("/tmp/e2e-snapshot")
+restored = hydrate("$BENCH_DIR/e2e-snapshot")
 model = restored["model"]
 model.eval()
 tokenizer = restored["tokenizer"]
@@ -163,7 +188,7 @@ print(f"TIME import={t_import-t_script:.2f}s hydrate={t_hydrate-t_import:.2f}s i
 PYEOF
 
 ZS_HYD_START=$(date +%s%3N)
-$ZS run -p torch -p transformers -p accelerate -p cloudpickle /tmp/bench_hydrate.py 2>&1 | grep -E "^(RESULT|TIME|Cache)"
+$ZS run -p torch -p transformers -p accelerate -p cloudpickle "$BENCH_DIR/bench_hydrate.py" 2>&1 | tail -30
 ZS_HYD_END=$(date +%s%3N)
 echo "  Total wall clock (zerostart warm + hydrate + inference): $(( ZS_HYD_END - ZS_HYD_START ))ms"
 echo ""
@@ -180,3 +205,6 @@ echo "  2. zerostart cold + from_pretrained:     $(( ZS_END - ZS_START ))ms"
 echo "  3. zerostart warm + from_pretrained:     $(( ZS_WARM_END - ZS_WARM_START ))ms"
 echo "  4. zerostart warm + hydrate (snapshot):  $(( ZS_HYD_END - ZS_HYD_START ))ms"
 echo "============================================================"
+
+# Cleanup
+rm -rf "$BENCH_DIR"