diff --git a/jit-diag.sh b/jit-diag.sh
index 6934c34..fcdad3a 100755
--- a/jit-diag.sh
+++ b/jit-diag.sh
@@ -1,20 +1,57 @@
 #!/bin/bash
 # JIT diagnostic launcher — runs emulator and captures output for analysis
-# Usage: ./jit-diag.sh [mode]
+# Usage: ./jit-diag.sh [-f|--features <list>] [mode]
 #   mode: "jit"      — JIT enabled (default)
 #         "verify"   — JIT with verification
 #         "nojit"    — interpreter only through JIT dispatch
 #         "interp"   — pure interpreter (no JIT feature, baseline)
 #         "perf"     — perf profile, interpreter only (text report for analysis)
 #         "perf-jit" — perf profile with JIT enabled
+#         "smoke"    — headless boot smoke test
+#
+#   -f / --features <list>  comma-separated extra features appended to the
+#                           mode's base feature list (e.g. "developer" to
+#                           enable dlog_dev! macros for IRIS_DEBUG_LOG tracing)
 #
 # All IRIS_JIT_* env vars are passed through automatically:
 #   IRIS_JIT_MAX_TIER=0 ./jit-diag.sh jit
 #   IRIS_JIT_PROBE=500 IRIS_JIT_PROBE_MIN=100 ./jit-diag.sh jit
+#   IRIS_DEBUG_LOG=mc ./jit-diag.sh -f developer interp
+
+EXTRA_FEATURES=""
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -f|--features)
+            EXTRA_FEATURES="$2"
+            shift 2
+            ;;
+        -h|--help)
+            sed -n '2,20p' "$0"
+            exit 0
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
 
 MODE="${1:-jit}"
 OUTFILE="jit-diag-$(date +%Y%m%d-%H%M%S)-${MODE}.log"
 
+# Use the ncargo wrapper so we hit the rustup-pinned nightly toolchain
+# (rust-toolchain.toml) regardless of any homebrew rust on PATH.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CARGO="${SCRIPT_DIR}/ncargo"
+
+features() {
+    local base="$1"
+    if [[ -n "$EXTRA_FEATURES" ]]; then
+        echo "${base},${EXTRA_FEATURES}"
+    else
+        echo "$base"
+    fi
+}
+
 # Collect all IRIS_JIT_* env vars for display and passthrough
 JIT_VARS=$(env | grep '^IRIS_JIT_' | tr '\n' ' ')
 
@@ -22,31 +59,36 @@ echo "=== IRIS JIT Diagnostic ===" | tee "$OUTFILE"
 echo "Mode: $MODE" | tee -a "$OUTFILE"
 echo "Date: $(date)" | tee -a "$OUTFILE"
 echo "Host: $(uname -m) $(uname -s) $(uname -r)" | tee -a "$OUTFILE"
-echo "Rust: $(rustc --version)" | tee -a "$OUTFILE"
+RUSTC_BIN="$("$HOME/.cargo/bin/rustup" which rustc 2>/dev/null || command -v rustc)"
+echo "Rust: $("$RUSTC_BIN" --version)" | tee -a "$OUTFILE"
 [ -n "$JIT_VARS" ] && echo "Env: $JIT_VARS" | tee -a "$OUTFILE"
 echo "" | tee -a "$OUTFILE"
 
 case "$MODE" in
   jit)
-    echo "Running: IRIS_JIT=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features jit,lightning)"
+    echo "Running: IRIS_JIT=1 ${JIT_VARS}${CARGO} run --release --features ${F}" | tee -a "$OUTFILE"
+    IRIS_JIT=1 "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   verify)
-    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features jit,lightning)"
+    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 ${JIT_VARS}${CARGO} run --release --features ${F}" | tee -a "$OUTFILE"
+    IRIS_JIT=1 IRIS_JIT_VERIFY=1 "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   nojit)
-    echo "Running: cargo run --release --features jit,lightning (no IRIS_JIT)" | tee -a "$OUTFILE"
-    cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features jit,lightning)"
+    echo "Running: ${CARGO} run --release --features ${F} (no IRIS_JIT)" | tee -a "$OUTFILE"
+    "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   interp)
-    echo "Running: cargo run --release --features lightning (no jit feature)" | tee -a "$OUTFILE"
-    cargo run --release --features lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features lightning)"
+    echo "Running: ${CARGO} run --release --features ${F} (no jit feature)" | tee -a "$OUTFILE"
+    "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   perf)
     PERFREPORT="perf-report-$(date +%Y%m%d-%H%M%S).txt"
     echo "Building (profiling profile, no jit feature)..." | tee -a "$OUTFILE"
-    cargo build --profile profiling --features lightning 2>&1 | tee -a "$OUTFILE"
+    "$CARGO" build --profile profiling --features lightning 2>&1 | tee -a "$OUTFILE"
     echo "--- Press Ctrl-C when you have enough samples ---"
     perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
     echo "Processing perf data..." | tee -a "$OUTFILE"
@@ -56,16 +98,107 @@ case "$MODE" in
   perf-jit)
     PERFREPORT="perf-report-jit-$(date +%Y%m%d-%H%M%S).txt"
     echo "Building (profiling profile, jit feature)..." | tee -a "$OUTFILE"
-    cargo build --profile profiling --features jit,lightning 2>&1 | tee -a "$OUTFILE"
-    echo "--- Press Ctrl-C when you have enough samples ---"
-    IRIS_JIT=1 perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
-    echo "Processing perf data..." | tee -a "$OUTFILE"
-    perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+    "$CARGO" build --profile profiling --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    case "$(uname -s)" in
+      Linux)
+        echo "--- Press Ctrl-C when you have enough samples ---"
+        IRIS_JIT=1 perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
+        echo "Processing perf data..." | tee -a "$OUTFILE"
+        perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+        ;;
+      Darwin)
+        DURATION="${PERF_DURATION:-30}"
+        echo "--- Launching iris; will sample for ${DURATION}s after it starts ---"
+        IRIS_JIT=1 ./target/profiling/iris &
+        IRIS_PID=$!
+        sleep 2
+        echo "Sampling PID $IRIS_PID for ${DURATION}s..."
+        sample "$IRIS_PID" "$DURATION" -f "$PERFREPORT" 2>&1 || true
+        kill "$IRIS_PID" 2>/dev/null
+        wait "$IRIS_PID" 2>/dev/null
+        ;;
+      *)
+        echo "Unsupported platform for profiling: $(uname -s)"
+        exit 1
+        ;;
+    esac
     echo "Perf report saved to: $PERFREPORT"
     ;;
+  smoke)
+    # Headless boot smoke test: boots IRIX with JIT, checks milestones, exits.
+    # Uses COW overlay to protect disk image. Exits 0 if all milestones pass.
+    TIMEOUT="${IRIS_SMOKE_TIMEOUT:-120}"
+    echo "Running: headless smoke test (timeout=${TIMEOUT}s)" | tee -a "$OUTFILE"
+
+    # Clean up stale overlays
+    rm -f scsi1.raw.overlay scsi2.raw.overlay
+
+    # Build
+    "$CARGO" build --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+
+    # Run headless with JIT, capture output, kill after timeout
+    IRIS_JIT=1 timeout "$TIMEOUT" "$CARGO" run --release --features jit,lightning -- --headless 2>&1 | tee -a "$OUTFILE" &
+    EMUPID=$!
+
+    # Wait for emulator to finish or timeout
+    wait $EMUPID 2>/dev/null
+    EXIT=$?
+
+    echo "" | tee -a "$OUTFILE"
+    echo "=== Smoke Test Results ===" | tee -a "$OUTFILE"
+
+    PASS=0
+    FAIL=0
+
+    check_milestone() {
+      local name="$1"
+      local pattern="$2"
+      if grep -q "$pattern" "$OUTFILE"; then
+        echo "  PASS: $name" | tee -a "$OUTFILE"
+        PASS=$((PASS + 1))
+      else
+        echo "  FAIL: $name (pattern '$pattern' not found)" | tee -a "$OUTFILE"
+        FAIL=$((FAIL + 1))
+      fi
+    }
+
+    check_milestone "JIT initialized"    "JIT: adaptive mode"
+    check_milestone "First compilation"  "JIT: compiled #1"
+    check_milestone "Blocks compiled"    "JIT:.*blocks,"
+
+    # Check for crashes
+    if grep -qiE "KERNEL FAULT|PANIC|TLBMISS.*KERNEL|SEGV" "$OUTFILE"; then
+      echo "  FAIL: kernel panic detected" | tee -a "$OUTFILE"
+      FAIL=$((FAIL + 1))
+    else
+      echo "  PASS: no kernel panic" | tee -a "$OUTFILE"
+      PASS=$((PASS + 1))
+    fi
+
+    # Check instruction count reached a reasonable level
+    LAST_TOTAL=$(grep -oP 'JIT: \K[0-9]+(?= total)' "$OUTFILE" | tail -1)
+    if [ -n "$LAST_TOTAL" ] && [ "$LAST_TOTAL" -gt 100000000 ] 2>/dev/null; then
+      echo "  PASS: reached ${LAST_TOTAL} instructions" | tee -a "$OUTFILE"
+      PASS=$((PASS + 1))
+    else
+      echo "  FAIL: instruction count too low (${LAST_TOTAL:-0})" | tee -a "$OUTFILE"
+      FAIL=$((FAIL + 1))
+    fi
+
+    echo "" | tee -a "$OUTFILE"
+    echo "Score: $PASS passed, $FAIL failed" | tee -a "$OUTFILE"
+
+    # Clean up overlay
+    rm -f scsi1.raw.overlay scsi2.raw.overlay
+
+    if [ "$FAIL" -gt 0 ]; then
+      exit 1
+    fi
+    exit 0
+    ;;
   *)
     echo "Unknown mode: $MODE"
-    echo "Usage: $0 [jit|verify|nojit|interp|perf|perf-jit]"
+    echo "Usage: $0 [jit|verify|nojit|interp|perf|perf-jit|smoke]"
     exit 1
     ;;
 esac
diff --git a/rules/jit/cranelift-opt-levelnone-is-the-right-trade-for-throughput-jits.md b/rules/jit/cranelift-opt-levelnone-is-the-right-trade-for-throughput-jits.md
new file mode 100644
index 0000000..d17ba5b
--- /dev/null
+++ b/rules/jit/cranelift-opt-levelnone-is-the-right-trade-for-throughput-jits.md
@@ -0,0 +1,28 @@
+# Cranelift opt_level=none is the right trade for throughput JITs
+
+**Keywords:** cranelift, opt_level, compilation speed, throughput, JIT overhead, perf
+**Category:** jit
+
+# Cranelift opt_level=none for Interpreter-First JITs
+
+For a JIT where blocks compile frequently and run a few hundred times each, **`opt_level = "none"` beats `"speed"` for total throughput** by 2-3x. Generated native code is ~10-20% slower per instruction, but compile time drops ~3-5x.
+
+## Why
+Profiling showed 66% of MIPS-CPU thread time was inside Cranelift compilation passes. Switching opt_level=none preserved the % but increased total emulator throughput 2.5x — same Cranelift work, more actual execution per second. The generated code slowdown is dwarfed by the compile speedup because every block gets compiled whether it runs 10 times or 10,000 times.
+
+## How to apply
+In `BlockCompiler::new` (`src/jit/compiler.rs`):
+```rust
+flag_builder.set("opt_level", "none").unwrap();
+```
+
+This is specifically correct when:
+- The JIT is interpreter-first (blocks share execution time with interpreter)
+- Most blocks are compiled once and run a moderate number of times
+- Chain-compile-on-miss aggressively fills the cache with blocks that are used briefly
+
+It would be wrong if blocks ran billions of times each (classic hot-loop JIT scenario), where spending more compile time for better native code pays off.
+
+## Measurement
+Use macOS `sample` (or Linux `perf`) on the running emulator. Count samples inside `cranelift_*` symbols vs total thread samples. Compare instructions/second before and after (log total count, divide by wall-clock).
+
diff --git a/rules/jit/cranelift-regalloc2-helper-diamond-limit-is-platform-dependent.md b/rules/jit/cranelift-regalloc2-helper-diamond-limit-is-platform-dependent.md
new file mode 100644
index 0000000..d3d5b5a
--- /dev/null
+++ b/rules/jit/cranelift-regalloc2-helper-diamond-limit-is-platform-dependent.md
@@ -0,0 +1,30 @@
+# Cranelift regalloc2 helper-diamond limit is platform-dependent
+
+**Keywords:** cranelift, regalloc2, aarch64, x86_64, helper call, ok_block, exc_block, diamond, Full tier, block length
+**Category:** jit
+
+# Cranelift Helper-Diamond Limit Differs by Architecture
+
+Full-tier JIT blocks terminate after N load/store helper calls to avoid Cranelift regalloc2 miscompilations. The safe N is platform-specific: **aarch64 tolerates 3, x86_64 only 1**. Bumping above the threshold produces silent miscompilations (confirmed by IRIS_JIT_VERIFY catching real GPR mismatches).
+
+## Why
+Each load or store helper call emits an `ok_block` / `exc_block` CFG diamond (helper can return an exception status, so we branch after). Multiple chained diamonds create complex control flow that stresses Cranelift's regalloc2 allocator. On x86_64 (15 GPRs), register pressure plus the CFG complexity hits an edge case and produces wrong code. On aarch64 (30 GPRs), more headroom — 3 diamonds tolerable, 4 starts failing.
+
+## How to apply
+In `src/jit/dispatch.rs` `trace_block`:
+```rust
+let max_helpers: u32 = if cfg!(target_arch = "aarch64") { 3 } else { 1 };
+let mut helper_count: u32 = 0;
+// ... inside loop ...
+if tier == BlockTier::Full && (is_compilable_store(&d) || is_compilable_load(&d)) {
+    helper_count += 1;
+    instrs.push((raw, d));
+    if helper_count >= max_helpers { break; }
+}
+```
+
+Don't raise the aarch64 limit without running `IRIS_JIT_VERIFY=1` for 500M+ instructions to catch silent miscompilations. GPR mismatches at Full tier, len=3+ with off-by-small-number values (jit=0x97 interp=0x98) are the signature.
+
+## History
+Original code had `if has_helper { break; }` unconditionally with a comment citing x86_64 regalloc2 issues. Binary-searched on aarch64: max_helpers=2 works, 3 works, 4 produces real codegen mismatches in verify mode. Kept at 3 for safety margin.
+
diff --git a/rules/jit/jit-block-chaining-needs-max-chain-instrs-cap-for-interrupt-timing.md b/rules/jit/jit-block-chaining-needs-max-chain-instrs-cap-for-interrupt-timing.md
new file mode 100644
index 0000000..7047616
--- /dev/null
+++ b/rules/jit/jit-block-chaining-needs-max-chain-instrs-cap-for-interrupt-timing.md
@@ -0,0 +1,24 @@
+# JIT block chaining needs MAX_CHAIN_INSTRS cap for interrupt timing
+
+**Keywords:** jit, chaining, interrupt latency, MAX_CHAIN_INSTRS, interpreter burst, cp0_count, timing, ugly login
+**Category:** jit
+
+# JIT Chain Length Affects Interrupt Delivery
+
+Block chaining (running one cached block after another without returning to the interpreter burst) must be capped by cumulative instruction count, NOT by chain block count. 32 instructions is safe; 64 causes "ugly login" / timing-dependent corruption.
+
+## Why
+The interpreter checks pending interrupts before every single instruction. The JIT defers interrupt checking to post-block bookkeeping (cp0_count advance + merge IP bits into cp0_cause). Without chaining, a block exit returns to the interpreter burst which immediately sees the merged interrupts.
+
+With chaining, multiple blocks execute back-to-back. Each chained block does its own post-block cp0_count advance and IP-bit merge, so timer interrupts get set in cp0_cause correctly — BUT the actual interrupt dispatch is deferred until the chain ends and the interpreter runs again. Worst-case interrupt delivery latency = MAX_CHAIN_INSTRS.
+
+IRIX has code paths that depend on interrupt timing tight enough that 32 instructions is tolerable but 64 is not. Measured empirically: MAX_CHAIN_INSTRS=32 boots cleanly, =64 produces timing-dependent boot failures.
+
+## How to apply
+In the chain loop in `run_jit_dispatch`, accumulate `chain_instrs += next_block_len` and `break` when it reaches 32. Don't check "is interrupt pending" inside the chain — IRIX's level-triggered device interrupts (IP2-IP6) are almost always asserted, which would break every chain after one block.
+
+If timing-related crashes reappear after touching chain code, **reduce MAX_CHAIN_INSTRS before debugging codegen**. The user's own heuristic, validated in practice.
+
+## History
+Initial chaining implementation checked `interrupts_enabled() && (cp0_cause & cp0_status & IM) != 0` to break the chain. Broke every chain immediately because devices are constantly asserted → JIT% barely moved. Removing the check but keeping MAX_CHAIN_INSTRS=32 gave clean boots with 3-4x more JIT coverage.
+
diff --git a/rules/jit/jit-delay-slot-fault-handling-exclude-all-faulting-instructions.md b/rules/jit/jit-delay-slot-fault-handling-exclude-all-faulting-instructions.md
new file mode 100644
index 0000000..e37e965
--- /dev/null
+++ b/rules/jit/jit-delay-slot-fault-handling-exclude-all-faulting-instructions.md
@@ -0,0 +1,32 @@
+# JIT delay slot fault handling — exclude ALL faulting instructions
+
+**Keywords:** jit, delay slot, loads, stores, cp0_epc, BD bit, branch, ERET, in_delay_slot, trace_block
+**Category:** jit
+
+# JIT Delay Slot Fault Handling
+
+Any instruction that can fault (load, store, FPU op, COP0 side effect, etc.) MUST be excluded from JIT-compiled branch delay slots.
+
+## Why
+If a delay slot instruction faults (TLB miss, bus error), the JIT exception path runs `sync_to_executor`, which explicitly clears `in_delay_slot` and `delay_slot_target` (context.rs, by design — compiled blocks normally handle their own delay slots).
+
+Then `exec.step()` re-executes at the faulting PC without delay-slot context. If it faults again, `handle_exception` sets `cp0_epc = faulting_PC` with **BD=0**. On ERET, the CPU returns to the faulting load/store, not to the branch. **The branch is permanently skipped** — execution diverges silently until something crashes.
+
+## Symptoms
+Process crashes mid-boot, "ugly login screen", graphics corruption, TLB panics. Appears as silent state corruption, not a direct fault — the divergence accumulates before manifesting.
+
+## How to apply
+In `src/jit/dispatch.rs` `trace_block`, when inspecting the delay slot instruction after a branch:
+```rust
+let delay_can_fault = is_compilable_load(&delay_d) || is_compilable_store(&delay_d);
+if is_compilable_for_tier(&delay_d, tier) && !delay_can_fault {
+    // compile delay slot into block
+}
+// else: drop delay slot AND the branch (pop both; block ends before branch)
+```
+
+Whenever adding a new JIT-compilable instruction type that can fault (e.g., LWL/LWR, LL/SC, FPU loads/stores), extend `delay_can_fault` to exclude it from delay slots.
+
+## History
+The codebase already excluded stores from delay slots with a detailed comment, but the same fix wasn't applied to loads. Adding Loads tier silently corrupted IRIX boot for weeks until binary-searching block length (max=1 works, max=3 fails) isolated it.
+
diff --git a/rules/jit/jit-profile-pre-compilation-at-startup-causes-prom-hang.md b/rules/jit/jit-profile-pre-compilation-at-startup-causes-prom-hang.md
new file mode 100644
index 0000000..d63976c
--- /dev/null
+++ b/rules/jit/jit-profile-pre-compilation-at-startup-causes-prom-hang.md
@@ -0,0 +1,23 @@
+# JIT profile pre-compilation at startup causes PROM hang
+
+**Keywords:** jit, profile, pre-compilation, PROM, debug_fetch_instr, CpuBusErrorDevice, MC CPU Error, boot hang
+**Category:** jit
+
+# JIT Profile Pre-compilation Breaks Boot
+
+Pre-compiling above-Alu JIT blocks from a saved profile at startup causes the PROM to hang in a retry loop. Pre-compiling AFTER PROM exit causes IRIX kernel panics (UTLB miss).
+
+## Why (startup variant)
+Profile entries contain kernel/userspace virtual PCs from a previous session. At startup, the kernel isn't loaded yet — those physical addresses are served by `CpuBusErrorDevice` (the bus error catcher for unmapped regions). Each `debug_fetch_instr` during pre-compilation triggers `mc.report_cpu_error()`, dirtying `REG_CPU_ERROR_ADDR` / `REG_CPU_ERROR_STAT` on the emulated Memory Controller. The PROM reads those registers during hardware init, sees errors, and retries forever.
+
+## Why (post-PROM variant, UTLB panic)
+Even deferring pre-compilation until after PROM exit and compiling incrementally (64 entries per dispatch batch) triggered IRIX UTLB-miss panics shortly after kernel boot. Exact mechanism unknown — suspected that the bulk `debug_fetch_instr` calls evict L2 lines that the kernel's initial data structures depend on (L2 is inclusive of D-cache on emulated R4000), or nanotlb[Fetch] state interference. Unresolved.
+
+## How to apply
+The `load_profile()` call in `run_jit_dispatch` should NOT feed into `compile_block` during boot. Blocks compile on-demand when first hit, which is safe because the kernel is already resident by then.
+
+If you want persistent compiled blocks across sessions, store the **compiled native code bytes** in the profile rather than re-tracing. That avoids `debug_fetch_instr` entirely.
+
+## History
+Discovered when investigating why max_tier=1 (Loads) hung in PROM while max_tier=0 (Alu) booted fine. MAX_TIER=0 skipped all pre-compilation (entries get capped to Alu, then `continue`); MAX_TIER≥1 actually traced and compiled. The MC:CPU Error messages during pre-compilation were the smoking gun.
+
diff --git a/rules/jit/mips-jit-compile-on-miss-should-use-max-tier-not-alu.md b/rules/jit/mips-jit-compile-on-miss-should-use-max-tier-not-alu.md
new file mode 100644
index 0000000..85e2696
--- /dev/null
+++ b/rules/jit/mips-jit-compile-on-miss-should-use-max-tier-not-alu.md
@@ -0,0 +1,38 @@
+# MIPS JIT compile-on-miss should use max_tier, not Alu
+
+**Keywords:** jit, compile on miss, chain, max_tier, Alu, cache miss, block length
+**Category:** jit
+
+# Compile-on-Miss at max_tier (not Alu) for Chain Progression
+
+When a JIT block chain breaks due to cache miss, compile the next block at `max_tier` directly, not at Alu. This is the difference between 3.3% and 8.7% JIT coverage.
+
+## Why
+The main dispatch path compiles new blocks at Alu tier and lets them be promoted over thousands of executions. That's fine for hot code. But chain misses happen at arbitrary PCs — often where the first instruction is a load/store, which Alu-tier can't compile. The `trace_block` returns empty, compile fails, the PC stays forever uncached, and every chain break at that PC hits the same miss again.
+
+Measured: with Alu-tier compile-on-miss, ~14K of 107M chain misses actually produced new blocks. With max_tier compile-on-miss, the hit rate on subsequent chains goes up dramatically.
+
+## How to apply
+In the chain loop's miss path in `run_jit_dispatch`:
+```rust
+None => {
+    // Compile at max_tier, not Alu — chain targets often start with
+    // loads/stores that Alu can't trace past.
+    let instrs = trace_block(exec, next_pc, max_tier);
+    if !instrs.is_empty() {
+        if let Some(mut block) = compiler.compile_block(&instrs, next_pc, max_tier) {
+            block.phys_addr = next_phys;
+            cache.insert(next_phys, next_pc, block);
+            blocks_compiled += 1;
+            probe.set_cache_size(cache.len() as u32);
+        }
+    }
+    break;
+}
+```
+
+Safe because Loads/Full tiers are proven stable (delay-slot fix in place, helper limits set). The "start at Alu and promote" progression is an artifact of older debugging needs.
+
+## History
+This single change moved the JIT from 3.3% coverage to 8.7% — biggest single-change win in the optimization pass.
+
diff --git a/rules/jit/profile-persistence.md b/rules/jit/profile-persistence.md
new file mode 100644
index 0000000..3283639
--- /dev/null
+++ b/rules/jit/profile-persistence.md
@@ -0,0 +1,56 @@
+# JIT Profile Persistence — Deferred Lazy Replay
+
+## Rule
+
+Profile replay must use two-phase deferred lazy replay. NEVER pre-compile
+blocks from a saved profile at startup.
+
+## Phase 1 — Boot (interpreter only)
+
+Load profile into a VecDeque but do NOT compile any blocks. Wait for the first
+userspace PC (pc32 < 0x80000000 = kuseg), then count 100 consecutive probes
+before activating replay. PROM and early kernel init run in kseg0/kseg1 and
+must not be disturbed by profile replay's debug_fetch_instr calls.
+
+## Phase 2 — Drip-feed (one block per probe, background)
+
+After boot settles, compile one profile entry per probe as BACKGROUND work.
+Normal compilation runs first (compile current PC if cache miss), then
+opportunistically pop one profile entry. This spreads L2 cache pressure
+across normal execution.
+
+## Key constraints
+
+- **Kernel-only on save**: only persist blocks with virt_pc >= 0x80000000.
+  Userspace blocks are per-process and ephemeral — a saved VA may belong to
+  a different process next session. Saving userspace blocks caused unbounded
+  profile growth (27K → 114K → ...) and post-login corruption.
+
+- **Re-derive phys_pc on replay**: saved phys_pc is for diagnostics only.
+  TLB state differs between sessions. Call translate_pc() to get current
+  phys_pc; discard entry silently if translate fails (page not mapped).
+
+- **Content hash validation**: FNV-1a 32-bit hash of raw instruction words,
+  computed at compile time and stored in both CompiledBlock and ProfileEntry.
+  On replay, re-trace and compare hash. Mismatch = different code at same
+  VA (different DSO). Discard silently.
+
+- **Speculative re-entry**: replayed blocks use compile_block's standard
+  speculative flag (!block_has_stores). Load-only blocks are speculative
+  and re-prove stability via normal rollback/demotion path.
+
+- **Atomic save**: write to tmp file, rename. Prevents truncated profiles
+  from interrupted writes.
+
+- **Sort by hit_count**: on save, sort entries hottest-first. On load, the
+  queue drains hottest blocks first for fastest time-to-coverage.
+
+## Why not pre-compile
+
+Two attempts at bulk pre-compilation both broke IRIX boot:
+1. Synchronous compile of 27K blocks → starved device threads → PROM hang
+2. Incremental compile after PROM → bulk debug_fetch_instr evicts L2/D-cache
+   lines the kernel depends on → UTLB panic
+
+The drip-feed approach (one per probe) makes L2 pressure identical to
+normal on-demand compilation.
diff --git a/rules/jit/speculative-safety-net.md b/rules/jit/speculative-safety-net.md
new file mode 100644
index 0000000..62951e3
--- /dev/null
+++ b/rules/jit/speculative-safety-net.md
@@ -0,0 +1,51 @@
+# JIT Speculative Execution Is a Safety Net, Not Just an Optimization
+
+## Rule
+
+All compiled blocks that do NOT contain store instructions MUST be speculative.
+The speculative flag enables snapshot/rollback/demotion — without it, Cranelift
+codegen errors persist permanently and silently corrupt emulator state.
+
+## Why
+
+Cranelift regalloc2 produces occasional miscompilations in blocks with multiple
+helper-call diamonds (ok_block/exc_block CFG patterns from load helpers). These
+are rare (perhaps 1 in 10,000 blocks) but fatal over billions of block
+executions.
+
+The speculative execution path provides three-layer defense:
+1. **Snapshot** before block entry captures correct pre-block state
+2. **Rollback** on exception restores correct state, preventing propagation
+3. **Demotion** after 3 exceptions replaces the bad block with a lower-tier
+   version that doesn't compile the problematic instruction
+
+Non-speculative blocks have NONE of these defenses. A codegen error that
+produces a wrong GPR value persists in the executor state permanently, with
+no demotion trigger and no escape.
+
+## Evidence (isolation matrix, 2026-04-16)
+
+| Configuration | Speculative? | Result |
+|---|---|---|
+| Loads tier (load-only blocks) | Yes | 3/3 clean |
+| Full tier (load+store blocks) | No | 0/3 clean |
+| Full tier (load-only, forced non-speculative) | No | 3/3 broken |
+| Full tier (load-only, speculative) | Yes | 3/3 clean |
+
+The ONLY variable that correlated with success was the speculative flag.
+Instruction mix, tier label, and block length were all irrelevant.
+
+## How to apply
+
+```rust
+// In compile_block:
+speculative: !block_has_stores(instrs),
+
+// In trace_block: terminate Full-tier blocks before stores
+if tier == BlockTier::Full && is_compilable_store(&d) {
+    break;
+}
+```
+
+This ensures ALL compiled blocks are load-only → always speculative →
+always self-healing. Stores execute via interpreter.
diff --git a/rules/jit/store-compilation.md b/rules/jit/store-compilation.md
index 8dd9bf6..4812bf8 100644
--- a/rules/jit/store-compilation.md
+++ b/rules/jit/store-compilation.md
@@ -1,30 +1,43 @@
 # JIT Store Compilation Rules
 
-## Full-tier blocks must be non-speculative
+## Full-tier blocks must terminate BEFORE the first store
 
-Set `speculative: tier != BlockTier::Full` in the compiler.
-
-**Why:** Snapshot rollback restores CPU+TLB but NOT memory. If a store block
-does read-modify-write (LW, ADDIU, SW) and then hits an exception, rollback
-rewinds CPU to pre-block state but memory has the modified value. The
-interpreter re-runs from block entry, reads the modified value, modifies it
-again. Counters become N+2 instead of N+1. This corrupts kernel data structures.
-
-## Full-tier blocks must terminate at the first store
-
-In trace_block, break after pushing the first store instruction at Full tier:
+In trace_block, break BEFORE pushing a store instruction at Full tier:
 ```rust
 if tier == BlockTier::Full && is_compilable_store(&d) {
+    record_termination(&d, tier);
     break;
 }
 ```
 
-**Why:** Long blocks with multiple load/store helper calls create complex CFG
-(ok_block/exc_block diamond patterns per helper). This triggers Cranelift
-regalloc2 codegen issues on x86_64 — rare but fatal corruption that manifests
-after millions of block executions. Short blocks (~3-10 instructions) work
-perfectly. Confirmed empirically: short blocks = stable with 5K+ Full
-promotions; long blocks = crash at 780M instructions.
+**Why:** This ensures all Full-tier blocks are load-only → always speculative →
+self-healing via rollback+demotion. Cranelift regalloc2 produces occasional
+codegen errors in blocks with multiple ok_block/exc_block helper-call diamonds.
+At Loads tier, speculative rollback catches these errors and demotes bad blocks.
+Non-speculative blocks (which store-containing blocks must be) have no safety
+net — codegen errors persist permanently, corrupting state silently.
+
+By terminating before stores, all compiled blocks stay load-only, all get the
+speculative safety net, and stores execute via the interpreter where they're
+always correct.
+
+Confirmed via systematic isolation matrix (2026-04-16):
+- Loads tier (speculative): 3/3 clean
+- Full tier with stores compiled (non-speculative): 0/3 clean (hang, broken, broken)
+- Full tier store-free but non-speculative: 3/3 broken
+- Full tier store-free and speculative: 3/3 clean
+
+## Speculative flag must be based on store presence, not tier
+
+```rust
+speculative: !block_has_stores(instrs)
+```
+
+**NOT** `speculative: tier != BlockTier::Full`. The old rule was overly broad —
+it made ALL Full-tier blocks non-speculative, including load-only blocks that
+are safe to roll back. The correct rule: only blocks containing stores
+(SB/SH/SW/SD) need to be non-speculative, because rollback can't undo memory
+writes.
 
 ## Write helpers must use status != EXEC_COMPLETE
 
@@ -42,21 +55,22 @@ uncached writes (MMIO stores to device registers), causing slow corruption.
 Verify mode snapshots CPU/TLB but NOT memory. After a JIT block with stores
 modifies memory, the interpreter re-run reads the JIT-modified values.
 Read-modify-write sequences get double-applied. Verify mode is only valid
-for ALU and Load tiers.
+for ALU and Load tiers. Running VERIFY with Full-tier store blocks causes
+kernel panics (confirmed 3/3 kernel panic in isolation matrix).
 
 ## Delay-slot stores should be excluded from compilation
 
 In trace_block, when checking the delay slot instruction for a branch, exclude
-stores:
+stores (and loads — any faulting instruction):
 ```rust
-if is_compilable_for_tier(&delay_d, tier) && !is_compilable_store(&delay_d) {
+let delay_can_fault = is_compilable_load(&delay_d) || is_compilable_store(&delay_d);
+if is_compilable_for_tier(&delay_d, tier) && !delay_can_fault {
     instrs.push((delay_raw, delay_d));
     delay_ok = true;
 }
 ```
 
-**Why:** If a delay-slot store faults, sync_to_executor clears in_delay_slot.
-exec.step() re-executes the store as a non-delay-slot instruction.
-handle_exception sets cp0_epc to the store PC (not the branch PC) and doesn't
-set the BD bit. On ERET, the branch is permanently skipped, corrupting control
-flow. This is defensive — the block length fix is the primary fix for stores.
+**Why:** If a delay-slot instruction faults, sync_to_executor clears
+in_delay_slot. exec.step() re-executes as a non-delay-slot instruction.
+handle_exception sets cp0_epc to the instruction PC (not the branch PC) and
+doesn't set the BD bit. On ERET, the branch is permanently skipped.
diff --git a/src/devlog.rs b/src/devlog.rs
index d547091..6ae1c43 100644
--- a/src/devlog.rs
+++ b/src/devlog.rs
@@ -179,12 +179,17 @@ impl ModuleLog {
 
 // ── DevLog ───────────────────────────────────────────────────────────────────
 
+/// Generic write+send sink used by DevLog. Holds either a TCP monitor connection
+/// or a host stream like stderr (for diagnostic mode).
+pub type DevLogWriter = Arc<Mutex<dyn Write + Send>>;
+
 pub struct DevLog {
     /// Fast-path gate: true if any module is enabled or any file sink is open.
     /// Checked before taking any lock. When false, dlog! is a two-load no-op.
     any_active: AtomicBool,
-    /// All currently connected monitor clients. Entries are pruned on write error.
-    writers: Mutex<Vec<Arc<Mutex<BufWriter<TcpStream>>>>>,
+    /// All currently connected monitor clients (and any host sinks like stderr).
+    /// Entries are pruned on write error.
+    writers: Mutex<Vec<DevLogWriter>>,
     modules: [ModuleLog; LogModule::COUNT],
 }
 
@@ -212,6 +217,11 @@ impl DevLog {
         // any_active is set when a module is enabled.
     }
 
+    /// Register a generic Write+Send sink (e.g. stderr for diagnostic runs).
+    pub fn add_sink(&self, sink: DevLogWriter) {
+        self.writers.lock().push(sink);
+    }
+
     /// Enable a module. Output will go to all connected monitor clients.
     pub fn enable(&self, m: LogModule) {
         self.modules[m as usize].enabled.store(true, Ordering::Relaxed);
diff --git a/src/hal2.rs b/src/hal2.rs
index 55fe6b7..bbfda31 100644
--- a/src/hal2.rs
+++ b/src/hal2.rs
@@ -305,17 +305,36 @@ fn open_persistent_output() -> Option<AudioOut> {
             buffer_size: cpal::BufferSize::Default,
         };
         let ring_size = prebuf_samples(rate) * RING_BUF_MULTIPLIER;
-        let (producer, mut consumer) = RingBuffer::<i16>::new(ring_size);
-        let err_fn = |err| { eprintln!("HAL2: cpal stream error: {:?}", err); };
-        let data_fn = move |data: &mut [i16], _: &cpal::OutputCallbackInfo| {
-            for sample in data.iter_mut() {
-                *sample = consumer.pop().unwrap_or(0);
+        let err_fn = |err: cpal::StreamError| { eprintln!("HAL2: cpal stream error: {:?}", err); };
+
+        // Try f32 first (macOS CoreAudio native), then i16 (Linux ALSA).
+        let (producer, stream) = {
+            let (p, mut c) = RingBuffer::<i16>::new(ring_size);
+            let data_fn = move |data: &mut [f32], _: &cpal::OutputCallbackInfo| {
+                for sample in data.iter_mut() {
+                    *sample = c.pop().unwrap_or(0) as f32 / 32768.0;
+                }
+            };
+            match device.build_output_stream(&config, data_fn, err_fn.clone(), None) {
+                Ok(s) => (p, s),
+                Err(_) => {
+                    // f32 failed, try i16
+                    let (p, mut c) = RingBuffer::<i16>::new(ring_size);
+                    let data_fn = move |data: &mut [i16], _: &cpal::OutputCallbackInfo| {
+                        for sample in data.iter_mut() {
+                            *sample = c.pop().unwrap_or(0);
+                        }
+                    };
+                    match device.build_output_stream(&config, data_fn, err_fn.clone(), None) {
+                        Ok(s) => (p, s),
+                        Err(e) => {
+                            eprintln!("HAL2: cpal build_output_stream failed at {}Hz: {:?}", rate, e);
+                            continue;
+                        }
+                    }
+                }
             }
         };
-        let stream = match device.build_output_stream(&config, data_fn, err_fn, None) {
-            Ok(s) => s,
-            Err(_) => continue,
-        };
         if stream.play().is_err() { continue; }
         println!("HAL2: audio output: {:?} via {:?} at {}Hz",
             device.name().unwrap_or_default(), host.id(), rate);
@@ -417,10 +436,14 @@ impl Hal2 {
         let id = tm.add_recurring(Instant::now() + period, period, (), move |_| {
             let mut st = ca_state.lock();
 
-            // No audio output — nothing to do.
+            // No audio output — still drain DMA so the kernel doesn't hang
+            // waiting for PDMA_CTRL_ACT to clear.
             let stream_rate = match st.out.as_ref() {
                 Some(o) => o.stream_rate,
-                None => return TimerReturn::Continue,
+                None => {
+                    let _ = read_frame_from(&dma_client, mode);
+                    return TimerReturn::Continue;
+                }
             };
 
             // (Re)build resampler if codec rate changed.
diff --git a/src/jit/cache.rs b/src/jit/cache.rs
index 9f7eb23..6388fbb 100644
--- a/src/jit/cache.rs
+++ b/src/jit/cache.rs
@@ -74,14 +74,24 @@ pub struct CompiledBlock {
     pub stable_hits:     u32,
     /// True when this block is in a trial period (not yet fully trusted at current tier).
     pub speculative:     bool,
+    /// FNV-1a hash of the raw instruction words; used to detect stale profile
+    /// entries when a different DSO is loaded at the same virtual address.
+    pub content_hash:    u32,
 }
 
 // Safety: CompiledBlock is only accessed from the CPU thread.
 unsafe impl Send for CompiledBlock {}
 
-/// Code cache keyed by physical PC (aligned to 4 bytes).
+/// Code cache keyed by (physical PC, virtual PC).
+///
+/// Physical PC alone is insufficient: compiled blocks bake virtual PC constants
+/// for exit PC and branch targets. When different virtual addresses map to the
+/// same physical page (shared libraries, fork), a block compiled for virtual
+/// address A would produce wrong exit PCs when executed at virtual address B.
+/// Including the virtual PC in the key ensures each virtual mapping gets its
+/// own correctly-compiled block.
 pub struct CodeCache {
-    blocks: HashMap<u64, CompiledBlock>,
+    blocks: HashMap<(u64, u64), CompiledBlock>,
 }
 
 impl CodeCache {
@@ -91,26 +101,30 @@ impl CodeCache {
         }
     }
 
-    pub fn lookup(&self, phys_pc: u64) -> Option<&CompiledBlock> {
-        self.blocks.get(&phys_pc)
+    pub fn lookup(&self, phys_pc: u64, virt_pc: u64) -> Option<&CompiledBlock> {
+        self.blocks.get(&(phys_pc, virt_pc))
     }
 
-    pub fn lookup_mut(&mut self, phys_pc: u64) -> Option<&mut CompiledBlock> {
-        self.blocks.get_mut(&phys_pc)
+    pub fn contains(&self, phys_pc: u64, virt_pc: u64) -> bool {
+        self.blocks.contains_key(&(phys_pc, virt_pc))
     }
 
-    pub fn insert(&mut self, phys_pc: u64, block: CompiledBlock) {
-        self.blocks.insert(phys_pc, block);
+    pub fn lookup_mut(&mut self, phys_pc: u64, virt_pc: u64) -> Option<&mut CompiledBlock> {
+        self.blocks.get_mut(&(phys_pc, virt_pc))
     }
 
-    pub fn replace(&mut self, phys_pc: u64, block: CompiledBlock) {
-        self.blocks.insert(phys_pc, block);
+    pub fn insert(&mut self, phys_pc: u64, virt_pc: u64, block: CompiledBlock) {
+        self.blocks.insert((phys_pc, virt_pc), block);
+    }
+
+    pub fn replace(&mut self, phys_pc: u64, virt_pc: u64, block: CompiledBlock) {
+        self.blocks.insert((phys_pc, virt_pc), block);
     }
 
     /// Invalidate all blocks that overlap a physical address range.
     /// Called when self-modifying code is detected or CACHE instruction executes.
     pub fn invalidate_range(&mut self, phys_start: u64, phys_end: u64) {
-        self.blocks.retain(|&addr, block| {
+        self.blocks.retain(|&(addr, _), block| {
             let block_end = addr + (block.len_mips as u64 * 4);
             addr >= phys_end || block_end <= phys_start
         });
@@ -125,7 +139,7 @@ impl CodeCache {
         self.blocks.len()
     }
 
-    pub fn iter(&self) -> impl Iterator<Item = (&u64, &CompiledBlock)> {
+    pub fn iter(&self) -> impl Iterator<Item = (&(u64, u64), &CompiledBlock)> {
         self.blocks.iter()
     }
 }
diff --git a/src/jit/codegen_test.rs b/src/jit/codegen_test.rs
new file mode 100644
index 0000000..2ea3d8f
--- /dev/null
+++ b/src/jit/codegen_test.rs
@@ -0,0 +1,1587 @@
+/// JIT codegen tests: compile single MIPS basic blocks via Cranelift and compare
+/// results against the interpreter for the same starting state.
+///
+/// Each test:
+///   1. Creates an executor with PassthroughTlb + PassthroughCache + MockMemory.
+///   2. Writes instruction word(s) to MockMemory at TEST_PC (kseg0 0x80010000).
+///   3. Sets up initial GPR/hi/lo state.
+///   4. Runs the JIT path (compile + sync_from + call + sync_to).
+///   5. Restores identical state and runs the interpreter path (step()).
+///   6. Panics with a diff if any GPR, PC, hi, or lo differs.
+#[cfg(all(test, feature = "jit"))]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::{Arc, Mutex};
+
+    use crate::jit::cache::BlockTier;
+    use crate::jit::compiler::BlockCompiler;
+    use crate::jit::context::JitContext;
+    use crate::jit::helpers::HelperPtrs;
+    use crate::mips_cache_v2::PassthroughCache;
+    use crate::mips_exec::{decode_into, DecodedInstr, MipsCpuConfig, MipsExecutor};
+    use crate::mips_isa::*;
+    use crate::mips_tlb::PassthroughTlb;
+    use crate::traits::{BusDevice, BusRead16, BusRead32, BusRead64, BusRead8, BUS_OK};
+
+    // Virtual PC in kseg0 (0x80000000–0x9FFFFFFF), maps to physical 0x00010000 via & 0x1FFFFFFF.
+    const TEST_PC: u64 = 0x8001_0000;
+    // Virtual data address in kseg0, maps to physical 0x00020000.
+    const DATA_ADDR: u64 = 0x8002_0000;
+
+    // ── MockMemory ───────────────────────────────────────────────────────────────
+
+    pub struct MockMemory {
+        pub data: Mutex<HashMap<u64, u8>>,
+    }
+
+    impl MockMemory {
+        pub fn new() -> Self {
+            Self { data: Mutex::new(HashMap::new()) }
+        }
+
+        pub fn get_byte(&self, addr: u64) -> u8 {
+            *self.data.lock().unwrap().get(&addr).unwrap_or(&0)
+        }
+
+        pub fn set_byte(&self, addr: u64, val: u8) {
+            self.data.lock().unwrap().insert(addr, val);
+        }
+
+        pub fn get_word(&self, addr: u64) -> u32 {
+            let mut b = [0u8; 4];
+            for i in 0..4 { b[i] = self.get_byte(addr + i as u64); }
+            u32::from_be_bytes(b)
+        }
+
+        pub fn set_word(&self, addr: u64, val: u32) {
+            let b = val.to_be_bytes();
+            for i in 0..4 { self.set_byte(addr + i as u64, b[i]); }
+        }
+
+        pub fn get_double(&self, addr: u64) -> u64 {
+            let mut b = [0u8; 8];
+            for i in 0..8 { b[i] = self.get_byte(addr + i as u64); }
+            u64::from_be_bytes(b)
+        }
+
+        pub fn set_double(&self, addr: u64, val: u64) {
+            let b = val.to_be_bytes();
+            for i in 0..8 { self.set_byte(addr + i as u64, b[i]); }
+        }
+    }
+
+    impl BusDevice for MockMemory {
+        fn read8(&self, addr: u32) -> BusRead8 { BusRead8::ok(self.get_byte(addr as u64)) }
+        fn write8(&self, addr: u32, val: u8) -> u32 { self.set_byte(addr as u64, val); BUS_OK }
+        fn read16(&self, addr: u32) -> BusRead16 {
+            let a = (addr & !1) as u64;
+            let mut b = [0u8; 2];
+            for i in 0..2 { b[i] = self.get_byte(a + i as u64); }
+            BusRead16::ok(u16::from_be_bytes(b))
+        }
+        fn write16(&self, addr: u32, val: u16) -> u32 {
+            let a = (addr & !1) as u64;
+            let b = val.to_be_bytes();
+            for i in 0..2 { self.set_byte(a + i as u64, b[i]); }
+            BUS_OK
+        }
+        fn read32(&self, addr: u32) -> BusRead32 {
+            BusRead32::ok(self.get_word((addr & !3) as u64))
+        }
+        fn write32(&self, addr: u32, val: u32) -> u32 {
+            self.set_word((addr & !3) as u64, val);
+            BUS_OK
+        }
+        fn read64(&self, addr: u32) -> BusRead64 {
+            BusRead64::ok(self.get_double((addr & !7) as u64))
+        }
+        fn write64(&self, addr: u32, val: u64) -> u32 {
+            self.set_double((addr & !7) as u64, val);
+            BUS_OK
+        }
+    }
+
+    // ── Executor factory ─────────────────────────────────────────────────────────
+
+    fn create_executor() -> (MipsExecutor<PassthroughTlb, PassthroughCache>, Arc<MockMemory>) {
+        let mem = Arc::new(MockMemory::new());
+        let bus: Arc<dyn BusDevice> = mem.clone();
+        let cfg = MipsCpuConfig::indy();
+        let exec = MipsExecutor::new(bus, PassthroughTlb::default(), &cfg);
+        (exec, mem)
+    }
+
+    // ── Instruction word builders ─────────────────────────────────────────────────
+
+    fn make_r(op: u32, rs: u32, rt: u32, rd: u32, sa: u32, funct: u32) -> u32 {
+        (op << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | ((rd & 0x1F) << 11)
+            | ((sa & 0x1F) << 6) | (funct & 0x3F)
+    }
+
+    fn make_i(op: u32, rs: u32, rt: u32, imm: u16) -> u32 {
+        (op << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (imm as u32)
+    }
+
+    fn make_j(op: u32, target: u32) -> u32 {
+        (op << 26) | (target & 0x3FF_FFFF)
+    }
+
+    // NOP = SLL $0, $0, 0 = 0x0000_0000
+    const NOP: u32 = 0;
+
+    // ── Test harness ─────────────────────────────────────────────────────────────
+
+    /// State snapshot for comparison.
+    #[derive(Clone, Debug)]
+    struct CpuState {
+        gpr: [u64; 32],
+        pc:  u64,
+        hi:  u64,
+        lo:  u64,
+    }
+
+    impl CpuState {
+        fn capture(exec: &MipsExecutor<PassthroughTlb, PassthroughCache>) -> Self {
+            CpuState {
+                gpr: exec.core.gpr,
+                pc:  exec.core.pc,
+                hi:  exec.core.hi,
+                lo:  exec.core.lo,
+            }
+        }
+
+        fn restore(&self, exec: &mut MipsExecutor<PassthroughTlb, PassthroughCache>) {
+            exec.core.gpr = self.gpr;
+            exec.core.pc  = self.pc;
+            exec.core.hi  = self.hi;
+            exec.core.lo  = self.lo;
+            exec.in_delay_slot      = false;
+            exec.delay_slot_target  = 0;
+        }
+    }
+
+    fn diff_states(label: &str, jit: &CpuState, interp: &CpuState) {
+        let mut diffs = Vec::new();
+        for i in 0..32 {
+            if jit.gpr[i] != interp.gpr[i] {
+                diffs.push(format!(
+                    "  gpr[{i:2}]: JIT={:#018x}  INTERP={:#018x}",
+                    jit.gpr[i], interp.gpr[i]
+                ));
+            }
+        }
+        if jit.pc != interp.pc {
+            diffs.push(format!(
+                "  pc:       JIT={:#018x}  INTERP={:#018x}",
+                jit.pc, interp.pc
+            ));
+        }
+        if jit.hi != interp.hi {
+            diffs.push(format!(
+                "  hi:       JIT={:#018x}  INTERP={:#018x}",
+                jit.hi, interp.hi
+            ));
+        }
+        if jit.lo != interp.lo {
+            diffs.push(format!(
+                "  lo:       JIT={:#018x}  INTERP={:#018x}",
+                jit.lo, interp.lo
+            ));
+        }
+        if !diffs.is_empty() {
+            panic!("{label}: JIT vs interpreter mismatch:\n{}", diffs.join("\n"));
+        }
+    }
+
+    /// Build a decoded instruction list from raw words at TEST_PC (kseg0).
+    /// The physical address is TEST_PC & 0x1FFFFFFF. Instrs are written to MockMemory
+    /// so the interpreter's fetch path also works.
+    fn prepare_block(
+        exec: &mut MipsExecutor<PassthroughTlb, PassthroughCache>,
+        mem: &MockMemory,
+        words: &[u32],
+    ) -> Vec<(u32, DecodedInstr)> {
+        // Write instruction bytes at the physical address
+        let phys = (TEST_PC & 0x1FFF_FFFF) as u64;
+        for (i, &w) in words.iter().enumerate() {
+            mem.set_word(phys + i as u64 * 4, w);
+        }
+        // Invalidate the instruction cache so the interpreter fetches fresh bytes
+        exec.core.nanotlb_invalidate();
+
+        // Build decoded instruction list (mirrors trace_block logic without the private call)
+        words.iter().enumerate().map(|(_, &raw)| {
+            let mut d = DecodedInstr::default();
+            d.raw = raw;
+            decode_into::<PassthroughTlb, PassthroughCache>(&mut d);
+            (raw, d)
+        }).collect()
+    }
+
+    /// Run a single-block JIT test with `interp_steps` interpreter steps.
+    ///
+    /// - `setup`: initialise GPRs/hi/lo before running
+    /// - `words`: instruction words (written to memory AND compiled as the JIT block)
+    /// - `tier`: compilation tier
+    /// - `interp_steps`: how many step() calls needed to produce equivalent state
+    ///   (1 for non-branch, 2 for branch+delay-slot)
+    fn run_jit_test(
+        label: &str,
+        setup: impl Fn(&mut MipsExecutor<PassthroughTlb, PassthroughCache>),
+        words: &[u32],
+        tier: BlockTier,
+        interp_steps: usize,
+    ) {
+        let (mut exec, mem) = create_executor();
+        exec.core.pc = TEST_PC;
+
+        let instrs = prepare_block(&mut exec, &mem, words);
+
+        // Apply initial state
+        setup(&mut exec);
+        exec.core.pc = TEST_PC;
+
+        // Capture state for restoring before the interpreter run
+        let saved = CpuState::capture(&exec);
+
+        // ── JIT path ─────────────────────────────────────────────────────────────
+        let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+        let mut compiler = BlockCompiler::new(&helpers);
+        let block = compiler
+            .compile_block(&instrs, TEST_PC, tier)
+            .expect("compile_block returned None");
+
+        let mut ctx = JitContext::new();
+        ctx.sync_from_executor(&exec);
+        ctx.executor_ptr = &mut exec as *mut _ as u64;
+
+        let entry: extern "C" fn(*mut JitContext) =
+            unsafe { std::mem::transmute(block.entry) };
+        entry(&mut ctx);
+        ctx.sync_to_executor(&mut exec);
+
+        let jit_state = CpuState::capture(&exec);
+
+        // ── Interpreter path ─────────────────────────────────────────────────────
+        saved.restore(&mut exec);
+
+        for _ in 0..interp_steps {
+            exec.step();
+        }
+
+        let interp_state = CpuState::capture(&exec);
+
+        diff_states(label, &jit_state, &interp_state);
+    }
+
+    /// Like run_jit_test but also compares MockMemory contents after execution
+    /// (for store tests). `mem_checks` is a list of (phys_addr, expected_bytes).
+    fn run_jit_store_test(
+        label: &str,
+        setup: impl Fn(&mut MipsExecutor<PassthroughTlb, PassthroughCache>, &MockMemory),
+        words: &[u32],
+        interp_steps: usize,
+        mem_checks: &[(u64, Vec<u8>)],
+    ) {
+        let (mut exec, mem) = create_executor();
+        exec.core.pc = TEST_PC;
+
+        let instrs = prepare_block(&mut exec, &mem, words);
+
+        setup(&mut exec, &mem);
+        exec.core.pc = TEST_PC;
+
+        let saved = CpuState::capture(&exec);
+
+        // ── JIT path ─────────────────────────────────────────────────────────────
+        let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+        let mut compiler = BlockCompiler::new(&helpers);
+        let block = compiler
+            .compile_block(&instrs, TEST_PC, BlockTier::Full)
+            .expect("compile_block returned None");
+
+        let mut ctx = JitContext::new();
+        ctx.sync_from_executor(&exec);
+        ctx.executor_ptr = &mut exec as *mut _ as u64;
+
+        let entry: extern "C" fn(*mut JitContext) =
+            unsafe { std::mem::transmute(block.entry) };
+        entry(&mut ctx);
+        ctx.sync_to_executor(&mut exec);
+
+        let jit_state = CpuState::capture(&exec);
+
+        // Verify memory contents after JIT execution
+        for (phys_addr, expected) in mem_checks {
+            for (i, &exp_byte) in expected.iter().enumerate() {
+                let got = mem.get_byte(phys_addr + i as u64);
+                assert_eq!(got, exp_byte,
+                    "{label}: JIT memory mismatch at phys {:#010x}+{i}: got {got:#04x}, want {exp_byte:#04x}",
+                    phys_addr);
+            }
+        }
+
+        // ── Interpreter path ─────────────────────────────────────────────────────
+        // Clear memory area written by JIT so interpreter writes fresh
+        let data_phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+        for i in 0..16 {
+            mem.set_byte(data_phys + i, 0);
+        }
+        saved.restore(&mut exec);
+
+        for _ in 0..interp_steps {
+            exec.step();
+        }
+
+        let interp_state = CpuState::capture(&exec);
+        diff_states(label, &jit_state, &interp_state);
+
+        // Re-verify memory against interpreter's writes
+        for (phys_addr, expected) in mem_checks {
+            for (i, &exp_byte) in expected.iter().enumerate() {
+                let got = mem.get_byte(phys_addr + i as u64);
+                assert_eq!(got, exp_byte,
+                    "{label}: interp memory mismatch at phys {:#010x}+{i}: got {got:#04x}, want {exp_byte:#04x}",
+                    phys_addr);
+            }
+        }
+    }
+
+    // ── ALU register ops (OP_SPECIAL) ────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_addu() {
+        // ADDU rd=3, rs=1, rt=2
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_ADDU);
+        for (a, b) in [(10u64, 20u64), (0, 0), (0xFFFF_FFFF, 1), (0x7FFF_FFFE, 2)] {
+            run_jit_test(
+                &format!("ADDU {a:#x}+{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_subu() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_SUBU);
+        for (a, b) in [(30u64, 20u64), (0, 0), (0, 1), (0x80000000, 1)] {
+            run_jit_test(
+                &format!("SUBU {a:#x}-{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_and() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_AND);
+        for (a, b) in [
+            (0xAAAA_AAAA_AAAA_AAAAu64, 0x5555_5555_5555_5555u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF),
+            (0, 0xFFFF_FFFF_FFFF_FFFF),
+            (0xDEAD_BEEF, 0xFFFF_0000),
+        ] {
+            run_jit_test(
+                &format!("AND {a:#x}&{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_or() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_OR);
+        for (a, b) in [
+            (0xAAAA_AAAAu64, 0x5555_5555u64),
+            (0, 0),
+            (0xFFFF_FFFF_FFFF_FFFF, 0),
+            (0xDEAD_0000, 0x0000_BEEF),
+        ] {
+            run_jit_test(
+                &format!("OR {a:#x}|{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_xor() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_XOR);
+        for (a, b) in [
+            (0xAAAA_AAAAu64, 0xAAAA_AAAAu64),
+            (0, 0xFFFF_FFFF_FFFF_FFFF),
+            (0x1234_5678_9ABC_DEF0, 0xFEDC_BA98_7654_3210),
+            (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF),
+        ] {
+            run_jit_test(
+                &format!("XOR {a:#x}^{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_nor() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_NOR);
+        for (a, b) in [
+            (0u64, 0u64),
+            (0xAAAA_AAAA, 0x5555_5555),
+            (0xFFFF_FFFF_FFFF_FFFF, 0),
+            (0x1234_5678, 0x9ABC_DEF0),
+        ] {
+            run_jit_test(
+                &format!("NOR {a:#x} nor {b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_slt() {
+        // SLT rd=3, rs=1, rt=2 (signed compare)
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_SLT);
+        for (a, b) in [
+            (0u64, 1u64),               // 0 < 1: 1
+            (1u64, 0u64),               // 1 < 0: 0
+            (0xFFFF_FFFF_FFFF_FFFF, 0), // -1 < 0: 1
+            (0, 0xFFFF_FFFF_FFFF_FFFF), // 0 < -1: 0
+            (0x7FFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF), // large pos < -1: 0
+        ] {
+            run_jit_test(
+                &format!("SLT {a:#x} < {b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_sltu() {
+        // SLTU rd=3, rs=1, rt=2 (unsigned compare)
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_SLTU);
+        for (a, b) in [
+            (0u64, 1u64),
+            (1u64, 0u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 0),        // large > 0: 0
+            (0, 0xFFFF_FFFF_FFFF_FFFF),        // 0 < large: 1
+            (0x8000_0000_0000_0000, 0x7FFF_FFFF_FFFF_FFFF), // min > max as unsigned
+        ] {
+            run_jit_test(
+                &format!("SLTU {a:#x} < {b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_daddu() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_DADDU);
+        for (a, b) in [
+            (0u64, 0u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 1),
+            (0x7FFF_FFFF_FFFF_FFFF, 1),
+            (0x1234_5678_9ABC_DEF0, 0x0FED_CBA9_8765_4321),
+        ] {
+            run_jit_test(
+                &format!("DADDU {a:#x}+{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_dsubu() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_DSUBU);
+        for (a, b) in [
+            (100u64, 50u64),
+            (0, 1),
+            (0x8000_0000_0000_0000, 1),
+            (0, 0xFFFF_FFFF_FFFF_FFFF),
+        ] {
+            run_jit_test(
+                &format!("DSUBU {a:#x}-{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    // ── 32-bit shifts ─────────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_sll() {
+        for sa in [0u32, 1, 8, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_SLL);
+            for val in [0u64, 1, 0xFFFF_FFFF, 0x8000_0000] {
+                run_jit_test(
+                    &format!("SLL {val:#x} << {sa}"),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_srl() {
+        for sa in [0u32, 1, 8, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_SRL);
+            for val in [0u64, 0xFFFF_FFFF, 0x8000_0000, 0x8000_0001] {
+                run_jit_test(
+                    &format!("SRL {val:#x} >> {sa}"),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_sra() {
+        for sa in [0u32, 1, 8, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_SRA);
+            // Include a negative value (high bit set) to test arithmetic shift
+            for val in [0u64, 0x7FFF_FFFF, 0x8000_0000, 0xFFFF_FFFF] {
+                run_jit_test(
+                    &format!("SRA {val:#x} >> {sa}"),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_sllv() {
+        let instr = make_r(OP_SPECIAL, 2, 1, 3, 0, FUNCT_SLLV); // rd=3 = rt=1 << rs=2
+        for (val, sa) in [(1u64, 0u64), (1, 16), (0xFFFF_FFFF, 1), (0x0000_FFFF, 31)] {
+            run_jit_test(
+                &format!("SLLV {val:#x} << {sa}"),
+                |e| { e.core.write_gpr(1, val); e.core.write_gpr(2, sa); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_srlv() {
+        let instr = make_r(OP_SPECIAL, 2, 1, 3, 0, FUNCT_SRLV);
+        for (val, sa) in [(0xFFFF_FFFFu64, 0u64), (0xFFFF_FFFF, 8), (0x8000_0000, 1), (1, 31)] {
+            run_jit_test(
+                &format!("SRLV {val:#x} >> {sa}"),
+                |e| { e.core.write_gpr(1, val); e.core.write_gpr(2, sa); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_srav() {
+        let instr = make_r(OP_SPECIAL, 2, 1, 3, 0, FUNCT_SRAV);
+        for (val, sa) in [(0x8000_0000u64, 0u64), (0x8000_0000, 1), (0x8000_0000, 16), (0x7FFF_FFFF, 31)] {
+            run_jit_test(
+                &format!("SRAV {val:#x} >> {sa}"),
+                |e| { e.core.write_gpr(1, val); e.core.write_gpr(2, sa); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    // ── 64-bit shifts ────────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_dsll() {
+        for sa in [0u32, 1, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_DSLL);
+            for val in [1u64, 0xFFFF_FFFF_FFFF_FFFF, 0x8000_0000_0000_0000] {
+                run_jit_test(
+                    &format!("DSLL {val:#x} << {sa}"),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_dsrl() {
+        for sa in [0u32, 1, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_DSRL);
+            for val in [0xFFFF_FFFF_FFFF_FFFFu64, 1, 0x8000_0000_0000_0000] {
+                run_jit_test(
+                    &format!("DSRL {val:#x} >> {sa}"),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_dsra() {
+        for sa in [0u32, 1, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_DSRA);
+            for val in [0x8000_0000_0000_0000u64, 0x7FFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF] {
+                run_jit_test(
+                    &format!("DSRA {val:#x} >> {sa}"),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_dsll32() {
+        // DSLL32 sa=0 means shift by 32; sa=1 means shift by 33, etc.
+        for sa in [0u32, 1, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_DSLL32);
+            for val in [1u64, 0xFFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF] {
+                run_jit_test(
+                    &format!("DSLL32 {val:#x} << {}",  sa + 32),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_dsrl32() {
+        for sa in [0u32, 1, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_DSRL32);
+            for val in [0xFFFF_FFFF_FFFF_FFFFu64, 0x8000_0000_0000_0000, 1] {
+                run_jit_test(
+                    &format!("DSRL32 {val:#x} >> {}", sa + 32),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_dsra32() {
+        for sa in [0u32, 1, 16, 31] {
+            let instr = make_r(OP_SPECIAL, 0, 1, 2, sa, FUNCT_DSRA32);
+            for val in [0x8000_0000_0000_0000u64, 0x7FFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF] {
+                run_jit_test(
+                    &format!("DSRA32 {val:#x} >> {}", sa + 32),
+                    |e| { e.core.write_gpr(1, val); },
+                    &[instr],
+                    BlockTier::Alu,
+                    1,
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_jit_dsllv() {
+        let instr = make_r(OP_SPECIAL, 2, 1, 3, 0, FUNCT_DSLLV);
+        for (val, sa) in [(1u64, 0u64), (1, 32), (0xFFFF_FFFF, 32), (0x1, 63)] {
+            run_jit_test(
+                &format!("DSLLV {val:#x} << {sa}"),
+                |e| { e.core.write_gpr(1, val); e.core.write_gpr(2, sa); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_dsrlv() {
+        let instr = make_r(OP_SPECIAL, 2, 1, 3, 0, FUNCT_DSRLV);
+        for (val, sa) in [(0xFFFF_FFFF_FFFF_FFFFu64, 0u64), (0xFFFF_FFFF_FFFF_FFFF, 32), (1, 1), (0x8000_0000_0000_0000, 63)] {
+            run_jit_test(
+                &format!("DSRLV {val:#x} >> {sa}"),
+                |e| { e.core.write_gpr(1, val); e.core.write_gpr(2, sa); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_dsrav() {
+        let instr = make_r(OP_SPECIAL, 2, 1, 3, 0, FUNCT_DSRAV);
+        for (val, sa) in [(0x8000_0000_0000_0000u64, 0u64), (0x8000_0000_0000_0000, 32), (0x8000_0000_0000_0000, 63), (0x7FFF_FFFF_FFFF_FFFF, 32)] {
+            run_jit_test(
+                &format!("DSRAV {val:#x} >> {sa}"),
+                |e| { e.core.write_gpr(1, val); e.core.write_gpr(2, sa); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    // ── Multiply / Divide ────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_mult() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_MULT);
+        for (a, b) in [
+            (0u64, 0u64),
+            (1, 1),
+            (0x7FFF_FFFF, 2),
+            (0xFFFF_FFFF, 0xFFFF_FFFF), // -1 × -1 signed 32
+        ] {
+            run_jit_test(
+                &format!("MULT {a:#x}*{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_multu() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_MULTU);
+        for (a, b) in [
+            (0u64, 0u64),
+            (0xFFFF_FFFF, 0xFFFF_FFFF),
+            (0x8000_0000, 2),
+            (1234, 5678),
+        ] {
+            run_jit_test(
+                &format!("MULTU {a:#x}*{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_div_nonzero() {
+        // Only test non-zero divisors: interpreter returns EXEC_COMPLETE but leaves
+        // hi/lo unchanged on division-by-zero (undefined per MIPS spec); JIT uses
+        // a different safe-divisor fallback. Also skip INT_MIN / -1: the interpreter
+        // uses wrapping_div (result = INT_MIN) but Cranelift's sdiv raises SIGFPE.
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_DIV);
+        for (a, b) in [
+            (100u64, 7u64),
+            (0xFFFF_FFFF, 2),    // -1 / 2 signed
+            (1, 3),
+            (0x7FFF_FFFF, 0xFFFF_FFFF), // INT_MAX / -1 = -INT_MAX
+        ] {
+            run_jit_test(
+                &format!("DIV {a:#x}/{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_divu_nonzero() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_DIVU);
+        for (a, b) in [
+            (100u64, 7u64),
+            (0xFFFF_FFFF, 2),
+            (0xDEAD_BEEF, 0x1000),
+            (1, 1),
+        ] {
+            run_jit_test(
+                &format!("DIVU {a:#x}/{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_dmult() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_DMULT);
+        for (a, b) in [
+            (0u64, 0u64),
+            (1, 0xFFFF_FFFF_FFFF_FFFF),
+            (0x7FFF_FFFF_FFFF_FFFF, 2),
+            (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF), // (-1) * (-1)
+        ] {
+            run_jit_test(
+                &format!("DMULT {a:#x}*{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_dmultu() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_DMULTU);
+        for (a, b) in [
+            (0u64, 0u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF),
+            (0x8000_0000_0000_0000, 2),
+            (0x1234_5678_9ABC_DEF0, 0x1),
+        ] {
+            run_jit_test(
+                &format!("DMULTU {a:#x}*{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_ddiv_nonzero() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_DDIV);
+        for (a, b) in [
+            (100u64, 7u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 2u64), // -1 / 2
+            (1, 3),
+            (0x1234_5678_9ABC_DEF0, 0x1000),
+        ] {
+            run_jit_test(
+                &format!("DDIV {a:#x}/{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_ddivu_nonzero() {
+        let instr = make_r(OP_SPECIAL, 1, 2, 0, 0, FUNCT_DDIVU);
+        for (a, b) in [
+            (100u64, 7u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 2u64),
+            (0xDEAD_BEEF_CAFE_BABE, 0x100),
+            (1, 1),
+        ] {
+            run_jit_test(
+                &format!("DDIVU {a:#x}/{b:#x}"),
+                |e| { e.core.write_gpr(1, a); e.core.write_gpr(2, b); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    // ── HI/LO moves ──────────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_mfhi_mflo() {
+        let mfhi = make_r(OP_SPECIAL, 0, 0, 3, 0, FUNCT_MFHI);
+        let mflo = make_r(OP_SPECIAL, 0, 0, 4, 0, FUNCT_MFLO);
+        for (hi_val, lo_val) in [
+            (0u64, 0u64),
+            (0xDEAD_BEEF_CAFE_BABEu64, 0x1234_5678_9ABC_DEF0u64),
+            (0xFFFF_FFFF_FFFF_FFFF, 1),
+        ] {
+            run_jit_test(
+                &format!("MFHI hi={hi_val:#x}"),
+                |e| { e.core.hi = hi_val; },
+                &[mfhi],
+                BlockTier::Alu,
+                1,
+            );
+            run_jit_test(
+                &format!("MFLO lo={lo_val:#x}"),
+                |e| { e.core.lo = lo_val; },
+                &[mflo],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_mthi_mtlo() {
+        let mthi = make_r(OP_SPECIAL, 1, 0, 0, 0, FUNCT_MTHI);
+        let mtlo = make_r(OP_SPECIAL, 1, 0, 0, 0, FUNCT_MTLO);
+        for val in [0u64, 0xDEAD_BEEF_CAFE_BABEu64, 0xFFFF_FFFF_FFFF_FFFF] {
+            run_jit_test(
+                &format!("MTHI rs={val:#x}"),
+                |e| { e.core.write_gpr(1, val); },
+                &[mthi],
+                BlockTier::Alu,
+                1,
+            );
+            run_jit_test(
+                &format!("MTLO rs={val:#x}"),
+                |e| { e.core.write_gpr(1, val); },
+                &[mtlo],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    // ── Conditional moves ────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_movz() {
+        // MOVZ rd=3, rs=1, rt=2 — if rt==0 then rd=rs else rd unchanged
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_MOVZ);
+        // rt=0: should move
+        run_jit_test(
+            "MOVZ rt=0",
+            |e| { e.core.write_gpr(1, 0xABCD); e.core.write_gpr(2, 0); e.core.write_gpr(3, 0x1234); },
+            &[instr],
+            BlockTier::Alu,
+            1,
+        );
+        // rt!=0: should not move
+        run_jit_test(
+            "MOVZ rt!=0",
+            |e| { e.core.write_gpr(1, 0xABCD); e.core.write_gpr(2, 1); e.core.write_gpr(3, 0x1234); },
+            &[instr],
+            BlockTier::Alu,
+            1,
+        );
+    }
+
+    #[test]
+    fn test_jit_movn() {
+        // MOVN rd=3, rs=1, rt=2 — if rt!=0 then rd=rs else rd unchanged
+        let instr = make_r(OP_SPECIAL, 1, 2, 3, 0, FUNCT_MOVN);
+        run_jit_test(
+            "MOVN rt!=0",
+            |e| { e.core.write_gpr(1, 0xABCD); e.core.write_gpr(2, 42); e.core.write_gpr(3, 0x1234); },
+            &[instr],
+            BlockTier::Alu,
+            1,
+        );
+        run_jit_test(
+            "MOVN rt=0",
+            |e| { e.core.write_gpr(1, 0xABCD); e.core.write_gpr(2, 0); e.core.write_gpr(3, 0x1234); },
+            &[instr],
+            BlockTier::Alu,
+            1,
+        );
+    }
+
+    // ── SYNC ─────────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_sync() {
+        let instr = make_r(OP_SPECIAL, 0, 0, 0, 0, FUNCT_SYNC);
+        run_jit_test("SYNC", |_| {}, &[instr], BlockTier::Alu, 1);
+    }
+
+    // ── Immediate ALU ops ────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_addiu() {
+        // ADDIU rt=2, rs=1, imm
+        for (rs_val, imm) in [
+            (0u64, 0i16),
+            (10, -5),
+            (0xFFFF_FFFF, 1),
+            (0, -1),
+            (0x7FFF_FFFF, 1),
+        ] {
+            let instr = make_i(OP_ADDIU, 1, 2, imm as u16);
+            run_jit_test(
+                &format!("ADDIU {rs_val:#x}+{imm}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_daddiu() {
+        for (rs_val, imm) in [
+            (0u64, 0i16),
+            (0xFFFF_FFFF_FFFF_FFFF, 1),
+            (0, -1),
+            (100, -200),
+        ] {
+            let instr = make_i(OP_DADDIU, 1, 2, imm as u16);
+            run_jit_test(
+                &format!("DADDIU {rs_val:#x}+{imm}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_slti() {
+        for (rs_val, imm) in [
+            (0u64, 0i16),
+            (0xFFFF_FFFF_FFFF_FFFF, 0), // -1 < 0: 1
+            (0, -1i16),                  // 0 < -1: 0
+            (100, 200),
+        ] {
+            let instr = make_i(OP_SLTI, 1, 2, imm as u16);
+            run_jit_test(
+                &format!("SLTI {rs_val:#x} < {imm}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_sltiu() {
+        for (rs_val, imm) in [
+            (0u64, 1u16),
+            (1u64, 0u16),
+            (0u64, 0u16),
+            (0xFFFF_FFFF_FFFF_FFFF, 1), // large < 1 unsigned: 0
+        ] {
+            let instr = make_i(OP_SLTIU, 1, 2, imm);
+            run_jit_test(
+                &format!("SLTIU {rs_val:#x} < {imm}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_andi() {
+        for (rs_val, imm) in [
+            (0xFFFF_FFFF_FFFF_FFFFu64, 0xFFFFu16),
+            (0xAAAA_AAAA_AAAA_AAAA, 0x5555),
+            (0xDEAD_BEEF, 0xFF00),
+            (0, 0xFFFF),
+        ] {
+            let instr = make_i(OP_ANDI, 1, 2, imm);
+            run_jit_test(
+                &format!("ANDI {rs_val:#x}&{imm:#x}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_ori() {
+        for (rs_val, imm) in [
+            (0u64, 0xFFFFu16),
+            (0xFFFF_FFFF_FFFF_0000u64, 0xFFFF),
+            (0, 0),
+            (0xDEAD_0000, 0xBEEF),
+        ] {
+            let instr = make_i(OP_ORI, 1, 2, imm);
+            run_jit_test(
+                &format!("ORI {rs_val:#x}|{imm:#x}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_xori() {
+        for (rs_val, imm) in [
+            (0xFFFFu64, 0xFFFFu16),
+            (0, 0xFFFF),
+            (0xDEAD_BEEF, 0xBEEF),
+            (0, 0),
+        ] {
+            let instr = make_i(OP_XORI, 1, 2, imm);
+            run_jit_test(
+                &format!("XORI {rs_val:#x}^{imm:#x}"),
+                |e| { e.core.write_gpr(1, rs_val); },
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_lui() {
+        for imm in [0u16, 1, 0x7FFF, 0x8000, 0xFFFF] {
+            let instr = make_i(OP_LUI, 0, 1, imm);
+            run_jit_test(
+                &format!("LUI imm={imm:#x}"),
+                |_| {},
+                &[instr],
+                BlockTier::Alu,
+                1,
+            );
+        }
+    }
+
+    // ── Branches ────────────────────────────────────────────────────────────────
+    //
+    // Branch blocks include the branch + a NOP delay slot. JIT computes the exit
+    // PC directly. The interpreter needs 2 step() calls (branch + delay slot).
+    // Branch offset is encoded as the signed number of instructions relative to
+    // PC+4, sign-extended and shifted left 2. We use offset=+4 words (16 bytes
+    // ahead of the delay slot) for the taken case.
+    //
+    // TEST_PC = 0x80010000
+    // branch at offset 0:  PC+4 = 0x80010004
+    // delay slot at offset 4: PC+8 = 0x80010008
+    // taken target  = PC+4 + imm*4 = 0x80010004 + 4*4 = 0x80010014  (imm=4)
+    // not-taken     = PC+8 = 0x80010008
+
+    #[test]
+    fn test_jit_beq_taken() {
+        let branch = make_i(OP_BEQ, 1, 2, 4u16); // beq $1, $2, +4  (taken when equal)
+        run_jit_test(
+            "BEQ taken",
+            |e| { e.core.write_gpr(1, 42); e.core.write_gpr(2, 42); },
+            &[branch, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    #[test]
+    fn test_jit_beq_not_taken() {
+        let branch = make_i(OP_BEQ, 1, 2, 4u16);
+        run_jit_test(
+            "BEQ not taken",
+            |e| { e.core.write_gpr(1, 1); e.core.write_gpr(2, 2); },
+            &[branch, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    #[test]
+    fn test_jit_bne_taken() {
+        let branch = make_i(OP_BNE, 1, 2, 4u16);
+        run_jit_test(
+            "BNE taken",
+            |e| { e.core.write_gpr(1, 1); e.core.write_gpr(2, 2); },
+            &[branch, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    #[test]
+    fn test_jit_bne_not_taken() {
+        let branch = make_i(OP_BNE, 1, 2, 4u16);
+        run_jit_test(
+            "BNE not taken",
+            |e| { e.core.write_gpr(1, 99); e.core.write_gpr(2, 99); },
+            &[branch, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    #[test]
+    fn test_jit_blez() {
+        let branch = make_i(OP_BLEZ, 1, 0, 4u16);
+        // zero: taken
+        run_jit_test("BLEZ zero",   |e| { e.core.write_gpr(1, 0); }, &[branch, NOP], BlockTier::Alu, 2);
+        // negative: taken
+        run_jit_test("BLEZ neg",    |e| { e.core.write_gpr(1, 0xFFFF_FFFF_FFFF_FFFF); }, &[branch, NOP], BlockTier::Alu, 2);
+        // positive: not taken
+        run_jit_test("BLEZ pos",    |e| { e.core.write_gpr(1, 1); }, &[branch, NOP], BlockTier::Alu, 2);
+    }
+
+    #[test]
+    fn test_jit_bgtz() {
+        let branch = make_i(OP_BGTZ, 1, 0, 4u16);
+        // positive: taken
+        run_jit_test("BGTZ pos",    |e| { e.core.write_gpr(1, 1); }, &[branch, NOP], BlockTier::Alu, 2);
+        // zero: not taken
+        run_jit_test("BGTZ zero",   |e| { e.core.write_gpr(1, 0); }, &[branch, NOP], BlockTier::Alu, 2);
+        // negative: not taken
+        run_jit_test("BGTZ neg",    |e| { e.core.write_gpr(1, 0xFFFF_FFFF_FFFF_FFFF); }, &[branch, NOP], BlockTier::Alu, 2);
+    }
+
+    #[test]
+    fn test_jit_j() {
+        // J target: target26 << 2 in the same 256MB region.
+        // TEST_PC = 0x8001_0000, PC+4 = 0x8001_0004
+        // region   = (PC+4) & 0xFFFF_FFFF_F000_0000 = 0x8000_0000
+        // target26 encodes instr-words offset: let's jump to 0x8000_4000
+        // target26 = (0x8000_4000 >> 2) & 0x3FF_FFFF = 0x1000
+        let target26: u32 = (0x8000_4000u32 >> 2) & 0x3FF_FFFF;
+        let instr = make_j(OP_J, target26);
+        run_jit_test(
+            "J",
+            |_| {},
+            &[instr, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    #[test]
+    fn test_jit_jal() {
+        // JAL stores PC+8 in $ra (r31) and jumps.
+        // TEST_PC = 0x8001_0000 → PC+8 = 0x8001_0008
+        let target26: u32 = (0x8000_8000u32 >> 2) & 0x3FF_FFFF;
+        let instr = make_j(OP_JAL, target26);
+        run_jit_test(
+            "JAL",
+            |_| {},
+            &[instr, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    #[test]
+    fn test_jit_jr() {
+        // JR $1 — jump to address in r1.
+        // r1 = 0x8003_0000 (must be in kseg0 and aligned)
+        let instr = make_r(OP_SPECIAL, 1, 0, 0, 0, FUNCT_JR);
+        run_jit_test(
+            "JR",
+            |e| { e.core.write_gpr(1, 0x8003_0000); },
+            &[instr, NOP],
+            BlockTier::Alu,
+            2,
+        );
+    }
+
+    // ── Loads ────────────────────────────────────────────────────────────────────
+    //
+    // DATA_ADDR = 0x8002_0000, physical = 0x0002_0000.
+    // We pre-populate MockMemory at that physical address before each load test.
+
+    #[test]
+    fn test_jit_lb() {
+        // LB rt=2, offset=0(rs=1): load signed byte
+        let instr = make_i(OP_LB, 1, 2, 0);
+        // Positive byte (0x42)
+        let (mut exec, mem) = create_executor();
+        exec.core.pc = TEST_PC;
+        let instrs = prepare_block(&mut exec, &mem, &[instr]);
+        mem.set_byte((DATA_ADDR & 0x1FFF_FFFF) as u64, 0x42);
+        exec.core.write_gpr(1, DATA_ADDR);
+        let saved = CpuState::capture(&exec);
+        let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+        let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+        let mut ctx = JitContext::new();
+        ctx.sync_from_executor(&exec);
+        ctx.executor_ptr = &mut exec as *mut _ as u64;
+        let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+        entry(&mut ctx);
+        ctx.sync_to_executor(&mut exec);
+        let jit_state = CpuState::capture(&exec);
+        saved.restore(&mut exec);
+        exec.step();
+        diff_states("LB positive", &jit_state, &CpuState::capture(&exec));
+
+        // Negative byte (0x80 → -128)
+        let (mut exec, mem) = create_executor();
+        exec.core.pc = TEST_PC;
+        let instrs = prepare_block(&mut exec, &mem, &[instr]);
+        mem.set_byte((DATA_ADDR & 0x1FFF_FFFF) as u64, 0x80);
+        exec.core.write_gpr(1, DATA_ADDR);
+        let saved = CpuState::capture(&exec);
+        let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+        let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+        let mut ctx = JitContext::new();
+        ctx.sync_from_executor(&exec);
+        ctx.executor_ptr = &mut exec as *mut _ as u64;
+        let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+        entry(&mut ctx);
+        ctx.sync_to_executor(&mut exec);
+        let jit_state = CpuState::capture(&exec);
+        saved.restore(&mut exec);
+        exec.step();
+        diff_states("LB negative", &jit_state, &CpuState::capture(&exec));
+    }
+
+    #[test]
+    fn test_jit_lbu() {
+        // LBU rt=2, offset=0(rs=1): load zero-extended byte
+        let instr = make_i(OP_LBU, 1, 2, 0);
+        for byte_val in [0u8, 0x42, 0x80, 0xFF] {
+            let (mut exec, mem) = create_executor();
+            exec.core.pc = TEST_PC;
+            let instrs = prepare_block(&mut exec, &mem, &[instr]);
+            mem.set_byte((DATA_ADDR & 0x1FFF_FFFF) as u64, byte_val);
+            exec.core.write_gpr(1, DATA_ADDR);
+            let saved = CpuState::capture(&exec);
+            let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+            let block = BlockCompiler::new(&helpers)
+                .compile_block(&instrs, TEST_PC, BlockTier::Loads)
+                .unwrap();
+            let mut ctx = JitContext::new();
+            ctx.sync_from_executor(&exec);
+            ctx.executor_ptr = &mut exec as *mut _ as u64;
+            let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+            entry(&mut ctx);
+            ctx.sync_to_executor(&mut exec);
+            let jit_state = CpuState::capture(&exec);
+            saved.restore(&mut exec);
+            exec.step();
+            diff_states(&format!("LBU {byte_val:#x}"), &jit_state, &CpuState::capture(&exec));
+        }
+    }
+
+    #[test]
+    fn test_jit_lh() {
+        // LH rt=2, offset=0(rs=1): load signed halfword
+        let instr = make_i(OP_LH, 1, 2, 0);
+        for hw in [0u16, 0x1234, 0x8000, 0xFFFF] {
+            let (mut exec, mem) = create_executor();
+            exec.core.pc = TEST_PC;
+            let instrs = prepare_block(&mut exec, &mem, &[instr]);
+            let phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+            mem.set_byte(phys,     (hw >> 8) as u8);
+            mem.set_byte(phys + 1, (hw & 0xFF) as u8);
+            exec.core.write_gpr(1, DATA_ADDR);
+            let saved = CpuState::capture(&exec);
+            let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+            let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+            let mut ctx = JitContext::new();
+            ctx.sync_from_executor(&exec);
+            ctx.executor_ptr = &mut exec as *mut _ as u64;
+            let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+            entry(&mut ctx);
+            ctx.sync_to_executor(&mut exec);
+            let jit_state = CpuState::capture(&exec);
+            saved.restore(&mut exec);
+            exec.step();
+            diff_states(&format!("LH {hw:#x}"), &jit_state, &CpuState::capture(&exec));
+        }
+    }
+
+    #[test]
+    fn test_jit_lhu() {
+        let instr = make_i(OP_LHU, 1, 2, 0);
+        for hw in [0u16, 0x1234, 0x8000, 0xFFFF] {
+            let (mut exec, mem) = create_executor();
+            exec.core.pc = TEST_PC;
+            let instrs = prepare_block(&mut exec, &mem, &[instr]);
+            let phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+            mem.set_byte(phys,     (hw >> 8) as u8);
+            mem.set_byte(phys + 1, (hw & 0xFF) as u8);
+            exec.core.write_gpr(1, DATA_ADDR);
+            let saved = CpuState::capture(&exec);
+            let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+            let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+            let mut ctx = JitContext::new();
+            ctx.sync_from_executor(&exec);
+            ctx.executor_ptr = &mut exec as *mut _ as u64;
+            let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+            entry(&mut ctx);
+            ctx.sync_to_executor(&mut exec);
+            let jit_state = CpuState::capture(&exec);
+            saved.restore(&mut exec);
+            exec.step();
+            diff_states(&format!("LHU {hw:#x}"), &jit_state, &CpuState::capture(&exec));
+        }
+    }
+
+    #[test]
+    fn test_jit_lw() {
+        let instr = make_i(OP_LW, 1, 2, 0);
+        for wval in [0u32, 0x1234_5678, 0x8000_0000, 0xFFFF_FFFF] {
+            let (mut exec, mem) = create_executor();
+            exec.core.pc = TEST_PC;
+            let instrs = prepare_block(&mut exec, &mem, &[instr]);
+            mem.set_word((DATA_ADDR & 0x1FFF_FFFF) as u64, wval);
+            exec.core.write_gpr(1, DATA_ADDR);
+            let saved = CpuState::capture(&exec);
+            let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+            let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+            let mut ctx = JitContext::new();
+            ctx.sync_from_executor(&exec);
+            ctx.executor_ptr = &mut exec as *mut _ as u64;
+            let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+            entry(&mut ctx);
+            ctx.sync_to_executor(&mut exec);
+            let jit_state = CpuState::capture(&exec);
+            saved.restore(&mut exec);
+            exec.step();
+            diff_states(&format!("LW {wval:#x}"), &jit_state, &CpuState::capture(&exec));
+        }
+    }
+
+    #[test]
+    fn test_jit_lwu() {
+        let instr = make_i(OP_LWU, 1, 2, 0);
+        for wval in [0u32, 0x1234_5678, 0x8000_0000, 0xFFFF_FFFF] {
+            let (mut exec, mem) = create_executor();
+            exec.core.pc = TEST_PC;
+            let instrs = prepare_block(&mut exec, &mem, &[instr]);
+            mem.set_word((DATA_ADDR & 0x1FFF_FFFF) as u64, wval);
+            exec.core.write_gpr(1, DATA_ADDR);
+            let saved = CpuState::capture(&exec);
+            let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+            let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+            let mut ctx = JitContext::new();
+            ctx.sync_from_executor(&exec);
+            ctx.executor_ptr = &mut exec as *mut _ as u64;
+            let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+            entry(&mut ctx);
+            ctx.sync_to_executor(&mut exec);
+            let jit_state = CpuState::capture(&exec);
+            saved.restore(&mut exec);
+            exec.step();
+            diff_states(&format!("LWU {wval:#x}"), &jit_state, &CpuState::capture(&exec));
+        }
+    }
+
+    #[test]
+    fn test_jit_ld() {
+        let instr = make_i(OP_LD, 1, 2, 0);
+        for dval in [0u64, 0x1234_5678_9ABC_DEF0, 0x8000_0000_0000_0000, 0xFFFF_FFFF_FFFF_FFFF] {
+            let (mut exec, mem) = create_executor();
+            exec.core.pc = TEST_PC;
+            let instrs = prepare_block(&mut exec, &mem, &[instr]);
+            mem.set_double((DATA_ADDR & 0x1FFF_FFFF) as u64, dval);
+            exec.core.write_gpr(1, DATA_ADDR);
+            let saved = CpuState::capture(&exec);
+            let helpers = HelperPtrs::new::<PassthroughTlb, PassthroughCache>();
+            let block = BlockCompiler::new(&helpers).compile_block(&instrs, TEST_PC, BlockTier::Loads).unwrap();
+            let mut ctx = JitContext::new();
+            ctx.sync_from_executor(&exec);
+            ctx.executor_ptr = &mut exec as *mut _ as u64;
+            let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) };
+            entry(&mut ctx);
+            ctx.sync_to_executor(&mut exec);
+            let jit_state = CpuState::capture(&exec);
+            saved.restore(&mut exec);
+            exec.step();
+            diff_states(&format!("LD {dval:#x}"), &jit_state, &CpuState::capture(&exec));
+        }
+    }
+
+    // ── Stores ───────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn test_jit_sb() {
+        // SB rt=2, offset=0(rs=1)
+        let instr = make_i(OP_SB, 1, 2, 0);
+        for bval in [0u64, 0x42, 0x80, 0xFF] {
+            let phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+            run_jit_store_test(
+                &format!("SB {bval:#x}"),
+                |e, _| { e.core.write_gpr(1, DATA_ADDR); e.core.write_gpr(2, bval); },
+                &[instr],
+                1,
+                &[(phys, vec![(bval & 0xFF) as u8])],
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_sh() {
+        let instr = make_i(OP_SH, 1, 2, 0);
+        for hval in [0u64, 0x1234, 0x8000, 0xFFFF] {
+            let phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+            let hw = (hval & 0xFFFF) as u16;
+            run_jit_store_test(
+                &format!("SH {hval:#x}"),
+                |e, _| { e.core.write_gpr(1, DATA_ADDR); e.core.write_gpr(2, hval); },
+                &[instr],
+                1,
+                &[(phys, vec![(hw >> 8) as u8, (hw & 0xFF) as u8])],
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_sw() {
+        let instr = make_i(OP_SW, 1, 2, 0);
+        for wval in [0u64, 0x1234_5678, 0x8000_0000, 0xFFFF_FFFF] {
+            let phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+            let w = (wval & 0xFFFF_FFFF) as u32;
+            let wb = w.to_be_bytes();
+            run_jit_store_test(
+                &format!("SW {wval:#x}"),
+                |e, _| { e.core.write_gpr(1, DATA_ADDR); e.core.write_gpr(2, wval); },
+                &[instr],
+                1,
+                &[(phys, wb.to_vec())],
+            );
+        }
+    }
+
+    #[test]
+    fn test_jit_sd() {
+        let instr = make_i(OP_SD, 1, 2, 0);
+        for dval in [0u64, 0x1234_5678_9ABC_DEF0, 0x8000_0000_0000_0000, 0xFFFF_FFFF_FFFF_FFFF] {
+            let phys = (DATA_ADDR & 0x1FFF_FFFF) as u64;
+            let db = dval.to_be_bytes();
+            run_jit_store_test(
+                &format!("SD {dval:#x}"),
+                |e, _| { e.core.write_gpr(1, DATA_ADDR); e.core.write_gpr(2, dval); },
+                &[instr],
+                1,
+                &[(phys, db.to_vec())],
+            );
+        }
+    }
+}
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
index cb5461b..d0df257 100644
--- a/src/jit/compiler.rs
+++ b/src/jit/compiler.rs
@@ -30,12 +30,22 @@ pub struct BlockCompiler {
     fn_write_u32: FuncId,
     fn_write_u64: FuncId,
     fn_interp_step: FuncId,
+    fn_mfc0: FuncId,
+    fn_dmfc0: FuncId,
+    fn_mtc0: FuncId,
+    fn_dmtc0: FuncId,
 }
 
 impl BlockCompiler {
     pub fn new(helpers: &HelperPtrs) -> Self {
         let mut flag_builder = settings::builder();
-        flag_builder.set("opt_level", "speed").unwrap();
+        // opt_level=none: Cranelift skips several optimization passes.
+        // Generated code is ~10-20% slower per instruction than "speed",
+        // but compile time drops 3-5x. Profiling showed 66% of MIPS-CPU
+        // thread time was in Cranelift passes, so this is the right trade
+        // for our interpreter-first JIT (hot blocks run few hundred times
+        // before being superseded by chained neighbors).
+        flag_builder.set("opt_level", "none").unwrap();
         flag_builder.set("is_pic", "false").unwrap();
 
         let isa_builder = cranelift_native::builder().expect("host ISA not supported");
@@ -53,6 +63,10 @@ impl BlockCompiler {
         jit_builder.symbol("jit_write_u32", helpers.write_u32);
         jit_builder.symbol("jit_write_u64", helpers.write_u64);
         jit_builder.symbol("jit_interp_step", helpers.interp_step);
+        jit_builder.symbol("jit_mfc0", helpers.mfc0);
+        jit_builder.symbol("jit_dmfc0", helpers.dmfc0);
+        jit_builder.symbol("jit_mtc0", helpers.mtc0);
+        jit_builder.symbol("jit_dmtc0", helpers.dmtc0);
 
         let mut jit_module = JITModule::new(jit_builder);
 
@@ -90,6 +104,13 @@ impl BlockCompiler {
         step_sig.returns.push(AbiParam::new(types::I64));
         let fn_interp_step = jit_module.declare_function("jit_interp_step", Linkage::Import, &step_sig).unwrap();
 
+        // mfc0/dmfc0(ctx_ptr, exec_ptr, rd) -> u64 — same shape as a read
+        let fn_mfc0 = jit_module.declare_function("jit_mfc0", Linkage::Import, &read_sig).unwrap();
+        let fn_dmfc0 = jit_module.declare_function("jit_dmfc0", Linkage::Import, &read_sig).unwrap();
+        // mtc0/dmtc0(ctx_ptr, exec_ptr, rd, value) -> u64 — same shape as a write
+        let fn_mtc0 = jit_module.declare_function("jit_mtc0", Linkage::Import, &write_sig).unwrap();
+        let fn_dmtc0 = jit_module.declare_function("jit_dmtc0", Linkage::Import, &write_sig).unwrap();
+
         Self {
             ctx: jit_module.make_context(),
             jit_module,
@@ -98,6 +119,8 @@ impl BlockCompiler {
             fn_read_u8, fn_read_u16, fn_read_u32, fn_read_u64,
             fn_write_u8, fn_write_u16, fn_write_u32, fn_write_u64,
             fn_interp_step,
+            fn_mfc0, fn_dmfc0,
+            fn_mtc0, fn_dmtc0,
         }
     }
 
@@ -157,6 +180,10 @@ impl BlockCompiler {
             write_u32: self.jit_module.declare_func_in_func(self.fn_write_u32, &mut builder.func),
             write_u64: self.jit_module.declare_func_in_func(self.fn_write_u64, &mut builder.func),
             interp_step: self.jit_module.declare_func_in_func(self.fn_interp_step, &mut builder.func),
+            mfc0:  self.jit_module.declare_func_in_func(self.fn_mfc0,  &mut builder.func),
+            dmfc0: self.jit_module.declare_func_in_func(self.fn_dmfc0, &mut builder.func),
+            mtc0:  self.jit_module.declare_func_in_func(self.fn_mtc0,  &mut builder.func),
+            dmtc0: self.jit_module.declare_func_in_func(self.fn_dmtc0, &mut builder.func),
         };
 
         // Load GPRs 1-31 from JitContext (gpr[0] is always 0)
@@ -237,6 +264,45 @@ impl BlockCompiler {
                     branch_exit_pc = Some(target_val);
                     break;
                 }
+                EmitResult::BranchLikely { taken, not_taken, cond } => {
+                    compiled_count += 1;
+                    idx += 1;
+                    if idx < instrs.len() {
+                        let old_gpr = gpr;
+                        let old_hi = hi;
+                        let old_lo = lo;
+                        let old_modified = modified_gprs;
+                        let (_, delay_d) = &instrs[idx];
+                        let delay_pc = block_pc.wrapping_add(idx as u64 * 4);
+                        let delay_result = emit_instruction(
+                            &mut builder, ctx_ptr, exec_ptr, &helpers,
+                            &mut gpr, &mut hi, &mut lo, &mut modified_gprs, delay_d, delay_pc, tier,
+                        );
+                        match delay_result {
+                            EmitResult::Ok => {
+                                for i in 1..32usize {
+                                    if gpr[i] != old_gpr[i] {
+                                        gpr[i] = builder.ins().select(cond, gpr[i], old_gpr[i]);
+                                    }
+                                }
+                                if hi != old_hi { hi = builder.ins().select(cond, hi, old_hi); }
+                                if lo != old_lo { lo = builder.ins().select(cond, lo, old_lo); }
+                                compiled_count += 1;
+                            }
+                            _ => {
+                                gpr = old_gpr;
+                                hi = old_hi;
+                                lo = old_lo;
+                                modified_gprs = old_modified;
+                                compiled_count -= 1;
+                                break;
+                            }
+                        }
+                    }
+                    let target = builder.ins().select(cond, taken, not_taken);
+                    branch_exit_pc = Some(target);
+                    break;
+                }
                 EmitResult::Stop => break,
             }
         }
@@ -289,6 +355,8 @@ impl BlockCompiler {
         let code_ptr = self.jit_module.get_finalized_function(func_id);
         let code_size = 0u32; // JITModule doesn't expose size easily; not critical
 
+        let content_hash = hash_block_instrs(instrs);
+
         Some(CompiledBlock {
             entry: code_ptr,
             phys_addr: 0, // filled in by caller
@@ -296,24 +364,51 @@ impl BlockCompiler {
             len_mips: compiled_count,
             len_native: code_size,
             tier,
-            // Full-tier blocks contain stores that modify memory. Speculative
-            // rollback restores CPU/TLB state but NOT memory, so read-modify-write
-            // sequences get double-applied on rollback. Non-speculative blocks skip
-            // snapshot/rollback — on exception, the store emitter's flushed GPRs and
-            // faulting PC (already in executor via sync_to) are used directly.
-            speculative: tier != BlockTier::Full,
+            // Speculative blocks get snapshot/rollback on exception, providing
+            // self-healing: codegen errors cause exceptions → rollback to correct
+            // state → demotion after 3 failures → bad block replaced.
+            //
+            // Non-speculative is ONLY safe when the block contains stores, because
+            // rollback can't undo memory writes (RMW double-apply). Load-only blocks
+            // at any tier should always be speculative for the safety net.
+            speculative: !block_has_stores(instrs),
             hit_count: 0,
             exception_count: 0,
             stable_hits: 0,
+            content_hash,
         })
     }
 }
 
+/// Check if a block contains any store instructions (SB/SH/SW/SD).
+/// Store-containing blocks must be non-speculative because rollback can't undo
+/// memory writes. Load-only blocks should be speculative for codegen safety.
+fn block_has_stores(instrs: &[(u32, DecodedInstr)]) -> bool {
+    use crate::mips_isa::*;
+    instrs.iter().any(|(_, d)| matches!(d.op as u32, OP_SB | OP_SH | OP_SW | OP_SD))
+}
+
+/// FNV-1a 32-bit hash of raw instruction words. Used to detect stale profile
+/// entries: a different DSO loaded at the same virtual address will have the
+/// same length but different instruction bytes.
+pub fn hash_block_instrs(instrs: &[(u32, DecodedInstr)]) -> u32 {
+    let mut hash: u32 = 0x811c9dc5;
+    for (raw, _) in instrs {
+        for byte in raw.to_le_bytes() {
+            hash ^= byte as u32;
+            hash = hash.wrapping_mul(0x01000193);
+        }
+    }
+    hash
+}
+
 /// Helper function references for memory operations within a compiled function.
 struct EmitHelpers {
     read_u8: FuncRef, read_u16: FuncRef, read_u32: FuncRef, read_u64: FuncRef,
     write_u8: FuncRef, write_u16: FuncRef, write_u32: FuncRef, write_u64: FuncRef,
     interp_step: FuncRef,
+    mfc0: FuncRef, dmfc0: FuncRef,
+    mtc0: FuncRef, dmtc0: FuncRef,
 }
 
 /// Result of emitting a single instruction.
@@ -322,6 +417,9 @@ enum EmitResult {
     Ok,
     /// Instruction is a branch; the Value is the computed target PC.
     Branch(Value),
+    /// Branch-likely: delay slot only executes when taken. The compile loop
+    /// uses `select` to conditionally apply delay-slot side effects.
+    BranchLikely { taken: Value, not_taken: Value, cond: Value },
     /// Instruction is not compilable — terminate block before it.
     Stop,
 }
@@ -349,7 +447,7 @@ fn emit_instruction(
 
     match op {
         OP_SPECIAL => {
-            let result = emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct);
+            let result = emit_special(builder, gpr, hi, lo, d, rs, rt, rd, sa, funct, instr_pc);
             // Conservative: mark rd modified for all SPECIAL ops that return Ok.
             // Harmless for ops that don't write rd (JR, MTHI, MTLO) since flush
             // will simply store the still-valid value that was loaded at block entry.
@@ -358,8 +456,8 @@ fn emit_instruction(
             }
             result
         }
-        OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
-        OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_ADDI | OP_ADDIU  => { emit_addiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
+        OP_DADDI | OP_DADDIU => { emit_daddiu(builder, gpr, rs, rt, d); *modified_gprs |= 1 << rt; EmitResult::Ok }
         OP_SLTI   => { emit_slti(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
         OP_SLTIU  => { emit_sltiu(builder, gpr, rs, rt, d);  *modified_gprs |= 1 << rt; EmitResult::Ok }
         OP_ANDI   => { emit_andi(builder, gpr, rs, rt, d);   *modified_gprs |= 1 << rt; EmitResult::Ok }
@@ -400,10 +498,82 @@ fn emit_instruction(
         OP_BLEZ  => emit_blez(builder, gpr, rs, d, instr_pc, false),
         OP_BGTZ  => emit_bgtz(builder, gpr, rs, d, instr_pc, false),
 
+        // --- Branch-likely ---
+        OP_BEQL  => emit_beq(builder, gpr, rs, rt, d, instr_pc, true),
+        OP_BNEL  => emit_bne(builder, gpr, rs, rt, d, instr_pc, true),
+        OP_BLEZL => emit_blez(builder, gpr, rs, d, instr_pc, true),
+        OP_BGTZL => emit_bgtz(builder, gpr, rs, d, instr_pc, true),
+
+        // --- REGIMM: BLTZ / BGEZ / BLTZAL / BGEZAL + likely variants ---
+        OP_REGIMM => {
+            let rt_code = d.rt as u32;
+            match rt_code {
+                RT_BLTZ => emit_bltz(builder, gpr, rs, d, instr_pc, false),
+                RT_BGEZ => emit_bgez(builder, gpr, rs, d, instr_pc, false),
+                RT_BLTZL => emit_bltz(builder, gpr, rs, d, instr_pc, true),
+                RT_BGEZL => emit_bgez(builder, gpr, rs, d, instr_pc, true),
+                RT_BLTZAL => {
+                    let link = builder.ins().iconst(types::I64, instr_pc.wrapping_add(8) as i64);
+                    gpr[31] = link;
+                    *modified_gprs |= 1u32 << 31;
+                    emit_bltz(builder, gpr, rs, d, instr_pc, false)
+                }
+                RT_BGEZAL => {
+                    let link = builder.ins().iconst(types::I64, instr_pc.wrapping_add(8) as i64);
+                    gpr[31] = link;
+                    *modified_gprs |= 1u32 << 31;
+                    emit_bgez(builder, gpr, rs, d, instr_pc, false)
+                }
+                RT_BLTZALL => {
+                    let link = builder.ins().iconst(types::I64, instr_pc.wrapping_add(8) as i64);
+                    gpr[31] = link;
+                    *modified_gprs |= 1u32 << 31;
+                    emit_bltz(builder, gpr, rs, d, instr_pc, true)
+                }
+                RT_BGEZALL => {
+                    let link = builder.ins().iconst(types::I64, instr_pc.wrapping_add(8) as i64);
+                    gpr[31] = link;
+                    *modified_gprs |= 1u32 << 31;
+                    emit_bgez(builder, gpr, rs, d, instr_pc, true)
+                }
+                _ => EmitResult::Stop,
+            }
+        }
+
         // --- Jumps ---
         OP_J   => emit_j(builder, gpr, d, instr_pc),
         OP_JAL => { *modified_gprs |= 1 << 31; emit_jal(builder, gpr, d, instr_pc) }
 
+        // --- COP0: MFC0 / DMFC0 / MTC0 / DMTC0 ---
+        // CFC0/CTC0/TLB*/ERET still fall through to Stop.
+        OP_COP0 => {
+            let sub = rs as u32;  // rs field encodes the COP0 operation
+            match sub {
+                RS_MFC0 | RS_DMFC0 => {
+                    let helper = if sub == RS_MFC0 { helpers.mfc0 } else { helpers.dmfc0 };
+                    flush_modified_gprs(builder, gpr, ctx_ptr, modified_gprs);
+                    let rd_val = builder.ins().iconst(types::I64, rd as i64);
+                    let call = builder.ins().call(helper, &[ctx_ptr, exec_ptr, rd_val]);
+                    let result = builder.inst_results(call)[0];
+                    gpr[rt] = result;
+                    *modified_gprs |= 1u32 << rt;
+                    EmitResult::Ok
+                }
+                RS_MTC0 | RS_DMTC0 => {
+                    let helper = if sub == RS_MTC0 { helpers.mtc0 } else { helpers.dmtc0 };
+                    // Flush dirty GPRs so write_cp0 side effects (which may re-
+                    // derive translation state from full register context) see
+                    // a consistent picture.
+                    flush_modified_gprs(builder, gpr, ctx_ptr, modified_gprs);
+                    let rd_val = builder.ins().iconst(types::I64, rd as i64);
+                    let value = gpr[rt];
+                    let _ = builder.ins().call(helper, &[ctx_ptr, exec_ptr, rd_val, value]);
+                    EmitResult::Ok
+                }
+                _ => EmitResult::Stop,
+            }
+        }
+
         _ => EmitResult::Stop,
     }
 }
@@ -415,6 +585,7 @@ fn emit_special(
     lo: &mut Value,
     d: &DecodedInstr,
     rs: usize, rt: usize, rd: usize, sa: u32, funct: u32,
+    instr_pc: u64,
 ) -> EmitResult {
     match funct {
         // --- Shifts (immediate) ---
@@ -477,13 +648,12 @@ fn emit_special(
         // --- JR / JALR ---
         FUNCT_JR   => { let target = gpr[rs]; EmitResult::Branch(target) }
         FUNCT_JALR => {
+            // JALR rd, rs: jump to gpr[rs], link PC+8 into gpr[rd].
+            // rd defaults to $ra ($31) when not specified in assembly.
             let target = gpr[rs];
-            let instr_pc_plus_8 = d.imm; // we'll handle this in dispatch; for now use rd
-            // JALR stores return address in rd (default $ra=31)
-            // But we don't know the PC here... pass it via a different mechanism.
-            // Actually: JALR rd, rs — stores PC+8 in rd.
-            // We don't have the PC as a value here. Let's defer JALR to interpreter.
-            EmitResult::Stop
+            let link = builder.ins().iconst(types::I64, instr_pc.wrapping_add(8) as i64);
+            gpr[rd] = link;
+            EmitResult::Branch(target)
         }
 
         // --- SYNC (barrier, NOP for JIT) ---
@@ -948,35 +1118,44 @@ fn emit_store(
 // The compiled block stores this PC and returns. Delay slots are handled by
 // the dispatch loop (the next instruction after the branch is interpreted).
 
+fn emit_branch_result(
+    builder: &mut FunctionBuilder, taken: Value, not_taken: Value, cond: Value, likely: bool,
+) -> EmitResult {
+    if likely {
+        EmitResult::BranchLikely { taken, not_taken, cond }
+    } else {
+        let target = builder.ins().select(cond, taken, not_taken);
+        EmitResult::Branch(target)
+    }
+}
+
 fn emit_beq(
     builder: &mut FunctionBuilder, gpr: &[Value; 32],
-    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, likely: bool,
 ) -> EmitResult {
     let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
-    let not_taken_pc = instr_pc.wrapping_add(8); // skip delay slot
+    let not_taken_pc = instr_pc.wrapping_add(8);
     let taken = builder.ins().iconst(types::I64, taken_pc as i64);
     let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
     let cond = builder.ins().icmp(IntCC::Equal, gpr[rs], gpr[rt]);
-    let target = builder.ins().select(cond, taken, not_taken);
-    EmitResult::Branch(target)
+    emit_branch_result(builder, taken, not_taken, cond, likely)
 }
 
 fn emit_bne(
     builder: &mut FunctionBuilder, gpr: &[Value; 32],
-    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+    rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, likely: bool,
 ) -> EmitResult {
     let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
     let not_taken_pc = instr_pc.wrapping_add(8);
     let taken = builder.ins().iconst(types::I64, taken_pc as i64);
     let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
     let cond = builder.ins().icmp(IntCC::NotEqual, gpr[rs], gpr[rt]);
-    let target = builder.ins().select(cond, taken, not_taken);
-    EmitResult::Branch(target)
+    emit_branch_result(builder, taken, not_taken, cond, likely)
 }
 
 fn emit_blez(
     builder: &mut FunctionBuilder, gpr: &[Value; 32],
-    rs: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+    rs: usize, d: &DecodedInstr, instr_pc: u64, likely: bool,
 ) -> EmitResult {
     let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
     let not_taken_pc = instr_pc.wrapping_add(8);
@@ -984,13 +1163,12 @@ fn emit_blez(
     let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
     let zero = builder.ins().iconst(types::I64, 0);
     let cond = builder.ins().icmp(IntCC::SignedLessThanOrEqual, gpr[rs], zero);
-    let target = builder.ins().select(cond, taken, not_taken);
-    EmitResult::Branch(target)
+    emit_branch_result(builder, taken, not_taken, cond, likely)
 }
 
 fn emit_bgtz(
     builder: &mut FunctionBuilder, gpr: &[Value; 32],
-    rs: usize, d: &DecodedInstr, instr_pc: u64, _likely: bool,
+    rs: usize, d: &DecodedInstr, instr_pc: u64, likely: bool,
 ) -> EmitResult {
     let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
     let not_taken_pc = instr_pc.wrapping_add(8);
@@ -998,8 +1176,33 @@ fn emit_bgtz(
     let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
     let zero = builder.ins().iconst(types::I64, 0);
     let cond = builder.ins().icmp(IntCC::SignedGreaterThan, gpr[rs], zero);
-    let target = builder.ins().select(cond, taken, not_taken);
-    EmitResult::Branch(target)
+    emit_branch_result(builder, taken, not_taken, cond, likely)
+}
+
+fn emit_bltz(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, d: &DecodedInstr, instr_pc: u64, likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let zero = builder.ins().iconst(types::I64, 0);
+    let cond = builder.ins().icmp(IntCC::SignedLessThan, gpr[rs], zero);
+    emit_branch_result(builder, taken, not_taken, cond, likely)
+}
+
+fn emit_bgez(
+    builder: &mut FunctionBuilder, gpr: &[Value; 32],
+    rs: usize, d: &DecodedInstr, instr_pc: u64, likely: bool,
+) -> EmitResult {
+    let taken_pc = instr_pc.wrapping_add(4).wrapping_add(d.imm as i32 as i64 as u64);
+    let not_taken_pc = instr_pc.wrapping_add(8);
+    let taken = builder.ins().iconst(types::I64, taken_pc as i64);
+    let not_taken = builder.ins().iconst(types::I64, not_taken_pc as i64);
+    let zero = builder.ins().iconst(types::I64, 0);
+    let cond = builder.ins().icmp(IntCC::SignedGreaterThanOrEqual, gpr[rs], zero);
+    emit_branch_result(builder, taken, not_taken, cond, likely)
 }
 
 fn emit_j(
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
index c9d6c2e..ffd7994 100644
--- a/src/jit/dispatch.rs
+++ b/src/jit/dispatch.rs
@@ -7,10 +7,59 @@
 //! The probe interval adapts dynamically: frequent cache hits → shorter interval
 //! (probe more often), frequent misses → longer interval (less overhead).
 
+use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Mutex;
+
+// Diagnostic: counts how many times a specific (non-compilable) instruction
+// type caused trace_block to terminate. Key encoding:
+//   bits 31..26: op
+//   bits 25..20: funct (for OP_SPECIAL) or rt (for OP_REGIMM), 0 otherwise
+//   bits 19..14: tier (0=Alu, 1=Loads, 2=Full)
+// Values are occurrence counts. Printed at shutdown to guide which
+// instructions to add next for the biggest block-length wins.
+static TERMINATION_STATS: Mutex<Option<HashMap<u32, u64>>> = Mutex::new(None);
+
+fn record_termination(d: &DecodedInstr, tier: BlockTier) {
+    let op = d.op as u32;
+    let secondary = if op == crate::mips_isa::OP_SPECIAL {
+        d.funct as u32
+    } else if op == crate::mips_isa::OP_REGIMM {
+        d.rt as u32
+    } else {
+        0
+    };
+    let tier_bits = match tier {
+        BlockTier::Alu => 0,
+        BlockTier::Loads => 1,
+        BlockTier::Full => 2,
+    };
+    let key = (op << 26) | ((secondary & 0x3F) << 20) | (tier_bits << 14);
+    if let Ok(mut guard) = TERMINATION_STATS.lock() {
+        let map = guard.get_or_insert_with(HashMap::new);
+        *map.entry(key).or_insert(0) += 1;
+    }
+}
+
+fn dump_termination_stats() {
+    let Ok(guard) = TERMINATION_STATS.lock() else { return; };
+    let Some(map) = guard.as_ref() else { return; };
+    if map.is_empty() { return; }
+    let mut entries: Vec<(u32, u64)> = map.iter().map(|(k, v)| (*k, *v)).collect();
+    entries.sort_by(|a, b| b.1.cmp(&a.1));
+    eprintln!("JIT: top block-termination causes (op/secondary/tier → count):");
+    for (key, count) in entries.iter().take(20) {
+        let op = (key >> 26) & 0x3F;
+        let secondary = (key >> 20) & 0x3F;
+        let tier = (key >> 14) & 0x3;
+        let tier_name = match tier { 0 => "Alu", 1 => "Loads", 2 => "Full", _ => "?" };
+        eprintln!("  op={:#04x} secondary={:#04x} tier={} count={}",
+            op, secondary, tier_name, count);
+    }
+}
 
 use crate::mips_exec::{MipsExecutor, DecodedInstr, EXEC_BREAKPOINT, decode_into};
-use crate::mips_tlb::{Tlb, AccessType};
+use crate::mips_tlb::Tlb;
 use crate::mips_cache_v2::MipsCache;
 
 use super::cache::{BlockTier, CodeCache, TierConfig};
@@ -19,6 +68,7 @@ use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION};
 use super::helpers::HelperPtrs;
 use super::profile::{self, ProfileEntry};
 use super::snapshot::CpuRollbackSnapshot;
+use super::trace::{TraceWriter, TraceRecord};
 
 const MAX_BLOCK_LEN: usize = 64;
 
@@ -51,7 +101,7 @@ impl ProbeController {
         let base = std::env::var("IRIS_JIT_PROBE").ok()
             .and_then(|v| v.parse().ok()).unwrap_or(200u32);
         let min = std::env::var("IRIS_JIT_PROBE_MIN").ok()
-            .and_then(|v| v.parse().ok()).unwrap_or(100u32);
+            .and_then(|v| v.parse().ok()).unwrap_or(50u32);
         let max = std::env::var("IRIS_JIT_PROBE_MAX").ok()
             .and_then(|v| v.parse().ok()).unwrap_or(2000u32);
         Self {
@@ -146,6 +196,8 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     let mut ctx = JitContext::new();
     ctx.executor_ptr = exec_ptr as u64;
 
+    let mut trace_writer = TraceWriter::from_env();
+
     let mut total_jit_instrs: u64 = 0;
     let mut total_interp_steps: u64 = 0;
     let mut blocks_compiled: u64 = 0;
@@ -153,30 +205,30 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     let mut demotions: u64 = 0;
     let mut rollbacks: u64 = 0;
 
-    // Load saved profile and eagerly compile hot blocks
-    {
-        let exec = unsafe { &mut *exec_ptr };
-        let profile_entries = profile::load_profile();
-        let mut profile_compiled = 0u64;
-        for entry in &profile_entries {
-            let tier = if entry.tier > max_tier { max_tier } else { entry.tier };
-            if tier == BlockTier::Alu {
-                continue;
-            }
-            let instrs = trace_block(exec, entry.virt_pc, tier);
-            if !instrs.is_empty() {
-                if let Some(mut block) = compiler.compile_block(&instrs, entry.virt_pc, tier) {
-                    block.phys_addr = entry.phys_pc;
-                    cache.insert(entry.phys_pc, block);
-                    blocks_compiled += 1;
-                    profile_compiled += 1;
-                }
-            }
-        }
-        if profile_compiled > 0 {
-            eprintln!("JIT profile: pre-compiled {} blocks from profile", profile_compiled);
-        }
-    }
+    // Chain diagnostics: how often does the chain break, and why?
+    let mut chain_starts: u64 = 0;     // times we entered the chain loop
+    let mut chain_blocks_run: u64 = 0; // total chained block executions
+    let mut chain_break_excluded: u64 = 0; // PROM/exc/delay at next_pc
+    let mut chain_break_translate: u64 = 0; // translate_pc failed
+    let mut chain_break_miss: u64 = 0;  // cache miss
+    let mut chain_break_exc: u64 = 0;   // exception in chained block
+    let mut chain_break_limit: u64 = 0; // hit MAX_CHAIN_INSTRS
+
+    // Load saved profile. Bulk pre-compilation at startup evicts L2/D-cache
+    // lines the kernel depends on (caused UTLB panics in prior attempts).
+    // Instead, replay drip-feeds one entry per probe — but only after the
+    // kernel reaches userspace, so PROM/early-kernel boot is completely
+    // unaffected. Entries arrive sorted by hit_count descending (hottest first).
+    let mut profile_queue: VecDeque<ProfileEntry> =
+        VecDeque::from(profile::load_profile());
+    let profile_total: u64 = profile_queue.len() as u64;
+    // Phase 1 state: detect boot has settled (kernel has reached userspace).
+    let mut saw_userspace = false;
+    let mut boot_settled_count: u32 = 0;
+    const BOOT_SETTLE_THRESHOLD: u32 = 100;
+    let mut profile_replay_active = false;
+    let mut profile_replayed: u64 = 0;
+    let mut profile_stale: u64 = 0;
 
     while running.load(Ordering::Relaxed) {
         let mut steps_in_batch: u32 = 0;
@@ -219,6 +271,26 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 continue;
             }
 
+            // Boot-settle detection (Phase 1 of deferred profile replay).
+            // Wait for the first userspace PC (pc32 < 0x80000000 = kuseg),
+            // then count 100 more probes before activating replay. This
+            // ensures the kernel is past init, running init(1M) or later
+            // user processes, before we start re-compiling saved blocks.
+            if !profile_replay_active && !profile_queue.is_empty() {
+                if !saw_userspace && pc32 < 0x80000000 {
+                    saw_userspace = true;
+                }
+                if saw_userspace {
+                    boot_settled_count += 1;
+                    if boot_settled_count >= BOOT_SETTLE_THRESHOLD {
+                        profile_replay_active = true;
+                        eprintln!("JIT profile: boot settled, replaying {} saved blocks",
+                            profile_queue.len());
+                    }
+                }
+            }
+
+
             let phys_pc = {
                 let exec = unsafe { &mut *exec_ptr };
                 match translate_pc(exec, pc) {
@@ -227,8 +299,9 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 }
             };
 
-            if let Some(block) = cache.lookup(phys_pc) {
+            if let Some(block) = cache.lookup(phys_pc, pc) {
                 probe.record_hit();
+
                 let block_len = block.len_mips;
                 let block_tier = block.tier;
                 let is_speculative = block.speculative;
@@ -265,7 +338,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                 snap.restore(exec);
                                 rollbacks += 1;
 
-                                if let Some(block) = cache.lookup_mut(phys_pc) {
+                                if let Some(block) = cache.lookup_mut(phys_pc, pc) {
                                     block.hit_count += 1;
                                     block.exception_count += 1;
                                     block.stable_hits = 0;
@@ -396,35 +469,27 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                         exec.core.cp0_count = prev.wrapping_add(count_advance);
                         if exec.core.cp0_compare.wrapping_sub(prev) <= count_advance {
                             exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                            exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
                         }
                         // Credit local_cycles so the stats display shows correct MHz
                         exec.core.local_cycles += n;
 
-                        // Check for pending interrupts — JIT blocks don't check per-
-                        // instruction like the interpreter does. If an external interrupt
-                        // arrived during the block, service it now via one interpreter step.
+                        // Merge external interrupt bits into cp0_cause so the
+                        // interpreter sees them on its next step. Don't call exec.step()
+                        // here — that would double-count cp0_count (the post-block
+                        // advancement above already accounted for all block instructions,
+                        // and step() would add yet another count_step tick per interrupt).
                         let pending = exec.core.interrupts.load(Ordering::Relaxed);
-                        if (pending | exec.core.cp0_cause as u64) != 0 {
-                            // Merge external IP bits (IP2-IP6) into Cause (same as step() does)
+                        if pending != 0 {
                             use crate::mips_core::{CAUSE_IP2, CAUSE_IP3, CAUSE_IP4, CAUSE_IP5, CAUSE_IP6};
                             let ext_mask = CAUSE_IP2 | CAUSE_IP3 | CAUSE_IP4 | CAUSE_IP5 | CAUSE_IP6;
                             exec.core.cp0_cause = (exec.core.cp0_cause & !ext_mask)
                                 | (pending as u32 & ext_mask);
-                            if exec.core.interrupts_enabled() {
-                                let ip = exec.core.cp0_cause & crate::mips_core::CAUSE_IP_MASK;
-                                let im = exec.core.cp0_status & crate::mips_core::STATUS_IM_MASK;
-                                if (ip & im) != 0 {
-                                    // Pending unmasked interrupt — let the interpreter handle it
-                                    exec.step();
-                                    total_interp_steps += 1;
-                                    steps_in_batch += 1;
-                                }
-                            }
                         }
                     }
 
                     // Update stats and check for promotion
-                    if let Some(block) = cache.lookup_mut(phys_pc) {
+                    if let Some(block) = cache.lookup_mut(phys_pc, pc) {
                         block.hit_count += 1;
                         block.stable_hits += 1;
                         block.exception_count = 0;
@@ -449,6 +514,170 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
 
                     total_jit_instrs += block_len as u64;
                     steps_in_batch += block_len;
+
+                    // Block chaining: after a normal JIT exit, try to run more
+                    // cached blocks without returning to the interpreter burst.
+                    // Skip in verify mode (per-block verification needed) and
+                    // for speculative blocks (snapshot/rollback adds complexity).
+                    // Break chain on: cache miss, exception, pending interrupt,
+                    // PROM/exc-vector/delay-slot PC, or max cumulative instrs.
+                    if !verify_mode {
+                        let mut chain_instrs: u32 = 0;
+                        const MAX_CHAIN_INSTRS: u32 = 32;
+                        chain_starts += 1;
+
+                        loop {
+                            if chain_instrs >= MAX_CHAIN_INSTRS {
+                                chain_break_limit += 1;
+                                break;
+                            }
+                            let (next_pc, next_delay) = (exec.core.pc, exec.in_delay_slot);
+                            let next_pc32 = next_pc as u32;
+                            let in_prom = (next_pc32 >= 0x9FC00000 && next_pc32 < 0xA0000000)
+                                || (next_pc32 >= 0xBFC00000);
+                            let in_exc = next_pc32 >= 0x80000000 && next_pc32 < 0x80000400;
+                            if in_prom || in_exc || next_delay {
+                                chain_break_excluded += 1;
+                                break;
+                            }
+
+                            // NOTE: No interrupt check here. IRIX's device
+                            // interrupts (IP2-IP6) are level-triggered and
+                            // frequently asserted; checking "enabled+pending"
+                            // would break nearly every chain immediately.
+                            // MAX_CHAIN_INSTRS caps worst-case interrupt
+                            // delivery latency.
+
+                            let next_phys = match translate_pc(exec, next_pc) {
+                                Some(p) => p,
+                                None => { chain_break_translate += 1; break; }
+                            };
+
+                            let (next_entry, next_block_len, next_is_speculative) =
+                                match cache.lookup(next_phys, next_pc) {
+                                    Some(b) => (b.entry, b.len_mips, b.speculative),
+                                    None => {
+                                        // Compile on miss at max_tier (not Alu).
+                                        // The main path always starts at Alu, but
+                                        // that fails if the first instruction is
+                                        // a load/store — leaving these PCs forever
+                                        // uncached. Compile at max_tier directly
+                                        // since Loads/Full tiers are proven stable.
+                                        let instrs = trace_block(exec, next_pc, max_tier);
+                                        if !instrs.is_empty() {
+                                            if let Some(mut block) = compiler.compile_block(&instrs, next_pc, max_tier) {
+                                                block.phys_addr = next_phys;
+                                                cache.insert(next_phys, next_pc, block);
+                                                blocks_compiled += 1;
+                                                probe.set_cache_size(cache.len() as u32);
+                                            }
+                                        }
+                                        chain_break_miss += 1;
+                                        break;
+                                    }
+                                };
+
+                            probe.record_hit();
+                            chain_blocks_run += 1;
+
+                            // Snapshot for speculative chained blocks (same
+                            // as main path). Non-speculative: no snapshot.
+                            let next_snapshot = if next_is_speculative {
+                                exec.tlb.clone_as_mips_tlb().map(|tlb| {
+                                    CpuRollbackSnapshot::capture(exec, tlb)
+                                })
+                            } else {
+                                None
+                            };
+
+                            ctx.sync_from_executor(exec);
+                            ctx.exit_reason = 0;
+                            let entry: extern "C" fn(*mut JitContext) = unsafe {
+                                std::mem::transmute(next_entry)
+                            };
+                            entry(&mut ctx);
+                            ctx.sync_to_executor(exec);
+
+                            if ctx.exit_reason == EXIT_EXCEPTION {
+                                // Exception in chained block. Speculative: roll
+                                // back and update demotion tracking. Either way:
+                                // advance cp0_count for instructions before the
+                                // fault, step the interpreter once, break chain.
+                                if let Some(snap) = &next_snapshot {
+                                    if next_is_speculative {
+                                        snap.restore(exec);
+                                        rollbacks += 1;
+                                        if let Some(blk) = cache.lookup_mut(next_phys, next_pc) {
+                                            blk.hit_count += 1;
+                                            blk.exception_count += 1;
+                                            blk.stable_hits = 0;
+                                            if blk.exception_count >= tier_cfg.demote {
+                                                if let Some(lower) = blk.tier.demote() {
+                                                    demotions += 1;
+                                                    eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)",
+                                                        next_pc, blk.tier, lower, blk.exception_count);
+                                                    recompile_block_at_tier(
+                                                        &mut compiler, &mut cache, exec,
+                                                        next_phys, next_pc, lower,
+                                                        &mut blocks_compiled,
+                                                    );
+                                                } else {
+                                                    blk.speculative = false;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                                let instrs_before_fault = ctx.pc.wrapping_sub(next_pc) / 4;
+                                if instrs_before_fault > 0 {
+                                    let advance = exec.core.count_step.wrapping_mul(instrs_before_fault);
+                                    let prev = exec.core.cp0_count;
+                                    exec.core.cp0_count = prev.wrapping_add(advance);
+                                    if exec.core.cp0_compare.wrapping_sub(prev) <= advance {
+                                        exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                                    }
+                                    exec.core.local_cycles += instrs_before_fault;
+                                }
+                                exec.step();
+                                total_interp_steps += 1;
+                                steps_in_batch += 1;
+                                chain_break_exc += 1;
+                                break;
+                            }
+
+                            // Normal exit: post-block bookkeeping (identical to
+                            // the main path's cp0_count advance + interrupt merge).
+                            let n = next_block_len as u64;
+                            let count_advance = exec.core.count_step.wrapping_mul(n);
+                            let prev = exec.core.cp0_count;
+                            exec.core.cp0_count = prev.wrapping_add(count_advance);
+                            if exec.core.cp0_compare.wrapping_sub(prev) <= count_advance {
+                                exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                                exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
+                            }
+                            exec.core.local_cycles += n;
+                            let pending = exec.core.interrupts.load(Ordering::Relaxed);
+                            if pending != 0 {
+                                use crate::mips_core::{CAUSE_IP2, CAUSE_IP3, CAUSE_IP4, CAUSE_IP5, CAUSE_IP6};
+                                let ext_mask = CAUSE_IP2 | CAUSE_IP3 | CAUSE_IP4 | CAUSE_IP5 | CAUSE_IP6;
+                                exec.core.cp0_cause = (exec.core.cp0_cause & !ext_mask)
+                                    | (pending as u32 & ext_mask);
+                            }
+
+                            // Update block stats (no promotion in chain path —
+                            // the block already went through promotion checks
+                            // on its main-path execution).
+                            if let Some(blk) = cache.lookup_mut(next_phys, next_pc) {
+                                blk.hit_count += 1;
+                                blk.stable_hits += 1;
+                                blk.exception_count = 0;
+                            }
+
+                            total_jit_instrs += next_block_len as u64;
+                            steps_in_batch += next_block_len;
+                            chain_instrs += next_block_len;
+                        }
+                    }
                 }
             } else {
                 probe.record_miss();
@@ -458,7 +687,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 if !instrs.is_empty() {
                     if let Some(mut block) = compiler.compile_block(&instrs, pc, BlockTier::Alu) {
                         block.phys_addr = phys_pc;
-                        cache.insert(phys_pc, block);
+                        cache.insert(phys_pc, pc, block);
                         blocks_compiled += 1;
                         probe.set_cache_size(cache.len() as u32);
                         if blocks_compiled <= 10 || blocks_compiled % 500 == 0 {
@@ -468,6 +697,31 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                     }
                 }
             }
+
+            // Phase 2 of deferred profile replay: drip-feed one saved block
+            // per probe as background work. The normal probe logic above
+            // already ran (including compile-current-PC on miss), so this
+            // is purely additive. Saved entries are sorted by hit_count
+            // descending, so the hottest blocks replay first.
+            if profile_replay_active {
+                if let Some(entry) = profile_queue.pop_front() {
+                    let exec = unsafe { &mut *exec_ptr };
+                    replay_one_profile_entry(
+                        &entry, &mut compiler, &mut cache, exec,
+                        &mut blocks_compiled, &mut profile_replayed,
+                        &mut profile_stale,
+                    );
+                    probe.set_cache_size(cache.len() as u32);
+                    if profile_replayed > 0 && profile_replayed % 1000 == 0 {
+                        eprintln!("JIT profile: replayed {}/{} ({} stale)",
+                            profile_replayed, profile_total, profile_stale);
+                    }
+                    if profile_queue.is_empty() {
+                        eprintln!("JIT profile: replay complete, {} compiled / {} stale",
+                            profile_replayed, profile_stale);
+                    }
+                }
+            }
         }
 
         {
@@ -475,6 +729,28 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
             exec.flush_cycles();
         }
 
+        // Write trace record at 100K instruction milestones.
+        // Both JIT and interpreter runs log at the same milestones so
+        // records align for offline comparison.
+        if let Some(tw) = &mut trace_writer {
+            let total = total_interp_steps + total_jit_instrs;
+            let prev_total = total.saturating_sub(BATCH_SIZE as u64);
+            let milestone = 100_000u64;
+            if total / milestone != prev_total / milestone {
+                let exec = unsafe { &*exec_ptr };
+                tw.write_record(&TraceRecord {
+                    insn_count: (total / milestone) * milestone,
+                    pc: exec.core.pc,
+                    cp0_count: exec.core.cp0_count,
+                    cp0_status: exec.core.cp0_status,
+                    cp0_cause: exec.core.cp0_cause,
+                    in_delay_slot: exec.in_delay_slot as u8,
+                    _pad: [0; 7],
+                    gpr_hash: TraceRecord::hash_gprs(&exec.core.gpr),
+                });
+            }
+        }
+
         let total = total_interp_steps + total_jit_instrs;
         if total % 10000000 < BATCH_SIZE as u64 {
             let exec = unsafe { &*exec_ptr };
@@ -500,14 +776,31 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     eprintln!("JIT: shutdown. {} blocks, {} jit / {} interp / {} total ({:.1}% jit), {}↑ {}↓ {}⟲, final_probe={}",
         blocks_compiled, total_jit_instrs, total_interp_steps, total,
         jit_pct, promotions, demotions, rollbacks, probe.interval);
-
-    // Save profile: all blocks above Alu tier
+    let chain_avg = if chain_starts > 0 {
+        chain_blocks_run as f64 / chain_starts as f64
+    } else { 0.0 };
+    eprintln!("JIT: chains: {} starts, {} blocks run (avg {:.2}/chain), breaks: excluded={} translate={} miss={} exc={} limit={}",
+        chain_starts, chain_blocks_run, chain_avg,
+        chain_break_excluded, chain_break_translate, chain_break_miss,
+        chain_break_exc, chain_break_limit);
+    dump_termination_stats();
+
+    // Save profile: kernel-space blocks above Alu tier only. Userspace blocks
+    // are per-process and ephemeral — a saved userspace VA may belong to a
+    // completely different process next session, or to nothing. Kernel code
+    // (kseg0/kseg1, pc32 >= 0x80000000) is shared and stable, so replay is
+    // meaningful. Without this filter the profile grows unboundedly with
+    // ephemeral process blocks and replay causes post-login corruption.
     let profile_entries: Vec<ProfileEntry> = cache.iter()
         .filter(|(_, block)| block.tier > BlockTier::Alu)
-        .map(|(&phys_pc, block)| ProfileEntry {
+        .filter(|(_, block)| (block.virt_addr as u32) >= 0x80000000)
+        .map(|(&(phys_pc, _virt_pc), block)| ProfileEntry {
             phys_pc,
             virt_pc: block.virt_addr,
             tier: block.tier,
+            len_mips: block.len_mips,
+            content_hash: block.content_hash,
+            hit_count: block.hit_count,
         })
         .collect();
     if !profile_entries.is_empty() {
@@ -517,6 +810,71 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
     }
 }
 
+/// Replay one profile entry: re-derive physical address from saved virt_pc,
+/// re-trace the block, validate content hash, and insert into the cache if
+/// everything still matches. Enters as speculative with zeroed counters —
+/// the profile is a hint, not a guarantee, and each replayed block must
+/// re-prove stability this session before being trusted.
+///
+/// Silently discards entries that can't be validated (unmapped pages,
+/// different code at the saved VA, already-cached blocks).
+fn replay_one_profile_entry<T: Tlb, C: MipsCache>(
+    entry: &ProfileEntry,
+    compiler: &mut BlockCompiler,
+    cache: &mut CodeCache,
+    exec: &mut MipsExecutor<T, C>,
+    blocks_compiled: &mut u64,
+    profile_replayed: &mut u64,
+    profile_stale: &mut u64,
+) {
+    // Re-derive phys_pc — saved phys_pc is for diagnostics only. TLB state
+    // differs between sessions, so the same virt_pc may map elsewhere now.
+    let phys_pc = match translate_pc(exec, entry.virt_pc) {
+        Some(p) => p,
+        None => { *profile_stale += 1; return; } // page not mapped this session
+    };
+
+    // Skip if a block already exists at this (phys_pc, virt_pc). This can
+    // happen if normal compilation beat us to it, or a prior replay already
+    // processed this entry (defensive).
+    if cache.contains(phys_pc, entry.virt_pc) {
+        return;
+    }
+
+    let instrs = trace_block(exec, entry.virt_pc, entry.tier);
+    if instrs.is_empty() {
+        *profile_stale += 1;
+        return;
+    }
+
+    // Cheap length check first, then definitive hash check. Either mismatch
+    // means the code at this VA is different from what we saw last session.
+    if instrs.len() as u32 != entry.len_mips {
+        *profile_stale += 1;
+        return;
+    }
+    let content_hash = super::compiler::hash_block_instrs(&instrs);
+    if content_hash != entry.content_hash {
+        *profile_stale += 1;
+        return;
+    }
+
+    if let Some(mut block) = compiler.compile_block(&instrs, entry.virt_pc, entry.tier) {
+        block.phys_addr = phys_pc;
+        // Zero all counters — no penalty baggage from prior session.
+        // speculative is left as compile_block set it: Full-tier is NOT
+        // speculative because rollback can't un-do stores (memory diverges
+        // from CPU state). Alu/Loads tiers are speculative and will re-prove
+        // stability via the normal snapshot/rollback path this session.
+        block.hit_count = 0;
+        block.stable_hits = 0;
+        block.exception_count = 0;
+        cache.insert(phys_pc, entry.virt_pc, block);
+        *blocks_compiled += 1;
+        *profile_replayed += 1;
+    }
+}
+
 /// Recompile a block at a different tier, replacing the existing cache entry.
 fn recompile_block_at_tier<T: Tlb, C: MipsCache>(
     compiler: &mut BlockCompiler,
@@ -531,7 +889,7 @@ fn recompile_block_at_tier<T: Tlb, C: MipsCache>(
     if !instrs.is_empty() {
         if let Some(mut block) = compiler.compile_block(&instrs, virt_pc, tier) {
             block.phys_addr = phys_pc;
-            cache.replace(phys_pc, block);
+            cache.replace(phys_pc, virt_pc, block);
             *blocks_compiled += 1;
         }
     }
@@ -541,6 +899,9 @@ fn interpreter_loop<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     running: &AtomicBool,
 ) {
+    let mut trace_writer = TraceWriter::from_env();
+    let mut total_steps: u64 = 0;
+
     while running.load(Ordering::Relaxed) {
         #[cfg(feature = "lightning")]
         for _ in 0..1000 {
@@ -555,7 +916,25 @@ fn interpreter_loop<T: Tlb, C: MipsCache>(
                 break;
             }
         }
+        total_steps += 10000;
         exec.flush_cycles();
+
+        if let Some(tw) = &mut trace_writer {
+            let prev = total_steps.saturating_sub(10000);
+            let milestone = 100_000u64;
+            if total_steps / milestone != prev / milestone {
+                tw.write_record(&TraceRecord {
+                    insn_count: (total_steps / milestone) * milestone,
+                    pc: exec.core.pc,
+                    cp0_count: exec.core.cp0_count,
+                    cp0_status: exec.core.cp0_status,
+                    cp0_cause: exec.core.cp0_cause,
+                    in_delay_slot: exec.in_delay_slot as u8,
+                    _pad: [0; 7],
+                    gpr_hash: TraceRecord::hash_gprs(&exec.core.gpr),
+                });
+            }
+        }
     }
 }
 
@@ -563,7 +942,12 @@ fn translate_pc<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     virt_pc: u64,
 ) -> Option<u64> {
-    let result = (exec.translate_fn)(exec, virt_pc, AccessType::Fetch);
+    // Use debug_translate (translate_impl::<true>) to avoid CP0 side effects.
+    // The non-debug path writes cp0_badvaddr, cp0_entryhi, cp0_context,
+    // cp0_xcontext on TLB miss — corrupting state that the next real TLB
+    // exception handler depends on. We only need the physical address for
+    // cache lookup; we don't want to touch CP0 state.
+    let result = exec.debug_translate(virt_pc);
     if result.is_exception() { None } else { Some(result.phys as u64) }
 }
 
@@ -572,10 +956,20 @@ fn trace_block<T: Tlb, C: MipsCache>(
     start_pc: u64,
     tier: BlockTier,
 ) -> Vec<(u32, DecodedInstr)> {
-    let mut instrs = Vec::with_capacity(MAX_BLOCK_LEN);
+    let max_len = MAX_BLOCK_LEN;
+    let mut instrs = Vec::with_capacity(max_len);
     let mut pc = start_pc;
 
-    for _ in 0..MAX_BLOCK_LEN {
+    // Full-tier blocks accumulate up to max_helpers load/store helper calls
+    // before terminating. Each helper emits an ok_block/exc_block CFG diamond.
+    // Too many chained diamonds trip Cranelift's regalloc2 and produce wrong
+    // code (confirmed by IRIS_JIT_VERIFY catching real GPR mismatches). The
+    // safe ceiling was empirically determined: aarch64 tolerates 3, x86_64
+    // only 1. Bumping past this threshold produces silent miscompilations.
+    let max_helpers: u32 = if cfg!(target_arch = "aarch64") { 3 } else { 1 };
+    let mut helper_count: u32 = 0;
+
+    for _ in 0..max_len {
         let raw = match exec.debug_fetch_instr(pc) {
             Ok(w) => w,
             Err(_) => break,
@@ -585,20 +979,33 @@ fn trace_block<T: Tlb, C: MipsCache>(
         d.raw = raw;
         decode_into::<T, C>(&mut d);
 
-        if !is_compilable_for_tier(&d, tier) { break; }
+        if !is_compilable_for_tier(&d, tier) {
+            record_termination(&d, tier);
+            break;
+        }
 
         let is_branch = is_branch_or_jump(&d);
-        // Terminate Full-tier blocks after each store to keep blocks short.
-        // Long blocks with multiple load/store helper calls create complex CFG
-        // (ok_block/exc_block diamonds) that triggers Cranelift regalloc2 issues
-        // on x86_64, causing rare but fatal codegen corruption.
-        let is_store = tier == BlockTier::Full && is_compilable_store(&d);
-        instrs.push((raw, d));
 
-        if is_store {
+        // Full-tier: terminate BEFORE stores. Store-containing blocks must be
+        // non-speculative (can't rollback memory), which disables the
+        // self-healing safety net (rollback + demotion on codegen error).
+        // By excluding stores, all Full-tier blocks stay load-only → speculative
+        // → self-healing. Stores go to interpreter, where they're always correct.
+        if tier == BlockTier::Full && is_compilable_store(&d) && !jit_no_stores() {
+            record_termination(&d, tier);
             break;
         }
 
+        let is_helper_instr = tier == BlockTier::Full && is_compilable_load(&d);
+        instrs.push((raw, d));
+
+        if is_helper_instr {
+            helper_count += 1;
+            if helper_count >= max_helpers {
+                break;
+            }
+        }
+
         if is_branch {
             pc = pc.wrapping_add(4);
             let mut delay_ok = false;
@@ -606,11 +1013,13 @@ fn trace_block<T: Tlb, C: MipsCache>(
                 let mut delay_d = DecodedInstr::default();
                 delay_d.raw = delay_raw;
                 decode_into::<T, C>(&mut delay_d);
-                // Exclude stores from delay slots: if the delay slot faults,
-                // the JIT exception path loses delay-slot context (sync_to clears
-                // in_delay_slot), so handle_exception sets wrong cp0_epc/BD bit,
-                // and on ERET the branch is permanently skipped → crash.
-                if is_compilable_for_tier(&delay_d, tier) && !is_compilable_store(&delay_d) {
+                // Exclude loads AND stores from delay slots: if the delay slot
+                // faults (TLB miss, bus error), the JIT exception path loses
+                // delay-slot context (sync_to clears in_delay_slot), so
+                // handle_exception sets wrong cp0_epc/BD bit, and on ERET
+                // the branch is permanently skipped → crash.
+                let delay_can_fault = is_compilable_load(&delay_d) || is_compilable_store(&delay_d);
+                if is_compilable_for_tier(&delay_d, tier) && !delay_can_fault {
                     instrs.push((delay_raw, delay_d));
                     delay_ok = true;
                 }
@@ -625,11 +1034,26 @@ fn trace_block<T: Tlb, C: MipsCache>(
     instrs
 }
 
+// IRIS_JIT_NO_STORES=1 disables store compilation in Full tier for diagnostic
+// bisection. Full tier still compiles loads (behaves like a faster Loads tier
+// with higher promotion priority). If this flag fixes the 4dwm glitch, the
+// bug is specifically in store compilation, not in the promotion/chaining path.
+static JIT_NO_STORES: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+
+fn jit_no_stores() -> bool {
+    *JIT_NO_STORES.get_or_init(|| std::env::var("IRIS_JIT_NO_STORES").map(|v| v == "1").unwrap_or(false))
+}
+
 fn is_compilable_for_tier(d: &DecodedInstr, tier: BlockTier) -> bool {
     if is_compilable_alu(d) || is_branch_or_jump(d) { return true; }
     match tier {
         BlockTier::Alu => false,
         BlockTier::Loads => is_compilable_load(d),
+        // Full tier accepts loads + stores in the compilable check, but stores
+        // are terminated before inclusion in trace_block (they trigger a break
+        // before being pushed). This keeps the block load-only → speculative →
+        // self-healing. The NO_STORES diagnostic flag skips even the termination
+        // check (stores just aren't compilable at all).
         BlockTier::Full => is_compilable_load(d) || is_compilable_store(d),
     }
 }
@@ -652,8 +1076,14 @@ fn is_compilable_alu(d: &DecodedInstr) -> bool {
             FUNCT_DSLLV | FUNCT_DSRLV | FUNCT_DSRAV |
             FUNCT_SYNC
         ),
-        OP_ADDIU | OP_DADDIU | OP_SLTI | OP_SLTIU |
+        OP_ADDI | OP_ADDIU | OP_DADDI | OP_DADDIU | OP_SLTI | OP_SLTIU |
         OP_ANDI | OP_ORI | OP_XORI | OP_LUI => true,
+        // MFC0/DMFC0 are read-only CP0 accesses — safe at Alu tier.
+        // MTC0/DMTC0 have side effects (Status→translate_fn, Compare→timer
+        // recalibration, Count writes, TLB ASID). Tested once and caused OOM
+        // during boot — needs deeper investigation. Emitters exist but gate
+        // stays off until we understand the failure mode.
+        OP_COP0 => matches!(d.rs as u32, RS_MFC0 | RS_DMFC0),
         _ => false,
     }
 }
@@ -676,8 +1106,12 @@ fn is_branch_or_jump(d: &DecodedInstr) -> bool {
     use crate::mips_isa::*;
     match d.op as u32 {
         OP_BEQ | OP_BNE | OP_BLEZ | OP_BGTZ => true,
+        OP_BEQL | OP_BNEL | OP_BLEZL | OP_BGTZL => true,
         OP_J | OP_JAL => true,
-        OP_SPECIAL => matches!(d.funct as u32, FUNCT_JR),
+        OP_SPECIAL => matches!(d.funct as u32, FUNCT_JR | FUNCT_JALR),
+        OP_REGIMM => matches!(d.rt as u32,
+            RT_BLTZ | RT_BGEZ | RT_BLTZAL | RT_BGEZAL |
+            RT_BLTZL | RT_BGEZL | RT_BLTZALL | RT_BGEZALL),
         _ => false,
     }
 }
diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs
index 2c4f9f2..303badd 100644
--- a/src/jit/helpers.rs
+++ b/src/jit/helpers.rs
@@ -129,6 +129,50 @@ pub extern "C" fn jit_interp_one_step<T: Tlb, C: MipsCache>(
     0
 }
 
+// ─── CP0 helpers ─────────────────────────────────────────────────────────────
+
+/// MFC0: read CP0 register `rd` as 32-bit sign-extended to 64.
+/// Random (rd=1) depends on cycle count; flush before read.
+pub extern "C" fn jit_mfc0<T: Tlb, C: MipsCache>(
+    _ctx_ptr: *mut JitContext, exec_ptr: *mut u8, rd: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let rd_u32 = rd as u32;
+    if rd_u32 == 1 { exec.flush_cycles(); }
+    let v = exec.core.read_cp0(rd_u32);
+    // sign-extend 32→64 to match interpreter exec_mfc0
+    v as u32 as i32 as i64 as u64
+}
+
+/// DMFC0: read CP0 register `rd` as full 64-bit value.
+pub extern "C" fn jit_dmfc0<T: Tlb, C: MipsCache>(
+    _ctx_ptr: *mut JitContext, exec_ptr: *mut u8, rd: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    let rd_u32 = rd as u32;
+    if rd_u32 == 1 { exec.flush_cycles(); }
+    exec.core.read_cp0(rd_u32)
+}
+
+/// MTC0: write low 32 bits of `value` (sign-extended) into CP0 register `rd`.
+/// write_cp0 handles side effects (Status→translate_fn, Compare→timer, etc.).
+pub extern "C" fn jit_mtc0<T: Tlb, C: MipsCache>(
+    _ctx_ptr: *mut JitContext, exec_ptr: *mut u8, rd: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    exec.core.write_cp0(rd as u32, value as u32 as i32 as i64 as u64);
+    0
+}
+
+/// DMTC0: write full 64-bit `value` into CP0 register `rd`.
+pub extern "C" fn jit_dmtc0<T: Tlb, C: MipsCache>(
+    _ctx_ptr: *mut JitContext, exec_ptr: *mut u8, rd: u64, value: u64,
+) -> u64 {
+    let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
+    exec.core.write_cp0(rd as u32, value);
+    0
+}
+
 /// Collection of monomorphized helper function pointers.
 pub struct HelperPtrs {
     pub read_u8:  *const u8,
@@ -140,6 +184,10 @@ pub struct HelperPtrs {
     pub write_u32: *const u8,
     pub write_u64: *const u8,
     pub interp_step: *const u8,
+    pub mfc0: *const u8,
+    pub dmfc0: *const u8,
+    pub mtc0: *const u8,
+    pub dmtc0: *const u8,
 }
 
 impl HelperPtrs {
@@ -154,6 +202,10 @@ impl HelperPtrs {
             write_u32: jit_write_u32::<T, C> as *const u8,
             write_u64: jit_write_u64::<T, C> as *const u8,
             interp_step: jit_interp_one_step::<T, C> as *const u8,
+            mfc0:  jit_mfc0::<T, C>  as *const u8,
+            dmfc0: jit_dmfc0::<T, C> as *const u8,
+            mtc0:  jit_mtc0::<T, C>  as *const u8,
+            dmtc0: jit_dmtc0::<T, C> as *const u8,
         }
     }
 }
diff --git a/src/jit/mod.rs b/src/jit/mod.rs
index c98854d..15b7645 100644
--- a/src/jit/mod.rs
+++ b/src/jit/mod.rs
@@ -10,6 +10,8 @@ pub mod dispatch;
 pub mod helpers;
 pub mod profile;
 pub mod snapshot;
+pub mod trace;
+pub mod codegen_test;
 
 pub use context::JitContext;
 pub use cache::{CodeCache, CompiledBlock};
diff --git a/src/jit/profile.rs b/src/jit/profile.rs
index 12a2650..b9d6b32 100644
--- a/src/jit/profile.rs
+++ b/src/jit/profile.rs
@@ -1,8 +1,12 @@
 //! JIT profile cache: persists hot block metadata across emulator runs.
 //!
-//! On shutdown, saves (phys_pc, virt_pc, tier) tuples for all blocks above Alu tier.
-//! On startup, loads the profile and eagerly compiles those blocks at their saved tier
-//! (still speculative until they prove stable again). Eliminates warmup time.
+//! On shutdown, saves metadata (virt_pc, phys_pc, tier, len_mips, content_hash,
+//! hit_count) for all blocks above Alu tier, sorted by hit_count descending so
+//! the hottest blocks replay first. On startup, the dispatch loop loads the
+//! profile into a queue but does NOT compile upfront — bulk pre-compilation
+//! evicts L2/D-cache lines the kernel depends on during early boot and causes
+//! UTLB panics. Instead, replay drip-feeds one entry per probe once the
+//! kernel has reached userspace (boot-settle detection).
 
 use std::fs;
 use std::io::{self, Read, Write, BufReader, BufWriter};
@@ -13,13 +17,26 @@ use super::cache::BlockTier;
 /// One entry in the profile: a block that reached a tier worth persisting.
 #[derive(Debug, Clone)]
 pub struct ProfileEntry {
+    /// Saved for diagnostics only — NOT used for cache insertion. Physical
+    /// addresses depend on TLB state which may differ between sessions.
     pub phys_pc: u64,
+    /// Virtual PC — stable across sessions, used to re-trace the block.
     pub virt_pc: u64,
     pub tier: BlockTier,
+    /// Instruction count at save time. Cheap staleness check.
+    pub len_mips: u32,
+    /// FNV-1a 32-bit hash of the raw instruction words. Definitive staleness
+    /// check: catches same-length different-code cases (DSO reused at same VA).
+    pub content_hash: u32,
+    /// Hit count from the previous session, used to prioritize replay order.
+    pub hit_count: u32,
 }
 
 const PROFILE_MAGIC: &[u8; 4] = b"IRJP"; // IRIS JIT Profile
-const PROFILE_VERSION: u8 = 1;
+const PROFILE_VERSION: u8 = 2;
+
+// On-disk entry: 8+8+1+4+4+4 = 29 bytes, padded to 32 for alignment.
+const ENTRY_BYTES: usize = 32;
 
 /// Default profile path: ~/.iris/jit-profile.bin
 fn default_profile_path() -> PathBuf {
@@ -39,6 +56,7 @@ pub fn profile_path() -> PathBuf {
 }
 
 /// Load profile entries from disk. Returns empty vec on any error.
+/// Entries are returned in save order (sorted by hit_count descending).
 pub fn load_profile() -> Vec<ProfileEntry> {
     let path = profile_path();
     let file = match fs::File::open(&path) {
@@ -55,7 +73,8 @@ pub fn load_profile() -> Vec<ProfileEntry> {
 
     let mut version = [0u8; 1];
     if reader.read_exact(&mut version).is_err() || version[0] != PROFILE_VERSION {
-        eprintln!("JIT profile: version mismatch in {:?}, ignoring", path);
+        eprintln!("JIT profile: version mismatch in {:?} (found {}, need {}), ignoring",
+            path, version[0], PROFILE_VERSION);
         return Vec::new();
     }
 
@@ -67,53 +86,72 @@ pub fn load_profile() -> Vec<ProfileEntry> {
 
     let mut entries = Vec::with_capacity(count);
     for _ in 0..count {
-        let mut buf = [0u8; 17]; // 8 + 8 + 1
+        let mut buf = [0u8; ENTRY_BYTES];
         if reader.read_exact(&mut buf).is_err() {
             break;
         }
-        let phys_pc = u64::from_le_bytes(buf[0..8].try_into().unwrap());
-        let virt_pc = u64::from_le_bytes(buf[8..16].try_into().unwrap());
-        let tier = match buf[16] {
+        let phys_pc      = u64::from_le_bytes(buf[0..8].try_into().unwrap());
+        let virt_pc      = u64::from_le_bytes(buf[8..16].try_into().unwrap());
+        let tier_byte    = buf[16];
+        let len_mips     = u32::from_le_bytes(buf[17..21].try_into().unwrap());
+        let content_hash = u32::from_le_bytes(buf[21..25].try_into().unwrap());
+        let hit_count    = u32::from_le_bytes(buf[25..29].try_into().unwrap());
+        // buf[29..32] is padding, ignore
+        let tier = match tier_byte {
             0 => BlockTier::Alu,
             1 => BlockTier::Loads,
             2 => BlockTier::Full,
             _ => continue,
         };
-        entries.push(ProfileEntry { phys_pc, virt_pc, tier });
+        entries.push(ProfileEntry {
+            phys_pc, virt_pc, tier, len_mips, content_hash, hit_count,
+        });
     }
 
     eprintln!("JIT profile: loaded {} entries from {:?}", entries.len(), path);
     entries
 }
 
-/// Save profile entries to disk.
+/// Save profile entries to disk. Writes atomically via tmp file + rename to
+/// avoid truncated files on interrupted writes. Sorts entries by hit_count
+/// descending so the hottest blocks are first in the queue on next load.
 pub fn save_profile(entries: &[ProfileEntry]) -> io::Result<()> {
     let path = profile_path();
 
-    // Ensure parent directory exists
     if let Some(parent) = path.parent() {
         fs::create_dir_all(parent)?;
     }
 
-    let file = fs::File::create(&path)?;
-    let mut writer = BufWriter::new(file);
-
-    writer.write_all(PROFILE_MAGIC)?;
-    writer.write_all(&[PROFILE_VERSION])?;
-    writer.write_all(&(entries.len() as u32).to_le_bytes())?;
+    let mut sorted: Vec<&ProfileEntry> = entries.iter().collect();
+    sorted.sort_by(|a, b| b.hit_count.cmp(&a.hit_count));
+
+    let tmp_path = path.with_extension("bin.tmp");
+    {
+        let file = fs::File::create(&tmp_path)?;
+        let mut writer = BufWriter::new(file);
+
+        writer.write_all(PROFILE_MAGIC)?;
+        writer.write_all(&[PROFILE_VERSION])?;
+        writer.write_all(&(sorted.len() as u32).to_le_bytes())?;
+
+        for entry in &sorted {
+            let mut buf = [0u8; ENTRY_BYTES];
+            buf[0..8].copy_from_slice(&entry.phys_pc.to_le_bytes());
+            buf[8..16].copy_from_slice(&entry.virt_pc.to_le_bytes());
+            buf[16] = match entry.tier {
+                BlockTier::Alu => 0,
+                BlockTier::Loads => 1,
+                BlockTier::Full => 2,
+            };
+            buf[17..21].copy_from_slice(&entry.len_mips.to_le_bytes());
+            buf[21..25].copy_from_slice(&entry.content_hash.to_le_bytes());
+            buf[25..29].copy_from_slice(&entry.hit_count.to_le_bytes());
+            writer.write_all(&buf)?;
+        }
 
-    for entry in entries {
-        writer.write_all(&entry.phys_pc.to_le_bytes())?;
-        writer.write_all(&entry.virt_pc.to_le_bytes())?;
-        let tier_byte = match entry.tier {
-            BlockTier::Alu => 0u8,
-            BlockTier::Loads => 1u8,
-            BlockTier::Full => 2u8,
-        };
-        writer.write_all(&[tier_byte])?;
+        writer.flush()?;
     }
-
-    writer.flush()?;
-    eprintln!("JIT profile: saved {} entries to {:?}", entries.len(), path);
+    fs::rename(&tmp_path, &path)?;
+    eprintln!("JIT profile: saved {} entries to {:?}", sorted.len(), path);
     Ok(())
 }
diff --git a/src/jit/trace.rs b/src/jit/trace.rs
new file mode 100644
index 0000000..5e2cf7b
--- /dev/null
+++ b/src/jit/trace.rs
@@ -0,0 +1,96 @@
+//! Lightweight binary state trace for differential JIT debugging.
+//!
+//! Enable with `IRIS_JIT_TRACE=<path>`. After every BATCH_SIZE instructions,
+//! writes a 48-byte record of key architectural state. Run once with JIT,
+//! once interpreter-only, diff with tools/diff-trace.py to find the first
+//! state divergence.
+//!
+//! Read-only access to executor state. No interaction with sync/snapshot/verify.
+
+use std::fs::File;
+use std::io::{self, BufWriter, Write};
+
+/// Fixed-size trace record. 48 bytes, written as raw bytes (little-endian).
+#[repr(C, packed)]
+#[derive(Clone, Copy)]
+pub struct TraceRecord {
+    pub insn_count: u64,
+    pub pc: u64,
+    pub cp0_count: u64,
+    pub cp0_status: u32,
+    pub cp0_cause: u32,
+    pub in_delay_slot: u8,
+    pub _pad: [u8; 7],
+    pub gpr_hash: u64,
+}
+
+const _: () = assert!(std::mem::size_of::<TraceRecord>() == 48);
+
+impl TraceRecord {
+    /// XOR-fold all 32 GPRs into a single u64 fingerprint.
+    pub fn hash_gprs(gpr: &[u64; 32]) -> u64 {
+        let mut h: u64 = 0;
+        for &v in gpr.iter() {
+            h ^= v;
+        }
+        // Rotate between XORs to reduce collision on symmetric patterns
+        h ^= h.rotate_left(17);
+        h
+    }
+}
+
+/// Buffered binary trace writer. Call `write_record` after each batch.
+pub struct TraceWriter {
+    out: BufWriter<File>,
+    records: u64,
+}
+
+impl TraceWriter {
+    /// Open a trace file. Returns None if the path is empty or open fails.
+    pub fn new(path: &str) -> io::Result<Self> {
+        let file = File::create(path)?;
+        eprintln!("iris: JIT trace enabled, writing to {}", path);
+        Ok(Self {
+            out: BufWriter::with_capacity(64 * 1024, file),
+            records: 0,
+        })
+    }
+
+    /// Try to create from the IRIS_JIT_TRACE env var. Returns None if not set.
+    pub fn from_env() -> Option<Self> {
+        let path = std::env::var("IRIS_JIT_TRACE").ok()?;
+        if path.is_empty() { return None; }
+        match Self::new(&path) {
+            Ok(w) => Some(w),
+            Err(e) => {
+                eprintln!("iris: failed to open trace file '{}': {}", path, e);
+                None
+            }
+        }
+    }
+
+    /// Write a trace record. This is a raw byte write, no serialization overhead.
+    pub fn write_record(&mut self, rec: &TraceRecord) {
+        let bytes = unsafe {
+            std::slice::from_raw_parts(
+                rec as *const TraceRecord as *const u8,
+                std::mem::size_of::<TraceRecord>(),
+            )
+        };
+        // Ignore write errors (trace is best-effort, don't crash the emulator)
+        let _ = self.out.write_all(bytes);
+        self.records += 1;
+    }
+
+    /// Flush and report stats.
+    pub fn finish(&mut self) {
+        let _ = self.out.flush();
+        eprintln!("iris: JIT trace finished, {} records written", self.records);
+    }
+}
+
+impl Drop for TraceWriter {
+    fn drop(&mut self) {
+        self.finish();
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 506c3ff..ccc3bb7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -22,6 +22,32 @@ fn main() {
         .join()
         .unwrap();
     machine.register_system_controller();
+
+    // DIAG: optionally enable verbose logging from startup via IRIS_DEBUG_LOG.
+    // IRIS_DEBUG_LOG="mc,mips" enables those modules. "all" enables everything.
+    // Output is broadcast to a stderr sink so jit-diag.sh's tee captures it inline.
+    if let Ok(spec) = std::env::var("IRIS_DEBUG_LOG") {
+        if let Some(dl) = iris::devlog::DEVLOG.get() {
+            // Register stderr as a sink so dlog output reaches our captured log.
+            let stderr_sink: iris::devlog::DevLogWriter = std::sync::Arc::new(
+                parking_lot::Mutex::new(std::io::stderr()),
+            );
+            dl.add_sink(stderr_sink);
+
+            for name in spec.split(',').map(str::trim).filter(|s| !s.is_empty()) {
+                if name == "all" {
+                    for m in iris::devlog::LogModule::all() { dl.enable(*m); }
+                    eprintln!("DIAG: enabled all log modules -> stderr");
+                } else if let Some(m) = iris::devlog::LogModule::from_str(name) {
+                    dl.enable(m);
+                    eprintln!("DIAG: enabled log module {} -> stderr", m.name());
+                } else {
+                    eprintln!("DIAG: unknown log module '{}'", name);
+                }
+            }
+        }
+    }
+
     machine.start();
     std::thread::spawn(|| {
         Machine::run_console_client();
diff --git a/tools/diff-trace.py b/tools/diff-trace.py
new file mode 100755
index 0000000..4f3ea6d
--- /dev/null
+++ b/tools/diff-trace.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""Compare two IRIS JIT trace files and report the first divergence.
+
+Usage:
+    python3 diff-trace.py trace-interp.bin trace-jit.bin
+
+Trace files are produced by setting IRIS_JIT_TRACE=<path> when running iris.
+Each record is 48 bytes:
+    u64 insn_count
+    u64 pc
+    u64 cp0_count
+    u32 cp0_status
+    u32 cp0_cause
+    u8  in_delay_slot
+    u8[7] pad
+    u64 gpr_hash
+"""
+
+import struct
+import sys
+
+RECORD_SIZE = 48
+RECORD_FMT = '<QQQIIBxxxxxxxQ'  # little-endian, 7 pad bytes
+
+assert struct.calcsize(RECORD_FMT) == RECORD_SIZE, f"bad format size: {struct.calcsize(RECORD_FMT)}"
+
+FIELDS = ['insn_count', 'pc', 'cp0_count', 'cp0_status', 'cp0_cause', 'in_delay_slot', 'gpr_hash']
+
+def read_records(path):
+    records = []
+    with open(path, 'rb') as f:
+        while True:
+            data = f.read(RECORD_SIZE)
+            if len(data) < RECORD_SIZE:
+                break
+            vals = struct.unpack(RECORD_FMT, data)
+            records.append(dict(zip(FIELDS, vals)))
+    return records
+
+def main():
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <trace-a.bin> <trace-b.bin>")
+        sys.exit(1)
+
+    path_a, path_b = sys.argv[1], sys.argv[2]
+    recs_a = read_records(path_a)
+    recs_b = read_records(path_b)
+
+    print(f"Trace A ({path_a}): {len(recs_a)} records")
+    print(f"Trace B ({path_b}): {len(recs_b)} records")
+
+    if not recs_a or not recs_b:
+        print("ERROR: one or both traces are empty")
+        sys.exit(1)
+
+    # Match records by closest insn_count
+    b_idx = 0
+    divergences = 0
+    max_report = 10
+
+    for a in recs_a:
+        # Find closest record in B
+        while b_idx < len(recs_b) - 1:
+            curr_diff = abs(recs_b[b_idx]['insn_count'] - a['insn_count'])
+            next_diff = abs(recs_b[b_idx + 1]['insn_count'] - a['insn_count'])
+            if next_diff < curr_diff:
+                b_idx += 1
+            else:
+                break
+
+        b = recs_b[b_idx]
+
+        # Skip if instruction counts are too far apart (> 2x BATCH_SIZE)
+        if abs(a['insn_count'] - b['insn_count']) > 20000:
+            continue
+
+        # Compare fields (skip insn_count since it won't match exactly)
+        mismatches = []
+        for field in FIELDS[1:]:  # skip insn_count
+            va, vb = a[field], b[field]
+            if va != vb:
+                mismatches.append(field)
+
+        if mismatches:
+            divergences += 1
+            if divergences <= max_report:
+                print(f"\nDIVERGE at insn_count ~{a['insn_count']} (A={a['insn_count']}, B={b['insn_count']}):")
+                for field in FIELDS[1:]:
+                    va, vb = a[field], b[field]
+                    status = "MISMATCH" if field in mismatches else "ok"
+                    if field in ('cp0_status', 'cp0_cause'):
+                        print(f"  {field:15s}: A={va:08x}  B={vb:08x}  {status}")
+                    elif field == 'in_delay_slot':
+                        print(f"  {field:15s}: A={va}  B={vb}  {status}")
+                    else:
+                        print(f"  {field:15s}: A={va:016x}  B={vb:016x}  {status}")
+
+    if divergences == 0:
+        print("\nTraces match (no divergences found)")
+    else:
+        shown = min(divergences, max_report)
+        print(f"\nTotal: {divergences} divergences ({shown} shown)")
+
+    sys.exit(1 if divergences > 0 else 0)
+
+if __name__ == '__main__':
+    main()