techomancer · techomancer · Apr 16, 2026 · Apr 9, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/jit-diag.sh b/jit-diag.sh
@@ -1,52 +1,94 @@
 #!/bin/bash
 # JIT diagnostic launcher — runs emulator and captures output for analysis
-# Usage: ./jit-diag.sh [mode]
+# Usage: ./jit-diag.sh [-f|--features <list>] [mode]
 #   mode: "jit"      — JIT enabled (default)
 #         "verify"   — JIT with verification
 #         "nojit"    — interpreter only through JIT dispatch
 #         "interp"   — pure interpreter (no JIT feature, baseline)
 #         "perf"     — perf profile, interpreter only (text report for analysis)
 #         "perf-jit" — perf profile with JIT enabled
+#         "smoke"    — headless boot smoke test
+#
+#   -f / --features <list>  comma-separated extra features appended to the
+#                           mode's base feature list (e.g. "developer" to
+#                           enable dlog_dev! macros for IRIS_DEBUG_LOG tracing)
 #
 # All IRIS_JIT_* env vars are passed through automatically:
 #   IRIS_JIT_MAX_TIER=0 ./jit-diag.sh jit
 #   IRIS_JIT_PROBE=500 IRIS_JIT_PROBE_MIN=100 ./jit-diag.sh jit
+#   IRIS_DEBUG_LOG=mc ./jit-diag.sh -f developer interp
+
+EXTRA_FEATURES=""
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -f|--features)
+            EXTRA_FEATURES="$2"
+            shift 2
+            ;;
+        -h|--help)
+            sed -n '2,20p' "$0"
+            exit 0
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
 
 MODE="${1:-jit}"
 OUTFILE="jit-diag-$(date +%Y%m%d-%H%M%S)-${MODE}.log"
 
+# Use the ncargo wrapper so we hit the rustup-pinned nightly toolchain
+# (rust-toolchain.toml) regardless of any homebrew rust on PATH.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CARGO="${SCRIPT_DIR}/ncargo"
+
+features() {
+    local base="$1"
+    if [[ -n "$EXTRA_FEATURES" ]]; then
+        echo "${base},${EXTRA_FEATURES}"
+    else
+        echo "$base"
+    fi
+}
+
 # Collect all IRIS_JIT_* env vars for display and passthrough
 JIT_VARS=$(env | grep '^IRIS_JIT_' | tr '\n' ' ')
 
 echo "=== IRIS JIT Diagnostic ===" | tee "$OUTFILE"
 echo "Mode: $MODE" | tee -a "$OUTFILE"
 echo "Date: $(date)" | tee -a "$OUTFILE"
 echo "Host: $(uname -m) $(uname -s) $(uname -r)" | tee -a "$OUTFILE"
-echo "Rust: $(rustc --version)" | tee -a "$OUTFILE"
+RUSTC_BIN="$("$HOME/.cargo/bin/rustup" which rustc 2>/dev/null || command -v rustc)"
+echo "Rust: $("$RUSTC_BIN" --version)" | tee -a "$OUTFILE"
 [ -n "$JIT_VARS" ] && echo "Env: $JIT_VARS" | tee -a "$OUTFILE"
 echo "" | tee -a "$OUTFILE"
 
 case "$MODE" in
   jit)
-    echo "Running: IRIS_JIT=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features jit,lightning)"
+    echo "Running: IRIS_JIT=1 ${JIT_VARS}${CARGO} run --release --features ${F}" | tee -a "$OUTFILE"
+    IRIS_JIT=1 "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   verify)
-    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 ${JIT_VARS}cargo run --release --features jit,lightning" | tee -a "$OUTFILE"
-    IRIS_JIT=1 IRIS_JIT_VERIFY=1 cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features jit,lightning)"
+    echo "Running: IRIS_JIT=1 IRIS_JIT_VERIFY=1 ${JIT_VARS}${CARGO} run --release --features ${F}" | tee -a "$OUTFILE"
+    IRIS_JIT=1 IRIS_JIT_VERIFY=1 "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   nojit)
-    echo "Running: cargo run --release --features jit,lightning (no IRIS_JIT)" | tee -a "$OUTFILE"
-    cargo run --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features jit,lightning)"
+    echo "Running: ${CARGO} run --release --features ${F} (no IRIS_JIT)" | tee -a "$OUTFILE"
+    "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   interp)
-    echo "Running: cargo run --release --features lightning (no jit feature)" | tee -a "$OUTFILE"
-    cargo run --release --features lightning 2>&1 | tee -a "$OUTFILE"
+    F="$(features lightning)"
+    echo "Running: ${CARGO} run --release --features ${F} (no jit feature)" | tee -a "$OUTFILE"
+    "$CARGO" run --release --features "$F" 2>&1 | tee -a "$OUTFILE"
     ;;
   perf)
     PERFREPORT="perf-report-$(date +%Y%m%d-%H%M%S).txt"
     echo "Building (profiling profile, no jit feature)..." | tee -a "$OUTFILE"
-    cargo build --profile profiling --features lightning 2>&1 | tee -a "$OUTFILE"
+    "$CARGO" build --profile profiling --features lightning 2>&1 | tee -a "$OUTFILE"
     echo "--- Press Ctrl-C when you have enough samples ---"
     perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
     echo "Processing perf data..." | tee -a "$OUTFILE"
@@ -56,16 +98,107 @@ case "$MODE" in
   perf-jit)
     PERFREPORT="perf-report-jit-$(date +%Y%m%d-%H%M%S).txt"
     echo "Building (profiling profile, jit feature)..." | tee -a "$OUTFILE"
-    cargo build --profile profiling --features jit,lightning 2>&1 | tee -a "$OUTFILE"
-    echo "--- Press Ctrl-C when you have enough samples ---"
-    IRIS_JIT=1 perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
-    echo "Processing perf data..." | tee -a "$OUTFILE"
-    perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+    "$CARGO" build --profile profiling --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+    case "$(uname -s)" in
+      Linux)
+        echo "--- Press Ctrl-C when you have enough samples ---"
+        IRIS_JIT=1 perf record -F 99 --call-graph dwarf -o perf.data -- ./target/profiling/iris
+        echo "Processing perf data..." | tee -a "$OUTFILE"
+        perf report --stdio --no-children -i perf.data > "$PERFREPORT" 2>&1
+        ;;
+      Darwin)
+        DURATION="${PERF_DURATION:-30}"
+        echo "--- Launching iris; will sample for ${DURATION}s after it starts ---"
+        IRIS_JIT=1 ./target/profiling/iris &
+        IRIS_PID=$!
+        sleep 2
+        echo "Sampling PID $IRIS_PID for ${DURATION}s..."
+        sample "$IRIS_PID" "$DURATION" -f "$PERFREPORT" 2>&1 || true
+        kill "$IRIS_PID" 2>/dev/null
+        wait "$IRIS_PID" 2>/dev/null
+        ;;
+      *)
+        echo "Unsupported platform for profiling: $(uname -s)"
+        exit 1
+        ;;
+    esac
     echo "Perf report saved to: $PERFREPORT"
     ;;
+  smoke)
+    # Headless boot smoke test: boots IRIX with JIT, checks milestones, exits.
+    # Uses COW overlay to protect disk image. Exits 0 if all milestones pass.
+    TIMEOUT="${IRIS_SMOKE_TIMEOUT:-120}"
+    echo "Running: headless smoke test (timeout=${TIMEOUT}s)" | tee -a "$OUTFILE"
+
+    # Clean up stale overlays
+    rm -f scsi1.raw.overlay scsi2.raw.overlay
+
+    # Build
+    "$CARGO" build --release --features jit,lightning 2>&1 | tee -a "$OUTFILE"
+
+    # Run headless with JIT, capture output, kill after timeout
+    IRIS_JIT=1 timeout "$TIMEOUT" "$CARGO" run --release --features jit,lightning -- --headless 2>&1 | tee -a "$OUTFILE" &
+    EMUPID=$!
+
+    # Wait for emulator to finish or timeout
+    wait $EMUPID 2>/dev/null
+    EXIT=$?
+
+    echo "" | tee -a "$OUTFILE"
+    echo "=== Smoke Test Results ===" | tee -a "$OUTFILE"
+
+    PASS=0
+    FAIL=0
+
+    check_milestone() {
+      local name="$1"
+      local pattern="$2"
+      if grep -q "$pattern" "$OUTFILE"; then
+        echo "  PASS: $name" | tee -a "$OUTFILE"
+        PASS=$((PASS + 1))
+      else
+        echo "  FAIL: $name (pattern '$pattern' not found)" | tee -a "$OUTFILE"
+        FAIL=$((FAIL + 1))
+      fi
+    }
+
+    check_milestone "JIT initialized"    "JIT: adaptive mode"
+    check_milestone "First compilation"  "JIT: compiled #1"
+    check_milestone "Blocks compiled"    "JIT:.*blocks,"
+
+    # Check for crashes
+    if grep -qiE "KERNEL FAULT|PANIC|TLBMISS.*KERNEL|SEGV" "$OUTFILE"; then
+      echo "  FAIL: kernel panic detected" | tee -a "$OUTFILE"
+      FAIL=$((FAIL + 1))
+    else
+      echo "  PASS: no kernel panic" | tee -a "$OUTFILE"
+      PASS=$((PASS + 1))
+    fi
+
+    # Check instruction count reached a reasonable level
+    LAST_TOTAL=$(grep -oP 'JIT: \K[0-9]+(?= total)' "$OUTFILE" | tail -1)
+    if [ -n "$LAST_TOTAL" ] && [ "$LAST_TOTAL" -gt 100000000 ] 2>/dev/null; then
+      echo "  PASS: reached ${LAST_TOTAL} instructions" | tee -a "$OUTFILE"
+      PASS=$((PASS + 1))
+    else
+      echo "  FAIL: instruction count too low (${LAST_TOTAL:-0})" | tee -a "$OUTFILE"
+      FAIL=$((FAIL + 1))
+    fi
+
+    echo "" | tee -a "$OUTFILE"
+    echo "Score: $PASS passed, $FAIL failed" | tee -a "$OUTFILE"
+
+    # Clean up overlay
+    rm -f scsi1.raw.overlay scsi2.raw.overlay
+
+    if [ "$FAIL" -gt 0 ]; then
+      exit 1
+    fi
+    exit 0
+    ;;
   *)
     echo "Unknown mode: $MODE"
-    echo "Usage: $0 [jit|verify|nojit|interp|perf|perf-jit]"
+    echo "Usage: $0 [jit|verify|nojit|interp|perf|perf-jit|smoke]"
     exit 1
     ;;
 esac

diff --git a/rules/jit/cranelift-opt-levelnone-is-the-right-trade-for-throughput-jits.md b/rules/jit/cranelift-opt-levelnone-is-the-right-trade-for-throughput-jits.md
@@ -0,0 +1,28 @@
+# Cranelift opt_level=none is the right trade for throughput JITs
+
+**Keywords:** cranelift, opt_level, compilation speed, throughput, JIT overhead, perf
+**Category:** jit
+
+# Cranelift opt_level=none for Interpreter-First JITs
+
+For a JIT where blocks compile frequently and run a few hundred times each, **`opt_level = "none"` beats `"speed"` for total throughput** by 2-3x. Generated native code is ~10-20% slower per instruction, but compile time drops ~3-5x.
+
+## Why
+Profiling showed 66% of MIPS-CPU thread time was inside Cranelift compilation passes. Switching opt_level=none preserved the % but increased total emulator throughput 2.5x — same Cranelift work, more actual execution per second. The generated code slowdown is dwarfed by the compile speedup because every block gets compiled whether it runs 10 times or 10,000 times.
+
+## How to apply
+In `BlockCompiler::new` (`src/jit/compiler.rs`):
+```rust
+flag_builder.set("opt_level", "none").unwrap();
+```
+
+This is specifically correct when:
+- The JIT is interpreter-first (blocks share execution time with interpreter)
+- Most blocks are compiled once and run a moderate number of times
+- Chain-compile-on-miss aggressively fills the cache with blocks that are used briefly
+
+It would be wrong if blocks ran billions of times each (classic hot-loop JIT scenario), where spending more compile time for better native code pays off.
+
+## Measurement
+Use macOS `sample` (or Linux `perf`) on the running emulator. Count samples inside `cranelift_*` symbols vs total thread samples. Compare instructions/second before and after (log total count, divide by wall-clock).
+
diff --git a/rules/jit/cranelift-regalloc2-helper-diamond-limit-is-platform-dependent.md b/rules/jit/cranelift-regalloc2-helper-diamond-limit-is-platform-dependent.md
@@ -0,0 +1,30 @@
+# Cranelift regalloc2 helper-diamond limit is platform-dependent
+
+**Keywords:** cranelift, regalloc2, aarch64, x86_64, helper call, ok_block, exc_block, diamond, Full tier, block length
+**Category:** jit
+
+# Cranelift Helper-Diamond Limit Differs by Architecture
+
+Full-tier JIT blocks terminate after N load/store helper calls to avoid Cranelift regalloc2 miscompilations. The safe N is platform-specific: **aarch64 tolerates 3, x86_64 only 1**. Bumping above the threshold produces silent miscompilations (confirmed by IRIS_JIT_VERIFY catching real GPR mismatches).
+
+## Why
+Each load or store helper call emits an `ok_block` / `exc_block` CFG diamond (helper can return an exception status, so we branch after). Multiple chained diamonds create complex control flow that stresses Cranelift's regalloc2 allocator. On x86_64 (15 GPRs), register pressure plus the CFG complexity hits an edge case and produces wrong code. On aarch64 (30 GPRs), more headroom — 3 diamonds tolerable, 4 starts failing.
+
+## How to apply
+In `src/jit/dispatch.rs` `trace_block`:
+```rust
+let max_helpers: u32 = if cfg!(target_arch = "aarch64") { 3 } else { 1 };
+let mut helper_count: u32 = 0;
+// ... inside loop ...
+if tier == BlockTier::Full && (is_compilable_store(&d) || is_compilable_load(&d)) {
+    helper_count += 1;
+    instrs.push((raw, d));
+    if helper_count >= max_helpers { break; }
+}
+```
+
+Don't raise the aarch64 limit without running `IRIS_JIT_VERIFY=1` for 500M+ instructions to catch silent miscompilations. GPR mismatches at Full tier, len=3+ with off-by-small-number values (jit=0x97 interp=0x98) are the signature.
+
+## History
+Original code had `if has_helper { break; }` unconditionally with a comment citing x86_64 regalloc2 issues. Binary-searched on aarch64: max_helpers=2 works, 3 works, 4 produces real codegen mismatches in verify mode. Kept at 3 for safety margin.
+
diff --git a/rules/jit/jit-block-chaining-needs-max-chain-instrs-cap-for-interrupt-timing.md b/rules/jit/jit-block-chaining-needs-max-chain-instrs-cap-for-interrupt-timing.md
@@ -0,0 +1,24 @@
+# JIT block chaining needs MAX_CHAIN_INSTRS cap for interrupt timing
+
+**Keywords:** jit, chaining, interrupt latency, MAX_CHAIN_INSTRS, interpreter burst, cp0_count, timing, ugly login
+**Category:** jit
+
+# JIT Chain Length Affects Interrupt Delivery
+
+Block chaining (running one cached block after another without returning to the interpreter burst) must be capped by cumulative instruction count, NOT by chain block count. 32 instructions is safe; 64 causes "ugly login" / timing-dependent corruption.
+
+## Why
+The interpreter checks pending interrupts before every single instruction. The JIT defers interrupt checking to post-block bookkeeping (cp0_count advance + merge IP bits into cp0_cause). Without chaining, a block exit returns to the interpreter burst which immediately sees the merged interrupts.
+
+With chaining, multiple blocks execute back-to-back. Each chained block does its own post-block cp0_count advance and IP-bit merge, so timer interrupts get set in cp0_cause correctly — BUT the actual interrupt dispatch is deferred until the chain ends and the interpreter runs again. Worst-case interrupt delivery latency = MAX_CHAIN_INSTRS.
+
+IRIX has code paths that depend on interrupt timing tight enough that 32 instructions is tolerable but 64 is not. Measured empirically: MAX_CHAIN_INSTRS=32 boots cleanly, =64 produces timing-dependent boot failures.
+
+## How to apply
+In the chain loop in `run_jit_dispatch`, accumulate `chain_instrs += next_block_len` and `break` when it reaches 32. Don't check "is interrupt pending" inside the chain — IRIX's level-triggered device interrupts (IP2-IP6) are almost always asserted, which would break every chain after one block.
+
+If timing-related crashes reappear after touching chain code, **reduce MAX_CHAIN_INSTRS before debugging codegen**. The user's own heuristic, validated in practice.
+
+## History
+Initial chaining implementation checked `interrupts_enabled() && (cp0_cause & cp0_status & IM) != 0` to break the chain. Broke every chain immediately because devices are constantly asserted → JIT% barely moved. Removing the check but keeping MAX_CHAIN_INSTRS=32 gave clean boots with 3-4x more JIT coverage.
+
diff --git a/rules/jit/jit-delay-slot-fault-handling-exclude-all-faulting-instructions.md b/rules/jit/jit-delay-slot-fault-handling-exclude-all-faulting-instructions.md
@@ -0,0 +1,32 @@
+# JIT delay slot fault handling — exclude ALL faulting instructions
+
+**Keywords:** jit, delay slot, loads, stores, cp0_epc, BD bit, branch, ERET, in_delay_slot, trace_block
+**Category:** jit
+
+# JIT Delay Slot Fault Handling
+
+Any instruction that can fault (load, store, FPU op, COP0 side effect, etc.) MUST be excluded from JIT-compiled branch delay slots.
+
+## Why
+If a delay slot instruction faults (TLB miss, bus error), the JIT exception path runs `sync_to_executor`, which explicitly clears `in_delay_slot` and `delay_slot_target` (context.rs, by design — compiled blocks normally handle their own delay slots).
+
+Then `exec.step()` re-executes at the faulting PC without delay-slot context. If it faults again, `handle_exception` sets `cp0_epc = faulting_PC` with **BD=0**. On ERET, the CPU returns to the faulting load/store, not to the branch. **The branch is permanently skipped** — execution diverges silently until something crashes.
+
+## Symptoms
+Process crashes mid-boot, "ugly login screen", graphics corruption, TLB panics. Appears as silent state corruption, not a direct fault — the divergence accumulates before manifesting.
+
+## How to apply
+In `src/jit/dispatch.rs` `trace_block`, when inspecting the delay slot instruction after a branch:
+```rust
+let delay_can_fault = is_compilable_load(&delay_d) || is_compilable_store(&delay_d);
+if is_compilable_for_tier(&delay_d, tier) && !delay_can_fault {
+    // compile delay slot into block
+}
+// else: drop delay slot AND the branch (pop both; block ends before branch)
+```
+
+Whenever adding a new JIT-compilable instruction type that can fault (e.g., LWL/LWR, LL/SC, FPU loads/stores), extend `delay_can_fault` to exclude it from delay slots.
+
+## History
+The codebase already excluded stores from delay slots with a detailed comment, but the same fix wasn't applied to loads. Adding Loads tier silently corrupted IRIX boot for weeks until binary-searching block length (max=1 works, max=3 fails) isolated it.
+
diff --git a/rules/jit/jit-profile-pre-compilation-at-startup-causes-prom-hang.md b/rules/jit/jit-profile-pre-compilation-at-startup-causes-prom-hang.md
@@ -0,0 +1,23 @@
+# JIT profile pre-compilation at startup causes PROM hang
+
+**Keywords:** jit, profile, pre-compilation, PROM, debug_fetch_instr, CpuBusErrorDevice, MC CPU Error, boot hang
+**Category:** jit
+
+# JIT Profile Pre-compilation Breaks Boot
+
+Pre-compiling above-Alu JIT blocks from a saved profile at startup causes the PROM to hang in a retry loop. Pre-compiling AFTER PROM exit causes IRIX kernel panics (UTLB miss).
+
+## Why (startup variant)
+Profile entries contain kernel/userspace virtual PCs from a previous session. At startup, the kernel isn't loaded yet — those physical addresses are served by `CpuBusErrorDevice` (the bus error catcher for unmapped regions). Each `debug_fetch_instr` during pre-compilation triggers `mc.report_cpu_error()`, dirtying `REG_CPU_ERROR_ADDR` / `REG_CPU_ERROR_STAT` on the emulated Memory Controller. The PROM reads those registers during hardware init, sees errors, and retries forever.
+
+## Why (post-PROM variant, UTLB panic)
+Even deferring pre-compilation until after PROM exit and compiling incrementally (64 entries per dispatch batch) triggered IRIX UTLB-miss panics shortly after kernel boot. Exact mechanism unknown — suspected that the bulk `debug_fetch_instr` calls evict L2 lines that the kernel's initial data structures depend on (L2 is inclusive of D-cache on emulated R4000), or nanotlb[Fetch] state interference. Unresolved.
+
+## How to apply
+The `load_profile()` call in `run_jit_dispatch` should NOT feed into `compile_block` during boot. Blocks compile on-demand when first hit, which is safe because the kernel is already resident by then.
+
+If you want persistent compiled blocks across sessions, store the **compiled native code bytes** in the profile rather than re-tracing. That avoids `debug_fetch_instr` entirely.
+
+## History
+Discovered when investigating why max_tier=1 (Loads) hung in PROM while max_tier=0 (Alu) booted fine. MAX_TIER=0 skipped all pre-compilation (entries get capped to Alu, then `continue`); MAX_TIER≥1 actually traced and compiled. The MC:CPU Error messages during pre-compilation were the smoking gun.
+