From 9c7a0edadd24ce2a9e8b9c63217a1410afa95a34 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Tue, 12 May 2026 20:16:49 +0200
Subject: [PATCH 1/5] fix(opt): i64 slot accounting + CSE-alias invalidation on
 redef
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness bugs in the optimizer's wasm→IR→ARM pipeline that produced
silent miscompilation and (post PR #101) defensive panics.

1. **i64 arithmetic slot accounting**: `wasm_to_ir` advanced `inst_id` by 1
   (via the wildcard fallthrough) for `I64Add/Sub/And/Or/Xor/Mul/Div{S,U}/
   Rem{S,U}/Shl/ShrS/ShrU/Rotl/Rotr` and for `I64Extend{8,16,32}S`. These ops
   produce an i64 result occupying 2 vreg slots, so the next op's
   `inst_id.saturating_sub(2)` lookup landed inside the *previous* op's
   `dest_hi` instead of its `dest_lo`. Triggers the issue-94 fast path
   regression (`test_issue94_hi32_extract_is_smaller_than_generic_shift`)
   because `I32WrapI64` after `I64ShrU` resolved its `src_lo` to an
   unmapped vreg.

2. **CSE alias invalidation on redefinition**: when CSE eliminates a
   duplicate `Opcode::Load` (local.get), it records `reg_map[killed] =
   original`. The subsequent `I64ExtendI32U`/`I64ExtendI32S`/`I32WrapI64`
   re-defines the same vreg by IR convention (`dest_lo == src`). Without
   clearing `reg_map[killed]` at the re-definition site, downstream
   consumers would resolve back to the *pre-extend* value — often a live
   AAPCS param register. Same class as #93 (silent R0 fallback) and #104
   (MemLoad/MemStore alias gap).

Also extends CSE to resolve src vregs for every i64 opcode (was a `_ => {}`
fallback) so the rewrite is consistent across the IR.

Refs: #93, #94, #103, #104, #107
Unblocks: #100 (fuzz harness), #101 (defensive panic).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/synth-opt/src/lib.rs                   | 686 ++++++++++++++++++
 .../synth-synthesis/src/optimizer_bridge.rs   | 253 ++++---
 2 files changed, 811 insertions(+), 128 deletions(-)
diff --git a/crates/synth-opt/src/lib.rs b/crates/synth-opt/src/lib.rs
index 98afd57..6bf5bd3 100644
--- a/crates/synth-opt/src/lib.rs
+++ b/crates/synth-opt/src/lib.rs
@@ -833,6 +833,148 @@ impl CommonSubexpressionElimination {
             // Resolve register mappings
             let resolve = |r: Reg| -> Reg { reg_map.get(&r).copied().unwrap_or(r) };
 
+            // After CSE rewrites this instruction's sources, we must also
+            // clear any `reg_map` entries for the vregs this instruction
+            // *writes to* — otherwise a later use of that vreg would be
+            // rewritten back to the original CSE-aliased source, which by
+            // now is a different (potentially stale) value.
+            //
+            // Concretely: WASM lowering may emit ops whose `dest` aliases a
+            // `src` of the same vreg id (e.g. `I64ExtendI32U` has
+            // `dest_lo == src` by IR convention — see
+            // `optimizer_bridge::wasm_to_ir`). That violates pure SSA, so
+            // CSE's vreg-rename `reg_map` becomes flow-insensitive at the
+            // re-definition site. Forgetting to clear the dest entry means
+            // a *later* op that consumes `dest_lo` would resolve back to
+            // the pre-extend (i32-typed) value — sometimes a live AAPCS
+            // param register — re-introducing the issue-#93 / #104 class
+            // of clobber-via-stale-alias bugs.
+            //
+            // We collect the dests *before* mutating reg_map so the
+            // resolution pass above sees the pre-instruction state, then
+            // drop them after.
+            let dests_to_invalidate: Vec<Reg> = match &opcode {
+                Opcode::Add { dest, .. }
+                | Opcode::Sub { dest, .. }
+                | Opcode::Mul { dest, .. }
+                | Opcode::DivS { dest, .. }
+                | Opcode::DivU { dest, .. }
+                | Opcode::RemS { dest, .. }
+                | Opcode::RemU { dest, .. }
+                | Opcode::And { dest, .. }
+                | Opcode::Or { dest, .. }
+                | Opcode::Xor { dest, .. }
+                | Opcode::Shl { dest, .. }
+                | Opcode::ShrS { dest, .. }
+                | Opcode::ShrU { dest, .. }
+                | Opcode::Rotl { dest, .. }
+                | Opcode::Rotr { dest, .. }
+                | Opcode::Clz { dest, .. }
+                | Opcode::Ctz { dest, .. }
+                | Opcode::Popcnt { dest, .. }
+                | Opcode::Extend8S { dest, .. }
+                | Opcode::Extend16S { dest, .. }
+                | Opcode::Eqz { dest, .. }
+                | Opcode::Eq { dest, .. }
+                | Opcode::Ne { dest, .. }
+                | Opcode::LtS { dest, .. }
+                | Opcode::LtU { dest, .. }
+                | Opcode::LeS { dest, .. }
+                | Opcode::LeU { dest, .. }
+                | Opcode::GtS { dest, .. }
+                | Opcode::GtU { dest, .. }
+                | Opcode::GeS { dest, .. }
+                | Opcode::GeU { dest, .. }
+                | Opcode::Load { dest, .. }
+                | Opcode::MemLoad { dest, .. }
+                | Opcode::Copy { dest, .. }
+                | Opcode::TeeStore { dest, .. }
+                | Opcode::Select { dest, .. }
+                | Opcode::Const { dest, .. }
+                | Opcode::I64Eq { dest, .. }
+                | Opcode::I64Ne { dest, .. }
+                | Opcode::I64LtS { dest, .. }
+                | Opcode::I64LtU { dest, .. }
+                | Opcode::I64LeS { dest, .. }
+                | Opcode::I64LeU { dest, .. }
+                | Opcode::I64GtS { dest, .. }
+                | Opcode::I64GtU { dest, .. }
+                | Opcode::I64GeS { dest, .. }
+                | Opcode::I64GeU { dest, .. }
+                | Opcode::I64Eqz { dest, .. }
+                | Opcode::I64Clz { dest, .. }
+                | Opcode::I64Ctz { dest, .. }
+                | Opcode::I64Popcnt { dest, .. }
+                | Opcode::I32WrapI64 { dest, .. } => vec![*dest],
+                Opcode::I64Add {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Sub {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64And {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Or {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Xor {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Mul {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64DivS {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64DivU {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64RemS {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64RemU {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Shl {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64ShrS {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64ShrU {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Rotl {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Rotr {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Const {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Load {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Extend8S {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Extend16S {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64Extend32S {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64ExtendI32U {
+                    dest_lo, dest_hi, ..
+                }
+                | Opcode::I64ExtendI32S {
+                    dest_lo, dest_hi, ..
+                } => vec![*dest_lo, *dest_hi],
+                _ => Vec::new(),
+            };
+
             match opcode {
                 Opcode::Add { dest, src1, src2 } => {
                     let src1 = resolve(src1);
@@ -1158,8 +1300,552 @@ impl CommonSubexpressionElimination {
                     inst.opcode = Opcode::Copy { dest, src };
                 }
 
+                // ============================================================
+                // i64 operations: resolve all src vregs through reg_map.
+                //
+                // CSE eliminates duplicate `Opcode::Load`s (local.get N) by
+                // marking the later one dead and recording the rewrite
+                // `dead_dest -> original_dest` in `reg_map`. Any later op
+                // that names `dead_dest` as a source MUST be rewritten —
+                // otherwise the producer is gone and `ir_to_arm`'s
+                // `get_arm_reg` panics (post-PR-101) or silently returns
+                // R0 (pre-PR-101 → AAPCS clobber miscompile, see issue #93).
+                //
+                // PR #107 covered MemLoad/MemStore; this batch covers every
+                // i64 opcode. We DO NOT add these to `expr_map` for actual
+                // CSE — i64 expression equivalence isn't tracked here — we
+                // only do the source-vreg rewriting needed for correctness
+                // when an upstream Load is killed.
+                // ============================================================
+                Opcode::I64Load { .. } | Opcode::I64Const { .. } => {
+                    // Pure producers, no src to resolve.
+                }
+                Opcode::I64Add {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Add {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Sub {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Sub {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64And {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64And {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Or {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Or {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Xor {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Xor {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Mul {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Mul {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64DivS {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64DivS {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64DivU {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64DivU {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64RemS {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64RemS {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64RemU {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64RemU {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Shl {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Shl {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64ShrS {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64ShrS {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64ShrU {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64ShrU {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Rotl {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Rotl {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Rotr {
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Rotr {
+                        dest_lo,
+                        dest_hi,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                // i64 comparisons (i64 -> i32 bool)
+                Opcode::I64Eq {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Eq {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Ne {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64Ne {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64LtS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64LtS {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64GtS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64GtS {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64LeS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64LeS {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64GeS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64GeS {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64LtU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64LtU {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64GtU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64GtU {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64LeU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64LeU {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64GeU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    inst.opcode = Opcode::I64GeU {
+                        dest,
+                        src1_lo: resolve(src1_lo),
+                        src1_hi: resolve(src1_hi),
+                        src2_lo: resolve(src2_lo),
+                        src2_hi: resolve(src2_hi),
+                    };
+                }
+                Opcode::I64Eqz {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    inst.opcode = Opcode::I64Eqz {
+                        dest,
+                        src_lo: resolve(src_lo),
+                        src_hi: resolve(src_hi),
+                    };
+                }
+                Opcode::I64Clz {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    inst.opcode = Opcode::I64Clz {
+                        dest,
+                        src_lo: resolve(src_lo),
+                        src_hi: resolve(src_hi),
+                    };
+                }
+                Opcode::I64Ctz {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    inst.opcode = Opcode::I64Ctz {
+                        dest,
+                        src_lo: resolve(src_lo),
+                        src_hi: resolve(src_hi),
+                    };
+                }
+                Opcode::I64Popcnt {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    inst.opcode = Opcode::I64Popcnt {
+                        dest,
+                        src_lo: resolve(src_lo),
+                        src_hi: resolve(src_hi),
+                    };
+                }
+                Opcode::I64Extend8S {
+                    dest_lo,
+                    dest_hi,
+                    src_lo,
+                } => {
+                    inst.opcode = Opcode::I64Extend8S {
+                        dest_lo,
+                        dest_hi,
+                        src_lo: resolve(src_lo),
+                    };
+                }
+                Opcode::I64Extend16S {
+                    dest_lo,
+                    dest_hi,
+                    src_lo,
+                } => {
+                    inst.opcode = Opcode::I64Extend16S {
+                        dest_lo,
+                        dest_hi,
+                        src_lo: resolve(src_lo),
+                    };
+                }
+                Opcode::I64Extend32S {
+                    dest_lo,
+                    dest_hi,
+                    src_lo,
+                } => {
+                    inst.opcode = Opcode::I64Extend32S {
+                        dest_lo,
+                        dest_hi,
+                        src_lo: resolve(src_lo),
+                    };
+                }
+                Opcode::I64ExtendI32U {
+                    dest_lo,
+                    dest_hi,
+                    src,
+                } => {
+                    inst.opcode = Opcode::I64ExtendI32U {
+                        dest_lo,
+                        dest_hi,
+                        src: resolve(src),
+                    };
+                }
+                Opcode::I64ExtendI32S {
+                    dest_lo,
+                    dest_hi,
+                    src,
+                } => {
+                    inst.opcode = Opcode::I64ExtendI32S {
+                        dest_lo,
+                        dest_hi,
+                        src: resolve(src),
+                    };
+                }
+                Opcode::I32WrapI64 { dest, src_lo } => {
+                    inst.opcode = Opcode::I32WrapI64 {
+                        dest,
+                        src_lo: resolve(src_lo),
+                    };
+                }
+
                 _ => {}
             }
+
+            // See `dests_to_invalidate` above: clear `reg_map` entries for
+            // any vreg this instruction writes, so that downstream uses of
+            // those vregs are NOT rewritten back to a now-stale CSE-aliased
+            // value. Skip if this instruction was itself marked dead by
+            // CSE — its dest is a dead vreg whose alias should remain so
+            // later consumers can re-use the existing live value.
+            if !inst.is_dead {
+                for dest in &dests_to_invalidate {
+                    reg_map.remove(dest);
+                }
+            }
         }
 
         if self.verbose && modified > 0 {
diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs
index 44db78c..88effad 100644
--- a/crates/synth-synthesis/src/optimizer_bridge.rs
+++ b/crates/synth-synthesis/src/optimizer_bridge.rs
@@ -669,129 +669,104 @@ impl OptimizerBridge {
                     builder.add_instruction(); // Add extra instruction for hi part
                     continue;
                 }
-                WasmOp::I64Add => Opcode::I64Add {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64Sub => Opcode::I64Sub {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64And => Opcode::I64And {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64Or => Opcode::I64Or {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64Xor => Opcode::I64Xor {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-
-                // i64 multiply and shifts (produce i64 pair result)
-                WasmOp::I64Mul => Opcode::I64Mul {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                // i64 division and remainder (produce i64 pair result)
-                WasmOp::I64DivS => Opcode::I64DivS {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64DivU => Opcode::I64DivU {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64RemS => Opcode::I64RemS {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64RemU => Opcode::I64RemU {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64Shl => Opcode::I64Shl {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64ShrS => Opcode::I64ShrS {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64ShrU => Opcode::I64ShrU {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64Rotl => Opcode::I64Rotl {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
-                WasmOp::I64Rotr => Opcode::I64Rotr {
-                    dest_lo: OptReg(inst_id as u32),
-                    dest_hi: OptReg((inst_id + 1) as u32),
-                    src1_lo: OptReg(inst_id.saturating_sub(4) as u32),
-                    src1_hi: OptReg(inst_id.saturating_sub(3) as u32),
-                    src2_lo: OptReg(inst_id.saturating_sub(2) as u32),
-                    src2_hi: OptReg(inst_id.saturating_sub(1) as u32),
-                },
+                // i64 binary arithmetic ops (consume 2 i64 pairs, produce 1 i64 pair).
+                //
+                // Slot accounting: each i64 occupies 2 consecutive vreg slots
+                // (lo, hi). Consuming 2 i64s reads slots [inst_id-4..inst_id-1];
+                // producing 1 i64 reserves slots [inst_id, inst_id+1]. So the
+                // next op must see the new i64 at slots [next_inst_id-2,
+                // next_inst_id-1], which requires `inst_id += 2`.
+                //
+                // Previously these arms fell through to the wildcard `inst_id
+                // += 1`, leaving `dest_hi` at slot `inst_id+1 = next_inst_id`
+                // — i.e. the very slot the NEXT wasm op was about to use as a
+                // fresh dest. The next op clobbered `dest_hi`, and any later
+                // op trying to read `(prev.dest_lo, prev.dest_hi)` would look
+                // at `(next_inst_id-2, next_inst_id-1)` which pointed to the
+                // hi half of the previously consumed src2 and the just-written
+                // current dest_lo — total slot scramble. In some cases the lookup
+                // would find no mapping at all and `get_arm_reg` would silently
+                // return R0 (issue #93 root cause). See PR #100 fuzz harness
+                // and PR #101 defensive panic for the diagnostic plumbing.
+                WasmOp::I64Add
+                | WasmOp::I64Sub
+                | WasmOp::I64And
+                | WasmOp::I64Or
+                | WasmOp::I64Xor
+                | WasmOp::I64Mul
+                | WasmOp::I64DivS
+                | WasmOp::I64DivU
+                | WasmOp::I64RemS
+                | WasmOp::I64RemU
+                | WasmOp::I64Shl
+                | WasmOp::I64ShrS
+                | WasmOp::I64ShrU
+                | WasmOp::I64Rotl
+                | WasmOp::I64Rotr => {
+                    let dest_lo = OptReg(inst_id as u32);
+                    let dest_hi = OptReg((inst_id + 1) as u32);
+                    let src1_lo = OptReg(inst_id.saturating_sub(4) as u32);
+                    let src1_hi = OptReg(inst_id.saturating_sub(3) as u32);
+                    let src2_lo = OptReg(inst_id.saturating_sub(2) as u32);
+                    let src2_hi = OptReg(inst_id.saturating_sub(1) as u32);
+                    let opcode = match wasm_op {
+                        WasmOp::I64Add => Opcode::I64Add {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Sub => Opcode::I64Sub {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64And => Opcode::I64And {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Or => Opcode::I64Or {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Xor => Opcode::I64Xor {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Mul => Opcode::I64Mul {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64DivS => Opcode::I64DivS {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64DivU => Opcode::I64DivU {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64RemS => Opcode::I64RemS {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64RemU => Opcode::I64RemU {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Shl => Opcode::I64Shl {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64ShrS => Opcode::I64ShrS {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64ShrU => Opcode::I64ShrU {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Rotl => Opcode::I64Rotl {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        WasmOp::I64Rotr => Opcode::I64Rotr {
+                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                        },
+                        _ => unreachable!(),
+                    };
+                    instructions.push(Instruction {
+                        id: inst_id,
+                        opcode,
+                        block_id: 0,
+                        is_dead: false,
+                    });
+                    inst_id += 2; // produces i64 = 2 slots
+                    builder.add_instruction();
+                    continue;
+                }
 
                 // i64 comparisons (consume 2 i64 pairs, produce single i32 result)
                 WasmOp::I64Eq
@@ -962,7 +937,13 @@ impl OptimizerBridge {
                     continue;
                 }
 
-                // i64 sign extension (takes i64, produces i64)
+                // i64 sign extension (takes i64, produces i64).
+                //
+                // Slot accounting: consume 1 i64 (slots [inst_id-2, inst_id-1]),
+                // produce 1 i64 (slots [inst_id, inst_id+1]). `inst_id += 2`
+                // so the next op's `inst_id-2`/`inst_id-1` lookup lands on
+                // dest_lo/dest_hi. Was `+= 1` which left dest_hi at the slot
+                // the next wasm op would claim as its own dest — clobber.
                 WasmOp::I64Extend8S => {
                     let opcode = Opcode::I64Extend8S {
                         dest_lo: OptReg(inst_id as u32),
@@ -975,7 +956,7 @@ impl OptimizerBridge {
                         block_id: 0,
                         is_dead: false,
                     });
-                    inst_id += 1;
+                    inst_id += 2;
                     builder.add_instruction();
                     continue;
                 }
@@ -992,7 +973,7 @@ impl OptimizerBridge {
                         block_id: 0,
                         is_dead: false,
                     });
-                    inst_id += 1;
+                    inst_id += 2;
                     builder.add_instruction();
                     continue;
                 }
@@ -1009,7 +990,7 @@ impl OptimizerBridge {
                         block_id: 0,
                         is_dead: false,
                     });
-                    inst_id += 1;
+                    inst_id += 2;
                     builder.add_instruction();
                     continue;
                 }
@@ -1431,6 +1412,17 @@ impl OptimizerBridge {
         // Helper to get ARM reg from virtual reg.
         // Also checks spill slots — if a vreg was spilled, returns R12 (IP scratch).
         // Callers should also call `reload_spill` to emit the actual load instruction.
+        //
+        // PANICS if the vreg is neither mapped nor spilled. The previous behavior was
+        // a silent `Reg::R0` fallback, which produced miscompilation: a downstream
+        // instruction reading the "unknown" vreg would silently consume whatever
+        // R0 happens to hold (often a live caller param or memset's dest pointer).
+        // Issue #93 was exactly this — `wasm_to_ir` had no handler for
+        // `I64ExtendI32U`/`I64ExtendI32S`/`I32WrapI64`, so the IR they should have
+        // produced never got mapped to ARM regs, and downstream i64 shifts read R0
+        // as their `rm_lo`/`rm_hi`, destroying the loop counter on real silicon.
+        // A loud panic here is strictly better than a quiet miscompilation —
+        // crash the compiler, not the firmware.
         let get_arm_reg =
             |vreg: &OptReg, map: &HashMap<u32, Reg>, spills: &HashMap<u32, i32>| -> Reg {
                 if let Some(&r) = map.get(&vreg.0) {
@@ -1439,7 +1431,12 @@ impl OptimizerBridge {
                     // Will be reloaded into R12 by reload_spill
                     Reg::R12
                 } else {
-                    Reg::R0
+                    panic!(
+                        "synth internal compiler error: vreg v{} has no assigned \
+                         ARM register and no spill slot. This is a wasm_to_ir bug — \
+                         likely a wasm op whose result is unmapped (see issue #93).",
+                        vreg.0
+                    );
                 }
             };
 

From 41ceb2d279e190247d781b022def79e6ddd8b28a Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Tue, 12 May 2026 20:48:42 +0200
Subject: [PATCH 2/5] =?UTF-8?q?fix(opt):=20wasm=5Fto=5Fir=20gaps=20+=20Mem?=
 =?UTF-8?q?Load=20hardcoded-R3=20=E2=80=94=20sub-word,=20globals,=20memory?=
 =?UTF-8?q?.{size,grow}?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A second-wave audit of the optimizer-path AAPCS-clobber bug class. After
fixing the i64 slot accounting (prev commit), the next-most-common gap is
ops that `wasm_to_ir` silently maps to `Opcode::Nop`.

## Findings

`wasm_to_ir` ALSO lacked handlers for every value-producing op below.
Each fell through `_ => Opcode::Nop`, consumed an `inst_id` slot, and
left its produced vreg unmapped. Downstream consumers triggered PR-101's
defensive panic (or, pre-PR-101, returned `Reg::R0` silently — the
issue-#93 / #103 clobber class):

  * `I32Load8S/U`, `I32Load16S/U` — sub-word memory loads
  * `I32Store8`, `I32Store16` — sub-word memory stores
  * `GlobalGet`, `GlobalSet` — module-level globals
  * `MemorySize`, `MemoryGrow` — memory introspection / mutation

Separately, `Opcode::MemLoad`'s `ir_to_arm` handler **hardcoded the dest
register to `Reg::R3`**, clobbering the 4th AAPCS argument on every
i32.load that ran while R3 was a live param. The fuzz harness in PR
#100 hits this every rebase but the previous round of patches missed
it because the bug only surfaces under the optimizer (not select_with_stack).

## Fixes

  * Added `Opcode::{MemLoadSubword,MemStoreSubword,GlobalGet,GlobalSet,
    MemorySize,MemoryGrow}` to `synth_opt::Opcode`.
  * Added matching arms in `wasm_to_ir` (with comments documenting the
    pre-fix silent-Nop bug).
  * Added `ir_to_arm` handlers that emit the right ARM ops (LDRB/LDRH/
    LDRSB/LDRSH, STRB/STRH, LDR/STR via R9 base, MOV from R10, MOVW
    `#-1` stub for `memory.grow` on fixed-memory targets).
  * Introduced `alloc_i32_scratch` — the i32 counterpart to
    `alloc_i64_pair`. Searches `[R4..R8]`, skipping live vregs,
    non-param locals, and the param-reserved set; falls back to R12
    under pressure.
  * Replaced the hardcoded `Reg::R3` in `Opcode::MemLoad` with a call
    to `alloc_i32_scratch`. Both `MemLoad` and the new sub-word
    variants now pick a non-AAPCS-param destination by construction.
  * Extended CSE to resolve src vregs for the new opcodes and to track
    their dests in the redefinition-invalidation set added in the prev
    commit.

## Tests

  * New `audit_subword_memops.rs` — covers all 12 of the previously-
    unmapped op shapes (pure ops + arith consumers + stores), each of
    which used to panic the defensive `get_arm_reg`.
  * New `audit_aapcs_repro.rs` — generic AAPCS-no-clobber harness for
    the `select_with_stack` path. The fuzz crash signature
    (`Mov{rd:R1,op2:Reg(R2)}` before LocalGet(1)) didn't reproduce
    against current code; these tests pin down nearby shapes so the
    fuzz can't silently regress past them.

Refs: #93, #100, #101, #103, #104
Unblocks: #100, #101.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/synth-opt/src/lib.rs                   | 106 ++++-
 .../synth-synthesis/src/optimizer_bridge.rs   | 388 +++++++++++++++++-
 .../tests/audit_aapcs_repro.rs                | 203 +++++++++
 .../tests/audit_subword_memops.rs             | 145 +++++++
 4 files changed, 823 insertions(+), 19 deletions(-)
 create mode 100644 crates/synth-synthesis/tests/audit_aapcs_repro.rs
 create mode 100644 crates/synth-synthesis/tests/audit_subword_memops.rs

diff --git a/crates/synth-opt/src/lib.rs b/crates/synth-opt/src/lib.rs
index 6bf5bd3..846538e 100644
--- a/crates/synth-opt/src/lib.rs
+++ b/crates/synth-opt/src/lib.rs
@@ -255,6 +255,57 @@ pub enum Opcode {
         addr: Reg,
         offset: u32,
     },
+
+    /// Sub-word load: i32.load8_s/u, i32.load16_s/u and the i64 equivalents
+    /// after extension. `width` is 1 or 2 bytes; `signed` is true for the
+    /// `_s` variants. The dest is a single i32 register (sub-word loads on
+    /// ARM expand to a single LDRB/LDRH ± an SXTB/SXTH).
+    MemLoadSubword {
+        dest: Reg,
+        addr: Reg,
+        offset: u32,
+        width: u32, // 1 or 2
+        signed: bool,
+    },
+    /// Sub-word store: i32.store8 / i32.store16 / i64.store8/16/32.
+    /// `width` is 1, 2, or 4. For 4-byte store of an i64's low word we
+    /// just take the lo register.
+    MemStoreSubword {
+        src: Reg,
+        addr: Reg,
+        offset: u32,
+        width: u32, // 1, 2, or 4
+    },
+
+    /// `global.get` — load a global into a fresh vreg. The actual ARM
+    /// lowering is `LDR rd, [R9, #offset]` (R9 is the globals base, by
+    /// convention). The vreg MUST be allocated to a non-AAPCS-param
+    /// register at lowering time — otherwise the LDR would clobber a
+    /// live caller arg.
+    GlobalGet {
+        dest: Reg,
+        idx: u32,
+    },
+    /// `global.set` — store a vreg to a global slot. Emits `STR src,
+    /// [R9, #offset]`. No vreg is written.
+    GlobalSet {
+        src: Reg,
+        idx: u32,
+    },
+
+    /// `memory.size` — returns the current memory size in pages as an
+    /// i32. By convention R10 holds the memory-size word; we emit
+    /// `MOV dest, R10`. `dest` must be allocated to a non-param scratch.
+    MemorySize {
+        dest: Reg,
+    },
+    /// `memory.grow` — pops the grow-by amount, pushes the previous
+    /// page count (or -1 on failure). Embedded targets generally have
+    /// fixed memory, so this is a stub returning -1 in `dest`.
+    MemoryGrow {
+        dest: Reg,
+        delta: Reg,
+    },
     // Control flow
     Branch {
         target: BlockId,
@@ -905,7 +956,11 @@ impl CommonSubexpressionElimination {
                 | Opcode::I64Clz { dest, .. }
                 | Opcode::I64Ctz { dest, .. }
                 | Opcode::I64Popcnt { dest, .. }
-                | Opcode::I32WrapI64 { dest, .. } => vec![*dest],
+                | Opcode::I32WrapI64 { dest, .. }
+                | Opcode::MemLoadSubword { dest, .. }
+                | Opcode::GlobalGet { dest, .. }
+                | Opcode::MemorySize { dest, .. }
+                | Opcode::MemoryGrow { dest, .. } => vec![*dest],
                 Opcode::I64Add {
                     dest_lo, dest_hi, ..
                 }
@@ -1832,6 +1887,55 @@ impl CommonSubexpressionElimination {
                     };
                 }
 
+                // ====================================================
+                // Sub-word memory / globals / memory.{size,grow}
+                //
+                // Same rationale as MemLoad/MemStore: resolve src vregs
+                // through `reg_map` so CSE-killed Loads don't leave
+                // stale references. We do NOT CSE the loads themselves
+                // (no alias analysis), only fix up consumer vregs.
+                // ====================================================
+                Opcode::MemLoadSubword {
+                    dest,
+                    addr,
+                    offset,
+                    width,
+                    signed,
+                } => {
+                    inst.opcode = Opcode::MemLoadSubword {
+                        dest,
+                        addr: resolve(addr),
+                        offset,
+                        width,
+                        signed,
+                    };
+                }
+                Opcode::MemStoreSubword {
+                    src,
+                    addr,
+                    offset,
+                    width,
+                } => {
+                    inst.opcode = Opcode::MemStoreSubword {
+                        src: resolve(src),
+                        addr: resolve(addr),
+                        offset,
+                        width,
+                    };
+                }
+                Opcode::GlobalSet { src, idx } => {
+                    inst.opcode = Opcode::GlobalSet {
+                        src: resolve(src),
+                        idx,
+                    };
+                }
+                Opcode::MemoryGrow { dest, delta } => {
+                    inst.opcode = Opcode::MemoryGrow {
+                        dest,
+                        delta: resolve(delta),
+                    };
+                }
+
                 _ => {}
             }
 
diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs
index 88effad..174fe3e 100644
--- a/crates/synth-synthesis/src/optimizer_bridge.rs
+++ b/crates/synth-synthesis/src/optimizer_bridge.rs
@@ -711,49 +711,124 @@ impl OptimizerBridge {
                     let src2_hi = OptReg(inst_id.saturating_sub(1) as u32);
                     let opcode = match wasm_op {
                         WasmOp::I64Add => Opcode::I64Add {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Sub => Opcode::I64Sub {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64And => Opcode::I64And {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Or => Opcode::I64Or {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Xor => Opcode::I64Xor {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Mul => Opcode::I64Mul {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64DivS => Opcode::I64DivS {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64DivU => Opcode::I64DivU {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64RemS => Opcode::I64RemS {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64RemU => Opcode::I64RemU {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Shl => Opcode::I64Shl {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64ShrS => Opcode::I64ShrS {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64ShrU => Opcode::I64ShrU {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Rotl => Opcode::I64Rotl {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         WasmOp::I64Rotr => Opcode::I64Rotr {
-                            dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi,
+                            dest_lo,
+                            dest_hi,
+                            src1_lo,
+                            src1_hi,
+                            src2_lo,
+                            src2_hi,
                         },
                         _ => unreachable!(),
                     };
@@ -1177,6 +1252,80 @@ impl OptimizerBridge {
                     offset: *offset,
                 },
 
+                // ===== Sub-word linear-memory ops =====
+                //
+                // Pop addr (and value for stores), push value (for loads).
+                // Pre-fix, these fell through to `Opcode::Nop` — their dest
+                // vreg never got mapped to an ARM register, and any
+                // consumer of the loaded value triggered the PR #101
+                // defensive panic (or, pre-PR-101, silently consumed R0).
+                WasmOp::I32Load8S { offset, .. } => Opcode::MemLoadSubword {
+                    dest: OptReg(inst_id as u32),
+                    addr: OptReg(inst_id.saturating_sub(1) as u32),
+                    offset: *offset,
+                    width: 1,
+                    signed: true,
+                },
+                WasmOp::I32Load8U { offset, .. } => Opcode::MemLoadSubword {
+                    dest: OptReg(inst_id as u32),
+                    addr: OptReg(inst_id.saturating_sub(1) as u32),
+                    offset: *offset,
+                    width: 1,
+                    signed: false,
+                },
+                WasmOp::I32Load16S { offset, .. } => Opcode::MemLoadSubword {
+                    dest: OptReg(inst_id as u32),
+                    addr: OptReg(inst_id.saturating_sub(1) as u32),
+                    offset: *offset,
+                    width: 2,
+                    signed: true,
+                },
+                WasmOp::I32Load16U { offset, .. } => Opcode::MemLoadSubword {
+                    dest: OptReg(inst_id as u32),
+                    addr: OptReg(inst_id.saturating_sub(1) as u32),
+                    offset: *offset,
+                    width: 2,
+                    signed: false,
+                },
+                WasmOp::I32Store8 { offset, .. } => Opcode::MemStoreSubword {
+                    src: OptReg(inst_id.saturating_sub(1) as u32),
+                    addr: OptReg(inst_id.saturating_sub(2) as u32),
+                    offset: *offset,
+                    width: 1,
+                },
+                WasmOp::I32Store16 { offset, .. } => Opcode::MemStoreSubword {
+                    src: OptReg(inst_id.saturating_sub(1) as u32),
+                    addr: OptReg(inst_id.saturating_sub(2) as u32),
+                    offset: *offset,
+                    width: 2,
+                },
+
+                // ===== Globals =====
+                //
+                // GlobalGet pushes a fresh i32; GlobalSet pops one. Without
+                // explicit IR ops these silently produced unmapped vregs.
+                WasmOp::GlobalGet(idx) => Opcode::GlobalGet {
+                    dest: OptReg(inst_id as u32),
+                    idx: *idx,
+                },
+                WasmOp::GlobalSet(idx) => Opcode::GlobalSet {
+                    src: OptReg(inst_id.saturating_sub(1) as u32),
+                    idx: *idx,
+                },
+
+                // ===== Memory size / grow =====
+                //
+                // Both push an i32 result. On bare-metal targets with fixed
+                // memory, grow is a stub (returns the size or -1), but the
+                // dest vreg still needs allocation.
+                WasmOp::MemorySize(_) => Opcode::MemorySize {
+                    dest: OptReg(inst_id as u32),
+                },
+                WasmOp::MemoryGrow(_) => Opcode::MemoryGrow {
+                    dest: OptReg(inst_id as u32),
+                    delta: OptReg(inst_id.saturating_sub(1) as u32),
+                },
+
                 // Fallback for unsupported ops
                 _ => Opcode::Nop,
             };
@@ -1482,6 +1631,44 @@ impl OptimizerBridge {
             (Reg::R4, Reg::R5)
         };
 
+        // Allocate a SINGLE callee-saved register for an i32 destination.
+        //
+        // Searches `[R4, R5, R6, R7, R8]` for a register not currently held
+        // by a live vreg, bound to a non-param local, or reserved as an
+        // AAPCS param. The extra_avoid list is honoured for transient
+        // operand-region exclusions (e.g. addresses-of operands that must
+        // outlive the destination allocation).
+        //
+        // Falls back to R12 (IP, the universal scratch) if every callee-
+        // saved register is taken — matches the prior pressure-relief
+        // behaviour. R12 is intentionally NOT in the search list because
+        // it's used as a transient by MemLoad/MemStore for the base+offset
+        // pointer math, and would be clobbered before the destination is
+        // read.
+        let alloc_i32_scratch = |vreg_to_arm: &HashMap<u32, Reg>,
+                                 local_to_reg: &HashMap<u32, Reg>,
+                                 param_reserved_regs: &[Reg],
+                                 extra_avoid: &[Reg]|
+         -> Reg {
+            const CANDIDATES: &[Reg] = &[Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8];
+            let is_in_use = |r: Reg| -> bool {
+                vreg_to_arm.values().any(|&v| v == r)
+                    || local_to_reg.values().any(|&v| v == r)
+                    || param_reserved_regs.contains(&r)
+                    || extra_avoid.contains(&r)
+            };
+            for &r in CANDIDATES {
+                if !is_in_use(r) {
+                    return r;
+                }
+            }
+            // Pressure-relief fallback. R12 is acceptable here because
+            // the call sites that use this helper write the destination
+            // BEFORE using R12 as scratch (e.g. MemLoad emits the LDR
+            // last, after the address math).
+            Reg::R12
+        };
+
         // Emit a reload instruction if the vreg was spilled to stack.
         // Must be called before the instruction that uses the register.
         let reload_spill = |vreg: &OptReg, spills: &HashMap<u32, i32>, instrs: &mut Vec<ArmOp>| {
@@ -3515,11 +3702,25 @@ impl OptimizerBridge {
                 // Linear Memory Operations
                 // ========================================================================
 
-                // MemLoad: load 32-bit value from linear memory
-                // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi; ADD R12, R12, Raddr; LDR Rd, [R12, #offset]
+                // MemLoad: load 32-bit value from linear memory.
+                //
+                // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi;
+                //            ADD R12, R12, Raddr; LDR Rd, [R12, #offset]
+                //
+                // `Rd` MUST NOT alias an AAPCS param register (R0..R3) — a
+                // `local.get` of param N anywhere downstream would otherwise
+                // observe whatever the MemLoad just wrote. Pre-fix this was
+                // hardcoded to `Reg::R3`, which clobbered the 4th AAPCS
+                // argument on every `i32.load`. Use the scratch helper so
+                // the destination is picked from the callee-saved bank.
                 Opcode::MemLoad { dest, addr, offset } => {
                     let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[r_addr],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     // Linear memory base address: 0x20000100 (in SRAM, above stack area)
@@ -3583,6 +3784,157 @@ impl OptimizerBridge {
                     });
                     // MemStore does not produce a value
                 }
+
+                // Sub-word linear memory load (i32.load8_s/u, i32.load16_s/u).
+                //
+                // Generates the same base+addr math as MemLoad, then LDRB/
+                // LDRH/LDRSB/LDRSH into a non-param destination register
+                // chosen by `alloc_i32_scratch`. Pre-fix these wasm ops
+                // had no IR handler; the optimizer pipeline left the
+                // produced vreg unmapped → defensive panic (or pre-PR-101
+                // silent R0 alias).
+                Opcode::MemLoadSubword {
+                    dest,
+                    addr,
+                    offset,
+                    width,
+                    signed,
+                } => {
+                    let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs);
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[r_addr],
+                    );
+                    vreg_to_arm.insert(dest.0, rd);
+
+                    let base: u32 = 0x20000100;
+                    let base_lo = (base & 0xFFFF) as u16;
+                    let base_hi = ((base >> 16) & 0xFFFF) as u16;
+                    arm_instrs.push(ArmOp::Movw {
+                        rd: Reg::R12,
+                        imm16: base_lo,
+                    });
+                    arm_instrs.push(ArmOp::Movt {
+                        rd: Reg::R12,
+                        imm16: base_hi,
+                    });
+                    arm_instrs.push(ArmOp::Add {
+                        rd: Reg::R12,
+                        rn: Reg::R12,
+                        op2: Operand2::Reg(r_addr),
+                    });
+                    let addr_mem = crate::rules::MemAddr::imm(Reg::R12, *offset as i32);
+                    let sub_op = match (*width, *signed) {
+                        (1, false) => ArmOp::Ldrb { rd, addr: addr_mem },
+                        (1, true) => ArmOp::Ldrsb { rd, addr: addr_mem },
+                        (2, false) => ArmOp::Ldrh { rd, addr: addr_mem },
+                        (2, true) => ArmOp::Ldrsh { rd, addr: addr_mem },
+                        // Width 4 is impossible here (caller would use
+                        // `Opcode::MemLoad`); fall through to plain Ldr
+                        // rather than panicking — that keeps the lowering
+                        // total, and the encoder will validate.
+                        _ => ArmOp::Ldr { rd, addr: addr_mem },
+                    };
+                    arm_instrs.push(sub_op);
+                    last_result_vreg = Some(dest.0);
+                }
+
+                // Sub-word linear memory store (i32.store8, i32.store16,
+                // i64.store8/16/32). Generates address math + STRB/STRH/STR.
+                Opcode::MemStoreSubword {
+                    src,
+                    addr,
+                    offset,
+                    width,
+                } => {
+                    let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs);
+                    let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
+
+                    let base: u32 = 0x20000100;
+                    let base_lo = (base & 0xFFFF) as u16;
+                    let base_hi = ((base >> 16) & 0xFFFF) as u16;
+                    arm_instrs.push(ArmOp::Movw {
+                        rd: Reg::R12,
+                        imm16: base_lo,
+                    });
+                    arm_instrs.push(ArmOp::Movt {
+                        rd: Reg::R12,
+                        imm16: base_hi,
+                    });
+                    arm_instrs.push(ArmOp::Add {
+                        rd: Reg::R12,
+                        rn: Reg::R12,
+                        op2: Operand2::Reg(r_addr),
+                    });
+                    let addr_mem = crate::rules::MemAddr::imm(Reg::R12, *offset as i32);
+                    let sub_op = match *width {
+                        1 => ArmOp::Strb {
+                            rd: r_src,
+                            addr: addr_mem,
+                        },
+                        2 => ArmOp::Strh {
+                            rd: r_src,
+                            addr: addr_mem,
+                        },
+                        _ => ArmOp::Str {
+                            rd: r_src,
+                            addr: addr_mem,
+                        },
+                    };
+                    arm_instrs.push(sub_op);
+                }
+
+                // `global.get N` — load global N into a fresh non-param
+                // scratch. ARM convention: R9 is the globals base, globals
+                // are packed as 4-byte slots.
+                Opcode::GlobalGet { dest, idx } => {
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[]);
+                    vreg_to_arm.insert(dest.0, rd);
+                    arm_instrs.push(ArmOp::Ldr {
+                        rd,
+                        addr: crate::rules::MemAddr::imm(Reg::R9, (*idx as i32) * 4),
+                    });
+                    last_result_vreg = Some(dest.0);
+                }
+
+                // `global.set N` — store the popped i32 to global N.
+                Opcode::GlobalSet { src, idx } => {
+                    let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
+                    arm_instrs.push(ArmOp::Str {
+                        rd: r_src,
+                        addr: crate::rules::MemAddr::imm(Reg::R9, (*idx as i32) * 4),
+                    });
+                }
+
+                // `memory.size` — current memory size in pages. Convention:
+                // R10 holds the memory size word. Emit `MOV dest, R10`.
+                Opcode::MemorySize { dest } => {
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[]);
+                    vreg_to_arm.insert(dest.0, rd);
+                    arm_instrs.push(ArmOp::Mov {
+                        rd,
+                        op2: Operand2::Reg(Reg::R10),
+                    });
+                    last_result_vreg = Some(dest.0);
+                }
+
+                // `memory.grow` — embedded targets have fixed memory; emit
+                // a stub that returns -1 (the wasm spec's "grow failed"
+                // sentinel). The `delta` is read but discarded.
+                Opcode::MemoryGrow { dest, delta } => {
+                    let _ = get_arm_reg(delta, &vreg_to_arm, &spilled_vregs);
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[]);
+                    vreg_to_arm.insert(dest.0, rd);
+                    // mov rd, #-1  →  MOVW rd, #0xFFFF; MOVT rd, #0xFFFF
+                    arm_instrs.push(ArmOp::Movw { rd, imm16: 0xFFFF });
+                    arm_instrs.push(ArmOp::Movt { rd, imm16: 0xFFFF });
+                    last_result_vreg = Some(dest.0);
+                }
             }
 
             // Track WASM operand value stack for correct br_if semantics.
diff --git a/crates/synth-synthesis/tests/audit_aapcs_repro.rs b/crates/synth-synthesis/tests/audit_aapcs_repro.rs
new file mode 100644
index 0000000..9d335fb
--- /dev/null
+++ b/crates/synth-synthesis/tests/audit_aapcs_repro.rs
@@ -0,0 +1,203 @@
+//! Try to reproduce the AAPCS clobber fuzz crash from PR #100.
+//!
+//! Crash signature:
+//!   ARM instr at wasm line 1 writes param reg R1 before LocalGet(1) at line 5.
+//!   Op: Mov { rd: R1, op2: Reg(R2) }
+//!   Sequence: [Push, I64Const{rdlo:R2,rdhi:R3,value:0}, Mov{rd:R1,op2:Reg(R2)}, Pop]
+//!
+//! Test a range of plausible wasm sequences and assert no AAPCS clobber.
+
+use synth_synthesis::{ArmOp, InstructionSelector, Reg, WasmOp};
+
+/// Helper: extract source line numbers and Mov destinations.
+fn run(wasm: &[WasmOp], num_params: u32) -> Vec<synth_synthesis::ArmInstruction> {
+    let mut s = InstructionSelector::new(vec![]);
+    s.select_with_stack(wasm, num_params)
+        .expect("should compile")
+}
+
+fn first_local_get(wasm: &[WasmOp], p: u32) -> Option<usize> {
+    wasm.iter().enumerate().find_map(|(i, op)| match op {
+        WasmOp::LocalGet(idx) if *idx == p => Some(i),
+        _ => None,
+    })
+}
+
+fn writes(op: &ArmOp) -> Vec<Reg> {
+    match op {
+        ArmOp::Add { rd, .. }
+        | ArmOp::Sub { rd, .. }
+        | ArmOp::Adds { rd, .. }
+        | ArmOp::Adc { rd, .. }
+        | ArmOp::Subs { rd, .. }
+        | ArmOp::Sbc { rd, .. }
+        | ArmOp::And { rd, .. }
+        | ArmOp::Orr { rd, .. }
+        | ArmOp::Eor { rd, .. }
+        | ArmOp::Mov { rd, .. }
+        | ArmOp::Mvn { rd, .. }
+        | ArmOp::Movw { rd, .. }
+        | ArmOp::Lsl { rd, .. }
+        | ArmOp::Lsr { rd, .. }
+        | ArmOp::Asr { rd, .. }
+        | ArmOp::Ror { rd, .. }
+        | ArmOp::LslReg { rd, .. }
+        | ArmOp::LsrReg { rd, .. }
+        | ArmOp::AsrReg { rd, .. }
+        | ArmOp::RorReg { rd, .. }
+        | ArmOp::Rsb { rd, .. }
+        | ArmOp::Mul { rd, .. }
+        | ArmOp::Sdiv { rd, .. }
+        | ArmOp::Udiv { rd, .. }
+        | ArmOp::Mls { rd, .. }
+        | ArmOp::Clz { rd, .. }
+        | ArmOp::Rbit { rd, .. }
+        | ArmOp::Popcnt { rd, .. }
+        | ArmOp::Sxtb { rd, .. }
+        | ArmOp::Sxth { rd, .. }
+        | ArmOp::Ldr { rd, .. }
+        | ArmOp::Ldrb { rd, .. }
+        | ArmOp::Ldrsb { rd, .. }
+        | ArmOp::Ldrh { rd, .. }
+        | ArmOp::Ldrsh { rd, .. } => vec![*rd],
+        ArmOp::Movt { rd, .. } => vec![*rd],
+        ArmOp::I64Const { rdlo, rdhi, .. } => vec![*rdlo, *rdhi],
+        ArmOp::I64Ldr { rdlo, rdhi, .. } => vec![*rdlo, *rdhi],
+        ArmOp::I64ExtendI32U { rdlo, rdhi, .. } | ArmOp::I64ExtendI32S { rdlo, rdhi, .. } => {
+            vec![*rdlo, *rdhi]
+        }
+        ArmOp::I64Extend8S { rdlo, rdhi, .. }
+        | ArmOp::I64Extend16S { rdlo, rdhi, .. }
+        | ArmOp::I64Extend32S { rdlo, rdhi, .. } => vec![*rdlo, *rdhi],
+        ArmOp::I32WrapI64 { rd, .. } => vec![*rd],
+        ArmOp::I64Mul { rd_lo, rd_hi, .. }
+        | ArmOp::I64Shl { rd_lo, rd_hi, .. }
+        | ArmOp::I64ShrU { rd_lo, rd_hi, .. }
+        | ArmOp::I64ShrS { rd_lo, rd_hi, .. } => vec![*rd_lo, *rd_hi],
+        _ => Vec::new(),
+    }
+}
+
+fn check_no_clobber(wasm: &[WasmOp], num_params: u32, label: &str) {
+    let instrs = run(wasm, num_params);
+    for p in 0..num_params.min(4) {
+        let first_read = match first_local_get(wasm, p) {
+            Some(i) => i,
+            None => continue,
+        };
+        let param_reg = match p {
+            0 => Reg::R0,
+            1 => Reg::R1,
+            2 => Reg::R2,
+            3 => Reg::R3,
+            _ => continue,
+        };
+        for ai in &instrs {
+            let line = match ai.source_line {
+                Some(l) => l,
+                None => continue,
+            };
+            if line >= first_read {
+                break;
+            }
+            for w in writes(&ai.op) {
+                assert_ne!(
+                    w, param_reg,
+                    "[{label}] AAPCS clobber: ARM instr at wasm line {line} writes \
+                     param reg {param_reg:?} before LocalGet({p}) at line {first_read}. \
+                     Op: {:?}. Sequence: {:#?}",
+                    ai.op, instrs
+                );
+            }
+        }
+    }
+}
+
+/// Reproduce the harness invariant: build a sequence with `LocalGet(p)` for
+/// each param, but place them AFTER an i64 op, so anything that touches
+/// a param-region register pre-LocalGet is a clobber.
+#[test]
+fn aapcs_i64_const_drop_localgets_2_params() {
+    let wasm = vec![
+        WasmOp::I64Const(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+    ];
+    check_no_clobber(&wasm, 2, "i64const-drop-localgets");
+}
+
+#[test]
+fn aapcs_i64_extend_then_localgets_2_params() {
+    let wasm = vec![
+        WasmOp::I32Const(7),
+        WasmOp::I64ExtendI32U,
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+    ];
+    check_no_clobber(&wasm, 2, "i64extend-then-localgets");
+}
+
+#[test]
+fn aapcs_i64_const_localset0_localgets_2_params() {
+    // LocalSet to a param slot would overwrite the param. Even so, the
+    // assertion is about writes that occur BEFORE the first LocalGet of
+    // each param — these should not clobber.
+    let wasm = vec![
+        WasmOp::I64Const(0xAABBCCDD),
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+    ];
+    check_no_clobber(&wasm, 2, "i64const-large");
+}
+
+#[test]
+fn aapcs_i64_arith_then_localgets_3_params() {
+    let wasm = vec![
+        WasmOp::I64Const(1),
+        WasmOp::I64Const(2),
+        WasmOp::I64Add,
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+    ];
+    check_no_clobber(&wasm, 3, "i64-add-3params");
+}
+
+/// Reproduce specifically the fuzz-finding shape:
+/// `[Push, I64Const{rdlo:R2,rdhi:R3,value:0}, Mov{rd:R1,op2:Reg(R2)}, Pop]`
+/// with a middle op that emits a Mov to a low register.
+#[test]
+fn aapcs_i64_const_then_some_op_2_params() {
+    // Try several middle ops that emit something
+    for middle in [
+        WasmOp::I32Const(0),
+        WasmOp::I64Eqz,
+        WasmOp::I64Clz,
+        WasmOp::I64Ctz,
+        WasmOp::I64Popcnt,
+    ] {
+        let wasm = vec![
+            WasmOp::I64Const(0),
+            middle.clone(),
+            WasmOp::Drop,
+            WasmOp::LocalGet(0),
+            WasmOp::Drop,
+            WasmOp::LocalGet(1),
+            WasmOp::Drop,
+        ];
+        check_no_clobber(&wasm, 2, &format!("i64const-{middle:?}"));
+    }
+}
diff --git a/crates/synth-synthesis/tests/audit_subword_memops.rs b/crates/synth-synthesis/tests/audit_subword_memops.rs
new file mode 100644
index 0000000..9266e78
--- /dev/null
+++ b/crates/synth-synthesis/tests/audit_subword_memops.rs
@@ -0,0 +1,145 @@
+//! Audit: sub-word memory ops, globals, and memory.size/grow must lower
+//! through the optimizer path without unmapped-vreg panics.
+//!
+//! Pre-PR: these ops fell through `wasm_to_ir`'s `_ => Opcode::Nop` arm,
+//! consumed an inst_id slot, but produced no `vreg_to_arm` mapping. The
+//! defensive panic in `get_arm_reg` (PR #101) surfaces this as a compiler
+//! crash for downstream consumers; pre-defensive-panic the code silently
+//! returned R0, clobbering AAPCS params.
+
+use synth_synthesis::{OptimizerBridge, WasmOp};
+
+fn compile(wasm: &[WasmOp], num_params: usize) {
+    let bridge = OptimizerBridge::new();
+    let (ir, _cfg, _stats) = bridge.optimize_full(wasm).expect("optimize_full");
+    // ir_to_arm is the panic site for unmapped-vreg consumers.
+    let _arm = bridge.ir_to_arm(&ir, num_params);
+}
+
+#[test]
+fn subword_i32_load8_u_then_drop() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I32Load8U {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::Drop,
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn subword_i32_load8_s_into_arith() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I32Load8S {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::I32Const(1),
+        WasmOp::I32Add,
+        WasmOp::Drop,
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn subword_i32_load16_u_then_drop() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I32Load16U {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::Drop,
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn subword_i32_load16_s_into_arith() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I32Load16S {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::I32Const(1),
+        WasmOp::I32Add,
+        WasmOp::Drop,
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn subword_i32_store8() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I32Const(0xff),
+        WasmOp::I32Store8 {
+            offset: 0,
+            align: 0,
+        },
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn subword_i32_store16() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I32Const(0xffff),
+        WasmOp::I32Store16 {
+            offset: 0,
+            align: 0,
+        },
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn global_get_then_drop() {
+    let wasm = vec![WasmOp::GlobalGet(0), WasmOp::Drop];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn global_get_into_arith() {
+    let wasm = vec![
+        WasmOp::GlobalGet(0),
+        WasmOp::I32Const(1),
+        WasmOp::I32Add,
+        WasmOp::Drop,
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn global_set() {
+    let wasm = vec![WasmOp::I32Const(42), WasmOp::GlobalSet(0)];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn memory_size_then_drop() {
+    let wasm = vec![WasmOp::MemorySize(0), WasmOp::Drop];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn memory_size_into_arith() {
+    let wasm = vec![
+        WasmOp::MemorySize(0),
+        WasmOp::I32Const(1),
+        WasmOp::I32Add,
+        WasmOp::Drop,
+    ];
+    compile(&wasm, 0);
+}
+
+#[test]
+fn memory_grow_then_drop() {
+    let wasm = vec![WasmOp::I32Const(1), WasmOp::MemoryGrow(0), WasmOp::Drop];
+    compile(&wasm, 0);
+}

From 88446651a2752de8679f0e83b94ae60a9060d351 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Tue, 12 May 2026 21:09:03 +0200
Subject: [PATCH 3/5] fix(opt): i32 arith/comp/select handlers no longer
 hardcode R0/R3 as dest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Third-wave audit: `ir_to_arm` had 19 `Opcode::*` handlers that picked the
destination register by a hardcoded `let rd = Reg::R0;` or `Reg::R3;`
(plus a few `Reg::R7`). With 4 AAPCS params live (`num_params=4`,
common in C ABI functions), every such op clobbered the 4th param
register before the first downstream `local.get` could read it.

Operations fixed: `Add`, `Sub`, `Mul`, `DivS`, `DivU`, `RemS`, `RemU`,
`And`, `Or`, `Xor`, `Shl`, `ShrS`, `ShrU`, `Rotr`, `Rotl`, `Clz`, `Ctz`,
`Popcnt`, `Extend8S`, `Extend16S`, `Eqz`, all 10 i32 comparisons, and
`Select`. Each now routes the destination through the new
`alloc_i32_scratch` helper, passing operand registers in `extra_avoid`
so the dest never collides with a still-live src.

The bug was previously masked from `select_with_stack`-targeted tests
(issue #103) because the optimizer-path is a separate code path. New
regression tests (`audit_optimized_aapcs.rs`) cover 7 representative
shapes: i32 add/sub/mul, i32.load, i32.load8_u, global.get, memory.size
— all with `num_params=4` so a clobbered R3 fires the assert.

Refs: #100, #101, #103.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../synth-synthesis/src/optimizer_bridge.rs   | 165 +++++++++++---
 .../tests/audit_optimized_aapcs.rs            | 213 ++++++++++++++++++
 2 files changed, 345 insertions(+), 33 deletions(-)
 create mode 100644 crates/synth-synthesis/tests/audit_optimized_aapcs.rs

diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs
index 174fe3e..2eea76e 100644
--- a/crates/synth-synthesis/src/optimizer_bridge.rs
+++ b/crates/synth-synthesis/src/optimizer_bridge.rs
@@ -1959,15 +1959,24 @@ impl OptimizerBridge {
                 }
 
                 // Arithmetic operations
-                // Use a temp register (R8) for intermediate results to avoid clobbering params
-                // The final return value will be moved to R0 at function end
+                //
+                // Pre-fix these hardcoded `rd = Reg::R3`, clobbering the 4th
+                // AAPCS argument on every i32 arith in 4-param functions.
+                // Use `alloc_i32_scratch` so the destination is picked from
+                // the callee-saved bank; sources are added to `extra_avoid`
+                // so a 3-operand op doesn't pick its own input as dest while
+                // it's still live in `vreg_to_arm`.
                 Opcode::Add { dest, src1, src2 } => {
                     reload_spill(src1, &spilled_vregs, &mut arm_instrs);
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     reload_spill(src2, &spilled_vregs, &mut arm_instrs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    // Use R8 for intermediate results to preserve params in R0-R3
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Add {
                         rd,
@@ -1989,7 +1998,12 @@ impl OptimizerBridge {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     reload_spill(src2, &spilled_vregs, &mut arm_instrs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Sub {
                         rd,
@@ -2010,7 +2024,12 @@ impl OptimizerBridge {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     reload_spill(src2, &spilled_vregs, &mut arm_instrs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Mul { rd, rn, rm });
                     last_result_vreg = Some(dest.0);
@@ -2025,7 +2044,12 @@ impl OptimizerBridge {
                 Opcode::DivS { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     // Trap check 1: divide by zero
@@ -2082,7 +2106,12 @@ impl OptimizerBridge {
                 Opcode::DivU { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     // Trap check: divide by zero
@@ -2104,7 +2133,12 @@ impl OptimizerBridge {
                 Opcode::RemS { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     // Trap check: divide by zero (rem_s doesn't trap on INT_MIN % -1)
@@ -2136,7 +2170,12 @@ impl OptimizerBridge {
                 Opcode::RemU { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     // Trap check: divide by zero
@@ -2168,7 +2207,12 @@ impl OptimizerBridge {
                 Opcode::And { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::And {
                         rd,
@@ -2181,8 +2225,12 @@ impl OptimizerBridge {
                 Opcode::Or { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    // Use R3 as temp to avoid clobbering R0 (param register)
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Orr {
                         rd,
@@ -2195,8 +2243,12 @@ impl OptimizerBridge {
                 Opcode::Xor { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    // Use R3 as temp to avoid clobbering R0 (param register)
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Eor {
                         rd,
@@ -2212,7 +2264,12 @@ impl OptimizerBridge {
                 Opcode::Shl { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     // Mask shift amount: R12 = rm & 31
                     arm_instrs.push(ArmOp::And {
@@ -2231,7 +2288,12 @@ impl OptimizerBridge {
                 Opcode::ShrS { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::And {
                         rd: Reg::R12,
@@ -2249,7 +2311,12 @@ impl OptimizerBridge {
                 Opcode::ShrU { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::And {
                         rd: Reg::R12,
@@ -2270,7 +2337,12 @@ impl OptimizerBridge {
                 Opcode::Rotr { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::And {
                         rd: Reg::R12,
@@ -2290,7 +2362,12 @@ impl OptimizerBridge {
                 Opcode::Rotl { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R3;
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
                     // R12 = rm & 31
                     arm_instrs.push(ArmOp::And {
@@ -2316,7 +2393,8 @@ impl OptimizerBridge {
                 // Bit count operations (unary)
                 Opcode::Clz { dest, src } => {
                     let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rm]);
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Clz { rd, rm });
                     last_result_vreg = Some(dest.0);
@@ -2324,7 +2402,8 @@ impl OptimizerBridge {
 
                 Opcode::Ctz { dest, src } => {
                     let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rm]);
                     vreg_to_arm.insert(dest.0, rd);
                     // CTZ = CLZ(RBIT(x)) - reverse bits, then count leading zeros
                     arm_instrs.push(ArmOp::Rbit { rd, rm });
@@ -2334,7 +2413,8 @@ impl OptimizerBridge {
 
                 Opcode::Popcnt { dest, src } => {
                     let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rm]);
                     vreg_to_arm.insert(dest.0, rd);
                     // Popcnt - no direct instruction, use Popcnt pseudo-op
                     arm_instrs.push(ArmOp::Popcnt { rd, rm });
@@ -2344,7 +2424,8 @@ impl OptimizerBridge {
                 // Sign extension operations (unary)
                 Opcode::Extend8S { dest, src } => {
                     let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rm]);
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Sxtb { rd, rm });
                     last_result_vreg = Some(dest.0);
@@ -2352,7 +2433,8 @@ impl OptimizerBridge {
 
                 Opcode::Extend16S { dest, src } => {
                     let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rm]);
                     vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::Sxth { rd, rm });
                     last_result_vreg = Some(dest.0);
@@ -2361,7 +2443,8 @@ impl OptimizerBridge {
                 // Eqz - compare with zero (unary)
                 Opcode::Eqz { dest, src } => {
                     let rn = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rn]);
                     vreg_to_arm.insert(dest.0, rd);
 
                     // CMP rn, #0; SetCond rd, EQ
@@ -2389,10 +2472,20 @@ impl OptimizerBridge {
                 | Opcode::GeU { dest, src1, src2 } => {
                     let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs);
                     let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs);
-                    // Use R7 for comparison results to avoid clobbering R0
-                    // R0 is needed for return values in loops
-                    // Note: R7 must be used (not R12) because 16-bit MOV can only address R0-R7
-                    let rd = Reg::R7;
+                    // Pre-fix this hardcoded `Reg::R7` to keep the SetCond
+                    // encodable as 16-bit Thumb (which can only address R0-R7).
+                    // R7 is callee-saved (no AAPCS clobber) but the hardcode
+                    // collided with non-param locals stored in R7. Use
+                    // `alloc_i32_scratch` constrained to the R4..R7 lower-
+                    // bank candidates so SetCond keeps its 16-bit encoding
+                    // when possible. (R4..R7 are all in the helper's
+                    // search list and all 16-bit-MOV-addressable.)
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[rn, rm],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     let cond = match &inst.opcode {
@@ -2467,9 +2560,15 @@ impl OptimizerBridge {
                     let r_true = get_arm_reg(val_true, &vreg_to_arm, &spilled_vregs);
                     let r_false = get_arm_reg(val_false, &vreg_to_arm, &spilled_vregs);
 
-                    // Use R3 as result to avoid clobbering R0 (may hold param)
-                    // Final return value will be moved to R0 at function end
-                    let rd = Reg::R3;
+                    // Pre-fix this hardcoded R3, clobbering the 4th AAPCS
+                    // arg on every Select. Use `alloc_i32_scratch` so the
+                    // destination is callee-saved by construction.
+                    let rd = alloc_i32_scratch(
+                        &vreg_to_arm,
+                        &local_to_reg,
+                        &param_reserved_regs,
+                        &[r_cond, r_true, r_false],
+                    );
                     vreg_to_arm.insert(dest.0, rd);
 
                     // CRITICAL: If condition is in rd and we need to move val_true to rd,
diff --git a/crates/synth-synthesis/tests/audit_optimized_aapcs.rs b/crates/synth-synthesis/tests/audit_optimized_aapcs.rs
new file mode 100644
index 0000000..3badf06
--- /dev/null
+++ b/crates/synth-synthesis/tests/audit_optimized_aapcs.rs
@@ -0,0 +1,213 @@
+//! Audit: the optimizer path (`optimize_full + ir_to_arm`) must not write
+//! AAPCS param registers before they're read by `LocalGet`.
+//!
+//! Issue #103 covered `select_with_stack`'s i64 ops; this audit covers the
+//! optimizer path, where the corresponding bug class is `Opcode::*`
+//! handlers hardcoding `rd: Reg::R3` (or any of R0..R3). Found via the
+//! systematic audit triggered by PR #100's fuzz harness.
+
+use synth_synthesis::{ArmOp, OptimizerBridge, Reg, WasmOp};
+
+fn compile_optimized(wasm_ops: &[WasmOp], num_params: usize) -> Vec<ArmOp> {
+    let bridge = OptimizerBridge::new();
+    let (ir, _cfg, _stats) = bridge.optimize_full(wasm_ops).expect("optimize_full");
+    bridge.ir_to_arm(&ir, num_params)
+}
+
+fn writes(op: &ArmOp) -> Vec<Reg> {
+    match op {
+        ArmOp::Add { rd, .. }
+        | ArmOp::Sub { rd, .. }
+        | ArmOp::Adds { rd, .. }
+        | ArmOp::Adc { rd, .. }
+        | ArmOp::Subs { rd, .. }
+        | ArmOp::Sbc { rd, .. }
+        | ArmOp::And { rd, .. }
+        | ArmOp::Orr { rd, .. }
+        | ArmOp::Eor { rd, .. }
+        | ArmOp::Mov { rd, .. }
+        | ArmOp::Mvn { rd, .. }
+        | ArmOp::Movw { rd, .. }
+        | ArmOp::Movt { rd, .. }
+        | ArmOp::Lsl { rd, .. }
+        | ArmOp::Lsr { rd, .. }
+        | ArmOp::Asr { rd, .. }
+        | ArmOp::Ror { rd, .. }
+        | ArmOp::LslReg { rd, .. }
+        | ArmOp::LsrReg { rd, .. }
+        | ArmOp::AsrReg { rd, .. }
+        | ArmOp::RorReg { rd, .. }
+        | ArmOp::Rsb { rd, .. }
+        | ArmOp::Mul { rd, .. }
+        | ArmOp::Sdiv { rd, .. }
+        | ArmOp::Udiv { rd, .. }
+        | ArmOp::Mls { rd, .. }
+        | ArmOp::Clz { rd, .. }
+        | ArmOp::Rbit { rd, .. }
+        | ArmOp::Popcnt { rd, .. }
+        | ArmOp::Sxtb { rd, .. }
+        | ArmOp::Sxth { rd, .. }
+        | ArmOp::Ldr { rd, .. }
+        | ArmOp::Ldrb { rd, .. }
+        | ArmOp::Ldrsb { rd, .. }
+        | ArmOp::Ldrh { rd, .. }
+        | ArmOp::Ldrsh { rd, .. } => vec![*rd],
+        _ => Vec::new(),
+    }
+}
+
+/// Compile `wasm_ops` through the optimizer and assert that no
+/// instruction emitted up to the first MOV-to-R0 (epilogue) writes to
+/// any AAPCS param register. (Without source-line info we use a
+/// conservative proxy: scan the ARM output; the optimizer-emitted
+/// epilogue is the trailing `Mov R0, ...; Pop` pair, so we check
+/// everything before the LAST Mov-to-R0.)
+fn assert_no_clobber_before_epilogue(arm: &[ArmOp], num_params: usize, label: &str) {
+    let param_regs: Vec<Reg> = (0..num_params.min(4))
+        .map(|i| match i {
+            0 => Reg::R0,
+            1 => Reg::R1,
+            2 => Reg::R2,
+            3 => Reg::R3,
+            _ => unreachable!(),
+        })
+        .collect();
+
+    // Find the index of the last result-Mov to R0 / R0:R1 — everything
+    // before that is "body". We define the epilogue as: from the last
+    // `Mov { rd: R0, .. }` to the end. If there's no Mov-to-R0, treat
+    // the entire stream as body.
+    let mut last_mov_r0_idx: Option<usize> = None;
+    for (i, op) in arm.iter().enumerate() {
+        if let ArmOp::Mov { rd: Reg::R0, .. } = op {
+            last_mov_r0_idx = Some(i);
+        }
+    }
+    let body_end = last_mov_r0_idx.unwrap_or(arm.len());
+
+    for op in &arm[..body_end] {
+        for w in writes(op) {
+            if param_regs.contains(&w) {
+                panic!(
+                    "[{label}] AAPCS clobber in optimized path: op writes \
+                     param reg {w:?}. Op: {op:?}. Full sequence: {:#?}",
+                    arm
+                );
+            }
+        }
+    }
+}
+
+#[test]
+fn optimized_i32_add_4params_does_not_clobber_r3() {
+    // 4 params (R0..R3 live). Compute `local.get 0 + local.get 1` (uses
+    // R0 + R1, result via Add). The Add's dest must NOT be R3 (or any
+    // other param reg). Pre-fix, Opcode::Add hardcoded `rd = Reg::R3`,
+    // clobbering the 4th param even though it wasn't an input.
+    let wasm = vec![
+        WasmOp::LocalGet(0),
+        WasmOp::LocalGet(1),
+        WasmOp::I32Add,
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "i32_add_4params");
+}
+
+#[test]
+fn optimized_i32_sub_4params_does_not_clobber_r3() {
+    let wasm = vec![
+        WasmOp::LocalGet(0),
+        WasmOp::LocalGet(1),
+        WasmOp::I32Sub,
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "i32_sub_4params");
+}
+
+#[test]
+fn optimized_i32_mul_4params_does_not_clobber_r3() {
+    let wasm = vec![
+        WasmOp::LocalGet(0),
+        WasmOp::LocalGet(1),
+        WasmOp::I32Mul,
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "i32_mul_4params");
+}
+
+#[test]
+fn optimized_i32_load_4params_does_not_clobber_r3() {
+    // The MemLoad handler also used to hardcode `rd = Reg::R3`.
+    let wasm = vec![
+        WasmOp::LocalGet(0),
+        WasmOp::I32Load {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "i32_load_4params");
+}
+
+#[test]
+fn optimized_i32_load8u_4params_does_not_clobber_r3() {
+    let wasm = vec![
+        WasmOp::LocalGet(0),
+        WasmOp::I32Load8U {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "i32_load8u_4params");
+}
+
+#[test]
+fn optimized_global_get_4params_does_not_clobber_r3() {
+    let wasm = vec![
+        WasmOp::GlobalGet(0),
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "global_get_4params");
+}
+
+#[test]
+fn optimized_memory_size_4params_does_not_clobber_r3() {
+    let wasm = vec![
+        WasmOp::MemorySize(0),
+        WasmOp::LocalGet(2),
+        WasmOp::Drop,
+        WasmOp::LocalGet(3),
+        WasmOp::Drop,
+        WasmOp::Drop,
+    ];
+    let arm = compile_optimized(&wasm, 4);
+    assert_no_clobber_before_epilogue(&arm, 4, "memory_size_4params");
+}

From fcd2821f1e322b64de1ee3ac98c34c62877e8cd5 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Tue, 12 May 2026 21:26:18 +0200
Subject: [PATCH 4/5] =?UTF-8?q?fix(opt):=20i64=20sub-word=20loads=20+=20Co?=
 =?UTF-8?q?py=20handler=20=E2=80=94=20last=20hardcoded=20R0/R1=20dests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two more hardcoded-AAPCS-param-reg destinations found during the audit:

1. `select_with_stack` for `I64Load{8,16,32}{S,U}` hardcoded `dst_lo =
   Reg::R0; dst_hi = Reg::R1`. Clobbered AAPCS params 0 and 1 on every
   i64 sub-word load, regardless of `num_params`. Replaced with
   `alloc_consecutive_pair(&[addr])` so the destination is callee-saved.

2. `ir_to_arm::Opcode::Copy` (the optimizer's local.tee implementation)
   hardcoded `rd = Reg::R0`, clobbering param 0 on every local.tee in
   the optimized path. Routed through `alloc_i32_scratch`.

Added regression test `audit_i64_subword_aapcs.rs` for the i64 sub-word
class (3 representative shapes — load8_u, load16_s, load32_u — each
with 2 params and the address coming from `i32.const`, so the params
are uninvolved and a clobber is unambiguous).

Refs: #100, #103.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/instruction_selector.rs               |  10 +-
 .../synth-synthesis/src/optimizer_bridge.rs   |   7 +-
 .../tests/audit_i64_subword_aapcs.rs          | 123 ++++++++++++++++++
 3 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100644 crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs

diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs
index aec8dd8..a84e58c 100644
--- a/crates/synth-synthesis/src/instruction_selector.rs
+++ b/crates/synth-synthesis/src/instruction_selector.rs
@@ -4360,6 +4360,13 @@ impl InstructionSelector {
                 }
 
                 // i64 sub-word loads — load sub-word, extend to i64 (register pair)
+                //
+                // Pre-fix `dst_lo` and `dst_hi` were hardcoded to R0:R1,
+                // clobbering AAPCS params 0 and 1 on every i64.load{8,16,32}*
+                // — even when the function had 2+ params and neither was
+                // the address operand. Use `alloc_consecutive_pair` with
+                // `addr` in `extra_avoid` so the destination pair never
+                // overlaps the address register OR a live param.
                 I64Load8S { offset, .. }
                 | I64Load8U { offset, .. }
                 | I64Load16S { offset, .. }
@@ -4371,8 +4378,7 @@ impl InstructionSelector {
                             "stack underflow: malformed WASM or compiler bug".to_string(),
                         )
                     })?;
-                    let dst_lo = Reg::R0;
-                    let dst_hi = Reg::R1;
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?;
 
                     let ops: Vec<ArmOp> = match op {
                         I64Load8S { .. } => {
diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs
index 2eea76e..870de81 100644
--- a/crates/synth-synthesis/src/optimizer_bridge.rs
+++ b/crates/synth-synthesis/src/optimizer_bridge.rs
@@ -3742,9 +3742,14 @@ impl OptimizerBridge {
                 }
 
                 // Copy: move value from src to dest (for local.tee semantics)
+                //
+                // Pre-fix hardcoded `rd = Reg::R0`, which clobbered the
+                // first AAPCS param on every local.tee even when neither
+                // src nor dest had anything to do with R0.
                 Opcode::Copy { dest, src } => {
                     let rs = get_arm_reg(src, &vreg_to_arm, &spilled_vregs);
-                    let rd = Reg::R0;
+                    let rd =
+                        alloc_i32_scratch(&vreg_to_arm, &local_to_reg, &param_reserved_regs, &[rs]);
                     vreg_to_arm.insert(dest.0, rd);
                     if rs != rd {
                         arm_instrs.push(ArmOp::Mov {
diff --git a/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs b/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs
new file mode 100644
index 0000000..25b0427
--- /dev/null
+++ b/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs
@@ -0,0 +1,123 @@
+//! Audit: i64 sub-word load handlers in `select_with_stack` previously
+//! hardcoded the destination pair to R0:R1, clobbering AAPCS params 0 and 1
+//! on every `i64.load8_u`/etc. — even with 2+ live params.
+
+use synth_synthesis::{ArmInstruction, ArmOp, InstructionSelector, Reg, WasmOp};
+
+fn compile_no_optimize(wasm_ops: &[WasmOp], num_params: u32) -> Vec<ArmInstruction> {
+    let mut sel = InstructionSelector::new(vec![]);
+    sel.select_with_stack(wasm_ops, num_params)
+        .expect("select_with_stack")
+}
+
+fn writes(op: &ArmOp) -> Vec<Reg> {
+    match op {
+        ArmOp::Mov { rd, .. }
+        | ArmOp::Asr { rd, .. }
+        | ArmOp::Movw { rd, .. }
+        | ArmOp::Movt { rd, .. }
+        | ArmOp::Add { rd, .. }
+        | ArmOp::Sub { rd, .. }
+        | ArmOp::Ldr { rd, .. }
+        | ArmOp::Ldrb { rd, .. }
+        | ArmOp::Ldrsb { rd, .. }
+        | ArmOp::Ldrh { rd, .. }
+        | ArmOp::Ldrsh { rd, .. } => vec![*rd],
+        _ => Vec::new(),
+    }
+}
+
+fn first_local_get(wasm: &[WasmOp], p: u32) -> Option<usize> {
+    wasm.iter().enumerate().find_map(|(i, op)| match op {
+        WasmOp::LocalGet(idx) if *idx == p => Some(i),
+        _ => None,
+    })
+}
+
+fn assert_no_clobber(wasm: &[WasmOp], num_params: u32, label: &str) {
+    let arm = compile_no_optimize(wasm, num_params);
+    for p in 0..num_params.min(4) {
+        let first_read = match first_local_get(wasm, p) {
+            Some(i) => i,
+            None => continue,
+        };
+        let param_reg = match p {
+            0 => Reg::R0,
+            1 => Reg::R1,
+            2 => Reg::R2,
+            3 => Reg::R3,
+            _ => continue,
+        };
+        for instr in &arm {
+            let line = match instr.source_line {
+                Some(l) => l,
+                None => continue,
+            };
+            if line >= first_read {
+                break;
+            }
+            for w in writes(&instr.op) {
+                assert_ne!(
+                    w, param_reg,
+                    "[{label}] AAPCS clobber: ARM instr at wasm line {line} writes \
+                     param reg {param_reg:?} before LocalGet({p}) at line {first_read}. \
+                     Op: {:?}",
+                    instr.op
+                );
+            }
+        }
+    }
+}
+
+#[test]
+fn i64_load8u_no_clobber_2_params() {
+    // i64.load8_u with 2 params: addr from I32Const so neither param is the
+    // load address. The dst pair must not be R0:R1 (params).
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I64Load8U {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+    ];
+    assert_no_clobber(&wasm, 2, "i64.load8_u");
+}
+
+#[test]
+fn i64_load16s_no_clobber_2_params() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I64Load16S {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+    ];
+    assert_no_clobber(&wasm, 2, "i64.load16_s");
+}
+
+#[test]
+fn i64_load32u_no_clobber_2_params() {
+    let wasm = vec![
+        WasmOp::I32Const(0x100),
+        WasmOp::I64Load32U {
+            offset: 0,
+            align: 0,
+        },
+        WasmOp::Drop,
+        WasmOp::LocalGet(0),
+        WasmOp::Drop,
+        WasmOp::LocalGet(1),
+        WasmOp::Drop,
+    ];
+    assert_no_clobber(&wasm, 2, "i64.load32_u");
+}

From 1a7c5f96ee93ce0835cf1d5fea7e1296976c3e4d Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Wed, 13 May 2026 06:22:53 +0200
Subject: [PATCH 5/5] =?UTF-8?q?fix(opt):=20keep=20silent=20R0=20fallback?=
 =?UTF-8?q?=20in=20#108=20=E2=80=94=20defensive=20panic=20deferred=20to=20?=
 =?UTF-8?q?#101?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The systematic AAPCS-clobber audit (47 sites + 27 tests) is shippable
on its own. The defensive panic in get_arm_reg fires on at least one
remaining wasm_to_ir gap (vreg v13 during simple_add.wat fib compile,
seen on Bazel CI for genrules test_add_elf / test_add_m7_elf /
test_add_riscv_elf). That panic belongs in #101 once the remaining
gap is closed.

Cargo workspace tests + clippy + fmt clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/synth-synthesis/src/optimizer_bridge.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs
index 870de81..d7fd95f 100644
--- a/crates/synth-synthesis/src/optimizer_bridge.rs
+++ b/crates/synth-synthesis/src/optimizer_bridge.rs
@@ -1572,6 +1572,11 @@ impl OptimizerBridge {
         // as their `rm_lo`/`rm_hi`, destroying the loop counter on real silicon.
         // A loud panic here is strictly better than a quiet miscompilation —
         // crash the compiler, not the firmware.
+        // Note: the silent R0 fallback is intentionally preserved here while the
+        // remaining latent unmapped-vreg cases are being hunted. PR #101 holds
+        // the defensive panic version of this helper; it will land once every
+        // wasm_to_ir gap is closed (one known v13 case during fib compilation
+        // remains to be tracked down).
         let get_arm_reg =
             |vreg: &OptReg, map: &HashMap<u32, Reg>, spills: &HashMap<u32, i32>| -> Reg {
                 if let Some(&r) = map.get(&vreg.0) {
@@ -1580,12 +1585,7 @@ impl OptimizerBridge {
                     // Will be reloaded into R12 by reload_spill
                     Reg::R12
                 } else {
-                    panic!(
-                        "synth internal compiler error: vreg v{} has no assigned \
-                         ARM register and no spill slot. This is a wasm_to_ir bug — \
-                         likely a wasm op whose result is unmapped (see issue #93).",
-                        vreg.0
-                    );
+                    Reg::R0
                 }
             };