From 9c7a0edadd24ce2a9e8b9c63217a1410afa95a34 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 12 May 2026 20:16:49 +0200 Subject: [PATCH 1/5] fix(opt): i64 slot accounting + CSE-alias invalidation on redef MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two correctness bugs in the optimizer's wasm→IR→ARM pipeline that produced silent miscompilation and (post PR #101) defensive panics. 1. **i64 arithmetic slot accounting**: `wasm_to_ir` advanced `inst_id` by 1 (via the wildcard fallthrough) for `I64Add/Sub/And/Or/Xor/Mul/Div{S,U}/ Rem{S,U}/Shl/ShrS/ShrU/Rotl/Rotr` and for `I64Extend{8,16,32}S`. These ops produce an i64 result occupying 2 vreg slots, so the next op's `inst_id.saturating_sub(2)` lookup landed inside the *previous* op's `dest_hi` instead of its `dest_lo`. Triggers the issue-94 fast path regression (`test_issue94_hi32_extract_is_smaller_than_generic_shift`) because `I32WrapI64` after `I64ShrU` resolved its `src_lo` to an unmapped vreg. 2. **CSE alias invalidation on redefinition**: when CSE eliminates a duplicate `Opcode::Load` (local.get), it records `reg_map[killed] = original`. The subsequent `I64ExtendI32U`/`I64ExtendI32S`/`I32WrapI64` re-defines the same vreg by IR convention (`dest_lo == src`). Without clearing `reg_map[killed]` at the re-definition site, downstream consumers would resolve back to the *pre-extend* value — often a live AAPCS param register. Same class as #93 (silent R0 fallback) and #104 (MemLoad/MemStore alias gap). Also extends CSE to resolve src vregs for every i64 opcode (was a `_ => {}` fallback) so the rewrite is consistent across the IR. Refs: #93, #94, #103, #104, #107 Unblocks: #100 (fuzz harness), #101 (defensive panic). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/synth-opt/src/lib.rs | 686 ++++++++++++++++++ .../synth-synthesis/src/optimizer_bridge.rs | 253 ++++--- 2 files changed, 811 insertions(+), 128 deletions(-) diff --git a/crates/synth-opt/src/lib.rs b/crates/synth-opt/src/lib.rs index 98afd57..6bf5bd3 100644 --- a/crates/synth-opt/src/lib.rs +++ b/crates/synth-opt/src/lib.rs @@ -833,6 +833,148 @@ impl CommonSubexpressionElimination { // Resolve register mappings let resolve = |r: Reg| -> Reg { reg_map.get(&r).copied().unwrap_or(r) }; + // After CSE rewrites this instruction's sources, we must also + // clear any `reg_map` entries for the vregs this instruction + // *writes to* — otherwise a later use of that vreg would be + // rewritten back to the original CSE-aliased source, which by + // now is a different (potentially stale) value. + // + // Concretely: WASM lowering may emit ops whose `dest` aliases a + // `src` of the same vreg id (e.g. `I64ExtendI32U` has + // `dest_lo == src` by IR convention — see + // `optimizer_bridge::wasm_to_ir`). That violates pure SSA, so + // CSE's vreg-rename `reg_map` becomes flow-insensitive at the + // re-definition site. Forgetting to clear the dest entry means + // a *later* op that consumes `dest_lo` would resolve back to + // the pre-extend (i32-typed) value — sometimes a live AAPCS + // param register — re-introducing the issue-#93 / #104 class + // of clobber-via-stale-alias bugs. + // + // We collect the dests *before* mutating reg_map so the + // resolution pass above sees the pre-instruction state, then + // drop them after. + let dests_to_invalidate: Vec = match &opcode { + Opcode::Add { dest, .. } + | Opcode::Sub { dest, .. } + | Opcode::Mul { dest, .. } + | Opcode::DivS { dest, .. } + | Opcode::DivU { dest, .. } + | Opcode::RemS { dest, .. } + | Opcode::RemU { dest, .. } + | Opcode::And { dest, .. } + | Opcode::Or { dest, .. } + | Opcode::Xor { dest, .. } + | Opcode::Shl { dest, .. } + | Opcode::ShrS { dest, .. } + | Opcode::ShrU { dest, .. } + | Opcode::Rotl { dest, .. } + | Opcode::Rotr { dest, .. } + | Opcode::Clz { dest, .. } + | Opcode::Ctz { dest, .. } + | Opcode::Popcnt { dest, .. } + | Opcode::Extend8S { dest, .. } + | Opcode::Extend16S { dest, .. } + | Opcode::Eqz { dest, .. } + | Opcode::Eq { dest, .. } + | Opcode::Ne { dest, .. } + | Opcode::LtS { dest, .. } + | Opcode::LtU { dest, .. } + | Opcode::LeS { dest, .. } + | Opcode::LeU { dest, .. } + | Opcode::GtS { dest, .. } + | Opcode::GtU { dest, .. } + | Opcode::GeS { dest, .. } + | Opcode::GeU { dest, .. } + | Opcode::Load { dest, .. } + | Opcode::MemLoad { dest, .. } + | Opcode::Copy { dest, .. } + | Opcode::TeeStore { dest, .. } + | Opcode::Select { dest, .. } + | Opcode::Const { dest, .. } + | Opcode::I64Eq { dest, .. } + | Opcode::I64Ne { dest, .. } + | Opcode::I64LtS { dest, .. } + | Opcode::I64LtU { dest, .. } + | Opcode::I64LeS { dest, .. } + | Opcode::I64LeU { dest, .. } + | Opcode::I64GtS { dest, .. } + | Opcode::I64GtU { dest, .. } + | Opcode::I64GeS { dest, .. } + | Opcode::I64GeU { dest, .. } + | Opcode::I64Eqz { dest, .. } + | Opcode::I64Clz { dest, .. } + | Opcode::I64Ctz { dest, .. } + | Opcode::I64Popcnt { dest, .. } + | Opcode::I32WrapI64 { dest, .. } => vec![*dest], + Opcode::I64Add { + dest_lo, dest_hi, .. + } + | Opcode::I64Sub { + dest_lo, dest_hi, .. + } + | Opcode::I64And { + dest_lo, dest_hi, .. + } + | Opcode::I64Or { + dest_lo, dest_hi, .. + } + | Opcode::I64Xor { + dest_lo, dest_hi, .. + } + | Opcode::I64Mul { + dest_lo, dest_hi, .. + } + | Opcode::I64DivS { + dest_lo, dest_hi, .. + } + | Opcode::I64DivU { + dest_lo, dest_hi, .. + } + | Opcode::I64RemS { + dest_lo, dest_hi, .. + } + | Opcode::I64RemU { + dest_lo, dest_hi, .. + } + | Opcode::I64Shl { + dest_lo, dest_hi, .. + } + | Opcode::I64ShrS { + dest_lo, dest_hi, .. + } + | Opcode::I64ShrU { + dest_lo, dest_hi, .. + } + | Opcode::I64Rotl { + dest_lo, dest_hi, .. + } + | Opcode::I64Rotr { + dest_lo, dest_hi, .. + } + | Opcode::I64Const { + dest_lo, dest_hi, .. + } + | Opcode::I64Load { + dest_lo, dest_hi, .. + } + | Opcode::I64Extend8S { + dest_lo, dest_hi, .. + } + | Opcode::I64Extend16S { + dest_lo, dest_hi, .. + } + | Opcode::I64Extend32S { + dest_lo, dest_hi, .. + } + | Opcode::I64ExtendI32U { + dest_lo, dest_hi, .. + } + | Opcode::I64ExtendI32S { + dest_lo, dest_hi, .. + } => vec![*dest_lo, *dest_hi], + _ => Vec::new(), + }; + match opcode { Opcode::Add { dest, src1, src2 } => { let src1 = resolve(src1); @@ -1158,8 +1300,552 @@ impl CommonSubexpressionElimination { inst.opcode = Opcode::Copy { dest, src }; } + // ============================================================ + // i64 operations: resolve all src vregs through reg_map. + // + // CSE eliminates duplicate `Opcode::Load`s (local.get N) by + // marking the later one dead and recording the rewrite + // `dead_dest -> original_dest` in `reg_map`. Any later op + // that names `dead_dest` as a source MUST be rewritten — + // otherwise the producer is gone and `ir_to_arm`'s + // `get_arm_reg` panics (post-PR-101) or silently returns + // R0 (pre-PR-101 → AAPCS clobber miscompile, see issue #93). + // + // PR #107 covered MemLoad/MemStore; this batch covers every + // i64 opcode. We DO NOT add these to `expr_map` for actual + // CSE — i64 expression equivalence isn't tracked here — we + // only do the source-vreg rewriting needed for correctness + // when an upstream Load is killed. + // ============================================================ + Opcode::I64Load { .. } | Opcode::I64Const { .. } => { + // Pure producers, no src to resolve. + } + Opcode::I64Add { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Add { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Sub { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Sub { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64And { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64And { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Or { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Or { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Xor { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Xor { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Mul { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Mul { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64DivS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64DivS { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64DivU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64DivU { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64RemS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64RemS { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64RemU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64RemU { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Shl { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Shl { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64ShrS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64ShrS { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64ShrU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64ShrU { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Rotl { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Rotl { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Rotr { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Rotr { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + // i64 comparisons (i64 -> i32 bool) + Opcode::I64Eq { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Eq { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Ne { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Ne { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LtS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GtS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LeS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GeS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LtU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GtU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LeU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GeU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Eqz { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Eqz { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Clz { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Clz { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Ctz { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Ctz { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Popcnt { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Popcnt { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Extend8S { + dest_lo, + dest_hi, + src_lo, + } => { + inst.opcode = Opcode::I64Extend8S { + dest_lo, + dest_hi, + src_lo: resolve(src_lo), + }; + } + Opcode::I64Extend16S { + dest_lo, + dest_hi, + src_lo, + } => { + inst.opcode = Opcode::I64Extend16S { + dest_lo, + dest_hi, + src_lo: resolve(src_lo), + }; + } + Opcode::I64Extend32S { + dest_lo, + dest_hi, + src_lo, + } => { + inst.opcode = Opcode::I64Extend32S { + dest_lo, + dest_hi, + src_lo: resolve(src_lo), + }; + } + Opcode::I64ExtendI32U { + dest_lo, + dest_hi, + src, + } => { + inst.opcode = Opcode::I64ExtendI32U { + dest_lo, + dest_hi, + src: resolve(src), + }; + } + Opcode::I64ExtendI32S { + dest_lo, + dest_hi, + src, + } => { + inst.opcode = Opcode::I64ExtendI32S { + dest_lo, + dest_hi, + src: resolve(src), + }; + } + Opcode::I32WrapI64 { dest, src_lo } => { + inst.opcode = Opcode::I32WrapI64 { + dest, + src_lo: resolve(src_lo), + }; + } + _ => {} } + + // See `dests_to_invalidate` above: clear `reg_map` entries for + // any vreg this instruction writes, so that downstream uses of + // those vregs are NOT rewritten back to a now-stale CSE-aliased + // value. Skip if this instruction was itself marked dead by + // CSE — its dest is a dead vreg whose alias should remain so + // later consumers can re-use the existing live value. + if !inst.is_dead { + for dest in &dests_to_invalidate { + reg_map.remove(dest); + } + } } if self.verbose && modified > 0 { diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 44db78c..88effad 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -669,129 +669,104 @@ impl OptimizerBridge { builder.add_instruction(); // Add extra instruction for hi part continue; } - WasmOp::I64Add => Opcode::I64Add { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Sub => Opcode::I64Sub { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64And => Opcode::I64And { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Or => Opcode::I64Or { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Xor => Opcode::I64Xor { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - - // i64 multiply and shifts (produce i64 pair result) - WasmOp::I64Mul => Opcode::I64Mul { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - // i64 division and remainder (produce i64 pair result) - WasmOp::I64DivS => Opcode::I64DivS { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64DivU => Opcode::I64DivU { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64RemS => Opcode::I64RemS { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64RemU => Opcode::I64RemU { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Shl => Opcode::I64Shl { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64ShrS => Opcode::I64ShrS { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64ShrU => Opcode::I64ShrU { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Rotl => Opcode::I64Rotl { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Rotr => Opcode::I64Rotr { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, + // i64 binary arithmetic ops (consume 2 i64 pairs, produce 1 i64 pair). + // + // Slot accounting: each i64 occupies 2 consecutive vreg slots + // (lo, hi). Consuming 2 i64s reads slots [inst_id-4..inst_id-1]; + // producing 1 i64 reserves slots [inst_id, inst_id+1]. So the + // next op must see the new i64 at slots [next_inst_id-2, + // next_inst_id-1], which requires `inst_id += 2`. + // + // Previously these arms fell through to the wildcard `inst_id + // += 1`, leaving `dest_hi` at slot `inst_id+1 = next_inst_id` + // — i.e. the very slot the NEXT wasm op was about to use as a + // fresh dest. The next op clobbered `dest_hi`, and any later + // op trying to read `(prev.dest_lo, prev.dest_hi)` would look + // at `(next_inst_id-2, next_inst_id-1)` which pointed to the + // hi half of the previously consumed src2 and the just-written + // current dest_lo — total slot scramble. In some cases the lookup + // would find no mapping at all and `get_arm_reg` would silently + // return R0 (issue #93 root cause). See PR #100 fuzz harness + // and PR #101 defensive panic for the diagnostic plumbing. + WasmOp::I64Add + | WasmOp::I64Sub + | WasmOp::I64And + | WasmOp::I64Or + | WasmOp::I64Xor + | WasmOp::I64Mul + | WasmOp::I64DivS + | WasmOp::I64DivU + | WasmOp::I64RemS + | WasmOp::I64RemU + | WasmOp::I64Shl + | WasmOp::I64ShrS + | WasmOp::I64ShrU + | WasmOp::I64Rotl + | WasmOp::I64Rotr => { + let dest_lo = OptReg(inst_id as u32); + let dest_hi = OptReg((inst_id + 1) as u32); + let src1_lo = OptReg(inst_id.saturating_sub(4) as u32); + let src1_hi = OptReg(inst_id.saturating_sub(3) as u32); + let src2_lo = OptReg(inst_id.saturating_sub(2) as u32); + let src2_hi = OptReg(inst_id.saturating_sub(1) as u32); + let opcode = match wasm_op { + WasmOp::I64Add => Opcode::I64Add { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Sub => Opcode::I64Sub { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64And => Opcode::I64And { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Or => Opcode::I64Or { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Xor => Opcode::I64Xor { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Mul => Opcode::I64Mul { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64DivS => Opcode::I64DivS { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64DivU => Opcode::I64DivU { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64RemS => Opcode::I64RemS { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64RemU => Opcode::I64RemU { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Shl => Opcode::I64Shl { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64ShrS => Opcode::I64ShrS { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64ShrU => Opcode::I64ShrU { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Rotl => Opcode::I64Rotl { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + WasmOp::I64Rotr => Opcode::I64Rotr { + dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + }, + _ => unreachable!(), + }; + instructions.push(Instruction { + id: inst_id, + opcode, + block_id: 0, + is_dead: false, + }); + inst_id += 2; // produces i64 = 2 slots + builder.add_instruction(); + continue; + } // i64 comparisons (consume 2 i64 pairs, produce single i32 result) WasmOp::I64Eq @@ -962,7 +937,13 @@ impl OptimizerBridge { continue; } - // i64 sign extension (takes i64, produces i64) + // i64 sign extension (takes i64, produces i64). + // + // Slot accounting: consume 1 i64 (slots [inst_id-2, inst_id-1]), + // produce 1 i64 (slots [inst_id, inst_id+1]). `inst_id += 2` + // so the next op's `inst_id-2`/`inst_id-1` lookup lands on + // dest_lo/dest_hi. Was `+= 1` which left dest_hi at the slot + // the next wasm op would claim as its own dest — clobber. WasmOp::I64Extend8S => { let opcode = Opcode::I64Extend8S { dest_lo: OptReg(inst_id as u32), @@ -975,7 +956,7 @@ impl OptimizerBridge { block_id: 0, is_dead: false, }); - inst_id += 1; + inst_id += 2; builder.add_instruction(); continue; } @@ -992,7 +973,7 @@ impl OptimizerBridge { block_id: 0, is_dead: false, }); - inst_id += 1; + inst_id += 2; builder.add_instruction(); continue; } @@ -1009,7 +990,7 @@ impl OptimizerBridge { block_id: 0, is_dead: false, }); - inst_id += 1; + inst_id += 2; builder.add_instruction(); continue; } @@ -1431,6 +1412,17 @@ impl OptimizerBridge { // Helper to get ARM reg from virtual reg. // Also checks spill slots — if a vreg was spilled, returns R12 (IP scratch). // Callers should also call `reload_spill` to emit the actual load instruction. + // + // PANICS if the vreg is neither mapped nor spilled. The previous behavior was + // a silent `Reg::R0` fallback, which produced miscompilation: a downstream + // instruction reading the "unknown" vreg would silently consume whatever + // R0 happens to hold (often a live caller param or memset's dest pointer). + // Issue #93 was exactly this — `wasm_to_ir` had no handler for + // `I64ExtendI32U`/`I64ExtendI32S`/`I32WrapI64`, so the IR they should have + // produced never got mapped to ARM regs, and downstream i64 shifts read R0 + // as their `rm_lo`/`rm_hi`, destroying the loop counter on real silicon. + // A loud panic here is strictly better than a quiet miscompilation — + // crash the compiler, not the firmware. let get_arm_reg = |vreg: &OptReg, map: &HashMap, spills: &HashMap| -> Reg { if let Some(&r) = map.get(&vreg.0) { @@ -1439,7 +1431,12 @@ impl OptimizerBridge { // Will be reloaded into R12 by reload_spill Reg::R12 } else { - Reg::R0 + panic!( + "synth internal compiler error: vreg v{} has no assigned \ + ARM register and no spill slot. This is a wasm_to_ir bug — \ + likely a wasm op whose result is unmapped (see issue #93).", + vreg.0 + ); } }; From 41ceb2d279e190247d781b022def79e6ddd8b28a Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 12 May 2026 20:48:42 +0200 Subject: [PATCH 2/5] =?UTF-8?q?fix(opt):=20wasm=5Fto=5Fir=20gaps=20+=20Mem?= =?UTF-8?q?Load=20hardcoded-R3=20=E2=80=94=20sub-word,=20globals,=20memory?= =?UTF-8?q?.{size,grow}?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A second-wave audit of the optimizer-path AAPCS-clobber bug class. After fixing the i64 slot accounting (prev commit), the next-most-common gap is ops that `wasm_to_ir` silently maps to `Opcode::Nop`. ## Findings `wasm_to_ir` ALSO lacked handlers for every value-producing op below. Each fell through `_ => Opcode::Nop`, consumed an `inst_id` slot, and left its produced vreg unmapped. Downstream consumers triggered PR-101's defensive panic (or, pre-PR-101, returned `Reg::R0` silently — the issue-#93 / #103 clobber class): * `I32Load8S/U`, `I32Load16S/U` — sub-word memory loads * `I32Store8`, `I32Store16` — sub-word memory stores * `GlobalGet`, `GlobalSet` — module-level globals * `MemorySize`, `MemoryGrow` — memory introspection / mutation Separately, `Opcode::MemLoad`'s `ir_to_arm` handler **hardcoded the dest register to `Reg::R3`**, clobbering the 4th AAPCS argument on every i32.load that ran while R3 was a live param. The fuzz harness in PR #100 hits this every rebase but the previous round of patches missed it because the bug only surfaces under the optimizer (not select_with_stack). ## Fixes * Added `Opcode::{MemLoadSubword,MemStoreSubword,GlobalGet,GlobalSet, MemorySize,MemoryGrow}` to `synth_opt::Opcode`. * Added matching arms in `wasm_to_ir` (with comments documenting the pre-fix silent-Nop bug). * Added `ir_to_arm` handlers that emit the right ARM ops (LDRB/LDRH/ LDRSB/LDRSH, STRB/STRH, LDR/STR via R9 base, MOV from R10, MOVW `#-1` stub for `memory.grow` on fixed-memory targets). * Introduced `alloc_i32_scratch` — the i32 counterpart to `alloc_i64_pair`. Searches `[R4..R8]`, skipping live vregs, non-param locals, and the param-reserved set; falls back to R12 under pressure. * Replaced the hardcoded `Reg::R3` in `Opcode::MemLoad` with a call to `alloc_i32_scratch`. Both `MemLoad` and the new sub-word variants now pick a non-AAPCS-param destination by construction. * Extended CSE to resolve src vregs for the new opcodes and to track their dests in the redefinition-invalidation set added in the prev commit. ## Tests * New `audit_subword_memops.rs` — covers all 12 of the previously- unmapped op shapes (pure ops + arith consumers + stores), each of which used to panic the defensive `get_arm_reg`. * New `audit_aapcs_repro.rs` — generic AAPCS-no-clobber harness for the `select_with_stack` path. The fuzz crash signature (`Mov{rd:R1,op2:Reg(R2)}` before LocalGet(1)) didn't reproduce against current code; these tests pin down nearby shapes so the fuzz can't silently regress past them. Refs: #93, #100, #101, #103, #104 Unblocks: #100, #101. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/synth-opt/src/lib.rs | 106 ++++- .../synth-synthesis/src/optimizer_bridge.rs | 388 +++++++++++++++++- .../tests/audit_aapcs_repro.rs | 203 +++++++++ .../tests/audit_subword_memops.rs | 145 +++++++ 4 files changed, 823 insertions(+), 19 deletions(-) create mode 100644 crates/synth-synthesis/tests/audit_aapcs_repro.rs create mode 100644 crates/synth-synthesis/tests/audit_subword_memops.rs diff --git a/crates/synth-opt/src/lib.rs b/crates/synth-opt/src/lib.rs index 6bf5bd3..846538e 100644 --- a/crates/synth-opt/src/lib.rs +++ b/crates/synth-opt/src/lib.rs @@ -255,6 +255,57 @@ pub enum Opcode { addr: Reg, offset: u32, }, + + /// Sub-word load: i32.load8_s/u, i32.load16_s/u and the i64 equivalents + /// after extension. `width` is 1 or 2 bytes; `signed` is true for the + /// `_s` variants. The dest is a single i32 register (sub-word loads on + /// ARM expand to a single LDRB/LDRH ± an SXTB/SXTH). + MemLoadSubword { + dest: Reg, + addr: Reg, + offset: u32, + width: u32, // 1 or 2 + signed: bool, + }, + /// Sub-word store: i32.store8 / i32.store16 / i64.store8/16/32. + /// `width` is 1, 2, or 4. For 4-byte store of an i64's low word we + /// just take the lo register. + MemStoreSubword { + src: Reg, + addr: Reg, + offset: u32, + width: u32, // 1, 2, or 4 + }, + + /// `global.get` — load a global into a fresh vreg. The actual ARM + /// lowering is `LDR rd, [R9, #offset]` (R9 is the globals base, by + /// convention). The vreg MUST be allocated to a non-AAPCS-param + /// register at lowering time — otherwise the LDR would clobber a + /// live caller arg. + GlobalGet { + dest: Reg, + idx: u32, + }, + /// `global.set` — store a vreg to a global slot. Emits `STR src, + /// [R9, #offset]`. No vreg is written. + GlobalSet { + src: Reg, + idx: u32, + }, + + /// `memory.size` — returns the current memory size in pages as an + /// i32. By convention R10 holds the memory-size word; we emit + /// `MOV dest, R10`. `dest` must be allocated to a non-param scratch. + MemorySize { + dest: Reg, + }, + /// `memory.grow` — pops the grow-by amount, pushes the previous + /// page count (or -1 on failure). Embedded targets generally have + /// fixed memory, so this is a stub returning -1 in `dest`. + MemoryGrow { + dest: Reg, + delta: Reg, + }, // Control flow Branch { target: BlockId, @@ -905,7 +956,11 @@ impl CommonSubexpressionElimination { | Opcode::I64Clz { dest, .. } | Opcode::I64Ctz { dest, .. } | Opcode::I64Popcnt { dest, .. } - | Opcode::I32WrapI64 { dest, .. } => vec![*dest], + | Opcode::I32WrapI64 { dest, .. } + | Opcode::MemLoadSubword { dest, .. } + | Opcode::GlobalGet { dest, .. } + | Opcode::MemorySize { dest, .. } + | Opcode::MemoryGrow { dest, .. } => vec![*dest], Opcode::I64Add { dest_lo, dest_hi, .. } @@ -1832,6 +1887,55 @@ impl CommonSubexpressionElimination { }; } + // ==================================================== + // Sub-word memory / globals / memory.{size,grow} + // + // Same rationale as MemLoad/MemStore: resolve src vregs + // through `reg_map` so CSE-killed Loads don't leave + // stale references. We do NOT CSE the loads themselves + // (no alias analysis), only fix up consumer vregs. + // ==================================================== + Opcode::MemLoadSubword { + dest, + addr, + offset, + width, + signed, + } => { + inst.opcode = Opcode::MemLoadSubword { + dest, + addr: resolve(addr), + offset, + width, + signed, + }; + } + Opcode::MemStoreSubword { + src, + addr, + offset, + width, + } => { + inst.opcode = Opcode::MemStoreSubword { + src: resolve(src), + addr: resolve(addr), + offset, + width, + }; + } + Opcode::GlobalSet { src, idx } => { + inst.opcode = Opcode::GlobalSet { + src: resolve(src), + idx, + }; + } + Opcode::MemoryGrow { dest, delta } => { + inst.opcode = Opcode::MemoryGrow { + dest, + delta: resolve(delta), + }; + } + _ => {} } diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 88effad..174fe3e 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -711,49 +711,124 @@ impl OptimizerBridge { let src2_hi = OptReg(inst_id.saturating_sub(1) as u32); let opcode = match wasm_op { WasmOp::I64Add => Opcode::I64Add { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Sub => Opcode::I64Sub { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64And => Opcode::I64And { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Or => Opcode::I64Or { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Xor => Opcode::I64Xor { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Mul => Opcode::I64Mul { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64DivS => Opcode::I64DivS { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64DivU => Opcode::I64DivU { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64RemS => Opcode::I64RemS { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64RemU => Opcode::I64RemU { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Shl => Opcode::I64Shl { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64ShrS => Opcode::I64ShrS { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64ShrU => Opcode::I64ShrU { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Rotl => Opcode::I64Rotl { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, WasmOp::I64Rotr => Opcode::I64Rotr { - dest_lo, dest_hi, src1_lo, src1_hi, src2_lo, src2_hi, + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, }, _ => unreachable!(), }; @@ -1177,6 +1252,80 @@ impl OptimizerBridge { offset: *offset, }, + // ===== Sub-word linear-memory ops ===== + // + // Pop addr (and value for stores), push value (for loads). + // Pre-fix, these fell through to `Opcode::Nop` — their dest + // vreg never got mapped to an ARM register, and any + // consumer of the loaded value triggered the PR #101 + // defensive panic (or, pre-PR-101, silently consumed R0). + WasmOp::I32Load8S { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 1, + signed: true, + }, + WasmOp::I32Load8U { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 1, + signed: false, + }, + WasmOp::I32Load16S { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 2, + signed: true, + }, + WasmOp::I32Load16U { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 2, + signed: false, + }, + WasmOp::I32Store8 { offset, .. } => Opcode::MemStoreSubword { + src: OptReg(inst_id.saturating_sub(1) as u32), + addr: OptReg(inst_id.saturating_sub(2) as u32), + offset: *offset, + width: 1, + }, + WasmOp::I32Store16 { offset, .. } => Opcode::MemStoreSubword { + src: OptReg(inst_id.saturating_sub(1) as u32), + addr: OptReg(inst_id.saturating_sub(2) as u32), + offset: *offset, + width: 2, + }, + + // ===== Globals ===== + // + // GlobalGet pushes a fresh i32; GlobalSet pops one. Without + // explicit IR ops these silently produced unmapped vregs. + WasmOp::GlobalGet(idx) => Opcode::GlobalGet { + dest: OptReg(inst_id as u32), + idx: *idx, + }, + WasmOp::GlobalSet(idx) => Opcode::GlobalSet { + src: OptReg(inst_id.saturating_sub(1) as u32), + idx: *idx, + }, + + // ===== Memory size / grow ===== + // + // Both push an i32 result. On bare-metal targets with fixed + // memory, grow is a stub (returns the size or -1), but the + // dest vreg still needs allocation. + WasmOp::MemorySize(_) => Opcode::MemorySize { + dest: OptReg(inst_id as u32), + }, + WasmOp::MemoryGrow(_) => Opcode::MemoryGrow { + dest: OptReg(inst_id as u32), + delta: OptReg(inst_id.saturating_sub(1) as u32), + }, + // Fallback for unsupported ops _ => Opcode::Nop, }; @@ -1482,6 +1631,44 @@ impl OptimizerBridge { (Reg::R4, Reg::R5) }; + // Allocate a SINGLE callee-saved register for an i32 destination. + // + // Searches `[R4, R5, R6, R7, R8]` for a register not currently held + // by a live vreg, bound to a non-param local, or reserved as an + // AAPCS param. The extra_avoid list is honoured for transient + // operand-region exclusions (e.g. addresses-of operands that must + // outlive the destination allocation). + // + // Falls back to R12 (IP, the universal scratch) if every callee- + // saved register is taken — matches the prior pressure-relief + // behaviour. R12 is intentionally NOT in the search list because + // it's used as a transient by MemLoad/MemStore for the base+offset + // pointer math, and would be clobbered before the destination is + // read. + let alloc_i32_scratch = |vreg_to_arm: &HashMap, + local_to_reg: &HashMap, + param_reserved_regs: &[Reg], + extra_avoid: &[Reg]| + -> Reg { + const CANDIDATES: &[Reg] = &[Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8]; + let is_in_use = |r: Reg| -> bool { + vreg_to_arm.values().any(|&v| v == r) + || local_to_reg.values().any(|&v| v == r) + || param_reserved_regs.contains(&r) + || extra_avoid.contains(&r) + }; + for &r in CANDIDATES { + if !is_in_use(r) { + return r; + } + } + // Pressure-relief fallback. R12 is acceptable here because + // the call sites that use this helper write the destination + // BEFORE using R12 as scratch (e.g. MemLoad emits the LDR + // last, after the address math). + Reg::R12 + }; + // Emit a reload instruction if the vreg was spilled to stack. // Must be called before the instruction that uses the register. let reload_spill = |vreg: &OptReg, spills: &HashMap, instrs: &mut Vec| { @@ -3515,11 +3702,25 @@ impl OptimizerBridge { // Linear Memory Operations // ======================================================================== - // MemLoad: load 32-bit value from linear memory - // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi; ADD R12, R12, Raddr; LDR Rd, [R12, #offset] + // MemLoad: load 32-bit value from linear memory. + // + // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi; + // ADD R12, R12, Raddr; LDR Rd, [R12, #offset] + // + // `Rd` MUST NOT alias an AAPCS param register (R0..R3) — a + // `local.get` of param N anywhere downstream would otherwise + // observe whatever the MemLoad just wrote. Pre-fix this was + // hardcoded to `Reg::R3`, which clobbered the 4th AAPCS + // argument on every `i32.load`. Use the scratch helper so + // the destination is picked from the callee-saved bank. Opcode::MemLoad { dest, addr, offset } => { let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_addr], + ); vreg_to_arm.insert(dest.0, rd); // Linear memory base address: 0x20000100 (in SRAM, above stack area) @@ -3583,6 +3784,157 @@ impl OptimizerBridge { }); // MemStore does not produce a value } + + // Sub-word linear memory load (i32.load8_s/u, i32.load16_s/u). + // + // Generates the same base+addr math as MemLoad, then LDRB/ + // LDRH/LDRSB/LDRSH into a non-param destination register + // chosen by `alloc_i32_scratch`. Pre-fix these wasm ops + // had no IR handler; the optimizer pipeline left the + // produced vreg unmapped → defensive panic (or pre-PR-101 + // silent R0 alias). + Opcode::MemLoadSubword { + dest, + addr, + offset, + width, + signed, + } => { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs); + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_addr], + ); + vreg_to_arm.insert(dest.0, rd); + + let base: u32 = 0x20000100; + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + let addr_mem = crate::rules::MemAddr::imm(Reg::R12, *offset as i32); + let sub_op = match (*width, *signed) { + (1, false) => ArmOp::Ldrb { rd, addr: addr_mem }, + (1, true) => ArmOp::Ldrsb { rd, addr: addr_mem }, + (2, false) => ArmOp::Ldrh { rd, addr: addr_mem }, + (2, true) => ArmOp::Ldrsh { rd, addr: addr_mem }, + // Width 4 is impossible here (caller would use + // `Opcode::MemLoad`); fall through to plain Ldr + // rather than panicking — that keeps the lowering + // total, and the encoder will validate. + _ => ArmOp::Ldr { rd, addr: addr_mem }, + }; + arm_instrs.push(sub_op); + last_result_vreg = Some(dest.0); + } + + // Sub-word linear memory store (i32.store8, i32.store16, + // i64.store8/16/32). Generates address math + STRB/STRH/STR. + Opcode::MemStoreSubword { + src, + addr, + offset, + width, + } => { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs); + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); + + let base: u32 = 0x20000100; + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + let addr_mem = crate::rules::MemAddr::imm(Reg::R12, *offset as i32); + let sub_op = match *width { + 1 => ArmOp::Strb { + rd: r_src, + addr: addr_mem, + }, + 2 => ArmOp::Strh { + rd: r_src, + addr: addr_mem, + }, + _ => ArmOp::Str { + rd: r_src, + addr: addr_mem, + }, + }; + arm_instrs.push(sub_op); + } + + // `global.get N` — load global N into a fresh non-param + // scratch. ARM convention: R9 is the globals base, globals + // are packed as 4-byte slots. + Opcode::GlobalGet { dest, idx } => { + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[]); + vreg_to_arm.insert(dest.0, rd); + arm_instrs.push(ArmOp::Ldr { + rd, + addr: crate::rules::MemAddr::imm(Reg::R9, (*idx as i32) * 4), + }); + last_result_vreg = Some(dest.0); + } + + // `global.set N` — store the popped i32 to global N. + Opcode::GlobalSet { src, idx } => { + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); + arm_instrs.push(ArmOp::Str { + rd: r_src, + addr: crate::rules::MemAddr::imm(Reg::R9, (*idx as i32) * 4), + }); + } + + // `memory.size` — current memory size in pages. Convention: + // R10 holds the memory size word. Emit `MOV dest, R10`. + Opcode::MemorySize { dest } => { + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[]); + vreg_to_arm.insert(dest.0, rd); + arm_instrs.push(ArmOp::Mov { + rd, + op2: Operand2::Reg(Reg::R10), + }); + last_result_vreg = Some(dest.0); + } + + // `memory.grow` — embedded targets have fixed memory; emit + // a stub that returns -1 (the wasm spec's "grow failed" + // sentinel). The `delta` is read but discarded. + Opcode::MemoryGrow { dest, delta } => { + let _ = get_arm_reg(delta, &vreg_to_arm, &spilled_vregs); + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[]); + vreg_to_arm.insert(dest.0, rd); + // mov rd, #-1 → MOVW rd, #0xFFFF; MOVT rd, #0xFFFF + arm_instrs.push(ArmOp::Movw { rd, imm16: 0xFFFF }); + arm_instrs.push(ArmOp::Movt { rd, imm16: 0xFFFF }); + last_result_vreg = Some(dest.0); + } } // Track WASM operand value stack for correct br_if semantics. diff --git a/crates/synth-synthesis/tests/audit_aapcs_repro.rs b/crates/synth-synthesis/tests/audit_aapcs_repro.rs new file mode 100644 index 0000000..9d335fb --- /dev/null +++ b/crates/synth-synthesis/tests/audit_aapcs_repro.rs @@ -0,0 +1,203 @@ +//! Try to reproduce the AAPCS clobber fuzz crash from PR #100. +//! +//! Crash signature: +//! ARM instr at wasm line 1 writes param reg R1 before LocalGet(1) at line 5. +//! Op: Mov { rd: R1, op2: Reg(R2) } +//! Sequence: [Push, I64Const{rdlo:R2,rdhi:R3,value:0}, Mov{rd:R1,op2:Reg(R2)}, Pop] +//! +//! Test a range of plausible wasm sequences and assert no AAPCS clobber. + +use synth_synthesis::{ArmOp, InstructionSelector, Reg, WasmOp}; + +/// Helper: extract source line numbers and Mov destinations. +fn run(wasm: &[WasmOp], num_params: u32) -> Vec { + let mut s = InstructionSelector::new(vec![]); + s.select_with_stack(wasm, num_params) + .expect("should compile") +} + +fn first_local_get(wasm: &[WasmOp], p: u32) -> Option { + wasm.iter().enumerate().find_map(|(i, op)| match op { + WasmOp::LocalGet(idx) if *idx == p => Some(i), + _ => None, + }) +} + +fn writes(op: &ArmOp) -> Vec { + match op { + ArmOp::Add { rd, .. } + | ArmOp::Sub { rd, .. } + | ArmOp::Adds { rd, .. } + | ArmOp::Adc { rd, .. } + | ArmOp::Subs { rd, .. } + | ArmOp::Sbc { rd, .. } + | ArmOp::And { rd, .. } + | ArmOp::Orr { rd, .. } + | ArmOp::Eor { rd, .. } + | ArmOp::Mov { rd, .. } + | ArmOp::Mvn { rd, .. } + | ArmOp::Movw { rd, .. } + | ArmOp::Lsl { rd, .. } + | ArmOp::Lsr { rd, .. } + | ArmOp::Asr { rd, .. } + | ArmOp::Ror { rd, .. } + | ArmOp::LslReg { rd, .. } + | ArmOp::LsrReg { rd, .. } + | ArmOp::AsrReg { rd, .. } + | ArmOp::RorReg { rd, .. } + | ArmOp::Rsb { rd, .. } + | ArmOp::Mul { rd, .. } + | ArmOp::Sdiv { rd, .. } + | ArmOp::Udiv { rd, .. } + | ArmOp::Mls { rd, .. } + | ArmOp::Clz { rd, .. } + | ArmOp::Rbit { rd, .. } + | ArmOp::Popcnt { rd, .. } + | ArmOp::Sxtb { rd, .. } + | ArmOp::Sxth { rd, .. } + | ArmOp::Ldr { rd, .. } + | ArmOp::Ldrb { rd, .. } + | ArmOp::Ldrsb { rd, .. } + | ArmOp::Ldrh { rd, .. } + | ArmOp::Ldrsh { rd, .. } => vec![*rd], + ArmOp::Movt { rd, .. } => vec![*rd], + ArmOp::I64Const { rdlo, rdhi, .. } => vec![*rdlo, *rdhi], + ArmOp::I64Ldr { rdlo, rdhi, .. } => vec![*rdlo, *rdhi], + ArmOp::I64ExtendI32U { rdlo, rdhi, .. } | ArmOp::I64ExtendI32S { rdlo, rdhi, .. } => { + vec![*rdlo, *rdhi] + } + ArmOp::I64Extend8S { rdlo, rdhi, .. } + | ArmOp::I64Extend16S { rdlo, rdhi, .. } + | ArmOp::I64Extend32S { rdlo, rdhi, .. } => vec![*rdlo, *rdhi], + ArmOp::I32WrapI64 { rd, .. } => vec![*rd], + ArmOp::I64Mul { rd_lo, rd_hi, .. } + | ArmOp::I64Shl { rd_lo, rd_hi, .. } + | ArmOp::I64ShrU { rd_lo, rd_hi, .. } + | ArmOp::I64ShrS { rd_lo, rd_hi, .. } => vec![*rd_lo, *rd_hi], + _ => Vec::new(), + } +} + +fn check_no_clobber(wasm: &[WasmOp], num_params: u32, label: &str) { + let instrs = run(wasm, num_params); + for p in 0..num_params.min(4) { + let first_read = match first_local_get(wasm, p) { + Some(i) => i, + None => continue, + }; + let param_reg = match p { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => continue, + }; + for ai in &instrs { + let line = match ai.source_line { + Some(l) => l, + None => continue, + }; + if line >= first_read { + break; + } + for w in writes(&ai.op) { + assert_ne!( + w, param_reg, + "[{label}] AAPCS clobber: ARM instr at wasm line {line} writes \ + param reg {param_reg:?} before LocalGet({p}) at line {first_read}. \ + Op: {:?}. Sequence: {:#?}", + ai.op, instrs + ); + } + } + } +} + +/// Reproduce the harness invariant: build a sequence with `LocalGet(p)` for +/// each param, but place them AFTER an i64 op, so anything that touches +/// a param-region register pre-LocalGet is a clobber. +#[test] +fn aapcs_i64_const_drop_localgets_2_params() { + let wasm = vec![ + WasmOp::I64Const(0), + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, "i64const-drop-localgets"); +} + +#[test] +fn aapcs_i64_extend_then_localgets_2_params() { + let wasm = vec![ + WasmOp::I32Const(7), + WasmOp::I64ExtendI32U, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, "i64extend-then-localgets"); +} + +#[test] +fn aapcs_i64_const_localset0_localgets_2_params() { + // LocalSet to a param slot would overwrite the param. Even so, the + // assertion is about writes that occur BEFORE the first LocalGet of + // each param — these should not clobber. + let wasm = vec![ + WasmOp::I64Const(0xAABBCCDD), + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, "i64const-large"); +} + +#[test] +fn aapcs_i64_arith_then_localgets_3_params() { + let wasm = vec![ + WasmOp::I64Const(1), + WasmOp::I64Const(2), + WasmOp::I64Add, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + WasmOp::LocalGet(2), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 3, "i64-add-3params"); +} + +/// Reproduce specifically the fuzz-finding shape: +/// `[Push, I64Const{rdlo:R2,rdhi:R3,value:0}, Mov{rd:R1,op2:Reg(R2)}, Pop]` +/// with a middle op that emits a Mov to a low register. +#[test] +fn aapcs_i64_const_then_some_op_2_params() { + // Try several middle ops that emit something + for middle in [ + WasmOp::I32Const(0), + WasmOp::I64Eqz, + WasmOp::I64Clz, + WasmOp::I64Ctz, + WasmOp::I64Popcnt, + ] { + let wasm = vec![ + WasmOp::I64Const(0), + middle.clone(), + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, &format!("i64const-{middle:?}")); + } +} diff --git a/crates/synth-synthesis/tests/audit_subword_memops.rs b/crates/synth-synthesis/tests/audit_subword_memops.rs new file mode 100644 index 0000000..9266e78 --- /dev/null +++ b/crates/synth-synthesis/tests/audit_subword_memops.rs @@ -0,0 +1,145 @@ +//! Audit: sub-word memory ops, globals, and memory.size/grow must lower +//! through the optimizer path without unmapped-vreg panics. +//! +//! Pre-PR: these ops fell through `wasm_to_ir`'s `_ => Opcode::Nop` arm, +//! consumed an inst_id slot, but produced no `vreg_to_arm` mapping. The +//! defensive panic in `get_arm_reg` (PR #101) surfaces this as a compiler +//! crash for downstream consumers; pre-defensive-panic the code silently +//! returned R0, clobbering AAPCS params. + +use synth_synthesis::{OptimizerBridge, WasmOp}; + +fn compile(wasm: &[WasmOp], num_params: usize) { + let bridge = OptimizerBridge::new(); + let (ir, _cfg, _stats) = bridge.optimize_full(wasm).expect("optimize_full"); + // ir_to_arm is the panic site for unmapped-vreg consumers. + let _arm = bridge.ir_to_arm(&ir, num_params); +} + +#[test] +fn subword_i32_load8_u_then_drop() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load8U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_load8_s_into_arith() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load8S { + offset: 0, + align: 0, + }, + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_load16_u_then_drop() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load16U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_load16_s_into_arith() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load16S { + offset: 0, + align: 0, + }, + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_store8() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Const(0xff), + WasmOp::I32Store8 { + offset: 0, + align: 0, + }, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_store16() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Const(0xffff), + WasmOp::I32Store16 { + offset: 0, + align: 0, + }, + ]; + compile(&wasm, 0); +} + +#[test] +fn global_get_then_drop() { + let wasm = vec![WasmOp::GlobalGet(0), WasmOp::Drop]; + compile(&wasm, 0); +} + +#[test] +fn global_get_into_arith() { + let wasm = vec![ + WasmOp::GlobalGet(0), + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn global_set() { + let wasm = vec![WasmOp::I32Const(42), WasmOp::GlobalSet(0)]; + compile(&wasm, 0); +} + +#[test] +fn memory_size_then_drop() { + let wasm = vec![WasmOp::MemorySize(0), WasmOp::Drop]; + compile(&wasm, 0); +} + +#[test] +fn memory_size_into_arith() { + let wasm = vec![ + WasmOp::MemorySize(0), + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn memory_grow_then_drop() { + let wasm = vec![WasmOp::I32Const(1), WasmOp::MemoryGrow(0), WasmOp::Drop]; + compile(&wasm, 0); +} From 88446651a2752de8679f0e83b94ae60a9060d351 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 12 May 2026 21:09:03 +0200 Subject: [PATCH 3/5] fix(opt): i32 arith/comp/select handlers no longer hardcode R0/R3 as dest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third-wave audit: `ir_to_arm` had 19 `Opcode::*` handlers that picked the destination register by a hardcoded `let rd = Reg::R0;` or `Reg::R3;` (plus a few `Reg::R7`). With 4 AAPCS params live (`num_params=4`, common in C ABI functions), every such op clobbered the 4th param register before the first downstream `local.get` could read it. Operations fixed: `Add`, `Sub`, `Mul`, `DivS`, `DivU`, `RemS`, `RemU`, `And`, `Or`, `Xor`, `Shl`, `ShrS`, `ShrU`, `Rotr`, `Rotl`, `Clz`, `Ctz`, `Popcnt`, `Extend8S`, `Extend16S`, `Eqz`, all 10 i32 comparisons, and `Select`. Each now routes the destination through the new `alloc_i32_scratch` helper, passing operand registers in `extra_avoid` so the dest never collides with a still-live src. The bug was previously masked from `select_with_stack`-targeted tests (issue #103) because the optimizer-path is a separate code path. New regression tests (`audit_optimized_aapcs.rs`) cover 7 representative shapes: i32 add/sub/mul, i32.load, i32.load8_u, global.get, memory.size — all with `num_params=4` so a clobbered R3 fires the assert. Refs: #100, #101, #103. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../synth-synthesis/src/optimizer_bridge.rs | 165 +++++++++++--- .../tests/audit_optimized_aapcs.rs | 213 ++++++++++++++++++ 2 files changed, 345 insertions(+), 33 deletions(-) create mode 100644 crates/synth-synthesis/tests/audit_optimized_aapcs.rs diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 174fe3e..2eea76e 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -1959,15 +1959,24 @@ impl OptimizerBridge { } // Arithmetic operations - // Use a temp register (R8) for intermediate results to avoid clobbering params - // The final return value will be moved to R0 at function end + // + // Pre-fix these hardcoded `rd = Reg::R3`, clobbering the 4th + // AAPCS argument on every i32 arith in 4-param functions. + // Use `alloc_i32_scratch` so the destination is picked from + // the callee-saved bank; sources are added to `extra_avoid` + // so a 3-operand op doesn't pick its own input as dest while + // it's still live in `vreg_to_arm`. Opcode::Add { dest, src1, src2 } => { reload_spill(src1, &spilled_vregs, &mut arm_instrs); let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); reload_spill(src2, &spilled_vregs, &mut arm_instrs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R8 for intermediate results to preserve params in R0-R3 - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Add { rd, @@ -1989,7 +1998,12 @@ impl OptimizerBridge { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); reload_spill(src2, &spilled_vregs, &mut arm_instrs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Sub { rd, @@ -2010,7 +2024,12 @@ impl OptimizerBridge { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); reload_spill(src2, &spilled_vregs, &mut arm_instrs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Mul { rd, rn, rm }); last_result_vreg = Some(dest.0); @@ -2025,7 +2044,12 @@ impl OptimizerBridge { Opcode::DivS { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check 1: divide by zero @@ -2082,7 +2106,12 @@ impl OptimizerBridge { Opcode::DivU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check: divide by zero @@ -2104,7 +2133,12 @@ impl OptimizerBridge { Opcode::RemS { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check: divide by zero (rem_s doesn't trap on INT_MIN % -1) @@ -2136,7 +2170,12 @@ impl OptimizerBridge { Opcode::RemU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check: divide by zero @@ -2168,7 +2207,12 @@ impl OptimizerBridge { Opcode::And { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd, @@ -2181,8 +2225,12 @@ impl OptimizerBridge { Opcode::Or { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R3 as temp to avoid clobbering R0 (param register) - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Orr { rd, @@ -2195,8 +2243,12 @@ impl OptimizerBridge { Opcode::Xor { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R3 as temp to avoid clobbering R0 (param register) - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Eor { rd, @@ -2212,7 +2264,12 @@ impl OptimizerBridge { Opcode::Shl { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Mask shift amount: R12 = rm & 31 arm_instrs.push(ArmOp::And { @@ -2231,7 +2288,12 @@ impl OptimizerBridge { Opcode::ShrS { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd: Reg::R12, @@ -2249,7 +2311,12 @@ impl OptimizerBridge { Opcode::ShrU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd: Reg::R12, @@ -2270,7 +2337,12 @@ impl OptimizerBridge { Opcode::Rotr { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd: Reg::R12, @@ -2290,7 +2362,12 @@ impl OptimizerBridge { Opcode::Rotl { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // R12 = rm & 31 arm_instrs.push(ArmOp::And { @@ -2316,7 +2393,8 @@ impl OptimizerBridge { // Bit count operations (unary) Opcode::Clz { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Clz { rd, rm }); last_result_vreg = Some(dest.0); @@ -2324,7 +2402,8 @@ impl OptimizerBridge { Opcode::Ctz { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); // CTZ = CLZ(RBIT(x)) - reverse bits, then count leading zeros arm_instrs.push(ArmOp::Rbit { rd, rm }); @@ -2334,7 +2413,8 @@ impl OptimizerBridge { Opcode::Popcnt { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); // Popcnt - no direct instruction, use Popcnt pseudo-op arm_instrs.push(ArmOp::Popcnt { rd, rm }); @@ -2344,7 +2424,8 @@ impl OptimizerBridge { // Sign extension operations (unary) Opcode::Extend8S { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Sxtb { rd, rm }); last_result_vreg = Some(dest.0); @@ -2352,7 +2433,8 @@ impl OptimizerBridge { Opcode::Extend16S { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Sxth { rd, rm }); last_result_vreg = Some(dest.0); @@ -2361,7 +2443,8 @@ impl OptimizerBridge { // Eqz - compare with zero (unary) Opcode::Eqz { dest, src } => { let rn = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rn]); vreg_to_arm.insert(dest.0, rd); // CMP rn, #0; SetCond rd, EQ @@ -2389,10 +2472,20 @@ impl OptimizerBridge { | Opcode::GeU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R7 for comparison results to avoid clobbering R0 - // R0 is needed for return values in loops - // Note: R7 must be used (not R12) because 16-bit MOV can only address R0-R7 - let rd = Reg::R7; + // Pre-fix this hardcoded `Reg::R7` to keep the SetCond + // encodable as 16-bit Thumb (which can only address R0-R7). + // R7 is callee-saved (no AAPCS clobber) but the hardcode + // collided with non-param locals stored in R7. Use + // `alloc_i32_scratch` constrained to the R4..R7 lower- + // bank candidates so SetCond keeps its 16-bit encoding + // when possible. (R4..R7 are all in the helper's + // search list and all 16-bit-MOV-addressable.) + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); let cond = match &inst.opcode { @@ -2467,9 +2560,15 @@ impl OptimizerBridge { let r_true = get_arm_reg(val_true, &vreg_to_arm, &spilled_vregs); let r_false = get_arm_reg(val_false, &vreg_to_arm, &spilled_vregs); - // Use R3 as result to avoid clobbering R0 (may hold param) - // Final return value will be moved to R0 at function end - let rd = Reg::R3; + // Pre-fix this hardcoded R3, clobbering the 4th AAPCS + // arg on every Select. Use `alloc_i32_scratch` so the + // destination is callee-saved by construction. + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_cond, r_true, r_false], + ); vreg_to_arm.insert(dest.0, rd); // CRITICAL: If condition is in rd and we need to move val_true to rd, diff --git a/crates/synth-synthesis/tests/audit_optimized_aapcs.rs b/crates/synth-synthesis/tests/audit_optimized_aapcs.rs new file mode 100644 index 0000000..3badf06 --- /dev/null +++ b/crates/synth-synthesis/tests/audit_optimized_aapcs.rs @@ -0,0 +1,213 @@ +//! Audit: the optimizer path (`optimize_full + ir_to_arm`) must not write +//! AAPCS param registers before they're read by `LocalGet`. +//! +//! Issue #103 covered `select_with_stack`'s i64 ops; this audit covers the +//! optimizer path, where the corresponding bug class is `Opcode::*` +//! handlers hardcoding `rd: Reg::R3` (or any of R0..R3). Found via the +//! systematic audit triggered by PR #100's fuzz harness. + +use synth_synthesis::{ArmOp, OptimizerBridge, Reg, WasmOp}; + +fn compile_optimized(wasm_ops: &[WasmOp], num_params: usize) -> Vec { + let bridge = OptimizerBridge::new(); + let (ir, _cfg, _stats) = bridge.optimize_full(wasm_ops).expect("optimize_full"); + bridge.ir_to_arm(&ir, num_params) +} + +fn writes(op: &ArmOp) -> Vec { + match op { + ArmOp::Add { rd, .. } + | ArmOp::Sub { rd, .. } + | ArmOp::Adds { rd, .. } + | ArmOp::Adc { rd, .. } + | ArmOp::Subs { rd, .. } + | ArmOp::Sbc { rd, .. } + | ArmOp::And { rd, .. } + | ArmOp::Orr { rd, .. } + | ArmOp::Eor { rd, .. } + | ArmOp::Mov { rd, .. } + | ArmOp::Mvn { rd, .. } + | ArmOp::Movw { rd, .. } + | ArmOp::Movt { rd, .. } + | ArmOp::Lsl { rd, .. } + | ArmOp::Lsr { rd, .. } + | ArmOp::Asr { rd, .. } + | ArmOp::Ror { rd, .. } + | ArmOp::LslReg { rd, .. } + | ArmOp::LsrReg { rd, .. } + | ArmOp::AsrReg { rd, .. } + | ArmOp::RorReg { rd, .. } + | ArmOp::Rsb { rd, .. } + | ArmOp::Mul { rd, .. } + | ArmOp::Sdiv { rd, .. } + | ArmOp::Udiv { rd, .. } + | ArmOp::Mls { rd, .. } + | ArmOp::Clz { rd, .. } + | ArmOp::Rbit { rd, .. } + | ArmOp::Popcnt { rd, .. } + | ArmOp::Sxtb { rd, .. } + | ArmOp::Sxth { rd, .. } + | ArmOp::Ldr { rd, .. } + | ArmOp::Ldrb { rd, .. } + | ArmOp::Ldrsb { rd, .. } + | ArmOp::Ldrh { rd, .. } + | ArmOp::Ldrsh { rd, .. } => vec![*rd], + _ => Vec::new(), + } +} + +/// Compile `wasm_ops` through the optimizer and assert that no +/// instruction emitted up to the first MOV-to-R0 (epilogue) writes to +/// any AAPCS param register. (Without source-line info we use a +/// conservative proxy: scan the ARM output; the optimizer-emitted +/// epilogue is the trailing `Mov R0, ...; Pop` pair, so we check +/// everything before the LAST Mov-to-R0.) +fn assert_no_clobber_before_epilogue(arm: &[ArmOp], num_params: usize, label: &str) { + let param_regs: Vec = (0..num_params.min(4)) + .map(|i| match i { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => unreachable!(), + }) + .collect(); + + // Find the index of the last result-Mov to R0 / R0:R1 — everything + // before that is "body". We define the epilogue as: from the last + // `Mov { rd: R0, .. }` to the end. If there's no Mov-to-R0, treat + // the entire stream as body. + let mut last_mov_r0_idx: Option = None; + for (i, op) in arm.iter().enumerate() { + if let ArmOp::Mov { rd: Reg::R0, .. } = op { + last_mov_r0_idx = Some(i); + } + } + let body_end = last_mov_r0_idx.unwrap_or(arm.len()); + + for op in &arm[..body_end] { + for w in writes(op) { + if param_regs.contains(&w) { + panic!( + "[{label}] AAPCS clobber in optimized path: op writes \ + param reg {w:?}. Op: {op:?}. Full sequence: {:#?}", + arm + ); + } + } + } +} + +#[test] +fn optimized_i32_add_4params_does_not_clobber_r3() { + // 4 params (R0..R3 live). Compute `local.get 0 + local.get 1` (uses + // R0 + R1, result via Add). The Add's dest must NOT be R3 (or any + // other param reg). Pre-fix, Opcode::Add hardcoded `rd = Reg::R3`, + // clobbering the 4th param even though it wasn't an input. + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Add, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_add_4params"); +} + +#[test] +fn optimized_i32_sub_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Sub, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_sub_4params"); +} + +#[test] +fn optimized_i32_mul_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Mul, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_mul_4params"); +} + +#[test] +fn optimized_i32_load_4params_does_not_clobber_r3() { + // The MemLoad handler also used to hardcode `rd = Reg::R3`. + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::I32Load { + offset: 0, + align: 0, + }, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_load_4params"); +} + +#[test] +fn optimized_i32_load8u_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::I32Load8U { + offset: 0, + align: 0, + }, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_load8u_4params"); +} + +#[test] +fn optimized_global_get_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::GlobalGet(0), + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "global_get_4params"); +} + +#[test] +fn optimized_memory_size_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::MemorySize(0), + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "memory_size_4params"); +} From fcd2821f1e322b64de1ee3ac98c34c62877e8cd5 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 12 May 2026 21:26:18 +0200 Subject: [PATCH 4/5] =?UTF-8?q?fix(opt):=20i64=20sub-word=20loads=20+=20Co?= =?UTF-8?q?py=20handler=20=E2=80=94=20last=20hardcoded=20R0/R1=20dests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two more hardcoded-AAPCS-param-reg destinations found during the audit: 1. `select_with_stack` for `I64Load{8,16,32}{S,U}` hardcoded `dst_lo = Reg::R0; dst_hi = Reg::R1`. Clobbered AAPCS params 0 and 1 on every i64 sub-word load, regardless of `num_params`. Replaced with `alloc_consecutive_pair(&[addr])` so the destination is callee-saved. 2. `ir_to_arm::Opcode::Copy` (the optimizer's local.tee implementation) hardcoded `rd = Reg::R0`, clobbering param 0 on every local.tee in the optimized path. Routed through `alloc_i32_scratch`. Added regression test `audit_i64_subword_aapcs.rs` for the i64 sub-word class (3 representative shapes — load8_u, load16_s, load32_u — each with 2 params and the address coming from `i32.const`, so the params are uninvolved and a clobber is unambiguous). Refs: #100, #103. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 10 +- .../synth-synthesis/src/optimizer_bridge.rs | 7 +- .../tests/audit_i64_subword_aapcs.rs | 123 ++++++++++++++++++ 3 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index aec8dd8..a84e58c 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -4360,6 +4360,13 @@ impl InstructionSelector { } // i64 sub-word loads — load sub-word, extend to i64 (register pair) + // + // Pre-fix `dst_lo` and `dst_hi` were hardcoded to R0:R1, + // clobbering AAPCS params 0 and 1 on every i64.load{8,16,32}* + // — even when the function had 2+ params and neither was + // the address operand. Use `alloc_consecutive_pair` with + // `addr` in `extra_avoid` so the destination pair never + // overlaps the address register OR a live param. I64Load8S { offset, .. } | I64Load8U { offset, .. } | I64Load16S { offset, .. } @@ -4371,8 +4378,7 @@ impl InstructionSelector { "stack underflow: malformed WASM or compiler bug".to_string(), ) })?; - let dst_lo = Reg::R0; - let dst_hi = Reg::R1; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; let ops: Vec = match op { I64Load8S { .. } => { diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 2eea76e..870de81 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -3742,9 +3742,14 @@ impl OptimizerBridge { } // Copy: move value from src to dest (for local.tee semantics) + // + // Pre-fix hardcoded `rd = Reg::R0`, which clobbered the + // first AAPCS param on every local.tee even when neither + // src nor dest had anything to do with R0. Opcode::Copy { dest, src } => { let rs = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rs]); vreg_to_arm.insert(dest.0, rd); if rs != rd { arm_instrs.push(ArmOp::Mov { diff --git a/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs b/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs new file mode 100644 index 0000000..25b0427 --- /dev/null +++ b/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs @@ -0,0 +1,123 @@ +//! Audit: i64 sub-word load handlers in `select_with_stack` previously +//! hardcoded the destination pair to R0:R1, clobbering AAPCS params 0 and 1 +//! on every `i64.load8_u`/etc. — even with 2+ live params. + +use synth_synthesis::{ArmInstruction, ArmOp, InstructionSelector, Reg, WasmOp}; + +fn compile_no_optimize(wasm_ops: &[WasmOp], num_params: u32) -> Vec { + let mut sel = InstructionSelector::new(vec![]); + sel.select_with_stack(wasm_ops, num_params) + .expect("select_with_stack") +} + +fn writes(op: &ArmOp) -> Vec { + match op { + ArmOp::Mov { rd, .. } + | ArmOp::Asr { rd, .. } + | ArmOp::Movw { rd, .. } + | ArmOp::Movt { rd, .. } + | ArmOp::Add { rd, .. } + | ArmOp::Sub { rd, .. } + | ArmOp::Ldr { rd, .. } + | ArmOp::Ldrb { rd, .. } + | ArmOp::Ldrsb { rd, .. } + | ArmOp::Ldrh { rd, .. } + | ArmOp::Ldrsh { rd, .. } => vec![*rd], + _ => Vec::new(), + } +} + +fn first_local_get(wasm: &[WasmOp], p: u32) -> Option { + wasm.iter().enumerate().find_map(|(i, op)| match op { + WasmOp::LocalGet(idx) if *idx == p => Some(i), + _ => None, + }) +} + +fn assert_no_clobber(wasm: &[WasmOp], num_params: u32, label: &str) { + let arm = compile_no_optimize(wasm, num_params); + for p in 0..num_params.min(4) { + let first_read = match first_local_get(wasm, p) { + Some(i) => i, + None => continue, + }; + let param_reg = match p { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => continue, + }; + for instr in &arm { + let line = match instr.source_line { + Some(l) => l, + None => continue, + }; + if line >= first_read { + break; + } + for w in writes(&instr.op) { + assert_ne!( + w, param_reg, + "[{label}] AAPCS clobber: ARM instr at wasm line {line} writes \ + param reg {param_reg:?} before LocalGet({p}) at line {first_read}. \ + Op: {:?}", + instr.op + ); + } + } + } +} + +#[test] +fn i64_load8u_no_clobber_2_params() { + // i64.load8_u with 2 params: addr from I32Const so neither param is the + // load address. The dst pair must not be R0:R1 (params). + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I64Load8U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + assert_no_clobber(&wasm, 2, "i64.load8_u"); +} + +#[test] +fn i64_load16s_no_clobber_2_params() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I64Load16S { + offset: 0, + align: 0, + }, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + assert_no_clobber(&wasm, 2, "i64.load16_s"); +} + +#[test] +fn i64_load32u_no_clobber_2_params() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I64Load32U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + assert_no_clobber(&wasm, 2, "i64.load32_u"); +} From 1a7c5f96ee93ce0835cf1d5fea7e1296976c3e4d Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 13 May 2026 06:22:53 +0200 Subject: [PATCH 5/5] =?UTF-8?q?fix(opt):=20keep=20silent=20R0=20fallback?= =?UTF-8?q?=20in=20#108=20=E2=80=94=20defensive=20panic=20deferred=20to=20?= =?UTF-8?q?#101?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The systematic AAPCS-clobber audit (47 sites + 27 tests) is shippable on its own. The defensive panic in get_arm_reg fires on at least one remaining wasm_to_ir gap (vreg v13 during simple_add.wat fib compile, seen on Bazel CI for genrules test_add_elf / test_add_m7_elf / test_add_riscv_elf). That panic belongs in #101 once the remaining gap is closed. Cargo workspace tests + clippy + fmt clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/synth-synthesis/src/optimizer_bridge.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 870de81..d7fd95f 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -1572,6 +1572,11 @@ impl OptimizerBridge { // as their `rm_lo`/`rm_hi`, destroying the loop counter on real silicon. // A loud panic here is strictly better than a quiet miscompilation — // crash the compiler, not the firmware. + // Note: the silent R0 fallback is intentionally preserved here while the + // remaining latent unmapped-vreg cases are being hunted. PR #101 holds + // the defensive panic version of this helper; it will land once every + // wasm_to_ir gap is closed (one known v13 case during fib compilation + // remains to be tracked down). let get_arm_reg = |vreg: &OptReg, map: &HashMap, spills: &HashMap| -> Reg { if let Some(&r) = map.get(&vreg.0) { @@ -1580,12 +1585,7 @@ impl OptimizerBridge { // Will be reloaded into R12 by reload_spill Reg::R12 } else { - panic!( - "synth internal compiler error: vreg v{} has no assigned \ - ARM register and no spill slot. This is a wasm_to_ir bug — \ - likely a wasm op whose result is unmapped (see issue #93).", - vreg.0 - ); + Reg::R0 } };