From 90b3285e9515bcb4204f76e1b579f626a33c4f39 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Mon, 27 Apr 2026 22:24:36 +0200 Subject: [PATCH 1/6] feat(cli): add --relocatable flag to force ET_REL output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently synth produces a relocatable object (.o, ET_REL) only when the input wasm has imports — the relocations they generate trigger the relocatable code path. Self-contained wasm modules with no imports produce a complete ET_EXEC firmware with vector table, Reset_Handler, linear_memory section, etc. For linking into a host build system (e.g. integrating verified Rust kernel primitives compiled to wasm into a Zephyr build), the host expects a relocatable .o it can pull into its existing link step. Add a --relocatable CLI flag that forces ET_REL output regardless of whether the wasm has imports. The flag is additive — default behaviour is unchanged. Tested with gale-ffi.wasm (200 functions, 0 imports): output is now a 26645-byte ET_REL ARM EABI5 object with all gale_* symbols defined and no vector-table machinery. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/synth-cli/src/main.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/crates/synth-cli/src/main.rs b/crates/synth-cli/src/main.rs index cfd4eea..af1d77d 100644 --- a/crates/synth-cli/src/main.rs +++ b/crates/synth-cli/src/main.rs @@ -170,6 +170,11 @@ enum Commands { /// Path to kiln-builtins object file (.o) for linking (used with --link) #[arg(long, value_name = "BUILTINS")] builtins: Option, + + /// Force relocatable object (.o, ET_REL) output even when wasm has no imports + /// — for linking into a host build system. + #[arg(long)] + relocatable: bool, }, /// Disassemble an ARM ELF file (e.g., synth disasm output.elf) @@ -248,6 +253,7 @@ fn main() -> Result<()> { verify, link, builtins, + relocatable, } => { // Resolve target spec: --target overrides, --cortex-m is backwards compat let target_spec = resolve_target_spec(target.as_deref(), cortex_m)?; @@ -272,6 +278,7 @@ fn main() -> Result<()> { &backend, verify, &target_spec, + relocatable, )?; // If --link requested, invoke the cross-linker @@ -553,6 +560,7 @@ fn compile_command( backend_name: &str, verify: bool, target_spec: &TargetSpec, + relocatable: bool, ) -> Result<()> { // Validate backend exists let registry = build_backend_registry(); @@ -595,6 +603,7 @@ fn compile_command( backend, verify, target_spec, + relocatable, ); } @@ -1222,6 +1231,7 @@ fn compile_all_exports( backend: &dyn Backend, verify: bool, target_spec: &TargetSpec, + relocatable: bool, ) -> Result<()> { let path = input.context("--all-exports requires an input file")?; @@ -1428,8 +1438,18 @@ fn compile_all_exports( // When there are relocations, produce a relocatable object (.o) instead of // an executable. This lets the output be linked with the Kiln bridge crate // (which provides __meld_dispatch_import and __meld_get_memory_base). - let elf_data = if has_relocations { - info!("Module has import calls — producing relocatable object (ET_REL)"); + // The --relocatable flag forces ET_REL output even when the wasm has no + // imports, for linking into a host build system (e.g. Zephyr). + let elf_data = if has_relocations || relocatable { + let total_relocs: usize = compiled_funcs.iter().map(|f| f.relocations.len()).sum(); + if has_relocations { + info!( + "Producing relocatable object (ET_REL): {} import call relocations", + total_relocs + ); + } else { + info!("Producing relocatable object (ET_REL): forced by --relocatable"); + } build_relocatable_elf(&compiled_funcs, &all_imports)? } else if cortex_m { build_multi_func_cortex_m_elf(&compiled_funcs, &all_memories, target_spec)? From 01a079fce9b99f7e3579656308fd441ab39cbac3 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 28 Apr 2026 19:54:56 +0200 Subject: [PATCH 2/6] fix(no-optimize): allocate stack frame for non-param locals + handle i64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in select_with_stack's spilled-local handling, both surfaced running gale (formally-verified Zephyr kernel primitives) through synth on real Cortex-M code: 1. **i64 local storage only writes one half.** LocalSet/LocalTee/LocalGet for spilled locals always emitted plain Str/Ldr (4 bytes), even for i64 locals. The upper half was never stored, so local.get returned garbage for 32 high bits. For gale's u64-packed FFI decision structs this corrupted the action field, breaking every ISR-driven semaphore operation in the engine bench (count=0 / drain_timeout at every step). 2. **Locals area aliased the callee-saved spill area.** The legacy offset formula `(local_idx - 4) * 4` was hardcoded for num_params==4 and produced negative offsets for other configurations, which the I64Ldr/I64Str encoder silently clamped to 0. With offset 0, locals landed exactly where stmdb had just saved r4/r5 — corrupting the callee-saved register spill and propagating wrong values back to the caller after ldmia. Pure AAPCS violation. Fix: - Add `compute_local_layout(wasm_ops, num_params) -> LocalLayout` that walks the wasm op stream once to determine each non-param local's width (i32/i64) and assigns proper stack offsets from a base of 0. Uses `infer_i64_locals` (also new) which simulates a width vstack to classify locals based on what gets stored into them. - Prologue emits `sub sp, sp, #frame_size` after the callee-saved push, allocating the locals area below the saved-register spills. - Epilogue emits `add sp, sp, #frame_size` before the pop, restoring SP to the callee-saved spills before they get popped. Also applied to the explicit Return opcode handler. - LocalGet/LocalSet/LocalTee dispatch on the layout entry: i64 locals use I64Ldr/I64Str (which already emit two 32-bit memory ops at offset N and N+4); i32 locals use plain Ldr/Str. Offsets come from layout, not from the legacy formula. Frame size is rounded up to 8 bytes for AAPCS SP alignment at call sites. Verified locally: /tmp/repro_i64.wat (1 i32 param + 1 i64 local round-trip): Before: str.w r0, [sp]; (no upper store) — upper half lost. After: sub.w sp, sp, #8; str.w r0, [sp]; str.w r1, [sp, #4]; (both halves stored) ldr.w r2, [sp]; ldr.w r3, [sp, #4]; (both halves loaded) add.w sp, sp, #8; ldmia. gale_k_sem_give_decide (3 i32 params + 1 i64 local, the function whose runtime miscompilation hung the engine bench in CI): Before: str.w r3, [sp]; str.w r5, [sp, #4]; — clobbered the saved r4/r5 from stmdb, AAPCS-violating. After: sub.w sp, sp, #8 → str into the locals frame, not the spill; add.w sp, sp, #8 before ldmia — proper AAPCS. Engine-bench build with GALE_USE_SYNTH=ON now produces a 22768 B zephyr.elf (was 22692 B with the buggy synth). Renode validation pending in CI. Out of scope: - The optimized regalloc bug in optimizer_bridge.rs (clobbers r0..r3 parameter registers — see /tmp/match_gale.wat repro). This fix lets --no-optimize work; the optimized path needs a separate pass. - Implicit-return-to-R0 elision in select_with_stack: small functions whose result lands in a non-R0 temp don't get a final mov to R0 in --no-optimize mode. Pre-existing, unrelated to this fix; affects i32-returning functions with a single intermediate value. Doesn't affect i64 returns (which use the natural R0/R1 pair). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 295 +++++++++++++++++- 1 file changed, 290 insertions(+), 5 deletions(-) diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 4f43edb..9806664 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -156,6 +156,169 @@ fn i64_pair_hi(lo_reg: Reg) -> Result { ))) } +/// Per-function stack-frame layout for non-parameter locals. +/// +/// `offsets[idx]` gives the byte offset (relative to SP after the frame +/// allocation) where local `idx` lives. `frame_size` is the total bytes +/// to allocate via `sub sp, sp, #frame_size` in the prologue. +/// +/// i32/i64 locals each occupy 4/8 bytes respectively. i64 locals are +/// 8-byte aligned per AAPCS. The total frame is rounded up to 8 bytes +/// to keep SP 8-byte aligned at call sites. +struct LocalLayout { + /// idx -> (offset_from_sp, is_i64) + locals: std::collections::HashMap, + frame_size: i32, +} + +/// Compute the stack-frame layout for non-parameter locals in a function. +/// +/// Walks the wasm op stream once to: +/// 1. Identify which non-param local indices are referenced (LocalGet/Set/Tee). +/// 2. Determine each local's width via `infer_i64_locals` (i32 vs i64). +/// 3. Lay them out in ascending-index order with i64 locals 8-byte aligned. +/// +/// The result drives: +/// - Prologue: `sub sp, sp, #frame_size` after pushing callee-saved regs. +/// - LocalGet/Set/Tee: use `offsets[idx]` instead of the legacy +/// `(idx - 4) * 4` formula (which only happened to work when num_params==4 +/// AND the formula's negative result was silently clamped to 0 by the +/// encoder, in both cases corrupting the caller's stack or the callee's +/// own callee-saved-register spill). +/// - Epilogue: `add sp, sp, #frame_size` before popping registers. +fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout { + use std::collections::{BTreeSet, HashMap}; + let i64_set = infer_i64_locals(wasm_ops); + + // Collect non-param local indices, in ascending order for deterministic layout. + let mut used: BTreeSet = BTreeSet::new(); + for op in wasm_ops { + match op { + WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => { + if *idx >= num_params { + used.insert(*idx); + } + } + _ => {} + } + } + + let mut locals: HashMap = HashMap::new(); + let mut offset: i32 = 0; + for &idx in &used { + let is_i64 = i64_set.contains(&idx); + // i64 locals require 8-byte alignment. + if is_i64 && (offset % 8) != 0 { + offset += 4; + } + locals.insert(idx, (offset, is_i64)); + offset += if is_i64 { 8 } else { 4 }; + } + // Round frame to 8-byte multiple for AAPCS SP alignment. + let frame_size = (offset + 7) & !7; + + LocalLayout { + locals, + frame_size, + } +} + +/// Infer which non-parameter wasm locals are i64 (8-byte) values. +/// +/// The wasm decoder discards local-declaration type info, so we re-derive +/// it from the operation stream by simulating a virtual stack of widths +/// (1 = 32-bit, 2 = 64-bit). On each `LocalSet`/`LocalTee` we record the +/// width of the value being stored. WASM type rules guarantee a local's +/// width is invariant for its lifetime, so the first store wins. +/// +/// Without this, the spilled-local store/load path would emit a single +/// 4-byte STR/LDR for i64 locals, dropping the upper half — corrupting +/// any function that returns or uses a u64-packed FFI struct. +fn infer_i64_locals(wasm_ops: &[WasmOp]) -> std::collections::HashSet { + use WasmOp::*; + let mut i64_locals: std::collections::HashSet = std::collections::HashSet::new(); + let mut vstack: Vec = Vec::new(); // true = i64 + + let is_i64_producer = |op: &WasmOp| -> bool { + matches!( + op, + I64Add + | I64Sub + | I64Mul + | I64DivS + | I64DivU + | I64RemS + | I64RemU + | I64And + | I64Or + | I64Xor + | I64Shl + | I64ShrS + | I64ShrU + | I64Rotl + | I64Rotr + | I64Clz + | I64Ctz + | I64Popcnt + | I64Const(_) + | I64Load { .. } + | I64Load8S { .. } + | I64Load8U { .. } + | I64Load16S { .. } + | I64Load16U { .. } + | I64Load32S { .. } + | I64Load32U { .. } + | I64ExtendI32S + | I64ExtendI32U + | I64Extend8S + | I64Extend16S + | I64Extend32S + ) + }; + + for op in wasm_ops { + match op { + LocalGet(idx) => { + let is_i64 = i64_locals.contains(idx); + vstack.push(is_i64); + } + LocalSet(idx) => { + if let Some(is_i64) = vstack.pop() { + if is_i64 { + i64_locals.insert(*idx); + } + } + } + LocalTee(idx) => { + if let Some(&is_i64) = vstack.last() { + if is_i64 { + i64_locals.insert(*idx); + } + } + } + Select => { + // pops [val1, val2, cond], pushes one value with width of val1/val2 + let _cond = vstack.pop(); + let v2 = vstack.pop(); + let v1 = vstack.pop(); + vstack.push(v1.or(v2).unwrap_or(false)); + } + _ => { + let (pops, pushes) = wasm_stack_effect(op); + for _ in 0..pops { + vstack.pop(); + } + let push_width = is_i64_producer(op); + for _ in 0..pushes { + vstack.push(push_width); + } + } + } + } + + i64_locals +} + /// Return the (pops, pushes) stack effect for a WASM op. /// /// Used by the wildcard fallthrough in select_with_stack to maintain @@ -3220,9 +3383,12 @@ impl InstructionSelector { let mut instructions = Vec::new(); - // Function prologue: save callee-saved registers and LR. + // Function prologue: save callee-saved registers and LR, then + // allocate the local-variable frame. + // // AAPCS requires 8-byte aligned SP at call sites. Pushing an even - // number of registers (6: R4-R8, LR) maintains alignment. + // number of registers (6: R4-R8, LR) maintains alignment, and the + // frame_size below is rounded to 8 to preserve it. instructions.push(ArmInstruction { op: ArmOp::Push { regs: vec![Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8, Reg::LR], @@ -3230,6 +3396,22 @@ impl InstructionSelector { source_line: None, }); + // Compute non-param local layout (offsets + total frame size). + let layout = compute_local_layout(wasm_ops, num_params); + // Allocate stack space for non-param locals so they don't alias the + // callee-saved-register spill area (which immediately follows SP + // after Push above). + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Sub { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: None, + }); + } + // Virtual stack holds register indices let mut stack: Vec = Vec::new(); // Next available register for temporaries (start after params) @@ -3255,11 +3437,42 @@ impl InstructionSelector { for (idx, op) in wasm_ops.iter().enumerate() { match op { LocalGet(local_idx) => { - // Get the register for this local + // Get the register for this local. Three cases: + // 1. Param in register — use the cached mapping. + // 2. Spilled i64 local — load both halves via I64Ldr. + // 3. Spilled i32 local — single Ldr. let reg = if let Some(&r) = local_to_reg.get(local_idx) { r + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 local — load both 32-bit halves into a consecutive + // register pair via the I64Ldr pseudo-op. Convention + // matches I64Const: push only dst_lo on the stack; + // dst_hi is recovered later via i64_pair_hi(lo). + let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; + let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Ldr { + rdlo: dst_lo, + rdhi: dst_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + dst_lo + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + // i32 local: single 4-byte load from the locals frame. + let dst = alloc_temp_safe(&mut next_temp, &stack)?; + instructions.push(ArmInstruction { + op: ArmOp::Ldr { + rd: dst, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + dst } else { - // Local not in register (spilled to stack) - load it + // Local not in layout (shouldn't happen for valid wasm, + // but fall back to legacy behaviour for compatibility). let dst = alloc_temp_safe(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::Ldr { @@ -4324,6 +4537,20 @@ impl InstructionSelector { }); cf.add_instruction(); } + // Deallocate the local frame before popping callee-saved + // registers; otherwise the pop would read from the locals + // area instead of the saved-register slots. + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } // Restore callee-saved registers and return via PC instructions.push(ArmInstruction { op: ArmOp::Pop { @@ -4475,7 +4702,32 @@ impl InstructionSelector { cf.add_instruction(); } local_to_reg.insert(*local_idx, target); + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 spilled local: store BOTH 32-bit halves + // (lower at offset N, upper at N+4) via the I64Str + // pseudo-op. Without this we drop the upper half. + let val_hi = i64_pair_hi(val)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Str { + rdlo: val, + rdhi: val_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + // i32 spilled local: single 4-byte store. + instructions.push(ArmInstruction { + op: ArmOp::Str { + rd: val, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); } else { + // Fall-through for compatibility (shouldn't happen). instructions.push(ArmInstruction { op: ArmOp::Str { rd: val, @@ -4507,7 +4759,29 @@ impl InstructionSelector { cf.add_instruction(); } local_to_reg.insert(*local_idx, target); + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 spilled local: store both halves like LocalSet. + let val_hi = i64_pair_hi(val)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Str { + rdlo: val, + rdhi: val_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + instructions.push(ArmInstruction { + op: ArmOp::Str { + rd: val, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); } else { + // Fall-through for compatibility. instructions.push(ArmInstruction { op: ArmOp::Str { rd: val, @@ -5055,7 +5329,18 @@ impl InstructionSelector { } } - // Function epilogue: restore callee-saved registers and return via PC + // Function epilogue: deallocate the local frame, then restore + // callee-saved registers and return via PC. + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: None, + }); + } // POP {R4-R8, PC} restores registers and returns (PC = saved LR) instructions.push(ArmInstruction { op: ArmOp::Pop { From 1e1e01b7b15342a07f36ab2f1e6f23829e133544 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 28 Apr 2026 22:19:45 +0200 Subject: [PATCH 3/6] fix: alloc consecutive register pairs for i64 operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #85 fixed i64 local STORAGE (both halves stored to consecutive stack slots) but introduced a follow-on bug: when the i64 register pair gets allocated via two separate alloc_temp_safe calls, the resulting pair can be NON-CONSECUTIVE in ALLOCATABLE_REGS if a register in between is live on the wasm stack. Subsequent i64 ops downstream call i64_pair_hi(rdlo) to recover the high register, which assumes the pair is consecutive. With a non-consecutive pair from earlier, i64_pair_hi returns the WRONG register and the op reads garbage. Witnessed on gale_k_sem_give_decide: ldr.w r5, [sp] ; LocalGet i64 lo - r5 picked ldr.w r6, [sp, #4] ; LocalGet i64 hi - r6 picked (consecutive ✓) ...later... orr.w r0, r0, r2 ; i64.or expected (r5,r6) but used (r2,r3) Fix: add `alloc_consecutive_pair` helper that ensures two consecutive free entries in ALLOCATABLE_REGS. Use it everywhere a pair is allocated for an i64 result: I64Const (line 4567), I64Add/Sub/Mul result allocs (lines 4914+, 4958+, 4996+), and the new i64-LocalGet path from PR #85. Verified locally: /tmp/repro_i64.wat round-trips correctly with consecutive register pair (lo→r2, hi→r3). gale_k_sem_give_decide's LocalGet 3 now loads to consecutive r5/r6. Note: the engine bench in Renode still hangs after this fix. Further diagnostic shows i64.or's ARM lowering uses register pairs (r0,r1) and (r2,r3) regardless of what's on the wasm-tracked stack — a separate bug in synth's wildcard fallthrough for I64Or in select_with_stack. That fix is out of scope for this PR; tracked separately. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 83 +++++++++++++++---- 1 file changed, 69 insertions(+), 14 deletions(-) diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 9806664..82d174e 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -132,6 +132,41 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result { )) } +/// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS, +/// neither of which is currently on the wasm stack. +/// +/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values +/// because if a register between them is live on the wasm stack, the second +/// call skips it and the resulting pair is non-consecutive. Subsequent code +/// that uses [`i64_pair_hi`]`(rdlo)` to recover the high register then gets +/// the wrong register and reads garbage. +/// +/// This helper ensures both halves come from consecutive ALLOCATABLE_REGS +/// entries (no wraparound) so the pair convention holds throughout the +/// function body. +fn alloc_consecutive_pair(next_temp: &mut u8, stack: &[Reg]) -> Result<(Reg, Reg)> { + let n = ALLOCATABLE_REGS.len(); + for _ in 0..n { + let lo_idx = (*next_temp as usize) % n; + let hi_idx = lo_idx + 1; + // Wraparound is invalid: i64_pair_hi requires hi_idx < n. + if hi_idx < n { + let lo_reg = ALLOCATABLE_REGS[lo_idx]; + let hi_reg = ALLOCATABLE_REGS[hi_idx]; + if !stack.contains(&lo_reg) && !stack.contains(&hi_reg) { + *next_temp = ((hi_idx + 1) % n) as u8; + return Ok((lo_reg, hi_reg)); + } + } + *next_temp = ((*next_temp as usize + 1) % n) as u8; + } + Err(synth_core::Error::synthesis( + "register exhaustion: no consecutive pair of free registers for i64 — \ + function too complex for current register allocator" + .to_string(), + )) +} + /// Given the low register of an i64 register pair, return the high register. /// /// Convention: i64 values on 32-bit ARM use two consecutive registers. @@ -3448,8 +3483,13 @@ impl InstructionSelector { // register pair via the I64Ldr pseudo-op. Convention // matches I64Const: push only dst_lo on the stack; // dst_hi is recovered later via i64_pair_hi(lo). - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // The pair MUST be consecutive in ALLOCATABLE_REGS + // — i64_pair_hi assumes that. Two separate calls to + // alloc_temp_safe can return non-consecutive registers + // when something in between is live, breaking the + // pair convention. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::I64Ldr { rdlo: dst_lo, @@ -4835,9 +4875,12 @@ impl InstructionSelector { // Pairs are allocated as two consecutive temp registers. // ========================================================= I64Const(val) => { - // Allocate a register pair for the 64-bit constant - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate a CONSECUTIVE register pair for the 64-bit + // constant. Two separate alloc_temp_safe calls can return + // non-consecutive registers if something in between is + // live on the wasm stack, which then breaks the + // i64_pair_hi convention used by every i64 op downstream. + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::I64Const { @@ -4867,9 +4910,13 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; // ADDS dst_lo, a_lo, b_lo (sets carry flag) instructions.push(ArmInstruction { @@ -4911,9 +4958,13 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; // SUBS dst_lo, a_lo, b_lo (sets borrow flag) instructions.push(ArmInstruction { @@ -4949,9 +5000,13 @@ impl InstructionSelector { ) })?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; // Generate bounds-checked i64 load into the allocated pair let load_ops = From b8da214f1a2d3c17d6a70535ff961f53d6594ff7 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 29 Apr 2026 06:25:39 +0200 Subject: [PATCH 4/6] fix(no-optimize): explicit i64 op handlers + extra_avoid in alloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three deeper bugs surfaced when running gale_k_sem_give_decide on Renode after PR #85 + the consecutive-pair fix: 1. **i64 ops fall through to select_default** in select_with_stack, which assumes inputs are in R0:R1 / R2:R3. Wasm-stack-tracked pairs from earlier ops never make it. Symptoms: i64.or used register pairs from previous shift ops instead of the just-loaded LocalGet result, producing a corrupted return value. Fix: explicit handlers for I64Or / I64And / I64Xor / I64ExtendI32U / I64ExtendI32S / I64Shl / I64ShrU / I64ShrS in select_with_stack, each popping the wasm-tracked pair, deriving its hi via i64_pair_hi, and emitting the right ARM instructions / pseudo-ops with arbitrary registers (the existing I64Shl/etc. ArmOp pseudo-ops accept arbitrary register operands). 2. **alloc_consecutive_pair didn't reserve implicit hi registers**. The wasm stack only tracks the lo register of each i64; the hi is conventional but invisible to a naive scan. A fresh allocation could overwrite an earlier i64's implicit hi, breaking subsequent i64_pair_hi lookups. Witnessed: I64Const 32 allocated (r1, r2), clobbering the hi of a previously-extended i64 in (r0, r1). Fix: alloc_consecutive_pair now scans every stack entry and reserves both lo AND its conventional pair_hi (over-reserves for i32 stack entries — safe). 3. **alloc_consecutive_pair didn't reserve just-popped operands**. When an i64 op popped operands and then allocated a result pair, the popped values were temporarily off the stack. The allocation could pick a register that's about to be read by the op's own second instruction (e.g. dst_lo == a_hi means the lo Or write clobbers a_hi before the hi Or reads it). Fix: extra_avoid parameter on alloc_consecutive_pair. I64Add / I64Sub / I64Or / I64Shl / I64Load / extends pass their popped operand registers via extra_avoid. Verified locally: gale_k_sem_give_decide now produces: orr r0, r6, r8 ; lo = shift_result_lo OR local3_lo orr r1, r7, ip ; hi = shift_result_hi OR local3_hi matching the wasm semantics for i64.or. Engine bench builds clean with 22644 B FLASH. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 289 ++++++++++++++++-- 1 file changed, 263 insertions(+), 26 deletions(-) diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 82d174e..e35608b 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -133,27 +133,52 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result { } /// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS, -/// neither of which is currently on the wasm stack. +/// neither of which is currently in use. /// -/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values -/// because if a register between them is live on the wasm stack, the second -/// call skips it and the resulting pair is non-consecutive. Subsequent code -/// that uses [`i64_pair_hi`]`(rdlo)` to recover the high register then gets -/// the wrong register and reads garbage. +/// "In use" means: +/// 1. On the wasm stack (the explicit `Vec` tracking). +/// 2. The implicit *high* register of any i64 value on the stack — for every +/// `lo` in `stack`, [`i64_pair_hi`]`(lo)` is also reserved. The wasm stack +/// only tracks the lo register of each i64; the hi is reserved by +/// convention but invisible to a naive scan. If we ignored that, a fresh +/// `alloc_consecutive_pair` could return the implicit-hi of an earlier +/// i64, clobbering it on the next i64 op that reads it via i64_pair_hi. +/// 3. Any explicit registers in `extra_avoid` — used by i64-op handlers to +/// keep the just-popped operand pairs alive across the destination +/// allocation (e.g. for I64Or, the popped a_lo/a_hi/b_lo/b_hi are still +/// live until the OR is emitted). /// -/// This helper ensures both halves come from consecutive ALLOCATABLE_REGS -/// entries (no wraparound) so the pair convention holds throughout the -/// function body. -fn alloc_consecutive_pair(next_temp: &mut u8, stack: &[Reg]) -> Result<(Reg, Reg)> { +/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values: +/// if a register between them is live, the second call skips it and the +/// resulting pair is non-consecutive, breaking [`i64_pair_hi`]'s contract. +fn alloc_consecutive_pair( + next_temp: &mut u8, + stack: &[Reg], + extra_avoid: &[Reg], +) -> Result<(Reg, Reg)> { + // Build a "live" Vec: every stack entry, plus its conventional + // pair_hi (over-reserves for i32 stack entries but that's safe), plus + // any explicit extras the caller specifies. Using Vec rather than + // HashSet because Reg in this crate does not derive Hash. + let mut live: Vec = Vec::with_capacity(stack.len() * 2 + extra_avoid.len()); + for ® in stack { + live.push(reg); + if let Ok(hi) = i64_pair_hi(reg) { + live.push(hi); + } + } + for ® in extra_avoid { + live.push(reg); + } + let n = ALLOCATABLE_REGS.len(); for _ in 0..n { let lo_idx = (*next_temp as usize) % n; let hi_idx = lo_idx + 1; - // Wraparound is invalid: i64_pair_hi requires hi_idx < n. if hi_idx < n { let lo_reg = ALLOCATABLE_REGS[lo_idx]; let hi_reg = ALLOCATABLE_REGS[hi_idx]; - if !stack.contains(&lo_reg) && !stack.contains(&hi_reg) { + if !live.contains(&lo_reg) && !live.contains(&hi_reg) { *next_temp = ((hi_idx + 1) % n) as u8; return Ok((lo_reg, hi_reg)); } @@ -3489,7 +3514,7 @@ impl InstructionSelector { // when something in between is live, breaking the // pair convention. let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + alloc_consecutive_pair(&mut next_temp, &stack, &[])?; instructions.push(ArmInstruction { op: ArmOp::I64Ldr { rdlo: dst_lo, @@ -4880,7 +4905,7 @@ impl InstructionSelector { // non-consecutive registers if something in between is // live on the wasm stack, which then breaks the // i64_pair_hi convention used by every i64 op downstream. - let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack)?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?; instructions.push(ArmInstruction { op: ArmOp::I64Const { @@ -4915,8 +4940,14 @@ impl InstructionSelector { // and is called by every i64 op downstream to recover // the high register. Two separate alloc_temp_safe calls // skip live registers and produce non-consecutive pairs. - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + // Avoid clobbering the just-popped operand pairs before + // the ADC reads them — passing them in extra_avoid + // ensures dst doesn't overlap any of a_lo/a_hi/b_lo/b_hi. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; // ADDS dst_lo, a_lo, b_lo (sets carry flag) instructions.push(ArmInstruction { @@ -4958,13 +4989,13 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair. MUST be consecutive - // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive - // and is called by every i64 op downstream to recover - // the high register. Two separate alloc_temp_safe calls - // skip live registers and produce non-consecutive pairs. - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + // See I64Add for why extra_avoid carries a_*/b_* — + // dst must not overlap any operand half before SBC reads it. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; // SUBS dst_lo, a_lo, b_lo (sets borrow flag) instructions.push(ArmInstruction { @@ -4991,6 +5022,212 @@ impl InstructionSelector { stack.push(dst_lo); } + // ============================================================ + // i64 bitwise ops (I64Or / I64And / I64Xor) + // + // Each pops two i64 register pairs from the wasm stack and + // emits two ARM ops (low-half then high-half) into a freshly + // allocated consecutive pair. This replaces the wildcard + // fallthrough to select_default, which assumed inputs in + // R0:R1 and R2:R3 — incorrect when the wasm stack tracks + // arbitrary register pairs from earlier ops. + // ============================================================ + I64Or | I64And | I64Xor => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 bitwise op".to_string(), + ) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 bitwise op".to_string(), + ) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // dst must not overlap any popped operand's half — the + // hi instruction reads a_hi and b_hi after the lo + // instruction writes dst_lo. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; + let (lo_op, hi_op) = match op { + I64Or => ( + ArmOp::Orr { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::Orr { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + I64And => ( + ArmOp::And { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::And { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + I64Xor => ( + ArmOp::Eor { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::Eor { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: lo_op, + source_line: Some(idx), + }); + cf.add_instruction(); + instructions.push(ArmInstruction { + op: hi_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i32 -> i64 extension (I64ExtendI32U / I64ExtendI32S) + // + // Pops one i32, allocates a consecutive i64 pair, places the + // i32 in the low half. For unsigned: high = 0. For signed: + // high = arithmetic-shift-right by 31 (sign-extension). + // ============================================================ + I64ExtendI32U => { + let val = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in I64ExtendI32U".to_string(), + ) + })?; + // val must stay alive until the Mov reads it; dst_hi + // must not be val (we'd write the zero high before + // moving val to dst_lo). + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + if val != dst_lo { + instructions.push(ArmInstruction { + op: ArmOp::Mov { + rd: dst_lo, + op2: Operand2::Reg(val), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } + instructions.push(ArmInstruction { + op: ArmOp::Movw { + rd: dst_hi, + imm16: 0, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + I64ExtendI32S => { + let val = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in I64ExtendI32S".to_string(), + ) + })?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + instructions.push(ArmInstruction { + op: ArmOp::I64ExtendI32S { + rdlo: dst_lo, + rdhi: dst_hi, + rn: val, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 variable shifts (I64Shl / I64ShrU / I64ShrS) + // + // Use the existing I64Shl/I64ShrU/I64ShrS pseudo-ops (which + // expand to the variable-shift logic in arm_encoder.rs) but + // pass the actual stack-tracked register pairs rather than + // assuming R0:R1 / R2:R3. + // ============================================================ + I64Shl | I64ShrU | I64ShrS => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 shift".to_string(), + ) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 shift".to_string(), + ) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // dst must not overlap any popped operand's half — the + // shift pseudo-op reads all four (rn_lo/rn_hi/rm_lo/rm_hi) + // before writing the destination. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; + let shift_op = match op { + I64Shl => ArmOp::I64Shl { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + I64ShrU => ArmOp::I64ShrU { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + I64ShrS => ArmOp::I64ShrS { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: shift_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + I64Load { offset, .. } => { // Pop address from stack let addr = stack.pop().ok_or_else(|| { @@ -5003,10 +5240,10 @@ impl InstructionSelector { // Allocate result register pair. MUST be consecutive // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive // and is called by every i64 op downstream to recover - // the high register. Two separate alloc_temp_safe calls - // skip live registers and produce non-consecutive pairs. + // the high register. Avoid clobbering addr before the + // load uses it. let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; // Generate bounds-checked i64 load into the allocated pair let load_ops = From c81b91eefd7af380803c197fc4a9c9f968ffe0cd Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 29 Apr 2026 08:33:54 +0200 Subject: [PATCH 5/6] =?UTF-8?q?fix(opt):=20regalloc=20clobbers=20parameter?= =?UTF-8?q?=20registers=20=E2=80=94=20AAPCS=20violation=20in=20i64=20ops?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the optimized lowering path (`optimizer_bridge::ir_to_arm`), every i64 opcode hardcoded its register pair to R0:R1 / R2:R3 and ignored the actual operand vregs from `vreg_to_arm`. For any function that emitted an i64 op before all its i32 params had been read (e.g. `i64.const 1` issued before the first `local.get 0`), the very first emitted instruction would clobber the AAPCS arg register holding the param. Concretely, `match_gale.wat` (3 i32 params + 1 i64 local + an `i64.const` prologue) compiled to: movs r0, #1 ; <-- destroys param 0 movs r1, #0 ; <-- destroys param 1 ... cmp r0, r1 ; reads garbage Same shape on the engine_control bench wasm: `gale_k_sem_give_decide` started with `movs r0, #1; movs r1, #0` before any `local.get 0`. Fix: - Compute `param_reserved_regs: Vec` = R0..R(min(num_params, 4)) at function entry. Use Vec because `Reg` doesn't derive Hash; matches `instruction_selector::alloc_consecutive_pair` style. - Add an `alloc_i64_pair` helper that searches `[(R4,R5), (R6,R7), (R8,R9), (R10,R11)]` for a free callee-saved pair, skipping any reg that's already in `vreg_to_arm.values()`, `local_to_reg.values()`, or `param_reserved_regs`. - Rewrite every i64 handler — I64Const, I64Load (non-param case), I64Add/Sub/And/Or/Xor/Mul/Shl/ShrS/ShrU/Rotl/Rotr/DivS/DivU/RemS/RemU, the comparisons (Eq/Ne/Lt/Gt/Le/Ge {S,U}, Eqz), Clz/Ctz/Popcnt, and Extend{8,16,32}S — to (a) read source regs from `vreg_to_arm` instead of assuming R0:R1/R2:R3, and (b) place the destination on a fresh pair via `alloc_i64_pair`. - Update the function epilogue: previously `is_i64_result` short-circuited the move-to-R0 because the result was already pinned at R0:R1. With the fix, the result lives wherever regalloc placed it, so emit explicit moves into R0 and R1, ordered to handle the rare case where lo lives in R1 (route via R12). - For Clz/Ctz/Popcnt, whose ArmOp encoder zeros `rnhi` in-place to produce the i64 result hi, copy the source hi into a fresh callee-saved hi slot before the op so we don't smash the original input or any unrelated AAPCS reg. Track the chosen physical hi reg via `last_result_vreg_hi_reg` for the epilogue. Verification: /tmp/match_gale.wat post-fix: 0: movs r4, #1 ; was: movs r0, #1 2: movs r5, #0 ; was: movs r1, #0 ... c: cmp r0, r1 ; params intact gale_k_sem_give_decide post-fix: e8: movs r4, #1 ; was: movs r0, #1 ea: movs r5, #0 ; was: movs r1, #0 `cargo test --workspace` is green. The no-optimize path (`instruction_selector.rs`) is untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../synth-synthesis/src/optimizer_bridge.rs | 1083 ++++++++++++----- 1 file changed, 790 insertions(+), 293 deletions(-) diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 2f741b0..510abfd 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -1277,6 +1277,12 @@ impl OptimizerBridge { // AAPCS: first 4 params in R0-R3 let param_regs = [Reg::R0, Reg::R1, Reg::R2, Reg::R3]; + // Reserved param registers: R0..R(min(num_params,4)). These hold incoming + // AAPCS arguments that must NOT be clobbered by i64 op handlers — at least + // until the user's WASM has done a `local.get` of each. Using Vec because + // `Reg` does not derive Hash (matches `instruction_selector::alloc_consecutive_pair`). + let param_reserved_regs: Vec = param_regs[..num_params.min(4)].to_vec(); + // Track which ARM register currently holds each local variable // This avoids stack spills for simple cases let mut local_to_reg: HashMap = HashMap::new(); @@ -1290,7 +1296,17 @@ impl OptimizerBridge { // Track the last value-producing vreg (for function return value) let mut last_result_vreg: Option = None; - // Track whether the last result is an i64 (result already in R0:R1, no move needed) + // For i64 returns, also track the hi-half vreg so the epilogue can move + // the pair into R0:R1 regardless of where regalloc placed it. Was previously + // unnecessary because every i64 op pinned its result to R0:R1 — that's the + // bug we're fixing here. + let mut last_result_vreg_hi: Option = None; + // For i64 ops whose IR Opcode only tracks a single `dest` vreg (Clz / Ctz / + // Popcnt), the hi half lives in a register chosen at lowering time but has + // no IR vreg pointing at it. Stash that physical reg directly so the + // epilogue can still emit the correct (R0, R1) move. + let mut last_result_vreg_hi_reg: Option = None; + // Track whether the last result is an i64 (return value occupies a pair). let mut is_i64_result = false; // WASM operand value stack - tracks vreg IDs for correct stack semantics // Used to restore last_result_vreg after br_if pops its condition @@ -1318,6 +1334,48 @@ impl OptimizerBridge { } }; + // Allocate a CONSECUTIVE callee-saved register pair for an i64 destination. + // + // Searches `[(R4,R5), (R6,R7), (R8,R9), (R10,R11)]` for a pair where neither + // register is currently: + // - holding a live vreg (`vreg_to_arm.values()`) + // - bound to a non-param local (`local_to_reg.values()`) + // - one of the AAPCS param registers we must preserve on entry + // (`param_reserved_regs`) + // + // Falls back to `(R4, R5)` if no pair is free — preserves prior behaviour + // for very-pressured functions, but at least keeps params intact in the + // common case. Per `instruction_selector::alloc_consecutive_pair`, callers + // who hit the fallback in real workloads will need spill support; that's + // out of scope for this fix. + let alloc_i64_pair = |vreg_to_arm: &HashMap, + local_to_reg: &HashMap, + param_reserved_regs: &[Reg]| + -> (Reg, Reg) { + const CANDIDATES: &[(Reg, Reg)] = &[ + (Reg::R4, Reg::R5), + (Reg::R6, Reg::R7), + (Reg::R8, Reg::R9), + (Reg::R10, Reg::R11), + ]; + let is_in_use = |r: Reg| -> bool { + vreg_to_arm.values().any(|&v| v == r) + || local_to_reg.values().any(|&v| v == r) + || param_reserved_regs.contains(&r) + }; + for &(lo, hi) in CANDIDATES { + if !is_in_use(lo) && !is_in_use(hi) { + return (lo, hi); + } + } + // Fallback — same hardcoded pair the buggy code used. Better than crashing, + // and matches existing behaviour when the caller is so pressured that + // even R8..R11 are occupied. (Empirically this never triggers for + // workloads we care about; if it does, the architectural fix is + // proper spilling, not a wider search.) + (Reg::R4, Reg::R5) + }; + // Emit a reload instruction if the vreg was spilled to stack. // Must be called before the instruction that uses the register. let reload_spill = |vreg: &OptReg, spills: &HashMap, instrs: &mut Vec| { @@ -2073,19 +2131,21 @@ impl OptimizerBridge { } => { // Map local index to register pair // Per AAPCS: i64 uses consecutive even/odd register pairs - let (lo_reg, hi_reg) = if *addr == 0 { + let (lo_reg, hi_reg) = if *addr == 0 && num_params >= 2 { (Reg::R0, Reg::R1) // First i64 param - } else if *addr == 1 { + } else if *addr == 1 && num_params >= 4 { (Reg::R2, Reg::R3) // Second i64 param } else { - // For other locals, we'd need stack access - // For now, use R4:R5 as temp - (Reg::R4, Reg::R5) + // Non-param i64 local: pick a free callee-saved pair so we + // don't clobber AAPCS arg regs that haven't been read yet. + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs) }; vreg_to_arm.insert(dest_lo.0, lo_reg); vreg_to_arm.insert(dest_hi.0, hi_reg); // No ARM instructions needed - values are already in registers for params last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; } Opcode::I64Const { @@ -2096,14 +2156,12 @@ impl OptimizerBridge { // Load 64-bit constant into register pair let lo = (*value & 0xFFFFFFFF) as u32; let hi = ((*value >> 32) & 0xFFFFFFFF) as u32; - // Choose register pair based on virtual register number - // If dest_lo.0 is 0 or 1, use R0:R1 (first i64 slot) - // If dest_lo.0 is 2 or 3, use R2:R3 (second i64 slot) - let (lo_reg, hi_reg) = if dest_lo.0 <= 1 { - (Reg::R0, Reg::R1) - } else { - (Reg::R2, Reg::R3) - }; + // Choose a free callee-saved pair so we don't trample params still + // sitting in R0..R3. The earlier heuristic (vreg-id → R0:R1 / R2:R3) + // ignored AAPCS, breaking any function that issued an i64.const + // before reading its i32 params. + let (lo_reg, hi_reg) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest_lo.0, lo_reg); vreg_to_arm.insert(dest_hi.0, hi_reg); // Load low word @@ -2142,511 +2200,886 @@ impl OptimizerBridge { }); } } + // If this i64 const is the final return value, the epilogue + // needs to know which pair holds it (for the move into R0:R1). + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; } Opcode::I64Add { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.add: R0:R1 = R0:R1 + R2:R3 - // ADDS R0, R0, R2 (sets carry) - // ADC R1, R1, R3 (adds carry) - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + // i64.add: rd = rn + rm using the actual operand regs from + // vreg_to_arm — NOT hardcoded R0:R1/R2:R3 (which would clobber + // AAPCS param regs). + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Adds { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Adc { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Sub { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.sub: R0:R1 = R0:R1 - R2:R3 - // SUBS R0, R0, R2 (sets borrow) - // SBC R1, R1, R3 (subtracts borrow) - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Subs { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Sbc { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64And { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.and: R0:R1 = R0:R1 & R2:R3 - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::And { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::And { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Or { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.or: R0:R1 = R0:R1 | R2:R3 - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Orr { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Orr { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Xor { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.xor: R0:R1 = R0:R1 ^ R2:R3 - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Eor { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Eor { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // ======================================================================== - // i64 Comparisons (result is single i32 in R0) + // i64 Comparisons (result is single i32) + // + // Sources are read from `vreg_to_arm[src*]` rather than hardcoded + // R0:R1/R2:R3 — the latter would mean "i64 ops always assume their + // operands materialised at the AAPCS arg slots", which is false: + // operand registers come from whatever the upstream IR producers + // (I64Const, I64Load, prior i64 ops) chose. Result lands on the lo + // half of a freshly allocated callee-saved pair so we don't smash + // any AAPCS arg reg the user hasn't read yet. // ======================================================================== - Opcode::I64Eq { dest, .. } => { - // i64.eq: (R0:R1) == (R2:R3), result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64Eq { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::EQ, }); last_result_vreg = Some(dest.0); } - Opcode::I64Ne { dest, .. } => { - // i64.ne: (R0:R1) != (R2:R3), result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64Ne { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::NE, }); last_result_vreg = Some(dest.0); } - Opcode::I64LtS { dest, .. } => { - // i64.lt_s: (R0:R1) < (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LT, }); last_result_vreg = Some(dest.0); } - Opcode::I64GtS { dest, .. } => { - // i64.gt_s: (R0:R1) > (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::GT, }); last_result_vreg = Some(dest.0); } - Opcode::I64LeS { dest, .. } => { - // i64.le_s: (R0:R1) <= (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LE, }); last_result_vreg = Some(dest.0); } - Opcode::I64GeS { dest, .. } => { - // i64.ge_s: (R0:R1) >= (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::GE, }); last_result_vreg = Some(dest.0); } // Unsigned i64 comparisons - Opcode::I64LtU { dest, .. } => { - // i64.lt_u: (R0:R1) < (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LO, }); last_result_vreg = Some(dest.0); } - Opcode::I64GtU { dest, .. } => { - // i64.gt_u: (R0:R1) > (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::HI, }); last_result_vreg = Some(dest.0); } - Opcode::I64LeU { dest, .. } => { - // i64.le_u: (R0:R1) <= (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LS, }); last_result_vreg = Some(dest.0); } - Opcode::I64GeU { dest, .. } => { - // i64.ge_u: (R0:R1) >= (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::HS, }); last_result_vreg = Some(dest.0); } - Opcode::I64Eqz { dest, .. } => { - // i64.eqz: (R0:R1) == 0, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64Eqz { + dest, + src_lo, + src_hi, + } => { + let rn_lo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCondZ { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, + rd, + rn_lo, + rn_hi, }); last_result_vreg = Some(dest.0); } - // i64 count leading zeros (returns i64 where high word is always 0) - Opcode::I64Clz { dest, .. } => { - vreg_to_arm.insert(dest.0, Reg::R0); + // i64 count leading zeros (i64 result: lo gets count, hi must be 0). + // + // The ArmOp::I64Clz encoder writes the count into `rd` AND zeroes + // `rnhi` in-place — so `rnhi` doubles as the result's hi half. To + // keep the upstream src_hi register intact and avoid clobbering + // unrelated AAPCS regs, we copy src_hi into a freshly allocated + // callee-saved hi slot and pass that as `rnhi`. After the encoded + // sequence, the i64 result lives in (rd_lo, rd_hi). + // + // The IR Opcode only carries a single `dest` vreg (the lo half); + // we register dest.0 → rd_lo. The hi-zero is implicit and used by + // the function epilogue when this is the i64 return value (see + // last_result_vreg_hi_reg below). + Opcode::I64Clz { + dest, + src_lo, + src_hi, + } => { + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + if rd_hi != rnhi_src { + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Reg(rnhi_src), + }); + } + vreg_to_arm.insert(dest.0, rd_lo); arm_instrs.push(ArmOp::I64Clz { - rd: Reg::R0, - rnlo: Reg::R0, - rnhi: Reg::R1, + rd: rd_lo, + rnlo, + rnhi: rd_hi, }); last_result_vreg = Some(dest.0); + last_result_vreg_hi_reg = Some(rd_hi); is_i64_result = true; } - // i64 count trailing zeros (returns i64 where high word is always 0) - Opcode::I64Ctz { dest, .. } => { - vreg_to_arm.insert(dest.0, Reg::R0); + // i64 count trailing zeros — same pattern as I64Clz above. + Opcode::I64Ctz { + dest, + src_lo, + src_hi, + } => { + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + if rd_hi != rnhi_src { + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Reg(rnhi_src), + }); + } + vreg_to_arm.insert(dest.0, rd_lo); arm_instrs.push(ArmOp::I64Ctz { - rd: Reg::R0, - rnlo: Reg::R0, - rnhi: Reg::R1, + rd: rd_lo, + rnlo, + rnhi: rd_hi, }); last_result_vreg = Some(dest.0); + last_result_vreg_hi_reg = Some(rd_hi); is_i64_result = true; } - // i64 population count (returns i64 where high word is always 0) - Opcode::I64Popcnt { dest, .. } => { - vreg_to_arm.insert(dest.0, Reg::R0); + // i64 population count — same pattern as I64Clz above. + Opcode::I64Popcnt { + dest, + src_lo, + src_hi, + } => { + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + if rd_hi != rnhi_src { + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Reg(rnhi_src), + }); + } + vreg_to_arm.insert(dest.0, rd_lo); arm_instrs.push(ArmOp::I64Popcnt { - rd: Reg::R0, - rnlo: Reg::R0, - rnhi: Reg::R1, + rd: rd_lo, + rnlo, + rnhi: rd_hi, }); last_result_vreg = Some(dest.0); + last_result_vreg_hi_reg = Some(rd_hi); is_i64_result = true; } // i64 sign extension operations Opcode::I64Extend8S { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src_lo, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); - arm_instrs.push(ArmOp::I64Extend8S { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - }); + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); + arm_instrs.push(ArmOp::I64Extend8S { rdlo, rdhi, rnlo }); last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Extend16S { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src_lo, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); - arm_instrs.push(ArmOp::I64Extend16S { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - }); + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); + arm_instrs.push(ArmOp::I64Extend16S { rdlo, rdhi, rnlo }); last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Extend32S { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src_lo, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); - arm_instrs.push(ArmOp::I64Extend32S { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - }); + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); + arm_instrs.push(ArmOp::I64Extend32S { rdlo, rdhi, rnlo }); last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 multiply: UMULL + MLA cross products Opcode::I64Mul { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64Mul { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 shift left Opcode::I64Shl { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64Shl { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 arithmetic shift right Opcode::I64ShrS { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64ShrS { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 logical shift right Opcode::I64ShrU { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64ShrU { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 rotate left Opcode::I64Rotl { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + .. } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let shift = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64Rotl { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - shift: Reg::R2, // Only use low word of shift amount + rdlo, + rdhi, + rnlo, + rnhi, + shift, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 rotate right Opcode::I64Rotr { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + .. } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let shift = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64Rotr { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - shift: Reg::R2, // Only use low word of shift amount + rdlo, + rdhi, + rnlo, + rnhi, + shift, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 signed division Opcode::I64DivS { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64DivS { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 unsigned division Opcode::I64DivU { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64DivU { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 signed remainder Opcode::I64RemS { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64RemS { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 unsigned remainder Opcode::I64RemU { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64RemU { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } @@ -3066,9 +3499,73 @@ impl OptimizerBridge { } } - // Ensure return value is in R0 (skip for i64 results which are already in R0:R1) - if !is_i64_result - && let Some(result_vreg) = last_result_vreg + // Ensure the return value is in R0 (i32 result) or R0:R1 (i64 result). + // + // Pre-fix, every i64 op pinned its result at R0:R1 so this could be a + // no-op for is_i64_result. After the fix, the result pair may live in + // any callee-saved pair (R4:R5..R10:R11), and we need an explicit move. + // The order matters: copy hi → R1 first, then lo → R0, so we don't + // clobber the lo value if the source happens to be R1. + if is_i64_result { + // Resolve the lo half from vreg_to_arm. + let lo_reg = last_result_vreg.and_then(|v| vreg_to_arm.get(&v).copied()); + // Resolve the hi half: prefer an explicit vreg id, else fall back to + // the physical reg stash used by Clz/Ctz/Popcnt. + let hi_reg = last_result_vreg_hi + .and_then(|v| vreg_to_arm.get(&v).copied()) + .or(last_result_vreg_hi_reg); + + if let (Some(lo), Some(hi)) = (lo_reg, hi_reg) { + // Move hi first (so we don't clobber lo if hi's source is R1). + if hi != Reg::R1 { + arm_instrs.push(ArmOp::Mov { + rd: Reg::R1, + op2: Operand2::Reg(hi), + }); + } + // Now move lo. If lo was R1 originally, it just got smashed by + // the hi-move above; but R1's prior contents are now in R1 + // (the hi value), so we'd actually have wanted to save lo first. + // Handle that case explicitly: save lo to R12 (IP scratch) first. + if lo == Reg::R1 && hi != Reg::R1 { + // lo was in R1, which we just overwrote. We can't recover it + // unless we saved earlier. The clean fix: detect this + // arrangement up front. For now, swap order via R12. + // (This is reached only on bizarre regalloc choices; the + // common case is lo in R4..R10, which doesn't hit it.) + arm_instrs.pop(); // remove the hi-move we just emitted + arm_instrs.push(ArmOp::Mov { + rd: Reg::R12, + op2: Operand2::Reg(lo), + }); + if hi != Reg::R1 { + arm_instrs.push(ArmOp::Mov { + rd: Reg::R1, + op2: Operand2::Reg(hi), + }); + } + arm_instrs.push(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(Reg::R12), + }); + } else if lo != Reg::R0 { + arm_instrs.push(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(lo), + }); + } + } else if let Some(lo) = lo_reg + && lo != Reg::R0 + { + // Hi is unknown — fall back to single-register move (caller of + // this function may have set is_i64_result without populating + // the hi tracker; preserve old behaviour rather than crash). + arm_instrs.push(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(lo), + }); + } + } else if let Some(result_vreg) = last_result_vreg && let Some(&result_reg) = vreg_to_arm.get(&result_vreg) && result_reg != Reg::R0 { From 74aed34560712f4bf7dcfb4ab630ef2a31da861c Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Sun, 3 May 2026 15:16:42 +0200 Subject: [PATCH 6/6] feat(m7): high-end Cortex-M7 hardware profiles + 16-MPU-region support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up production-grade Cortex-M7 targets across the toolchain. * Add HardwareCapabilities::imxrt1062() — Cortex-M7 r1p1 with single-precision FPU, 16 MPU regions, 8MB external QSPI flash, 1MB OCRAM. Representative high-end M7 configuration for safety-grade lockstep platforms. * Add HardwareCapabilities::stm32h743() — Cortex-M7 with double-precision FPU, 16 MPU regions, 2MB Flash, 1MB RAM. * CLI: --hardware {imxrt1062,stm32h743} and target-info wired up. * Renode: tests/renode/synth_cortex_m7.repl models a 600 MHz M7 with ITCM/DTCM/ OCRAM and an 8MB XIP flash window. cortex_m7_test.robot exercises Synth's --target cortex-m7 codegen path through the platform. * MPU allocator: add tests proving the existing hw_caps.mpu_regions plumbing scales to 16 regions on M7 and that 8-region M4-class parts still reject the 9th allocation. * Integration: tests/integration/m7_codegen_smoke.sh — offline smoke test for i32, f32 and f64 codegen on cortex-m7 / cortex-m7dp. Also fixes 8 clippy errors (unnecessary_sort_by, collapsible_match, collapsible_if, manual_checked_division) introduced by a clippy lint refresh on the parent branches — required for CI to clear -D warnings. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/synth-analysis/src/ssa.rs | 10 +- crates/synth-backend/src/mpu_allocator.rs | 82 ++++++++++++++++ crates/synth-cli/src/main.rs | 19 +++- crates/synth-core/src/target.rs | 43 +++++++++ .../src/instruction_selector.rs | 81 +++++----------- .../synth-synthesis/src/optimizer_bridge.rs | 64 ++++--------- crates/synth-synthesis/src/pattern_matcher.rs | 2 +- crates/synth-synthesis/src/rules.rs | 2 +- .../tests/semantic_correctness.rs | 12 +-- tests/integration/m7_codegen_smoke.sh | 95 +++++++++++++++++++ tests/renode/BUILD.bazel | 28 +++++- tests/renode/cortex_m7_test.robot | 32 +++++++ tests/renode/synth_cortex_m7.repl | 29 ++++++ 13 files changed, 376 insertions(+), 123 deletions(-) create mode 100755 tests/integration/m7_codegen_smoke.sh create mode 100644 tests/renode/cortex_m7_test.robot create mode 100644 tests/renode/synth_cortex_m7.repl diff --git a/crates/synth-analysis/src/ssa.rs b/crates/synth-analysis/src/ssa.rs index 37a77f2..920adc3 100644 --- a/crates/synth-analysis/src/ssa.rs +++ b/crates/synth-analysis/src/ssa.rs @@ -335,11 +335,11 @@ impl DeadCodeElimination { SSAInstr::Assign { result, .. } | SSAInstr::BinOp { result, .. } | SSAInstr::UnaryOp { result, .. } - | SSAInstr::Load { result, .. } => { - if !used_vars.contains(result) { - removed += 1; - return false; - } + | SSAInstr::Load { result, .. } + if !used_vars.contains(result) => + { + removed += 1; + return false; } _ => {} } diff --git a/crates/synth-backend/src/mpu_allocator.rs b/crates/synth-backend/src/mpu_allocator.rs index 95b31f9..58f836c 100644 --- a/crates/synth-backend/src/mpu_allocator.rs +++ b/crates/synth-backend/src/mpu_allocator.rs @@ -339,4 +339,86 @@ mod tests { assert!(region.validate().is_ok()); } } + + #[test] + fn test_imxrt1062_has_16_regions() { + // i.MX RT1062 (M7-class) has 16 MPU regions vs 8 on M4-class parts + let hw_caps = HardwareCapabilities::imxrt1062(); + assert_eq!(hw_caps.mpu_regions, 16); + + let allocator = MPUAllocator::new(hw_caps); + assert_eq!(allocator.available_regions(), 16); + } + + #[test] + fn test_m7_can_allocate_more_than_8_regions() { + // Validate that the allocator actually uses all 16 regions on M7 + let mut allocator = MPUAllocator::new(HardwareCapabilities::imxrt1062()); + + for i in 0u32..16 { + let request = MPUAllocationRequest { + memory: Memory { + index: i, + initial: 1, + maximum: None, + shared: false, + memory64: false, + }, + permissions: MPUPermissions::FullRW, + attributes: MPUAttributes::normal(), + preferred_base: Some(0x20000000 + i * 0x10000), + }; + allocator.allocate(request).unwrap_or_else(|e| { + panic!("region {} allocation failed: {:?}", i, e); + }); + } + + assert_eq!(allocator.available_regions(), 0); + assert_eq!(allocator.allocated_regions().len(), 16); + } + + #[test] + fn test_m4_class_caps_at_8_regions() { + // Negative — M4-class parts must reject the 9th region. + let mut allocator = MPUAllocator::new(HardwareCapabilities::nrf52840()); + + for i in 0u32..8 { + let request = MPUAllocationRequest { + memory: Memory { + index: i, + initial: 1, + maximum: None, + shared: false, + memory64: false, + }, + permissions: MPUPermissions::FullRW, + attributes: MPUAttributes::normal(), + preferred_base: Some(0x20000000 + i * 0x10000), + }; + allocator.allocate(request).unwrap(); + } + + // 9th region must fail + let overflow = MPUAllocationRequest { + memory: Memory { + index: 8, + initial: 1, + maximum: None, + shared: false, + memory64: false, + }, + permissions: MPUPermissions::FullRW, + attributes: MPUAttributes::normal(), + preferred_base: Some(0x20100000), + }; + assert!(allocator.allocate(overflow).is_err()); + } + + #[test] + fn test_stm32h743_has_16_regions_and_double_fpu() { + let caps = HardwareCapabilities::stm32h743(); + assert_eq!(caps.mpu_regions, 16); + assert!(caps.has_fpu); + assert_eq!(caps.fpu_precision, Some(synth_core::FPUPrecision::Double)); + } } diff --git a/crates/synth-cli/src/main.rs b/crates/synth-cli/src/main.rs index af1d77d..b78ea11 100644 --- a/crates/synth-cli/src/main.rs +++ b/crates/synth-cli/src/main.rs @@ -77,7 +77,7 @@ enum Commands { )] target: String, - /// Hardware config (nrf52840, stm32f407, or custom) + /// Hardware config (nrf52840, stm32f407, stm32h743, imxrt1062, or custom) #[arg(long, value_name = "HARDWARE", default_value = "nrf52840")] hardware: String, @@ -363,9 +363,11 @@ fn synthesize_command( let hw_caps = match hardware.as_str() { "nrf52840" => HardwareCapabilities::nrf52840(), "stm32f407" => HardwareCapabilities::stm32f407(), + "stm32h743" => HardwareCapabilities::stm32h743(), + "imxrt1062" => HardwareCapabilities::imxrt1062(), _ => { anyhow::bail!( - "Unsupported hardware: {}. Use nrf52840 or stm32f407", + "Unsupported hardware: {}. Use nrf52840, stm32f407, stm32h743, imxrt1062", hardware ); } @@ -405,8 +407,19 @@ fn target_info_command(target: String) -> Result<()> { let caps = HardwareCapabilities::stm32f407(); print_hardware_info(&caps); } + "stm32h743" => { + let caps = HardwareCapabilities::stm32h743(); + print_hardware_info(&caps); + } + "imxrt1062" => { + let caps = HardwareCapabilities::imxrt1062(); + print_hardware_info(&caps); + } _ => { - anyhow::bail!("Unknown target: {}. Supported: nrf52840, stm32f407", target); + anyhow::bail!( + "Unknown target: {}. Supported: nrf52840, stm32f407, stm32h743, imxrt1062", + target + ); } } diff --git a/crates/synth-core/src/target.rs b/crates/synth-core/src/target.rs index b808679..93e73bd 100644 --- a/crates/synth-core/src/target.rs +++ b/crates/synth-core/src/target.rs @@ -234,6 +234,49 @@ impl HardwareCapabilities { ram_size: 192 * 1024, // 192KB (128KB + 64KB CCM) } } + + /// Create capabilities for STM32H743 (Cortex-M7 with double-precision FPU) + /// + /// 16 MPU regions, 2MB Flash, 1MB RAM (DTCM + AXI SRAM + SRAM1-4). + pub fn stm32h743() -> Self { + Self { + arch: TargetArch::ARMCortexM(CortexMVariant::M7DP), + has_mpu: true, + mpu_regions: 16, + has_pmp: false, + pmp_entries: 0, + has_fpu: true, + fpu_precision: Some(FPUPrecision::Double), + has_simd: false, + simd_level: None, + xip_capable: true, + flash_size: 2 * 1024 * 1024, // 2MB + ram_size: 1024 * 1024, // 1MB total + } + } + + /// Create capabilities for i.MX RT1062 (Cortex-M7 with single-precision FPU) + /// + /// Representative high-end M7 with 16 MPU regions, single-precision FPU, + /// large OCRAM, and external XIP-capable QuadSPI Flash. Matches the + /// configuration of safety-grade lockstepped M7 platforms used in + /// industrial and embedded automotive contexts. + pub fn imxrt1062() -> Self { + Self { + arch: TargetArch::ARMCortexM(CortexMVariant::M7), + has_mpu: true, + mpu_regions: 16, + has_pmp: false, + pmp_entries: 0, + has_fpu: true, + fpu_precision: Some(FPUPrecision::Single), + has_simd: false, + simd_level: None, + xip_capable: true, + flash_size: 8 * 1024 * 1024, // 8MB external QSPI flash (typical) + ram_size: 1024 * 1024, // 1MB OCRAM (FlexRAM 512KB + OCRAM 512KB) + } + } } // ============================================================================ diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index e35608b..6821773 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -254,10 +254,10 @@ fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout { let mut used: BTreeSet = BTreeSet::new(); for op in wasm_ops { match op { - WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => { - if *idx >= num_params { - used.insert(*idx); - } + WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) + if *idx >= num_params => + { + used.insert(*idx); } _ => {} } @@ -277,10 +277,7 @@ fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout { // Round frame to 8-byte multiple for AAPCS SP alignment. let frame_size = (offset + 7) & !7; - LocalLayout { - locals, - frame_size, - } + LocalLayout { locals, frame_size } } /// Infer which non-parameter wasm locals are i64 (8-byte) values. @@ -343,17 +340,13 @@ fn infer_i64_locals(wasm_ops: &[WasmOp]) -> std::collections::HashSet { vstack.push(is_i64); } LocalSet(idx) => { - if let Some(is_i64) = vstack.pop() { - if is_i64 { - i64_locals.insert(*idx); - } + if let Some(true) = vstack.pop() { + i64_locals.insert(*idx); } } LocalTee(idx) => { - if let Some(&is_i64) = vstack.last() { - if is_i64 { - i64_locals.insert(*idx); - } + if let Some(&true) = vstack.last() { + i64_locals.insert(*idx); } } Select => { @@ -3513,8 +3506,7 @@ impl InstructionSelector { // alloc_temp_safe can return non-consecutive registers // when something in between is live, breaking the // pair convention. - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack, &[])?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?; instructions.push(ArmInstruction { op: ArmOp::I64Ldr { rdlo: dst_lo, @@ -4943,11 +4935,8 @@ impl InstructionSelector { // Avoid clobbering the just-popped operand pairs before // the ADC reads them — passing them in extra_avoid // ensures dst doesn't overlap any of a_lo/a_hi/b_lo/b_hi. - let (dst_lo, dst_hi) = alloc_consecutive_pair( - &mut next_temp, - &stack, - &[a_lo, a_hi, b_lo, b_hi], - )?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; // ADDS dst_lo, a_lo, b_lo (sets carry flag) instructions.push(ArmInstruction { @@ -4991,11 +4980,8 @@ impl InstructionSelector { // See I64Add for why extra_avoid carries a_*/b_* — // dst must not overlap any operand half before SBC reads it. - let (dst_lo, dst_hi) = alloc_consecutive_pair( - &mut next_temp, - &stack, - &[a_lo, a_hi, b_lo, b_hi], - )?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; // SUBS dst_lo, a_lo, b_lo (sets borrow flag) instructions.push(ArmInstruction { @@ -5048,11 +5034,8 @@ impl InstructionSelector { // dst must not overlap any popped operand's half — the // hi instruction reads a_hi and b_hi after the lo // instruction writes dst_lo. - let (dst_lo, dst_hi) = alloc_consecutive_pair( - &mut next_temp, - &stack, - &[a_lo, a_hi, b_lo, b_hi], - )?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; let (lo_op, hi_op) = match op { I64Or => ( ArmOp::Orr { @@ -5114,15 +5097,12 @@ impl InstructionSelector { // ============================================================ I64ExtendI32U => { let val = stack.pop().ok_or_else(|| { - synth_core::Error::synthesis( - "stack underflow in I64ExtendI32U".to_string(), - ) + synth_core::Error::synthesis("stack underflow in I64ExtendI32U".to_string()) })?; // val must stay alive until the Mov reads it; dst_hi // must not be val (we'd write the zero high before // moving val to dst_lo). - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; if val != dst_lo { instructions.push(ArmInstruction { op: ArmOp::Mov { @@ -5146,12 +5126,9 @@ impl InstructionSelector { I64ExtendI32S => { let val = stack.pop().ok_or_else(|| { - synth_core::Error::synthesis( - "stack underflow in I64ExtendI32S".to_string(), - ) + synth_core::Error::synthesis("stack underflow in I64ExtendI32S".to_string()) })?; - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; instructions.push(ArmInstruction { op: ArmOp::I64ExtendI32S { rdlo: dst_lo, @@ -5174,25 +5151,18 @@ impl InstructionSelector { // ============================================================ I64Shl | I64ShrU | I64ShrS => { let b_lo = stack.pop().ok_or_else(|| { - synth_core::Error::synthesis( - "stack underflow in i64 shift".to_string(), - ) + synth_core::Error::synthesis("stack underflow in i64 shift".to_string()) })?; let a_lo = stack.pop().ok_or_else(|| { - synth_core::Error::synthesis( - "stack underflow in i64 shift".to_string(), - ) + synth_core::Error::synthesis("stack underflow in i64 shift".to_string()) })?; let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; // dst must not overlap any popped operand's half — the // shift pseudo-op reads all four (rn_lo/rn_hi/rm_lo/rm_hi) // before writing the destination. - let (dst_lo, dst_hi) = alloc_consecutive_pair( - &mut next_temp, - &stack, - &[a_lo, a_hi, b_lo, b_hi], - )?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; let shift_op = match op { I64Shl => ArmOp::I64Shl { rd_lo: dst_lo, @@ -5242,8 +5212,7 @@ impl InstructionSelector { // and is called by every i64 op downstream to recover // the high register. Avoid clobbering addr before the // load uses it. - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; // Generate bounds-checked i64 load into the allocated pair let load_ops = diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 510abfd..5c506d3 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -2387,8 +2387,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2412,8 +2411,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2437,8 +2435,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2462,8 +2459,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2487,8 +2483,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2512,8 +2507,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2538,8 +2532,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2563,8 +2556,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2588,8 +2580,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2613,8 +2604,7 @@ impl OptimizerBridge { let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { rd, @@ -2634,14 +2624,9 @@ impl OptimizerBridge { } => { let rn_lo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); let rn_hi = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); - let (rd, _) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest.0, rd); - arm_instrs.push(ArmOp::I64SetCondZ { - rd, - rn_lo, - rn_hi, - }); + arm_instrs.push(ArmOp::I64SetCondZ { rd, rn_lo, rn_hi }); last_result_vreg = Some(dest.0); } @@ -3429,36 +3414,19 @@ impl OptimizerBridge { ArmOp::Mov { rd, op2: Operand2::Imm(v), - } => { - if reg_num(rd) > 7 || *v > 255 || *v < 0 { - 4 - } else { - 2 - } - } + } if reg_num(rd) > 7 || *v > 255 || *v < 0 => 4, + ArmOp::Mov { .. } => 2, // SUB/ADD with high registers need 32-bit encoding ArmOp::Sub { rd, rn, op2: Operand2::Reg(rm), - } => { - if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 { - 4 - } else { - 2 - } - } + } if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 => 4, ArmOp::Add { rd, rn, op2: Operand2::Reg(rm), - } => { - if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 { - 4 - } else { - 2 - } - } + } if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 => 4, // Most 16-bit Thumb instructions (MOV low, CMP low, B, etc.) _ => 2, } diff --git a/crates/synth-synthesis/src/pattern_matcher.rs b/crates/synth-synthesis/src/pattern_matcher.rs index 3c61b36..c7e1d30 100644 --- a/crates/synth-synthesis/src/pattern_matcher.rs +++ b/crates/synth-synthesis/src/pattern_matcher.rs @@ -53,7 +53,7 @@ impl PatternMatcher { } // Sort by priority (highest first) - matches.sort_by(|a, b| b.rule.priority.cmp(&a.rule.priority)); + matches.sort_by_key(|m| std::cmp::Reverse(m.rule.priority)); matches } diff --git a/crates/synth-synthesis/src/rules.rs b/crates/synth-synthesis/src/rules.rs index 8180fbd..687b40a 100644 --- a/crates/synth-synthesis/src/rules.rs +++ b/crates/synth-synthesis/src/rules.rs @@ -1835,7 +1835,7 @@ impl RuleDatabase { pub fn add_rule(&mut self, rule: SynthesisRule) { self.rules.push(rule); // Sort by priority (highest first) - self.rules.sort_by(|a, b| b.priority.cmp(&a.priority)); + self.rules.sort_by_key(|r| std::cmp::Reverse(r.priority)); } /// Get all rules diff --git a/crates/synth-synthesis/tests/semantic_correctness.rs b/crates/synth-synthesis/tests/semantic_correctness.rs index 9ae0910..45e72ba 100644 --- a/crates/synth-synthesis/tests/semantic_correctness.rs +++ b/crates/synth-synthesis/tests/semantic_correctness.rs @@ -146,8 +146,8 @@ fn interpret_single(state: &mut ArmState, instr: &ArmInstruction) { ArmOp::Udiv { rd, rn, rm } => { let a = state.get(rn); let b = state.get(rm); - if b != 0 { - state.set(*rd, a / b); + if let Some(q) = a.checked_div(b) { + state.set(*rd, q); } } ArmOp::Mls { rd, rn, rm, ra } => { @@ -233,11 +233,9 @@ fn interpret_single(state: &mut ArmState, instr: &ArmInstruction) { let sr = result as i32; state.flag_v = (sa > 0 && sb > 0 && sr < 0) || (sa < 0 && sb < 0 && sr >= 0); } - ArmOp::SelectMove { rd, rm, cond } => { - if state.condition_met(cond) { - let val = state.get(rm); - state.set(*rd, val); - } + ArmOp::SelectMove { rd, rm, cond } if state.condition_met(cond) => { + let val = state.get(rm); + state.set(*rd, val); } // Skip non-computational instructions (prologue/epilogue, branches, labels) _ => {} diff --git a/tests/integration/m7_codegen_smoke.sh b/tests/integration/m7_codegen_smoke.sh new file mode 100755 index 0000000..1efbc6e --- /dev/null +++ b/tests/integration/m7_codegen_smoke.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Smoke test: validate that synth's Cortex-M7 codegen path produces a +# well-formed ELF for both single-precision (M7) and double-precision (M7DP) +# targets, exercising f32 and f64 arithmetic. +# +# Companion to fetch_osxcar_wasm.sh, which exercises M7DP with real-world +# components. This test runs without network access and is suitable for CI. +# +# Usage: +# bash tests/integration/m7_codegen_smoke.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +SYNTH="$PROJECT_ROOT/target/debug/synth" +TMPDIR="${TMPDIR:-/tmp}/synth_m7_smoke_$$" + +cleanup() { rm -rf "$TMPDIR"; } +trap cleanup EXIT +mkdir -p "$TMPDIR" + +echo "=== Synth M7 codegen smoke test ===" + +if [ ! -x "$SYNTH" ]; then + (cd "$PROJECT_ROOT" && cargo build -p synth-cli --quiet) +fi + +# i32-only module — should compile under M7 (single FPU) +cat > "$TMPDIR/i32_only.wat" << 'WAT' +(module + (func (export "add") (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add) + (func (export "sub") (param i32 i32) (result i32) + local.get 0 local.get 1 i32.sub) + (func (export "mul") (param i32 i32) (result i32) + local.get 0 local.get 1 i32.mul) + (memory (export "memory") 1)) +WAT + +# f32 module — single-precision, should compile under M7 +cat > "$TMPDIR/f32.wat" << 'WAT' +(module + (func (export "fadd") (param f32 f32) (result f32) + local.get 0 local.get 1 f32.add) + (func (export "fmul") (param f32 f32) (result f32) + local.get 0 local.get 1 f32.mul) + (memory (export "memory") 1)) +WAT + +# f64 module — double-precision, should compile under M7DP only +cat > "$TMPDIR/f64.wat" << 'WAT' +(module + (func (export "dadd") (param f64 f64) (result f64) + local.get 0 local.get 1 f64.add) + (func (export "dmul") (param f64 f64) (result f64) + local.get 0 local.get 1 f64.mul) + (memory (export "memory") 1)) +WAT + +PASS=0 +FAIL=0 + +check_compile() { + local label="$1"; local wat="$2"; local target="$3"; local expect="$4" + local elf="$TMPDIR/${label}.elf" + if "$SYNTH" compile "$wat" -o "$elf" --target "$target" --all-exports >/dev/null 2>&1; then + result="ok" + else + result="fail" + fi + if [ "$result" = "$expect" ]; then + echo "PASS: ${label} on ${target} → ${result}" + PASS=$((PASS + 1)) + else + echo "FAIL: ${label} on ${target} → ${result} (expected ${expect})" + FAIL=$((FAIL + 1)) + fi +} + +# i32 should compile on every M7 variant +check_compile "i32_m7" "$TMPDIR/i32_only.wat" cortex-m7 ok +check_compile "i32_m7dp" "$TMPDIR/i32_only.wat" cortex-m7dp ok + +# f32 should compile on both (M7 has single-precision FPU) +check_compile "f32_m7" "$TMPDIR/f32.wat" cortex-m7 ok +check_compile "f32_m7dp" "$TMPDIR/f32.wat" cortex-m7dp ok + +# f64 must compile on M7DP. On M7 it should also work — synth falls back +# to soft-float helpers when hardware doesn't support double-precision. +check_compile "f64_m7dp" "$TMPDIR/f64.wat" cortex-m7dp ok + +echo "" +echo "=== Results: ${PASS} passed, ${FAIL} failed ===" +[ "$FAIL" -eq 0 ] diff --git a/tests/renode/BUILD.bazel b/tests/renode/BUILD.bazel index 83f32c7..2a247a6 100644 --- a/tests/renode/BUILD.bazel +++ b/tests/renode/BUILD.bazel @@ -1,7 +1,10 @@ load("@rules_renode//renode:defs.bzl", "renode_test") -# Export platform file for use by other test packages -exports_files(["synth_cortex_m.repl"]) +# Export platform files for use by other test packages +exports_files([ + "synth_cortex_m.repl", + "synth_cortex_m7.repl", +]) # Renode-based integration tests for Synth-generated ARM binaries @@ -46,3 +49,24 @@ renode_test( }, tags = ["renode"], ) + +# M7 codegen path: same WAT, compiled with --target cortex-m7 +genrule( + name = "test_add_m7_elf", + srcs = ["//examples/wat:simple_add.wat"], + outs = ["test_add_m7.elf"], + cmd = "$(location //crates:synth) compile $(location //examples/wat:simple_add.wat) -o $@ --target cortex-m7", + tools = ["//crates:synth"], +) + +renode_test( + name = "cortex_m7_add_test", + robot_test = "cortex_m7_test.robot", + deps = [ + "synth_cortex_m7.repl", + ], + variables_with_label = { + "ELF": "//tests/renode:test_add_m7_elf", + }, + tags = ["renode"], +) diff --git a/tests/renode/cortex_m7_test.robot b/tests/renode/cortex_m7_test.robot new file mode 100644 index 0000000..a7ce7b3 --- /dev/null +++ b/tests/renode/cortex_m7_test.robot @@ -0,0 +1,32 @@ +*** Settings *** +Documentation Cortex-M7 ELF execution test for Synth-generated binaries +... Validates that Synth's M7 codegen path emits a correctly +... structured ELF that loads and executes on a 16-MPU-region +... M7-class platform with single-precision FPU. + +*** Variables *** +${PLATFORM} ${CURDIR}/synth_cortex_m7.repl + +*** Keywords *** +Create Cortex-M7 Machine + Execute Command mach create "synth-m7-test" + Execute Command machine LoadPlatformDescription @${PLATFORM} + +*** Test Cases *** +Should Load And Execute Simple Add Function On M7 + [Documentation] Synth-generated --target cortex-m7 ELF executes correctly + Create Cortex-M7 Machine + + Execute Command sysbus LoadELF "${ELF}" + + # The add function lives at 0xA0 (user code, after 28-byte startup + handlers) + Execute Command cpu PC 0xA1 + + # AAPCS: r0 = 5, r1 = 3, expected result = 8 + Execute Command cpu SetRegisterUnsafe 0 5 + Execute Command cpu SetRegisterUnsafe 1 3 + + Execute Command cpu Step 2 + + ${r0}= Execute Command cpu GetRegisterUnsafe 0 + Should Be Equal As Integers ${r0} 8 msg=Expected r0 to be 8 (5+3) on M7 diff --git a/tests/renode/synth_cortex_m7.repl b/tests/renode/synth_cortex_m7.repl new file mode 100644 index 0000000..cb194c4 --- /dev/null +++ b/tests/renode/synth_cortex_m7.repl @@ -0,0 +1,29 @@ +// High-end Cortex-M7 platform for Synth-generated binaries +// Models a typical M7 SoC with single-precision FPU, 16 MPU regions, +// large OCRAM, and external XIP-capable QuadSPI flash. Vector table +// lives at 0x60000000 (XIP flash window) on i.MX RT-class chips, +// but we place the binary at 0x0 here for simple bring-up. + +flash: Memory.MappedMemory @ sysbus 0x0 + size: 0x800000 // 8MB external QSPI flash window + +itcm: Memory.MappedMemory @ sysbus 0x00080000 + size: 0x40000 // 256KB ITCM + +dtcm: Memory.MappedMemory @ sysbus 0x20000000 + size: 0x40000 // 256KB DTCM + +ocram: Memory.MappedMemory @ sysbus 0x20200000 + size: 0x80000 // 512KB OCRAM (FlexRAM-mapped) + +ocram2: Memory.MappedMemory @ sysbus 0x20280000 + size: 0x80000 // 512KB additional OCRAM + +nvic: IRQControllers.NVIC @ sysbus 0xE000E000 + priorityMask: 0xF0 + systickFrequency: 600000000 // 600 MHz typical for M7-class parts + IRQ -> cpu@0 + +cpu: CPU.CortexM @ sysbus + cpuType: "cortex-m7" + nvic: nvic