From 90b3285e9515bcb4204f76e1b579f626a33c4f39 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Mon, 27 Apr 2026 22:24:36 +0200 Subject: [PATCH 1/4] feat(cli): add --relocatable flag to force ET_REL output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently synth produces a relocatable object (.o, ET_REL) only when the input wasm has imports — the relocations they generate trigger the relocatable code path. Self-contained wasm modules with no imports produce a complete ET_EXEC firmware with vector table, Reset_Handler, linear_memory section, etc. For linking into a host build system (e.g. integrating verified Rust kernel primitives compiled to wasm into a Zephyr build), the host expects a relocatable .o it can pull into its existing link step. Add a --relocatable CLI flag that forces ET_REL output regardless of whether the wasm has imports. The flag is additive — default behaviour is unchanged. Tested with gale-ffi.wasm (200 functions, 0 imports): output is now a 26645-byte ET_REL ARM EABI5 object with all gale_* symbols defined and no vector-table machinery. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/synth-cli/src/main.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/crates/synth-cli/src/main.rs b/crates/synth-cli/src/main.rs index cfd4eea..af1d77d 100644 --- a/crates/synth-cli/src/main.rs +++ b/crates/synth-cli/src/main.rs @@ -170,6 +170,11 @@ enum Commands { /// Path to kiln-builtins object file (.o) for linking (used with --link) #[arg(long, value_name = "BUILTINS")] builtins: Option, + + /// Force relocatable object (.o, ET_REL) output even when wasm has no imports + /// — for linking into a host build system. + #[arg(long)] + relocatable: bool, }, /// Disassemble an ARM ELF file (e.g., synth disasm output.elf) @@ -248,6 +253,7 @@ fn main() -> Result<()> { verify, link, builtins, + relocatable, } => { // Resolve target spec: --target overrides, --cortex-m is backwards compat let target_spec = resolve_target_spec(target.as_deref(), cortex_m)?; @@ -272,6 +278,7 @@ fn main() -> Result<()> { &backend, verify, &target_spec, + relocatable, )?; // If --link requested, invoke the cross-linker @@ -553,6 +560,7 @@ fn compile_command( backend_name: &str, verify: bool, target_spec: &TargetSpec, + relocatable: bool, ) -> Result<()> { // Validate backend exists let registry = build_backend_registry(); @@ -595,6 +603,7 @@ fn compile_command( backend, verify, target_spec, + relocatable, ); } @@ -1222,6 +1231,7 @@ fn compile_all_exports( backend: &dyn Backend, verify: bool, target_spec: &TargetSpec, + relocatable: bool, ) -> Result<()> { let path = input.context("--all-exports requires an input file")?; @@ -1428,8 +1438,18 @@ fn compile_all_exports( // When there are relocations, produce a relocatable object (.o) instead of // an executable. This lets the output be linked with the Kiln bridge crate // (which provides __meld_dispatch_import and __meld_get_memory_base). - let elf_data = if has_relocations { - info!("Module has import calls — producing relocatable object (ET_REL)"); + // The --relocatable flag forces ET_REL output even when the wasm has no + // imports, for linking into a host build system (e.g. Zephyr). + let elf_data = if has_relocations || relocatable { + let total_relocs: usize = compiled_funcs.iter().map(|f| f.relocations.len()).sum(); + if has_relocations { + info!( + "Producing relocatable object (ET_REL): {} import call relocations", + total_relocs + ); + } else { + info!("Producing relocatable object (ET_REL): forced by --relocatable"); + } build_relocatable_elf(&compiled_funcs, &all_imports)? } else if cortex_m { build_multi_func_cortex_m_elf(&compiled_funcs, &all_memories, target_spec)? From 01a079fce9b99f7e3579656308fd441ab39cbac3 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 28 Apr 2026 19:54:56 +0200 Subject: [PATCH 2/4] fix(no-optimize): allocate stack frame for non-param locals + handle i64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in select_with_stack's spilled-local handling, both surfaced running gale (formally-verified Zephyr kernel primitives) through synth on real Cortex-M code: 1. **i64 local storage only writes one half.** LocalSet/LocalTee/LocalGet for spilled locals always emitted plain Str/Ldr (4 bytes), even for i64 locals. The upper half was never stored, so local.get returned garbage for 32 high bits. For gale's u64-packed FFI decision structs this corrupted the action field, breaking every ISR-driven semaphore operation in the engine bench (count=0 / drain_timeout at every step). 2. **Locals area aliased the callee-saved spill area.** The legacy offset formula `(local_idx - 4) * 4` was hardcoded for num_params==4 and produced negative offsets for other configurations, which the I64Ldr/I64Str encoder silently clamped to 0. With offset 0, locals landed exactly where stmdb had just saved r4/r5 — corrupting the callee-saved register spill and propagating wrong values back to the caller after ldmia. Pure AAPCS violation. Fix: - Add `compute_local_layout(wasm_ops, num_params) -> LocalLayout` that walks the wasm op stream once to determine each non-param local's width (i32/i64) and assigns proper stack offsets from a base of 0. Uses `infer_i64_locals` (also new) which simulates a width vstack to classify locals based on what gets stored into them. - Prologue emits `sub sp, sp, #frame_size` after the callee-saved push, allocating the locals area below the saved-register spills. - Epilogue emits `add sp, sp, #frame_size` before the pop, restoring SP to the callee-saved spills before they get popped. Also applied to the explicit Return opcode handler. - LocalGet/LocalSet/LocalTee dispatch on the layout entry: i64 locals use I64Ldr/I64Str (which already emit two 32-bit memory ops at offset N and N+4); i32 locals use plain Ldr/Str. Offsets come from layout, not from the legacy formula. Frame size is rounded up to 8 bytes for AAPCS SP alignment at call sites. Verified locally: /tmp/repro_i64.wat (1 i32 param + 1 i64 local round-trip): Before: str.w r0, [sp]; (no upper store) — upper half lost. After: sub.w sp, sp, #8; str.w r0, [sp]; str.w r1, [sp, #4]; (both halves stored) ldr.w r2, [sp]; ldr.w r3, [sp, #4]; (both halves loaded) add.w sp, sp, #8; ldmia. gale_k_sem_give_decide (3 i32 params + 1 i64 local, the function whose runtime miscompilation hung the engine bench in CI): Before: str.w r3, [sp]; str.w r5, [sp, #4]; — clobbered the saved r4/r5 from stmdb, AAPCS-violating. After: sub.w sp, sp, #8 → str into the locals frame, not the spill; add.w sp, sp, #8 before ldmia — proper AAPCS. Engine-bench build with GALE_USE_SYNTH=ON now produces a 22768 B zephyr.elf (was 22692 B with the buggy synth). Renode validation pending in CI. Out of scope: - The optimized regalloc bug in optimizer_bridge.rs (clobbers r0..r3 parameter registers — see /tmp/match_gale.wat repro). This fix lets --no-optimize work; the optimized path needs a separate pass. - Implicit-return-to-R0 elision in select_with_stack: small functions whose result lands in a non-R0 temp don't get a final mov to R0 in --no-optimize mode. Pre-existing, unrelated to this fix; affects i32-returning functions with a single intermediate value. Doesn't affect i64 returns (which use the natural R0/R1 pair). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 295 +++++++++++++++++- 1 file changed, 290 insertions(+), 5 deletions(-) diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 4f43edb..9806664 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -156,6 +156,169 @@ fn i64_pair_hi(lo_reg: Reg) -> Result { ))) } +/// Per-function stack-frame layout for non-parameter locals. +/// +/// `offsets[idx]` gives the byte offset (relative to SP after the frame +/// allocation) where local `idx` lives. `frame_size` is the total bytes +/// to allocate via `sub sp, sp, #frame_size` in the prologue. +/// +/// i32/i64 locals each occupy 4/8 bytes respectively. i64 locals are +/// 8-byte aligned per AAPCS. The total frame is rounded up to 8 bytes +/// to keep SP 8-byte aligned at call sites. +struct LocalLayout { + /// idx -> (offset_from_sp, is_i64) + locals: std::collections::HashMap, + frame_size: i32, +} + +/// Compute the stack-frame layout for non-parameter locals in a function. +/// +/// Walks the wasm op stream once to: +/// 1. Identify which non-param local indices are referenced (LocalGet/Set/Tee). +/// 2. Determine each local's width via `infer_i64_locals` (i32 vs i64). +/// 3. Lay them out in ascending-index order with i64 locals 8-byte aligned. +/// +/// The result drives: +/// - Prologue: `sub sp, sp, #frame_size` after pushing callee-saved regs. +/// - LocalGet/Set/Tee: use `offsets[idx]` instead of the legacy +/// `(idx - 4) * 4` formula (which only happened to work when num_params==4 +/// AND the formula's negative result was silently clamped to 0 by the +/// encoder, in both cases corrupting the caller's stack or the callee's +/// own callee-saved-register spill). +/// - Epilogue: `add sp, sp, #frame_size` before popping registers. +fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout { + use std::collections::{BTreeSet, HashMap}; + let i64_set = infer_i64_locals(wasm_ops); + + // Collect non-param local indices, in ascending order for deterministic layout. + let mut used: BTreeSet = BTreeSet::new(); + for op in wasm_ops { + match op { + WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => { + if *idx >= num_params { + used.insert(*idx); + } + } + _ => {} + } + } + + let mut locals: HashMap = HashMap::new(); + let mut offset: i32 = 0; + for &idx in &used { + let is_i64 = i64_set.contains(&idx); + // i64 locals require 8-byte alignment. + if is_i64 && (offset % 8) != 0 { + offset += 4; + } + locals.insert(idx, (offset, is_i64)); + offset += if is_i64 { 8 } else { 4 }; + } + // Round frame to 8-byte multiple for AAPCS SP alignment. + let frame_size = (offset + 7) & !7; + + LocalLayout { + locals, + frame_size, + } +} + +/// Infer which non-parameter wasm locals are i64 (8-byte) values. +/// +/// The wasm decoder discards local-declaration type info, so we re-derive +/// it from the operation stream by simulating a virtual stack of widths +/// (1 = 32-bit, 2 = 64-bit). On each `LocalSet`/`LocalTee` we record the +/// width of the value being stored. WASM type rules guarantee a local's +/// width is invariant for its lifetime, so the first store wins. +/// +/// Without this, the spilled-local store/load path would emit a single +/// 4-byte STR/LDR for i64 locals, dropping the upper half — corrupting +/// any function that returns or uses a u64-packed FFI struct. +fn infer_i64_locals(wasm_ops: &[WasmOp]) -> std::collections::HashSet { + use WasmOp::*; + let mut i64_locals: std::collections::HashSet = std::collections::HashSet::new(); + let mut vstack: Vec = Vec::new(); // true = i64 + + let is_i64_producer = |op: &WasmOp| -> bool { + matches!( + op, + I64Add + | I64Sub + | I64Mul + | I64DivS + | I64DivU + | I64RemS + | I64RemU + | I64And + | I64Or + | I64Xor + | I64Shl + | I64ShrS + | I64ShrU + | I64Rotl + | I64Rotr + | I64Clz + | I64Ctz + | I64Popcnt + | I64Const(_) + | I64Load { .. } + | I64Load8S { .. } + | I64Load8U { .. } + | I64Load16S { .. } + | I64Load16U { .. } + | I64Load32S { .. } + | I64Load32U { .. } + | I64ExtendI32S + | I64ExtendI32U + | I64Extend8S + | I64Extend16S + | I64Extend32S + ) + }; + + for op in wasm_ops { + match op { + LocalGet(idx) => { + let is_i64 = i64_locals.contains(idx); + vstack.push(is_i64); + } + LocalSet(idx) => { + if let Some(is_i64) = vstack.pop() { + if is_i64 { + i64_locals.insert(*idx); + } + } + } + LocalTee(idx) => { + if let Some(&is_i64) = vstack.last() { + if is_i64 { + i64_locals.insert(*idx); + } + } + } + Select => { + // pops [val1, val2, cond], pushes one value with width of val1/val2 + let _cond = vstack.pop(); + let v2 = vstack.pop(); + let v1 = vstack.pop(); + vstack.push(v1.or(v2).unwrap_or(false)); + } + _ => { + let (pops, pushes) = wasm_stack_effect(op); + for _ in 0..pops { + vstack.pop(); + } + let push_width = is_i64_producer(op); + for _ in 0..pushes { + vstack.push(push_width); + } + } + } + } + + i64_locals +} + /// Return the (pops, pushes) stack effect for a WASM op. /// /// Used by the wildcard fallthrough in select_with_stack to maintain @@ -3220,9 +3383,12 @@ impl InstructionSelector { let mut instructions = Vec::new(); - // Function prologue: save callee-saved registers and LR. + // Function prologue: save callee-saved registers and LR, then + // allocate the local-variable frame. + // // AAPCS requires 8-byte aligned SP at call sites. Pushing an even - // number of registers (6: R4-R8, LR) maintains alignment. + // number of registers (6: R4-R8, LR) maintains alignment, and the + // frame_size below is rounded to 8 to preserve it. instructions.push(ArmInstruction { op: ArmOp::Push { regs: vec![Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8, Reg::LR], @@ -3230,6 +3396,22 @@ impl InstructionSelector { source_line: None, }); + // Compute non-param local layout (offsets + total frame size). + let layout = compute_local_layout(wasm_ops, num_params); + // Allocate stack space for non-param locals so they don't alias the + // callee-saved-register spill area (which immediately follows SP + // after Push above). + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Sub { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: None, + }); + } + // Virtual stack holds register indices let mut stack: Vec = Vec::new(); // Next available register for temporaries (start after params) @@ -3255,11 +3437,42 @@ impl InstructionSelector { for (idx, op) in wasm_ops.iter().enumerate() { match op { LocalGet(local_idx) => { - // Get the register for this local + // Get the register for this local. Three cases: + // 1. Param in register — use the cached mapping. + // 2. Spilled i64 local — load both halves via I64Ldr. + // 3. Spilled i32 local — single Ldr. let reg = if let Some(&r) = local_to_reg.get(local_idx) { r + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 local — load both 32-bit halves into a consecutive + // register pair via the I64Ldr pseudo-op. Convention + // matches I64Const: push only dst_lo on the stack; + // dst_hi is recovered later via i64_pair_hi(lo). + let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; + let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Ldr { + rdlo: dst_lo, + rdhi: dst_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + dst_lo + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + // i32 local: single 4-byte load from the locals frame. + let dst = alloc_temp_safe(&mut next_temp, &stack)?; + instructions.push(ArmInstruction { + op: ArmOp::Ldr { + rd: dst, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + dst } else { - // Local not in register (spilled to stack) - load it + // Local not in layout (shouldn't happen for valid wasm, + // but fall back to legacy behaviour for compatibility). let dst = alloc_temp_safe(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::Ldr { @@ -4324,6 +4537,20 @@ impl InstructionSelector { }); cf.add_instruction(); } + // Deallocate the local frame before popping callee-saved + // registers; otherwise the pop would read from the locals + // area instead of the saved-register slots. + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } // Restore callee-saved registers and return via PC instructions.push(ArmInstruction { op: ArmOp::Pop { @@ -4475,7 +4702,32 @@ impl InstructionSelector { cf.add_instruction(); } local_to_reg.insert(*local_idx, target); + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 spilled local: store BOTH 32-bit halves + // (lower at offset N, upper at N+4) via the I64Str + // pseudo-op. Without this we drop the upper half. + let val_hi = i64_pair_hi(val)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Str { + rdlo: val, + rdhi: val_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + // i32 spilled local: single 4-byte store. + instructions.push(ArmInstruction { + op: ArmOp::Str { + rd: val, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); } else { + // Fall-through for compatibility (shouldn't happen). instructions.push(ArmInstruction { op: ArmOp::Str { rd: val, @@ -4507,7 +4759,29 @@ impl InstructionSelector { cf.add_instruction(); } local_to_reg.insert(*local_idx, target); + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 spilled local: store both halves like LocalSet. + let val_hi = i64_pair_hi(val)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Str { + rdlo: val, + rdhi: val_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + instructions.push(ArmInstruction { + op: ArmOp::Str { + rd: val, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); } else { + // Fall-through for compatibility. instructions.push(ArmInstruction { op: ArmOp::Str { rd: val, @@ -5055,7 +5329,18 @@ impl InstructionSelector { } } - // Function epilogue: restore callee-saved registers and return via PC + // Function epilogue: deallocate the local frame, then restore + // callee-saved registers and return via PC. + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: None, + }); + } // POP {R4-R8, PC} restores registers and returns (PC = saved LR) instructions.push(ArmInstruction { op: ArmOp::Pop { From 1e1e01b7b15342a07f36ab2f1e6f23829e133544 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 28 Apr 2026 22:19:45 +0200 Subject: [PATCH 3/4] fix: alloc consecutive register pairs for i64 operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #85 fixed i64 local STORAGE (both halves stored to consecutive stack slots) but introduced a follow-on bug: when the i64 register pair gets allocated via two separate alloc_temp_safe calls, the resulting pair can be NON-CONSECUTIVE in ALLOCATABLE_REGS if a register in between is live on the wasm stack. Subsequent i64 ops downstream call i64_pair_hi(rdlo) to recover the high register, which assumes the pair is consecutive. With a non-consecutive pair from earlier, i64_pair_hi returns the WRONG register and the op reads garbage. Witnessed on gale_k_sem_give_decide: ldr.w r5, [sp] ; LocalGet i64 lo - r5 picked ldr.w r6, [sp, #4] ; LocalGet i64 hi - r6 picked (consecutive ✓) ...later... orr.w r0, r0, r2 ; i64.or expected (r5,r6) but used (r2,r3) Fix: add `alloc_consecutive_pair` helper that ensures two consecutive free entries in ALLOCATABLE_REGS. Use it everywhere a pair is allocated for an i64 result: I64Const (line 4567), I64Add/Sub/Mul result allocs (lines 4914+, 4958+, 4996+), and the new i64-LocalGet path from PR #85. Verified locally: /tmp/repro_i64.wat round-trips correctly with consecutive register pair (lo→r2, hi→r3). gale_k_sem_give_decide's LocalGet 3 now loads to consecutive r5/r6. Note: the engine bench in Renode still hangs after this fix. Further diagnostic shows i64.or's ARM lowering uses register pairs (r0,r1) and (r2,r3) regardless of what's on the wasm-tracked stack — a separate bug in synth's wildcard fallthrough for I64Or in select_with_stack. That fix is out of scope for this PR; tracked separately. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 83 +++++++++++++++---- 1 file changed, 69 insertions(+), 14 deletions(-) diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 9806664..82d174e 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -132,6 +132,41 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result { )) } +/// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS, +/// neither of which is currently on the wasm stack. +/// +/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values +/// because if a register between them is live on the wasm stack, the second +/// call skips it and the resulting pair is non-consecutive. Subsequent code +/// that uses [`i64_pair_hi`]`(rdlo)` to recover the high register then gets +/// the wrong register and reads garbage. +/// +/// This helper ensures both halves come from consecutive ALLOCATABLE_REGS +/// entries (no wraparound) so the pair convention holds throughout the +/// function body. +fn alloc_consecutive_pair(next_temp: &mut u8, stack: &[Reg]) -> Result<(Reg, Reg)> { + let n = ALLOCATABLE_REGS.len(); + for _ in 0..n { + let lo_idx = (*next_temp as usize) % n; + let hi_idx = lo_idx + 1; + // Wraparound is invalid: i64_pair_hi requires hi_idx < n. + if hi_idx < n { + let lo_reg = ALLOCATABLE_REGS[lo_idx]; + let hi_reg = ALLOCATABLE_REGS[hi_idx]; + if !stack.contains(&lo_reg) && !stack.contains(&hi_reg) { + *next_temp = ((hi_idx + 1) % n) as u8; + return Ok((lo_reg, hi_reg)); + } + } + *next_temp = ((*next_temp as usize + 1) % n) as u8; + } + Err(synth_core::Error::synthesis( + "register exhaustion: no consecutive pair of free registers for i64 — \ + function too complex for current register allocator" + .to_string(), + )) +} + /// Given the low register of an i64 register pair, return the high register. /// /// Convention: i64 values on 32-bit ARM use two consecutive registers. @@ -3448,8 +3483,13 @@ impl InstructionSelector { // register pair via the I64Ldr pseudo-op. Convention // matches I64Const: push only dst_lo on the stack; // dst_hi is recovered later via i64_pair_hi(lo). - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // The pair MUST be consecutive in ALLOCATABLE_REGS + // — i64_pair_hi assumes that. Two separate calls to + // alloc_temp_safe can return non-consecutive registers + // when something in between is live, breaking the + // pair convention. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::I64Ldr { rdlo: dst_lo, @@ -4835,9 +4875,12 @@ impl InstructionSelector { // Pairs are allocated as two consecutive temp registers. // ========================================================= I64Const(val) => { - // Allocate a register pair for the 64-bit constant - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate a CONSECUTIVE register pair for the 64-bit + // constant. Two separate alloc_temp_safe calls can return + // non-consecutive registers if something in between is + // live on the wasm stack, which then breaks the + // i64_pair_hi convention used by every i64 op downstream. + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::I64Const { @@ -4867,9 +4910,13 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; // ADDS dst_lo, a_lo, b_lo (sets carry flag) instructions.push(ArmInstruction { @@ -4911,9 +4958,13 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; // SUBS dst_lo, a_lo, b_lo (sets borrow flag) instructions.push(ArmInstruction { @@ -4949,9 +5000,13 @@ impl InstructionSelector { ) })?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack)?; // Generate bounds-checked i64 load into the allocated pair let load_ops = From b8da214f1a2d3c17d6a70535ff961f53d6594ff7 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 29 Apr 2026 06:25:39 +0200 Subject: [PATCH 4/4] fix(no-optimize): explicit i64 op handlers + extra_avoid in alloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three deeper bugs surfaced when running gale_k_sem_give_decide on Renode after PR #85 + the consecutive-pair fix: 1. **i64 ops fall through to select_default** in select_with_stack, which assumes inputs are in R0:R1 / R2:R3. Wasm-stack-tracked pairs from earlier ops never make it. Symptoms: i64.or used register pairs from previous shift ops instead of the just-loaded LocalGet result, producing a corrupted return value. Fix: explicit handlers for I64Or / I64And / I64Xor / I64ExtendI32U / I64ExtendI32S / I64Shl / I64ShrU / I64ShrS in select_with_stack, each popping the wasm-tracked pair, deriving its hi via i64_pair_hi, and emitting the right ARM instructions / pseudo-ops with arbitrary registers (the existing I64Shl/etc. ArmOp pseudo-ops accept arbitrary register operands). 2. **alloc_consecutive_pair didn't reserve implicit hi registers**. The wasm stack only tracks the lo register of each i64; the hi is conventional but invisible to a naive scan. A fresh allocation could overwrite an earlier i64's implicit hi, breaking subsequent i64_pair_hi lookups. Witnessed: I64Const 32 allocated (r1, r2), clobbering the hi of a previously-extended i64 in (r0, r1). Fix: alloc_consecutive_pair now scans every stack entry and reserves both lo AND its conventional pair_hi (over-reserves for i32 stack entries — safe). 3. **alloc_consecutive_pair didn't reserve just-popped operands**. When an i64 op popped operands and then allocated a result pair, the popped values were temporarily off the stack. The allocation could pick a register that's about to be read by the op's own second instruction (e.g. dst_lo == a_hi means the lo Or write clobbers a_hi before the hi Or reads it). Fix: extra_avoid parameter on alloc_consecutive_pair. I64Add / I64Sub / I64Or / I64Shl / I64Load / extends pass their popped operand registers via extra_avoid. Verified locally: gale_k_sem_give_decide now produces: orr r0, r6, r8 ; lo = shift_result_lo OR local3_lo orr r1, r7, ip ; hi = shift_result_hi OR local3_hi matching the wasm semantics for i64.or. Engine bench builds clean with 22644 B FLASH. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/instruction_selector.rs | 289 ++++++++++++++++-- 1 file changed, 263 insertions(+), 26 deletions(-) diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 82d174e..e35608b 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -133,27 +133,52 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result { } /// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS, -/// neither of which is currently on the wasm stack. +/// neither of which is currently in use. /// -/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values -/// because if a register between them is live on the wasm stack, the second -/// call skips it and the resulting pair is non-consecutive. Subsequent code -/// that uses [`i64_pair_hi`]`(rdlo)` to recover the high register then gets -/// the wrong register and reads garbage. +/// "In use" means: +/// 1. On the wasm stack (the explicit `Vec` tracking). +/// 2. The implicit *high* register of any i64 value on the stack — for every +/// `lo` in `stack`, [`i64_pair_hi`]`(lo)` is also reserved. The wasm stack +/// only tracks the lo register of each i64; the hi is reserved by +/// convention but invisible to a naive scan. If we ignored that, a fresh +/// `alloc_consecutive_pair` could return the implicit-hi of an earlier +/// i64, clobbering it on the next i64 op that reads it via i64_pair_hi. +/// 3. Any explicit registers in `extra_avoid` — used by i64-op handlers to +/// keep the just-popped operand pairs alive across the destination +/// allocation (e.g. for I64Or, the popped a_lo/a_hi/b_lo/b_hi are still +/// live until the OR is emitted). /// -/// This helper ensures both halves come from consecutive ALLOCATABLE_REGS -/// entries (no wraparound) so the pair convention holds throughout the -/// function body. -fn alloc_consecutive_pair(next_temp: &mut u8, stack: &[Reg]) -> Result<(Reg, Reg)> { +/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values: +/// if a register between them is live, the second call skips it and the +/// resulting pair is non-consecutive, breaking [`i64_pair_hi`]'s contract. +fn alloc_consecutive_pair( + next_temp: &mut u8, + stack: &[Reg], + extra_avoid: &[Reg], +) -> Result<(Reg, Reg)> { + // Build a "live" Vec: every stack entry, plus its conventional + // pair_hi (over-reserves for i32 stack entries but that's safe), plus + // any explicit extras the caller specifies. Using Vec rather than + // HashSet because Reg in this crate does not derive Hash. + let mut live: Vec = Vec::with_capacity(stack.len() * 2 + extra_avoid.len()); + for ® in stack { + live.push(reg); + if let Ok(hi) = i64_pair_hi(reg) { + live.push(hi); + } + } + for ® in extra_avoid { + live.push(reg); + } + let n = ALLOCATABLE_REGS.len(); for _ in 0..n { let lo_idx = (*next_temp as usize) % n; let hi_idx = lo_idx + 1; - // Wraparound is invalid: i64_pair_hi requires hi_idx < n. if hi_idx < n { let lo_reg = ALLOCATABLE_REGS[lo_idx]; let hi_reg = ALLOCATABLE_REGS[hi_idx]; - if !stack.contains(&lo_reg) && !stack.contains(&hi_reg) { + if !live.contains(&lo_reg) && !live.contains(&hi_reg) { *next_temp = ((hi_idx + 1) % n) as u8; return Ok((lo_reg, hi_reg)); } @@ -3489,7 +3514,7 @@ impl InstructionSelector { // when something in between is live, breaking the // pair convention. let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + alloc_consecutive_pair(&mut next_temp, &stack, &[])?; instructions.push(ArmInstruction { op: ArmOp::I64Ldr { rdlo: dst_lo, @@ -4880,7 +4905,7 @@ impl InstructionSelector { // non-consecutive registers if something in between is // live on the wasm stack, which then breaks the // i64_pair_hi convention used by every i64 op downstream. - let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack)?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?; instructions.push(ArmInstruction { op: ArmOp::I64Const { @@ -4915,8 +4940,14 @@ impl InstructionSelector { // and is called by every i64 op downstream to recover // the high register. Two separate alloc_temp_safe calls // skip live registers and produce non-consecutive pairs. - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + // Avoid clobbering the just-popped operand pairs before + // the ADC reads them — passing them in extra_avoid + // ensures dst doesn't overlap any of a_lo/a_hi/b_lo/b_hi. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; // ADDS dst_lo, a_lo, b_lo (sets carry flag) instructions.push(ArmInstruction { @@ -4958,13 +4989,13 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair. MUST be consecutive - // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive - // and is called by every i64 op downstream to recover - // the high register. Two separate alloc_temp_safe calls - // skip live registers and produce non-consecutive pairs. - let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + // See I64Add for why extra_avoid carries a_*/b_* — + // dst must not overlap any operand half before SBC reads it. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; // SUBS dst_lo, a_lo, b_lo (sets borrow flag) instructions.push(ArmInstruction { @@ -4991,6 +5022,212 @@ impl InstructionSelector { stack.push(dst_lo); } + // ============================================================ + // i64 bitwise ops (I64Or / I64And / I64Xor) + // + // Each pops two i64 register pairs from the wasm stack and + // emits two ARM ops (low-half then high-half) into a freshly + // allocated consecutive pair. This replaces the wildcard + // fallthrough to select_default, which assumed inputs in + // R0:R1 and R2:R3 — incorrect when the wasm stack tracks + // arbitrary register pairs from earlier ops. + // ============================================================ + I64Or | I64And | I64Xor => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 bitwise op".to_string(), + ) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 bitwise op".to_string(), + ) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // dst must not overlap any popped operand's half — the + // hi instruction reads a_hi and b_hi after the lo + // instruction writes dst_lo. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; + let (lo_op, hi_op) = match op { + I64Or => ( + ArmOp::Orr { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::Orr { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + I64And => ( + ArmOp::And { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::And { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + I64Xor => ( + ArmOp::Eor { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::Eor { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: lo_op, + source_line: Some(idx), + }); + cf.add_instruction(); + instructions.push(ArmInstruction { + op: hi_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i32 -> i64 extension (I64ExtendI32U / I64ExtendI32S) + // + // Pops one i32, allocates a consecutive i64 pair, places the + // i32 in the low half. For unsigned: high = 0. For signed: + // high = arithmetic-shift-right by 31 (sign-extension). + // ============================================================ + I64ExtendI32U => { + let val = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in I64ExtendI32U".to_string(), + ) + })?; + // val must stay alive until the Mov reads it; dst_hi + // must not be val (we'd write the zero high before + // moving val to dst_lo). + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + if val != dst_lo { + instructions.push(ArmInstruction { + op: ArmOp::Mov { + rd: dst_lo, + op2: Operand2::Reg(val), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } + instructions.push(ArmInstruction { + op: ArmOp::Movw { + rd: dst_hi, + imm16: 0, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + I64ExtendI32S => { + let val = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in I64ExtendI32S".to_string(), + ) + })?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + instructions.push(ArmInstruction { + op: ArmOp::I64ExtendI32S { + rdlo: dst_lo, + rdhi: dst_hi, + rn: val, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 variable shifts (I64Shl / I64ShrU / I64ShrS) + // + // Use the existing I64Shl/I64ShrU/I64ShrS pseudo-ops (which + // expand to the variable-shift logic in arm_encoder.rs) but + // pass the actual stack-tracked register pairs rather than + // assuming R0:R1 / R2:R3. + // ============================================================ + I64Shl | I64ShrU | I64ShrS => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 shift".to_string(), + ) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 shift".to_string(), + ) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // dst must not overlap any popped operand's half — the + // shift pseudo-op reads all four (rn_lo/rn_hi/rm_lo/rm_hi) + // before writing the destination. + let (dst_lo, dst_hi) = alloc_consecutive_pair( + &mut next_temp, + &stack, + &[a_lo, a_hi, b_lo, b_hi], + )?; + let shift_op = match op { + I64Shl => ArmOp::I64Shl { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + I64ShrU => ArmOp::I64ShrU { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + I64ShrS => ArmOp::I64ShrS { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: shift_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + I64Load { offset, .. } => { // Pop address from stack let addr = stack.pop().ok_or_else(|| { @@ -5003,10 +5240,10 @@ impl InstructionSelector { // Allocate result register pair. MUST be consecutive // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive // and is called by every i64 op downstream to recover - // the high register. Two separate alloc_temp_safe calls - // skip live registers and produce non-consecutive pairs. + // the high register. Avoid clobbering addr before the + // load uses it. let (dst_lo, dst_hi) = - alloc_consecutive_pair(&mut next_temp, &stack)?; + alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; // Generate bounds-checked i64 load into the allocated pair let load_ops =