From 90b3285e9515bcb4204f76e1b579f626a33c4f39 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Mon, 27 Apr 2026 22:24:36 +0200
Subject: [PATCH 1/4] feat(cli): add --relocatable flag to force ET_REL output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently synth produces a relocatable object (.o, ET_REL) only when the
input wasm has imports — the relocations they generate trigger the
relocatable code path. Self-contained wasm modules with no imports
produce a complete ET_EXEC firmware with vector table, Reset_Handler,
linear_memory section, etc.

For linking into a host build system (e.g. integrating verified Rust
kernel primitives compiled to wasm into a Zephyr build), the host
expects a relocatable .o it can pull into its existing link step. Add
a --relocatable CLI flag that forces ET_REL output regardless of
whether the wasm has imports.

The flag is additive — default behaviour is unchanged.

Tested with gale-ffi.wasm (200 functions, 0 imports): output is now
a 26645-byte ET_REL ARM EABI5 object with all gale_* symbols defined
and no vector-table machinery.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/synth-cli/src/main.rs | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/crates/synth-cli/src/main.rs b/crates/synth-cli/src/main.rs
index cfd4eea..af1d77d 100644
--- a/crates/synth-cli/src/main.rs
+++ b/crates/synth-cli/src/main.rs
@@ -170,6 +170,11 @@ enum Commands {
         /// Path to kiln-builtins object file (.o) for linking (used with --link)
         #[arg(long, value_name = "BUILTINS")]
         builtins: Option<PathBuf>,
+
+        /// Force relocatable object (.o, ET_REL) output even when wasm has no imports
+        /// — for linking into a host build system.
+        #[arg(long)]
+        relocatable: bool,
     },
 
     /// Disassemble an ARM ELF file (e.g., synth disasm output.elf)
@@ -248,6 +253,7 @@ fn main() -> Result<()> {
             verify,
             link,
             builtins,
+            relocatable,
         } => {
             // Resolve target spec: --target overrides, --cortex-m is backwards compat
             let target_spec = resolve_target_spec(target.as_deref(), cortex_m)?;
@@ -272,6 +278,7 @@ fn main() -> Result<()> {
                 &backend,
                 verify,
                 &target_spec,
+                relocatable,
             )?;
 
             // If --link requested, invoke the cross-linker
@@ -553,6 +560,7 @@ fn compile_command(
     backend_name: &str,
     verify: bool,
     target_spec: &TargetSpec,
+    relocatable: bool,
 ) -> Result<()> {
     // Validate backend exists
     let registry = build_backend_registry();
@@ -595,6 +603,7 @@ fn compile_command(
             backend,
             verify,
             target_spec,
+            relocatable,
         );
     }
 
@@ -1222,6 +1231,7 @@ fn compile_all_exports(
     backend: &dyn Backend,
     verify: bool,
     target_spec: &TargetSpec,
+    relocatable: bool,
 ) -> Result<()> {
     let path = input.context("--all-exports requires an input file")?;
 
@@ -1428,8 +1438,18 @@ fn compile_all_exports(
     // When there are relocations, produce a relocatable object (.o) instead of
     // an executable. This lets the output be linked with the Kiln bridge crate
     // (which provides __meld_dispatch_import and __meld_get_memory_base).
-    let elf_data = if has_relocations {
-        info!("Module has import calls — producing relocatable object (ET_REL)");
+    // The --relocatable flag forces ET_REL output even when the wasm has no
+    // imports, for linking into a host build system (e.g. Zephyr).
+    let elf_data = if has_relocations || relocatable {
+        let total_relocs: usize = compiled_funcs.iter().map(|f| f.relocations.len()).sum();
+        if has_relocations {
+            info!(
+                "Producing relocatable object (ET_REL): {} import call relocations",
+                total_relocs
+            );
+        } else {
+            info!("Producing relocatable object (ET_REL): forced by --relocatable");
+        }
         build_relocatable_elf(&compiled_funcs, &all_imports)?
     } else if cortex_m {
         build_multi_func_cortex_m_elf(&compiled_funcs, &all_memories, target_spec)?

From 01a079fce9b99f7e3579656308fd441ab39cbac3 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Tue, 28 Apr 2026 19:54:56 +0200
Subject: [PATCH 2/4] fix(no-optimize): allocate stack frame for non-param
 locals + handle i64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs in select_with_stack's spilled-local handling, both surfaced
running gale (formally-verified Zephyr kernel primitives) through synth
on real Cortex-M code:

1. **i64 local storage only writes one half.** LocalSet/LocalTee/LocalGet
   for spilled locals always emitted plain Str/Ldr (4 bytes), even for
   i64 locals. The upper half was never stored, so local.get returned
   garbage for 32 high bits. For gale's u64-packed FFI decision structs
   this corrupted the action field, breaking every ISR-driven semaphore
   operation in the engine bench (count=0 / drain_timeout at every step).

2. **Locals area aliased the callee-saved spill area.** The legacy
   offset formula `(local_idx - 4) * 4` was hardcoded for num_params==4
   and produced negative offsets for other configurations, which the
   I64Ldr/I64Str encoder silently clamped to 0. With offset 0, locals
   landed exactly where stmdb had just saved r4/r5 — corrupting the
   callee-saved register spill and propagating wrong values back to the
   caller after ldmia. Pure AAPCS violation.

Fix:

- Add `compute_local_layout(wasm_ops, num_params) -> LocalLayout` that
  walks the wasm op stream once to determine each non-param local's
  width (i32/i64) and assigns proper stack offsets from a base of 0.
  Uses `infer_i64_locals` (also new) which simulates a width vstack
  to classify locals based on what gets stored into them.

- Prologue emits `sub sp, sp, #frame_size` after the callee-saved
  push, allocating the locals area below the saved-register spills.

- Epilogue emits `add sp, sp, #frame_size` before the pop, restoring
  SP to the callee-saved spills before they get popped. Also applied
  to the explicit Return opcode handler.

- LocalGet/LocalSet/LocalTee dispatch on the layout entry: i64 locals
  use I64Ldr/I64Str (which already emit two 32-bit memory ops at offset
  N and N+4); i32 locals use plain Ldr/Str. Offsets come from layout,
  not from the legacy formula. Frame size is rounded up to 8 bytes for
  AAPCS SP alignment at call sites.

Verified locally:

  /tmp/repro_i64.wat (1 i32 param + 1 i64 local round-trip):
    Before: str.w r0, [sp]; (no upper store) — upper half lost.
    After:  sub.w sp, sp, #8;
            str.w r0, [sp]; str.w r1, [sp, #4]; (both halves stored)
            ldr.w r2, [sp]; ldr.w r3, [sp, #4]; (both halves loaded)
            add.w sp, sp, #8; ldmia.

  gale_k_sem_give_decide (3 i32 params + 1 i64 local, the function
  whose runtime miscompilation hung the engine bench in CI):
    Before: str.w r3, [sp]; str.w r5, [sp, #4]; — clobbered the saved
            r4/r5 from stmdb, AAPCS-violating.
    After:  sub.w sp, sp, #8 → str into the locals frame, not the spill;
            add.w sp, sp, #8 before ldmia — proper AAPCS.

Engine-bench build with GALE_USE_SYNTH=ON now produces a 22768 B
zephyr.elf (was 22692 B with the buggy synth). Renode validation
pending in CI.

Out of scope:
- The optimized regalloc bug in optimizer_bridge.rs (clobbers r0..r3
  parameter registers — see /tmp/match_gale.wat repro). This fix lets
  --no-optimize work; the optimized path needs a separate pass.
- Implicit-return-to-R0 elision in select_with_stack: small functions
  whose result lands in a non-R0 temp don't get a final mov to R0 in
  --no-optimize mode. Pre-existing, unrelated to this fix; affects
  i32-returning functions with a single intermediate value. Doesn't
  affect i64 returns (which use the natural R0/R1 pair).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/instruction_selector.rs               | 295 +++++++++++++++++-
 1 file changed, 290 insertions(+), 5 deletions(-)

diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs
index 4f43edb..9806664 100644
--- a/crates/synth-synthesis/src/instruction_selector.rs
+++ b/crates/synth-synthesis/src/instruction_selector.rs
@@ -156,6 +156,169 @@ fn i64_pair_hi(lo_reg: Reg) -> Result<Reg> {
     )))
 }
 
+/// Per-function stack-frame layout for non-parameter locals.
+///
+/// `offsets[idx]` gives the byte offset (relative to SP after the frame
+/// allocation) where local `idx` lives. `frame_size` is the total bytes
+/// to allocate via `sub sp, sp, #frame_size` in the prologue.
+///
+/// i32/i64 locals each occupy 4/8 bytes respectively. i64 locals are
+/// 8-byte aligned per AAPCS. The total frame is rounded up to 8 bytes
+/// to keep SP 8-byte aligned at call sites.
+struct LocalLayout {
+    /// idx -> (offset_from_sp, is_i64)
+    locals: std::collections::HashMap<u32, (i32, bool)>,
+    frame_size: i32,
+}
+
+/// Compute the stack-frame layout for non-parameter locals in a function.
+///
+/// Walks the wasm op stream once to:
+/// 1. Identify which non-param local indices are referenced (LocalGet/Set/Tee).
+/// 2. Determine each local's width via `infer_i64_locals` (i32 vs i64).
+/// 3. Lay them out in ascending-index order with i64 locals 8-byte aligned.
+///
+/// The result drives:
+/// - Prologue: `sub sp, sp, #frame_size` after pushing callee-saved regs.
+/// - LocalGet/Set/Tee: use `offsets[idx]` instead of the legacy
+///   `(idx - 4) * 4` formula (which only happened to work when num_params==4
+///   AND the formula's negative result was silently clamped to 0 by the
+///   encoder, in both cases corrupting the caller's stack or the callee's
+///   own callee-saved-register spill).
+/// - Epilogue: `add sp, sp, #frame_size` before popping registers.
+fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout {
+    use std::collections::{BTreeSet, HashMap};
+    let i64_set = infer_i64_locals(wasm_ops);
+
+    // Collect non-param local indices, in ascending order for deterministic layout.
+    let mut used: BTreeSet<u32> = BTreeSet::new();
+    for op in wasm_ops {
+        match op {
+            WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
+                if *idx >= num_params {
+                    used.insert(*idx);
+                }
+            }
+            _ => {}
+        }
+    }
+
+    let mut locals: HashMap<u32, (i32, bool)> = HashMap::new();
+    let mut offset: i32 = 0;
+    for &idx in &used {
+        let is_i64 = i64_set.contains(&idx);
+        // i64 locals require 8-byte alignment.
+        if is_i64 && (offset % 8) != 0 {
+            offset += 4;
+        }
+        locals.insert(idx, (offset, is_i64));
+        offset += if is_i64 { 8 } else { 4 };
+    }
+    // Round frame to 8-byte multiple for AAPCS SP alignment.
+    let frame_size = (offset + 7) & !7;
+
+    LocalLayout {
+        locals,
+        frame_size,
+    }
+}
+
+/// Infer which non-parameter wasm locals are i64 (8-byte) values.
+///
+/// The wasm decoder discards local-declaration type info, so we re-derive
+/// it from the operation stream by simulating a virtual stack of widths
+/// (1 = 32-bit, 2 = 64-bit). On each `LocalSet`/`LocalTee` we record the
+/// width of the value being stored. WASM type rules guarantee a local's
+/// width is invariant for its lifetime, so the first store wins.
+///
+/// Without this, the spilled-local store/load path would emit a single
+/// 4-byte STR/LDR for i64 locals, dropping the upper half — corrupting
+/// any function that returns or uses a u64-packed FFI struct.
+fn infer_i64_locals(wasm_ops: &[WasmOp]) -> std::collections::HashSet<u32> {
+    use WasmOp::*;
+    let mut i64_locals: std::collections::HashSet<u32> = std::collections::HashSet::new();
+    let mut vstack: Vec<bool> = Vec::new(); // true = i64
+
+    let is_i64_producer = |op: &WasmOp| -> bool {
+        matches!(
+            op,
+            I64Add
+                | I64Sub
+                | I64Mul
+                | I64DivS
+                | I64DivU
+                | I64RemS
+                | I64RemU
+                | I64And
+                | I64Or
+                | I64Xor
+                | I64Shl
+                | I64ShrS
+                | I64ShrU
+                | I64Rotl
+                | I64Rotr
+                | I64Clz
+                | I64Ctz
+                | I64Popcnt
+                | I64Const(_)
+                | I64Load { .. }
+                | I64Load8S { .. }
+                | I64Load8U { .. }
+                | I64Load16S { .. }
+                | I64Load16U { .. }
+                | I64Load32S { .. }
+                | I64Load32U { .. }
+                | I64ExtendI32S
+                | I64ExtendI32U
+                | I64Extend8S
+                | I64Extend16S
+                | I64Extend32S
+        )
+    };
+
+    for op in wasm_ops {
+        match op {
+            LocalGet(idx) => {
+                let is_i64 = i64_locals.contains(idx);
+                vstack.push(is_i64);
+            }
+            LocalSet(idx) => {
+                if let Some(is_i64) = vstack.pop() {
+                    if is_i64 {
+                        i64_locals.insert(*idx);
+                    }
+                }
+            }
+            LocalTee(idx) => {
+                if let Some(&is_i64) = vstack.last() {
+                    if is_i64 {
+                        i64_locals.insert(*idx);
+                    }
+                }
+            }
+            Select => {
+                // pops [val1, val2, cond], pushes one value with width of val1/val2
+                let _cond = vstack.pop();
+                let v2 = vstack.pop();
+                let v1 = vstack.pop();
+                vstack.push(v1.or(v2).unwrap_or(false));
+            }
+            _ => {
+                let (pops, pushes) = wasm_stack_effect(op);
+                for _ in 0..pops {
+                    vstack.pop();
+                }
+                let push_width = is_i64_producer(op);
+                for _ in 0..pushes {
+                    vstack.push(push_width);
+                }
+            }
+        }
+    }
+
+    i64_locals
+}
+
 /// Return the (pops, pushes) stack effect for a WASM op.
 ///
 /// Used by the wildcard fallthrough in select_with_stack to maintain
@@ -3220,9 +3383,12 @@ impl InstructionSelector {
 
         let mut instructions = Vec::new();
 
-        // Function prologue: save callee-saved registers and LR.
+        // Function prologue: save callee-saved registers and LR, then
+        // allocate the local-variable frame.
+        //
         // AAPCS requires 8-byte aligned SP at call sites. Pushing an even
-        // number of registers (6: R4-R8, LR) maintains alignment.
+        // number of registers (6: R4-R8, LR) maintains alignment, and the
+        // frame_size below is rounded to 8 to preserve it.
         instructions.push(ArmInstruction {
             op: ArmOp::Push {
                 regs: vec![Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8, Reg::LR],
@@ -3230,6 +3396,22 @@ impl InstructionSelector {
             source_line: None,
         });
 
+        // Compute non-param local layout (offsets + total frame size).
+        let layout = compute_local_layout(wasm_ops, num_params);
+        // Allocate stack space for non-param locals so they don't alias the
+        // callee-saved-register spill area (which immediately follows SP
+        // after Push above).
+        if layout.frame_size > 0 {
+            instructions.push(ArmInstruction {
+                op: ArmOp::Sub {
+                    rd: Reg::SP,
+                    rn: Reg::SP,
+                    op2: Operand2::Imm(layout.frame_size),
+                },
+                source_line: None,
+            });
+        }
+
         // Virtual stack holds register indices
         let mut stack: Vec<Reg> = Vec::new();
         // Next available register for temporaries (start after params)
@@ -3255,11 +3437,42 @@ impl InstructionSelector {
         for (idx, op) in wasm_ops.iter().enumerate() {
             match op {
                 LocalGet(local_idx) => {
-                    // Get the register for this local
+                    // Get the register for this local. Three cases:
+                    //  1. Param in register — use the cached mapping.
+                    //  2. Spilled i64 local — load both halves via I64Ldr.
+                    //  3. Spilled i32 local — single Ldr.
                     let reg = if let Some(&r) = local_to_reg.get(local_idx) {
                         r
+                    } else if let Some(&(off, true)) = layout.locals.get(local_idx) {
+                        // i64 local — load both 32-bit halves into a consecutive
+                        // register pair via the I64Ldr pseudo-op. Convention
+                        // matches I64Const: push only dst_lo on the stack;
+                        // dst_hi is recovered later via i64_pair_hi(lo).
+                        let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
+                        let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::I64Ldr {
+                                rdlo: dst_lo,
+                                rdhi: dst_hi,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        dst_lo
+                    } else if let Some(&(off, false)) = layout.locals.get(local_idx) {
+                        // i32 local: single 4-byte load from the locals frame.
+                        let dst = alloc_temp_safe(&mut next_temp, &stack)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Ldr {
+                                rd: dst,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        dst
                     } else {
-                        // Local not in register (spilled to stack) - load it
+                        // Local not in layout (shouldn't happen for valid wasm,
+                        // but fall back to legacy behaviour for compatibility).
                         let dst = alloc_temp_safe(&mut next_temp, &stack)?;
                         instructions.push(ArmInstruction {
                             op: ArmOp::Ldr {
@@ -4324,6 +4537,20 @@ impl InstructionSelector {
                         });
                         cf.add_instruction();
                     }
+                    // Deallocate the local frame before popping callee-saved
+                    // registers; otherwise the pop would read from the locals
+                    // area instead of the saved-register slots.
+                    if layout.frame_size > 0 {
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Add {
+                                rd: Reg::SP,
+                                rn: Reg::SP,
+                                op2: Operand2::Imm(layout.frame_size),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    }
                     // Restore callee-saved registers and return via PC
                     instructions.push(ArmInstruction {
                         op: ArmOp::Pop {
@@ -4475,7 +4702,32 @@ impl InstructionSelector {
                             cf.add_instruction();
                         }
                         local_to_reg.insert(*local_idx, target);
+                    } else if let Some(&(off, true)) = layout.locals.get(local_idx) {
+                        // i64 spilled local: store BOTH 32-bit halves
+                        // (lower at offset N, upper at N+4) via the I64Str
+                        // pseudo-op. Without this we drop the upper half.
+                        let val_hi = i64_pair_hi(val)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::I64Str {
+                                rdlo: val,
+                                rdhi: val_hi,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    } else if let Some(&(off, false)) = layout.locals.get(local_idx) {
+                        // i32 spilled local: single 4-byte store.
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Str {
+                                rd: val,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
                     } else {
+                        // Fall-through for compatibility (shouldn't happen).
                         instructions.push(ArmInstruction {
                             op: ArmOp::Str {
                                 rd: val,
@@ -4507,7 +4759,29 @@ impl InstructionSelector {
                             cf.add_instruction();
                         }
                         local_to_reg.insert(*local_idx, target);
+                    } else if let Some(&(off, true)) = layout.locals.get(local_idx) {
+                        // i64 spilled local: store both halves like LocalSet.
+                        let val_hi = i64_pair_hi(val)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::I64Str {
+                                rdlo: val,
+                                rdhi: val_hi,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    } else if let Some(&(off, false)) = layout.locals.get(local_idx) {
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Str {
+                                rd: val,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
                     } else {
+                        // Fall-through for compatibility.
                         instructions.push(ArmInstruction {
                             op: ArmOp::Str {
                                 rd: val,
@@ -5055,7 +5329,18 @@ impl InstructionSelector {
             }
         }
 
-        // Function epilogue: restore callee-saved registers and return via PC
+        // Function epilogue: deallocate the local frame, then restore
+        // callee-saved registers and return via PC.
+        if layout.frame_size > 0 {
+            instructions.push(ArmInstruction {
+                op: ArmOp::Add {
+                    rd: Reg::SP,
+                    rn: Reg::SP,
+                    op2: Operand2::Imm(layout.frame_size),
+                },
+                source_line: None,
+            });
+        }
         // POP {R4-R8, PC} restores registers and returns (PC = saved LR)
         instructions.push(ArmInstruction {
             op: ArmOp::Pop {

From 1e1e01b7b15342a07f36ab2f1e6f23829e133544 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Tue, 28 Apr 2026 22:19:45 +0200
Subject: [PATCH 3/4] fix: alloc consecutive register pairs for i64 operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #85 fixed i64 local STORAGE (both halves stored to consecutive stack
slots) but introduced a follow-on bug: when the i64 register pair gets
allocated via two separate alloc_temp_safe calls, the resulting pair
can be NON-CONSECUTIVE in ALLOCATABLE_REGS if a register in between is
live on the wasm stack.

Subsequent i64 ops downstream call i64_pair_hi(rdlo) to recover the
high register, which assumes the pair is consecutive. With a
non-consecutive pair from earlier, i64_pair_hi returns the WRONG
register and the op reads garbage.

Witnessed on gale_k_sem_give_decide:

  ldr.w r5, [sp]      ; LocalGet i64 lo - r5 picked
  ldr.w r6, [sp, #4]  ; LocalGet i64 hi - r6 picked (consecutive ✓)
  ...later...
  orr.w r0, r0, r2    ; i64.or expected (r5,r6) but used (r2,r3)

Fix: add `alloc_consecutive_pair` helper that ensures two consecutive
free entries in ALLOCATABLE_REGS. Use it everywhere a pair is allocated
for an i64 result: I64Const (line 4567), I64Add/Sub/Mul result allocs
(lines 4914+, 4958+, 4996+), and the new i64-LocalGet path from PR #85.

Verified locally: /tmp/repro_i64.wat round-trips correctly with consecutive
register pair (lo→r2, hi→r3). gale_k_sem_give_decide's LocalGet 3 now
loads to consecutive r5/r6.

Note: the engine bench in Renode still hangs after this fix. Further
diagnostic shows i64.or's ARM lowering uses register pairs (r0,r1) and
(r2,r3) regardless of what's on the wasm-tracked stack — a separate
bug in synth's wildcard fallthrough for I64Or in select_with_stack.
That fix is out of scope for this PR; tracked separately.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/instruction_selector.rs               | 83 +++++++++++++++----
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs
index 9806664..82d174e 100644
--- a/crates/synth-synthesis/src/instruction_selector.rs
+++ b/crates/synth-synthesis/src/instruction_selector.rs
@@ -132,6 +132,41 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result<Reg> {
     ))
 }
 
+/// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS,
+/// neither of which is currently on the wasm stack.
+///
+/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values
+/// because if a register between them is live on the wasm stack, the second
+/// call skips it and the resulting pair is non-consecutive. Subsequent code
+/// that uses [`i64_pair_hi`]`(rdlo)` to recover the high register then gets
+/// the wrong register and reads garbage.
+///
+/// This helper ensures both halves come from consecutive ALLOCATABLE_REGS
+/// entries (no wraparound) so the pair convention holds throughout the
+/// function body.
+fn alloc_consecutive_pair(next_temp: &mut u8, stack: &[Reg]) -> Result<(Reg, Reg)> {
+    let n = ALLOCATABLE_REGS.len();
+    for _ in 0..n {
+        let lo_idx = (*next_temp as usize) % n;
+        let hi_idx = lo_idx + 1;
+        // Wraparound is invalid: i64_pair_hi requires hi_idx < n.
+        if hi_idx < n {
+            let lo_reg = ALLOCATABLE_REGS[lo_idx];
+            let hi_reg = ALLOCATABLE_REGS[hi_idx];
+            if !stack.contains(&lo_reg) && !stack.contains(&hi_reg) {
+                *next_temp = ((hi_idx + 1) % n) as u8;
+                return Ok((lo_reg, hi_reg));
+            }
+        }
+        *next_temp = ((*next_temp as usize + 1) % n) as u8;
+    }
+    Err(synth_core::Error::synthesis(
+        "register exhaustion: no consecutive pair of free registers for i64 — \
+         function too complex for current register allocator"
+            .to_string(),
+    ))
+}
+
 /// Given the low register of an i64 register pair, return the high register.
 ///
 /// Convention: i64 values on 32-bit ARM use two consecutive registers.
@@ -3448,8 +3483,13 @@ impl InstructionSelector {
                         // register pair via the I64Ldr pseudo-op. Convention
                         // matches I64Const: push only dst_lo on the stack;
                         // dst_hi is recovered later via i64_pair_hi(lo).
-                        let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                        let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                        // The pair MUST be consecutive in ALLOCATABLE_REGS
+                        // — i64_pair_hi assumes that. Two separate calls to
+                        // alloc_temp_safe can return non-consecutive registers
+                        // when something in between is live, breaking the
+                        // pair convention.
+                        let (dst_lo, dst_hi) =
+                            alloc_consecutive_pair(&mut next_temp, &stack)?;
                         instructions.push(ArmInstruction {
                             op: ArmOp::I64Ldr {
                                 rdlo: dst_lo,
@@ -4835,9 +4875,12 @@ impl InstructionSelector {
                 // Pairs are allocated as two consecutive temp registers.
                 // =========================================================
                 I64Const(val) => {
-                    // Allocate a register pair for the 64-bit constant
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate a CONSECUTIVE register pair for the 64-bit
+                    // constant. Two separate alloc_temp_safe calls can return
+                    // non-consecutive registers if something in between is
+                    // live on the wasm stack, which then breaks the
+                    // i64_pair_hi convention used by every i64 op downstream.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack)?;
 
                     instructions.push(ArmInstruction {
                         op: ArmOp::I64Const {
@@ -4867,9 +4910,13 @@ impl InstructionSelector {
                     let b_hi = i64_pair_hi(b_lo)?;
                     let a_hi = i64_pair_hi(a_lo)?;
 
-                    // Allocate result register pair
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate result register pair. MUST be consecutive
+                    // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
+                    // and is called by every i64 op downstream to recover
+                    // the high register. Two separate alloc_temp_safe calls
+                    // skip live registers and produce non-consecutive pairs.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack)?;
 
                     // ADDS dst_lo, a_lo, b_lo  (sets carry flag)
                     instructions.push(ArmInstruction {
@@ -4911,9 +4958,13 @@ impl InstructionSelector {
                     let b_hi = i64_pair_hi(b_lo)?;
                     let a_hi = i64_pair_hi(a_lo)?;
 
-                    // Allocate result register pair
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate result register pair. MUST be consecutive
+                    // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
+                    // and is called by every i64 op downstream to recover
+                    // the high register. Two separate alloc_temp_safe calls
+                    // skip live registers and produce non-consecutive pairs.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack)?;
 
                     // SUBS dst_lo, a_lo, b_lo  (sets borrow flag)
                     instructions.push(ArmInstruction {
@@ -4949,9 +5000,13 @@ impl InstructionSelector {
                         )
                     })?;
 
-                    // Allocate result register pair
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate result register pair. MUST be consecutive
+                    // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
+                    // and is called by every i64 op downstream to recover
+                    // the high register. Two separate alloc_temp_safe calls
+                    // skip live registers and produce non-consecutive pairs.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack)?;
 
                     // Generate bounds-checked i64 load into the allocated pair
                     let load_ops =

From b8da214f1a2d3c17d6a70535ff961f53d6594ff7 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Wed, 29 Apr 2026 06:25:39 +0200
Subject: [PATCH 4/4] fix(no-optimize): explicit i64 op handlers + extra_avoid
 in alloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three deeper bugs surfaced when running gale_k_sem_give_decide on
Renode after PR #85 + the consecutive-pair fix:

1. **i64 ops fall through to select_default** in select_with_stack,
   which assumes inputs are in R0:R1 / R2:R3. Wasm-stack-tracked
   pairs from earlier ops never make it. Symptoms: i64.or used
   register pairs from previous shift ops instead of the just-loaded
   LocalGet result, producing a corrupted return value.

   Fix: explicit handlers for I64Or / I64And / I64Xor / I64ExtendI32U /
   I64ExtendI32S / I64Shl / I64ShrU / I64ShrS in select_with_stack,
   each popping the wasm-tracked pair, deriving its hi via
   i64_pair_hi, and emitting the right ARM instructions / pseudo-ops
   with arbitrary registers (the existing I64Shl/etc. ArmOp pseudo-ops
   accept arbitrary register operands).

2. **alloc_consecutive_pair didn't reserve implicit hi registers**.
   The wasm stack only tracks the lo register of each i64; the hi is
   conventional but invisible to a naive scan. A fresh allocation
   could overwrite an earlier i64's implicit hi, breaking subsequent
   i64_pair_hi lookups. Witnessed: I64Const 32 allocated (r1, r2),
   clobbering the hi of a previously-extended i64 in (r0, r1).

   Fix: alloc_consecutive_pair now scans every stack entry and
   reserves both lo AND its conventional pair_hi (over-reserves for
   i32 stack entries — safe).

3. **alloc_consecutive_pair didn't reserve just-popped operands**.
   When an i64 op popped operands and then allocated a result pair,
   the popped values were temporarily off the stack. The allocation
   could pick a register that's about to be read by the op's own
   second instruction (e.g. dst_lo == a_hi means the lo Or write
   clobbers a_hi before the hi Or reads it).

   Fix: extra_avoid parameter on alloc_consecutive_pair. I64Add /
   I64Sub / I64Or / I64Shl / I64Load / extends pass their popped
   operand registers via extra_avoid.

Verified locally: gale_k_sem_give_decide now produces:
  orr r0, r6, r8   ; lo = shift_result_lo OR local3_lo
  orr r1, r7, ip   ; hi = shift_result_hi OR local3_hi
matching the wasm semantics for i64.or. Engine bench builds clean
with 22644 B FLASH.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/instruction_selector.rs               | 289 ++++++++++++++++--
 1 file changed, 263 insertions(+), 26 deletions(-)

diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs
index 82d174e..e35608b 100644
--- a/crates/synth-synthesis/src/instruction_selector.rs
+++ b/crates/synth-synthesis/src/instruction_selector.rs
@@ -133,27 +133,52 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result<Reg> {
 }
 
 /// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS,
-/// neither of which is currently on the wasm stack.
+/// neither of which is currently in use.
 ///
-/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values
-/// because if a register between them is live on the wasm stack, the second
-/// call skips it and the resulting pair is non-consecutive. Subsequent code
-/// that uses [`i64_pair_hi`]`(rdlo)` to recover the high register then gets
-/// the wrong register and reads garbage.
+/// "In use" means:
+/// 1. On the wasm stack (the explicit `Vec<Reg>` tracking).
+/// 2. The implicit *high* register of any i64 value on the stack — for every
+///    `lo` in `stack`, [`i64_pair_hi`]`(lo)` is also reserved. The wasm stack
+///    only tracks the lo register of each i64; the hi is reserved by
+///    convention but invisible to a naive scan. If we ignored that, a fresh
+///    `alloc_consecutive_pair` could return the implicit-hi of an earlier
+///    i64, clobbering it on the next i64 op that reads it via i64_pair_hi.
+/// 3. Any explicit registers in `extra_avoid` — used by i64-op handlers to
+///    keep the just-popped operand pairs alive across the destination
+///    allocation (e.g. for I64Or, the popped a_lo/a_hi/b_lo/b_hi are still
+///    live until the OR is emitted).
 ///
-/// This helper ensures both halves come from consecutive ALLOCATABLE_REGS
-/// entries (no wraparound) so the pair convention holds throughout the
-/// function body.
-fn alloc_consecutive_pair(next_temp: &mut u8, stack: &[Reg]) -> Result<(Reg, Reg)> {
+/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values:
+/// if a register between them is live, the second call skips it and the
+/// resulting pair is non-consecutive, breaking [`i64_pair_hi`]'s contract.
+fn alloc_consecutive_pair(
+    next_temp: &mut u8,
+    stack: &[Reg],
+    extra_avoid: &[Reg],
+) -> Result<(Reg, Reg)> {
+    // Build a "live" Vec: every stack entry, plus its conventional
+    // pair_hi (over-reserves for i32 stack entries but that's safe), plus
+    // any explicit extras the caller specifies. Using Vec rather than
+    // HashSet because Reg in this crate does not derive Hash.
+    let mut live: Vec<Reg> = Vec::with_capacity(stack.len() * 2 + extra_avoid.len());
+    for &reg in stack {
+        live.push(reg);
+        if let Ok(hi) = i64_pair_hi(reg) {
+            live.push(hi);
+        }
+    }
+    for &reg in extra_avoid {
+        live.push(reg);
+    }
+
     let n = ALLOCATABLE_REGS.len();
     for _ in 0..n {
         let lo_idx = (*next_temp as usize) % n;
         let hi_idx = lo_idx + 1;
-        // Wraparound is invalid: i64_pair_hi requires hi_idx < n.
         if hi_idx < n {
             let lo_reg = ALLOCATABLE_REGS[lo_idx];
             let hi_reg = ALLOCATABLE_REGS[hi_idx];
-            if !stack.contains(&lo_reg) && !stack.contains(&hi_reg) {
+            if !live.contains(&lo_reg) && !live.contains(&hi_reg) {
                 *next_temp = ((hi_idx + 1) % n) as u8;
                 return Ok((lo_reg, hi_reg));
             }
@@ -3489,7 +3514,7 @@ impl InstructionSelector {
                         // when something in between is live, breaking the
                         // pair convention.
                         let (dst_lo, dst_hi) =
-                            alloc_consecutive_pair(&mut next_temp, &stack)?;
+                            alloc_consecutive_pair(&mut next_temp, &stack, &[])?;
                         instructions.push(ArmInstruction {
                             op: ArmOp::I64Ldr {
                                 rdlo: dst_lo,
@@ -4880,7 +4905,7 @@ impl InstructionSelector {
                     // non-consecutive registers if something in between is
                     // live on the wasm stack, which then breaks the
                     // i64_pair_hi convention used by every i64 op downstream.
-                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack)?;
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?;
 
                     instructions.push(ArmInstruction {
                         op: ArmOp::I64Const {
@@ -4915,8 +4940,14 @@ impl InstructionSelector {
                     // and is called by every i64 op downstream to recover
                     // the high register. Two separate alloc_temp_safe calls
                     // skip live registers and produce non-consecutive pairs.
-                    let (dst_lo, dst_hi) =
-                        alloc_consecutive_pair(&mut next_temp, &stack)?;
+                    // Avoid clobbering the just-popped operand pairs before
+                    // the ADC reads them — passing them in extra_avoid
+                    // ensures dst doesn't overlap any of a_lo/a_hi/b_lo/b_hi.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(
+                        &mut next_temp,
+                        &stack,
+                        &[a_lo, a_hi, b_lo, b_hi],
+                    )?;
 
                     // ADDS dst_lo, a_lo, b_lo  (sets carry flag)
                     instructions.push(ArmInstruction {
@@ -4958,13 +4989,13 @@ impl InstructionSelector {
                     let b_hi = i64_pair_hi(b_lo)?;
                     let a_hi = i64_pair_hi(a_lo)?;
 
-                    // Allocate result register pair. MUST be consecutive
-                    // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
-                    // and is called by every i64 op downstream to recover
-                    // the high register. Two separate alloc_temp_safe calls
-                    // skip live registers and produce non-consecutive pairs.
-                    let (dst_lo, dst_hi) =
-                        alloc_consecutive_pair(&mut next_temp, &stack)?;
+                    // See I64Add for why extra_avoid carries a_*/b_* —
+                    // dst must not overlap any operand half before SBC reads it.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(
+                        &mut next_temp,
+                        &stack,
+                        &[a_lo, a_hi, b_lo, b_hi],
+                    )?;
 
                     // SUBS dst_lo, a_lo, b_lo  (sets borrow flag)
                     instructions.push(ArmInstruction {
@@ -4991,6 +5022,212 @@ impl InstructionSelector {
                     stack.push(dst_lo);
                 }
 
+                // ============================================================
+                // i64 bitwise ops (I64Or / I64And / I64Xor)
+                //
+                // Each pops two i64 register pairs from the wasm stack and
+                // emits two ARM ops (low-half then high-half) into a freshly
+                // allocated consecutive pair. This replaces the wildcard
+                // fallthrough to select_default, which assumed inputs in
+                // R0:R1 and R2:R3 — incorrect when the wasm stack tracks
+                // arbitrary register pairs from earlier ops.
+                // ============================================================
+                I64Or | I64And | I64Xor => {
+                    let b_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in i64 bitwise op".to_string(),
+                        )
+                    })?;
+                    let a_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in i64 bitwise op".to_string(),
+                        )
+                    })?;
+                    let b_hi = i64_pair_hi(b_lo)?;
+                    let a_hi = i64_pair_hi(a_lo)?;
+                    // dst must not overlap any popped operand's half — the
+                    // hi instruction reads a_hi and b_hi after the lo
+                    // instruction writes dst_lo.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(
+                        &mut next_temp,
+                        &stack,
+                        &[a_lo, a_hi, b_lo, b_hi],
+                    )?;
+                    let (lo_op, hi_op) = match op {
+                        I64Or => (
+                            ArmOp::Orr {
+                                rd: dst_lo,
+                                rn: a_lo,
+                                op2: Operand2::Reg(b_lo),
+                            },
+                            ArmOp::Orr {
+                                rd: dst_hi,
+                                rn: a_hi,
+                                op2: Operand2::Reg(b_hi),
+                            },
+                        ),
+                        I64And => (
+                            ArmOp::And {
+                                rd: dst_lo,
+                                rn: a_lo,
+                                op2: Operand2::Reg(b_lo),
+                            },
+                            ArmOp::And {
+                                rd: dst_hi,
+                                rn: a_hi,
+                                op2: Operand2::Reg(b_hi),
+                            },
+                        ),
+                        I64Xor => (
+                            ArmOp::Eor {
+                                rd: dst_lo,
+                                rn: a_lo,
+                                op2: Operand2::Reg(b_lo),
+                            },
+                            ArmOp::Eor {
+                                rd: dst_hi,
+                                rn: a_hi,
+                                op2: Operand2::Reg(b_hi),
+                            },
+                        ),
+                        _ => unreachable!(),
+                    };
+                    instructions.push(ArmInstruction {
+                        op: lo_op,
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    instructions.push(ArmInstruction {
+                        op: hi_op,
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
+                // ============================================================
+                // i32 -> i64 extension (I64ExtendI32U / I64ExtendI32S)
+                //
+                // Pops one i32, allocates a consecutive i64 pair, places the
+                // i32 in the low half. For unsigned: high = 0. For signed:
+                // high = arithmetic-shift-right by 31 (sign-extension).
+                // ============================================================
+                I64ExtendI32U => {
+                    let val = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in I64ExtendI32U".to_string(),
+                        )
+                    })?;
+                    // val must stay alive until the Mov reads it; dst_hi
+                    // must not be val (we'd write the zero high before
+                    // moving val to dst_lo).
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[val])?;
+                    if val != dst_lo {
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Mov {
+                                rd: dst_lo,
+                                op2: Operand2::Reg(val),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    }
+                    instructions.push(ArmInstruction {
+                        op: ArmOp::Movw {
+                            rd: dst_hi,
+                            imm16: 0,
+                        },
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
+                I64ExtendI32S => {
+                    let val = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in I64ExtendI32S".to_string(),
+                        )
+                    })?;
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[val])?;
+                    instructions.push(ArmInstruction {
+                        op: ArmOp::I64ExtendI32S {
+                            rdlo: dst_lo,
+                            rdhi: dst_hi,
+                            rn: val,
+                        },
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
+                // ============================================================
+                // i64 variable shifts (I64Shl / I64ShrU / I64ShrS)
+                //
+                // Use the existing I64Shl/I64ShrU/I64ShrS pseudo-ops (which
+                // expand to the variable-shift logic in arm_encoder.rs) but
+                // pass the actual stack-tracked register pairs rather than
+                // assuming R0:R1 / R2:R3.
+                // ============================================================
+                I64Shl | I64ShrU | I64ShrS => {
+                    let b_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in i64 shift".to_string(),
+                        )
+                    })?;
+                    let a_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in i64 shift".to_string(),
+                        )
+                    })?;
+                    let b_hi = i64_pair_hi(b_lo)?;
+                    let a_hi = i64_pair_hi(a_lo)?;
+                    // dst must not overlap any popped operand's half — the
+                    // shift pseudo-op reads all four (rn_lo/rn_hi/rm_lo/rm_hi)
+                    // before writing the destination.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(
+                        &mut next_temp,
+                        &stack,
+                        &[a_lo, a_hi, b_lo, b_hi],
+                    )?;
+                    let shift_op = match op {
+                        I64Shl => ArmOp::I64Shl {
+                            rd_lo: dst_lo,
+                            rd_hi: dst_hi,
+                            rn_lo: a_lo,
+                            rn_hi: a_hi,
+                            rm_lo: b_lo,
+                            rm_hi: b_hi,
+                        },
+                        I64ShrU => ArmOp::I64ShrU {
+                            rd_lo: dst_lo,
+                            rd_hi: dst_hi,
+                            rn_lo: a_lo,
+                            rn_hi: a_hi,
+                            rm_lo: b_lo,
+                            rm_hi: b_hi,
+                        },
+                        I64ShrS => ArmOp::I64ShrS {
+                            rd_lo: dst_lo,
+                            rd_hi: dst_hi,
+                            rn_lo: a_lo,
+                            rn_hi: a_hi,
+                            rm_lo: b_lo,
+                            rm_hi: b_hi,
+                        },
+                        _ => unreachable!(),
+                    };
+                    instructions.push(ArmInstruction {
+                        op: shift_op,
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
                 I64Load { offset, .. } => {
                     // Pop address from stack
                     let addr = stack.pop().ok_or_else(|| {
@@ -5003,10 +5240,10 @@ impl InstructionSelector {
                     // Allocate result register pair. MUST be consecutive
                     // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
                     // and is called by every i64 op downstream to recover
-                    // the high register. Two separate alloc_temp_safe calls
-                    // skip live registers and produce non-consecutive pairs.
+                    // the high register. Avoid clobbering addr before the
+                    // load uses it.
                     let (dst_lo, dst_hi) =
-                        alloc_consecutive_pair(&mut next_temp, &stack)?;
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?;
 
                     // Generate bounds-checked i64 load into the allocated pair
                     let load_ops =