diff --git a/crates/synth-analysis/src/ssa.rs b/crates/synth-analysis/src/ssa.rs
index 37a77f2..920adc3 100644
--- a/crates/synth-analysis/src/ssa.rs
+++ b/crates/synth-analysis/src/ssa.rs
@@ -335,11 +335,11 @@ impl DeadCodeElimination {
                     SSAInstr::Assign { result, .. }
                     | SSAInstr::BinOp { result, .. }
                     | SSAInstr::UnaryOp { result, .. }
-                    | SSAInstr::Load { result, .. } => {
-                        if !used_vars.contains(result) {
-                            removed += 1;
-                            return false;
-                        }
+                    | SSAInstr::Load { result, .. }
+                        if !used_vars.contains(result) =>
+                    {
+                        removed += 1;
+                        return false;
                     }
                     _ => {}
                 }
diff --git a/crates/synth-backend/src/mpu_allocator.rs b/crates/synth-backend/src/mpu_allocator.rs
index 95b31f9..58f836c 100644
--- a/crates/synth-backend/src/mpu_allocator.rs
+++ b/crates/synth-backend/src/mpu_allocator.rs
@@ -339,4 +339,86 @@ mod tests {
             assert!(region.validate().is_ok());
         }
     }
+
+    #[test]
+    fn test_imxrt1062_has_16_regions() {
+        // i.MX RT1062 (M7-class) has 16 MPU regions vs 8 on M4-class parts
+        let hw_caps = HardwareCapabilities::imxrt1062();
+        assert_eq!(hw_caps.mpu_regions, 16);
+
+        let allocator = MPUAllocator::new(hw_caps);
+        assert_eq!(allocator.available_regions(), 16);
+    }
+
+    #[test]
+    fn test_m7_can_allocate_more_than_8_regions() {
+        // Validate that the allocator actually uses all 16 regions on M7
+        let mut allocator = MPUAllocator::new(HardwareCapabilities::imxrt1062());
+
+        for i in 0u32..16 {
+            let request = MPUAllocationRequest {
+                memory: Memory {
+                    index: i,
+                    initial: 1,
+                    maximum: None,
+                    shared: false,
+                    memory64: false,
+                },
+                permissions: MPUPermissions::FullRW,
+                attributes: MPUAttributes::normal(),
+                preferred_base: Some(0x20000000 + i * 0x10000),
+            };
+            allocator.allocate(request).unwrap_or_else(|e| {
+                panic!("region {} allocation failed: {:?}", i, e);
+            });
+        }
+
+        assert_eq!(allocator.available_regions(), 0);
+        assert_eq!(allocator.allocated_regions().len(), 16);
+    }
+
+    #[test]
+    fn test_m4_class_caps_at_8_regions() {
+        // Negative — M4-class parts must reject the 9th region.
+        let mut allocator = MPUAllocator::new(HardwareCapabilities::nrf52840());
+
+        for i in 0u32..8 {
+            let request = MPUAllocationRequest {
+                memory: Memory {
+                    index: i,
+                    initial: 1,
+                    maximum: None,
+                    shared: false,
+                    memory64: false,
+                },
+                permissions: MPUPermissions::FullRW,
+                attributes: MPUAttributes::normal(),
+                preferred_base: Some(0x20000000 + i * 0x10000),
+            };
+            allocator.allocate(request).unwrap();
+        }
+
+        // 9th region must fail
+        let overflow = MPUAllocationRequest {
+            memory: Memory {
+                index: 8,
+                initial: 1,
+                maximum: None,
+                shared: false,
+                memory64: false,
+            },
+            permissions: MPUPermissions::FullRW,
+            attributes: MPUAttributes::normal(),
+            preferred_base: Some(0x20100000),
+        };
+        assert!(allocator.allocate(overflow).is_err());
+    }
+
+    #[test]
+    fn test_stm32h743_has_16_regions_and_double_fpu() {
+        let caps = HardwareCapabilities::stm32h743();
+        assert_eq!(caps.mpu_regions, 16);
+        assert!(caps.has_fpu);
+        assert_eq!(caps.fpu_precision, Some(synth_core::FPUPrecision::Double));
+    }
 }
diff --git a/crates/synth-cli/src/main.rs b/crates/synth-cli/src/main.rs
index cfd4eea..b78ea11 100644
--- a/crates/synth-cli/src/main.rs
+++ b/crates/synth-cli/src/main.rs
@@ -77,7 +77,7 @@ enum Commands {
         )]
         target: String,
 
-        /// Hardware config (nrf52840, stm32f407, or custom)
+        /// Hardware config (nrf52840, stm32f407, stm32h743, imxrt1062, or custom)
         #[arg(long, value_name = "HARDWARE", default_value = "nrf52840")]
         hardware: String,
 
@@ -170,6 +170,11 @@ enum Commands {
         /// Path to kiln-builtins object file (.o) for linking (used with --link)
         #[arg(long, value_name = "BUILTINS")]
         builtins: Option<PathBuf>,
+
+        /// Force relocatable object (.o, ET_REL) output even when wasm has no imports
+        /// — for linking into a host build system.
+        #[arg(long)]
+        relocatable: bool,
     },
 
     /// Disassemble an ARM ELF file (e.g., synth disasm output.elf)
@@ -248,6 +253,7 @@ fn main() -> Result<()> {
             verify,
             link,
             builtins,
+            relocatable,
         } => {
             // Resolve target spec: --target overrides, --cortex-m is backwards compat
             let target_spec = resolve_target_spec(target.as_deref(), cortex_m)?;
@@ -272,6 +278,7 @@ fn main() -> Result<()> {
                 &backend,
                 verify,
                 &target_spec,
+                relocatable,
             )?;
 
             // If --link requested, invoke the cross-linker
@@ -356,9 +363,11 @@ fn synthesize_command(
     let hw_caps = match hardware.as_str() {
         "nrf52840" => HardwareCapabilities::nrf52840(),
         "stm32f407" => HardwareCapabilities::stm32f407(),
+        "stm32h743" => HardwareCapabilities::stm32h743(),
+        "imxrt1062" => HardwareCapabilities::imxrt1062(),
         _ => {
             anyhow::bail!(
-                "Unsupported hardware: {}. Use nrf52840 or stm32f407",
+                "Unsupported hardware: {}. Use nrf52840, stm32f407, stm32h743, imxrt1062",
                 hardware
             );
         }
@@ -398,8 +407,19 @@ fn target_info_command(target: String) -> Result<()> {
             let caps = HardwareCapabilities::stm32f407();
             print_hardware_info(&caps);
         }
+        "stm32h743" => {
+            let caps = HardwareCapabilities::stm32h743();
+            print_hardware_info(&caps);
+        }
+        "imxrt1062" => {
+            let caps = HardwareCapabilities::imxrt1062();
+            print_hardware_info(&caps);
+        }
         _ => {
-            anyhow::bail!("Unknown target: {}. Supported: nrf52840, stm32f407", target);
+            anyhow::bail!(
+                "Unknown target: {}. Supported: nrf52840, stm32f407, stm32h743, imxrt1062",
+                target
+            );
         }
     }
 
@@ -553,6 +573,7 @@ fn compile_command(
     backend_name: &str,
     verify: bool,
     target_spec: &TargetSpec,
+    relocatable: bool,
 ) -> Result<()> {
     // Validate backend exists
     let registry = build_backend_registry();
@@ -595,6 +616,7 @@ fn compile_command(
             backend,
             verify,
             target_spec,
+            relocatable,
         );
     }
 
@@ -1222,6 +1244,7 @@ fn compile_all_exports(
     backend: &dyn Backend,
     verify: bool,
     target_spec: &TargetSpec,
+    relocatable: bool,
 ) -> Result<()> {
     let path = input.context("--all-exports requires an input file")?;
 
@@ -1428,8 +1451,18 @@ fn compile_all_exports(
     // When there are relocations, produce a relocatable object (.o) instead of
     // an executable. This lets the output be linked with the Kiln bridge crate
     // (which provides __meld_dispatch_import and __meld_get_memory_base).
-    let elf_data = if has_relocations {
-        info!("Module has import calls — producing relocatable object (ET_REL)");
+    // The --relocatable flag forces ET_REL output even when the wasm has no
+    // imports, for linking into a host build system (e.g. Zephyr).
+    let elf_data = if has_relocations || relocatable {
+        let total_relocs: usize = compiled_funcs.iter().map(|f| f.relocations.len()).sum();
+        if has_relocations {
+            info!(
+                "Producing relocatable object (ET_REL): {} import call relocations",
+                total_relocs
+            );
+        } else {
+            info!("Producing relocatable object (ET_REL): forced by --relocatable");
+        }
         build_relocatable_elf(&compiled_funcs, &all_imports)?
     } else if cortex_m {
         build_multi_func_cortex_m_elf(&compiled_funcs, &all_memories, target_spec)?
diff --git a/crates/synth-core/src/target.rs b/crates/synth-core/src/target.rs
index b808679..93e73bd 100644
--- a/crates/synth-core/src/target.rs
+++ b/crates/synth-core/src/target.rs
@@ -234,6 +234,49 @@ impl HardwareCapabilities {
             ram_size: 192 * 1024,    // 192KB (128KB + 64KB CCM)
         }
     }
+
+    /// Create capabilities for STM32H743 (Cortex-M7 with double-precision FPU)
+    ///
+    /// 16 MPU regions, 2MB Flash, 1MB RAM (DTCM + AXI SRAM + SRAM1-4).
+    pub fn stm32h743() -> Self {
+        Self {
+            arch: TargetArch::ARMCortexM(CortexMVariant::M7DP),
+            has_mpu: true,
+            mpu_regions: 16,
+            has_pmp: false,
+            pmp_entries: 0,
+            has_fpu: true,
+            fpu_precision: Some(FPUPrecision::Double),
+            has_simd: false,
+            simd_level: None,
+            xip_capable: true,
+            flash_size: 2 * 1024 * 1024, // 2MB
+            ram_size: 1024 * 1024,       // 1MB total
+        }
+    }
+
+    /// Create capabilities for i.MX RT1062 (Cortex-M7 with single-precision FPU)
+    ///
+    /// Representative high-end M7 with 16 MPU regions, single-precision FPU,
+    /// large OCRAM, and external XIP-capable QuadSPI Flash. Matches the
+    /// configuration of safety-grade lockstepped M7 platforms used in
+    /// industrial and embedded automotive contexts.
+    pub fn imxrt1062() -> Self {
+        Self {
+            arch: TargetArch::ARMCortexM(CortexMVariant::M7),
+            has_mpu: true,
+            mpu_regions: 16,
+            has_pmp: false,
+            pmp_entries: 0,
+            has_fpu: true,
+            fpu_precision: Some(FPUPrecision::Single),
+            has_simd: false,
+            simd_level: None,
+            xip_capable: true,
+            flash_size: 8 * 1024 * 1024, // 8MB external QSPI flash (typical)
+            ram_size: 1024 * 1024,       // 1MB OCRAM (FlexRAM 512KB + OCRAM 512KB)
+        }
+    }
 }
 
 // ============================================================================
diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs
index 4f43edb..6821773 100644
--- a/crates/synth-synthesis/src/instruction_selector.rs
+++ b/crates/synth-synthesis/src/instruction_selector.rs
@@ -132,6 +132,66 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result<Reg> {
     ))
 }
 
+/// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS,
+/// neither of which is currently in use.
+///
+/// "In use" means:
+/// 1. On the wasm stack (the explicit `Vec<Reg>` tracking).
+/// 2. The implicit *high* register of any i64 value on the stack — for every
+///    `lo` in `stack`, [`i64_pair_hi`]`(lo)` is also reserved. The wasm stack
+///    only tracks the lo register of each i64; the hi is reserved by
+///    convention but invisible to a naive scan. If we ignored that, a fresh
+///    `alloc_consecutive_pair` could return the implicit-hi of an earlier
+///    i64, clobbering it on the next i64 op that reads it via i64_pair_hi.
+/// 3. Any explicit registers in `extra_avoid` — used by i64-op handlers to
+///    keep the just-popped operand pairs alive across the destination
+///    allocation (e.g. for I64Or, the popped a_lo/a_hi/b_lo/b_hi are still
+///    live until the OR is emitted).
+///
+/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values:
+/// if a register between them is live, the second call skips it and the
+/// resulting pair is non-consecutive, breaking [`i64_pair_hi`]'s contract.
+fn alloc_consecutive_pair(
+    next_temp: &mut u8,
+    stack: &[Reg],
+    extra_avoid: &[Reg],
+) -> Result<(Reg, Reg)> {
+    // Build a "live" Vec: every stack entry, plus its conventional
+    // pair_hi (over-reserves for i32 stack entries but that's safe), plus
+    // any explicit extras the caller specifies. Using Vec rather than
+    // HashSet because Reg in this crate does not derive Hash.
+    let mut live: Vec<Reg> = Vec::with_capacity(stack.len() * 2 + extra_avoid.len());
+    for &reg in stack {
+        live.push(reg);
+        if let Ok(hi) = i64_pair_hi(reg) {
+            live.push(hi);
+        }
+    }
+    for &reg in extra_avoid {
+        live.push(reg);
+    }
+
+    let n = ALLOCATABLE_REGS.len();
+    for _ in 0..n {
+        let lo_idx = (*next_temp as usize) % n;
+        let hi_idx = lo_idx + 1;
+        if hi_idx < n {
+            let lo_reg = ALLOCATABLE_REGS[lo_idx];
+            let hi_reg = ALLOCATABLE_REGS[hi_idx];
+            if !live.contains(&lo_reg) && !live.contains(&hi_reg) {
+                *next_temp = ((hi_idx + 1) % n) as u8;
+                return Ok((lo_reg, hi_reg));
+            }
+        }
+        *next_temp = ((*next_temp as usize + 1) % n) as u8;
+    }
+    Err(synth_core::Error::synthesis(
+        "register exhaustion: no consecutive pair of free registers for i64 — \
+         function too complex for current register allocator"
+            .to_string(),
+    ))
+}
+
 /// Given the low register of an i64 register pair, return the high register.
 ///
 /// Convention: i64 values on 32-bit ARM use two consecutive registers.
@@ -156,6 +216,162 @@ fn i64_pair_hi(lo_reg: Reg) -> Result<Reg> {
     )))
 }
 
+/// Per-function stack-frame layout for non-parameter locals.
+///
+/// `offsets[idx]` gives the byte offset (relative to SP after the frame
+/// allocation) where local `idx` lives. `frame_size` is the total bytes
+/// to allocate via `sub sp, sp, #frame_size` in the prologue.
+///
+/// i32/i64 locals each occupy 4/8 bytes respectively. i64 locals are
+/// 8-byte aligned per AAPCS. The total frame is rounded up to 8 bytes
+/// to keep SP 8-byte aligned at call sites.
+struct LocalLayout {
+    /// idx -> (offset_from_sp, is_i64)
+    locals: std::collections::HashMap<u32, (i32, bool)>,
+    frame_size: i32,
+}
+
+/// Compute the stack-frame layout for non-parameter locals in a function.
+///
+/// Walks the wasm op stream once to:
+/// 1. Identify which non-param local indices are referenced (LocalGet/Set/Tee).
+/// 2. Determine each local's width via `infer_i64_locals` (i32 vs i64).
+/// 3. Lay them out in ascending-index order with i64 locals 8-byte aligned.
+///
+/// The result drives:
+/// - Prologue: `sub sp, sp, #frame_size` after pushing callee-saved regs.
+/// - LocalGet/Set/Tee: use `offsets[idx]` instead of the legacy
+///   `(idx - 4) * 4` formula (which only happened to work when num_params==4
+///   AND the formula's negative result was silently clamped to 0 by the
+///   encoder, in both cases corrupting the caller's stack or the callee's
+///   own callee-saved-register spill).
+/// - Epilogue: `add sp, sp, #frame_size` before popping registers.
+fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout {
+    use std::collections::{BTreeSet, HashMap};
+    let i64_set = infer_i64_locals(wasm_ops);
+
+    // Collect non-param local indices, in ascending order for deterministic layout.
+    let mut used: BTreeSet<u32> = BTreeSet::new();
+    for op in wasm_ops {
+        match op {
+            WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx)
+                if *idx >= num_params =>
+            {
+                used.insert(*idx);
+            }
+            _ => {}
+        }
+    }
+
+    let mut locals: HashMap<u32, (i32, bool)> = HashMap::new();
+    let mut offset: i32 = 0;
+    for &idx in &used {
+        let is_i64 = i64_set.contains(&idx);
+        // i64 locals require 8-byte alignment.
+        if is_i64 && (offset % 8) != 0 {
+            offset += 4;
+        }
+        locals.insert(idx, (offset, is_i64));
+        offset += if is_i64 { 8 } else { 4 };
+    }
+    // Round frame to 8-byte multiple for AAPCS SP alignment.
+    let frame_size = (offset + 7) & !7;
+
+    LocalLayout { locals, frame_size }
+}
+
+/// Infer which non-parameter wasm locals are i64 (8-byte) values.
+///
+/// The wasm decoder discards local-declaration type info, so we re-derive
+/// it from the operation stream by simulating a virtual stack of widths
+/// (1 = 32-bit, 2 = 64-bit). On each `LocalSet`/`LocalTee` we record the
+/// width of the value being stored. WASM type rules guarantee a local's
+/// width is invariant for its lifetime, so the first store wins.
+///
+/// Without this, the spilled-local store/load path would emit a single
+/// 4-byte STR/LDR for i64 locals, dropping the upper half — corrupting
+/// any function that returns or uses a u64-packed FFI struct.
+fn infer_i64_locals(wasm_ops: &[WasmOp]) -> std::collections::HashSet<u32> {
+    use WasmOp::*;
+    let mut i64_locals: std::collections::HashSet<u32> = std::collections::HashSet::new();
+    let mut vstack: Vec<bool> = Vec::new(); // true = i64
+
+    let is_i64_producer = |op: &WasmOp| -> bool {
+        matches!(
+            op,
+            I64Add
+                | I64Sub
+                | I64Mul
+                | I64DivS
+                | I64DivU
+                | I64RemS
+                | I64RemU
+                | I64And
+                | I64Or
+                | I64Xor
+                | I64Shl
+                | I64ShrS
+                | I64ShrU
+                | I64Rotl
+                | I64Rotr
+                | I64Clz
+                | I64Ctz
+                | I64Popcnt
+                | I64Const(_)
+                | I64Load { .. }
+                | I64Load8S { .. }
+                | I64Load8U { .. }
+                | I64Load16S { .. }
+                | I64Load16U { .. }
+                | I64Load32S { .. }
+                | I64Load32U { .. }
+                | I64ExtendI32S
+                | I64ExtendI32U
+                | I64Extend8S
+                | I64Extend16S
+                | I64Extend32S
+        )
+    };
+
+    for op in wasm_ops {
+        match op {
+            LocalGet(idx) => {
+                let is_i64 = i64_locals.contains(idx);
+                vstack.push(is_i64);
+            }
+            LocalSet(idx) => {
+                if let Some(true) = vstack.pop() {
+                    i64_locals.insert(*idx);
+                }
+            }
+            LocalTee(idx) => {
+                if let Some(&true) = vstack.last() {
+                    i64_locals.insert(*idx);
+                }
+            }
+            Select => {
+                // pops [val1, val2, cond], pushes one value with width of val1/val2
+                let _cond = vstack.pop();
+                let v2 = vstack.pop();
+                let v1 = vstack.pop();
+                vstack.push(v1.or(v2).unwrap_or(false));
+            }
+            _ => {
+                let (pops, pushes) = wasm_stack_effect(op);
+                for _ in 0..pops {
+                    vstack.pop();
+                }
+                let push_width = is_i64_producer(op);
+                for _ in 0..pushes {
+                    vstack.push(push_width);
+                }
+            }
+        }
+    }
+
+    i64_locals
+}
+
 /// Return the (pops, pushes) stack effect for a WASM op.
 ///
 /// Used by the wildcard fallthrough in select_with_stack to maintain
@@ -3220,9 +3436,12 @@ impl InstructionSelector {
 
         let mut instructions = Vec::new();
 
-        // Function prologue: save callee-saved registers and LR.
+        // Function prologue: save callee-saved registers and LR, then
+        // allocate the local-variable frame.
+        //
         // AAPCS requires 8-byte aligned SP at call sites. Pushing an even
-        // number of registers (6: R4-R8, LR) maintains alignment.
+        // number of registers (6: R4-R8, LR) maintains alignment, and the
+        // frame_size below is rounded to 8 to preserve it.
         instructions.push(ArmInstruction {
             op: ArmOp::Push {
                 regs: vec![Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8, Reg::LR],
@@ -3230,6 +3449,22 @@ impl InstructionSelector {
             source_line: None,
         });
 
+        // Compute non-param local layout (offsets + total frame size).
+        let layout = compute_local_layout(wasm_ops, num_params);
+        // Allocate stack space for non-param locals so they don't alias the
+        // callee-saved-register spill area (which immediately follows SP
+        // after Push above).
+        if layout.frame_size > 0 {
+            instructions.push(ArmInstruction {
+                op: ArmOp::Sub {
+                    rd: Reg::SP,
+                    rn: Reg::SP,
+                    op2: Operand2::Imm(layout.frame_size),
+                },
+                source_line: None,
+            });
+        }
+
         // Virtual stack holds register indices
         let mut stack: Vec<Reg> = Vec::new();
         // Next available register for temporaries (start after params)
@@ -3255,11 +3490,46 @@ impl InstructionSelector {
         for (idx, op) in wasm_ops.iter().enumerate() {
             match op {
                 LocalGet(local_idx) => {
-                    // Get the register for this local
+                    // Get the register for this local. Three cases:
+                    //  1. Param in register — use the cached mapping.
+                    //  2. Spilled i64 local — load both halves via I64Ldr.
+                    //  3. Spilled i32 local — single Ldr.
                     let reg = if let Some(&r) = local_to_reg.get(local_idx) {
                         r
+                    } else if let Some(&(off, true)) = layout.locals.get(local_idx) {
+                        // i64 local — load both 32-bit halves into a consecutive
+                        // register pair via the I64Ldr pseudo-op. Convention
+                        // matches I64Const: push only dst_lo on the stack;
+                        // dst_hi is recovered later via i64_pair_hi(lo).
+                        // The pair MUST be consecutive in ALLOCATABLE_REGS
+                        // — i64_pair_hi assumes that. Two separate calls to
+                        // alloc_temp_safe can return non-consecutive registers
+                        // when something in between is live, breaking the
+                        // pair convention.
+                        let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::I64Ldr {
+                                rdlo: dst_lo,
+                                rdhi: dst_hi,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        dst_lo
+                    } else if let Some(&(off, false)) = layout.locals.get(local_idx) {
+                        // i32 local: single 4-byte load from the locals frame.
+                        let dst = alloc_temp_safe(&mut next_temp, &stack)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Ldr {
+                                rd: dst,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        dst
                     } else {
-                        // Local not in register (spilled to stack) - load it
+                        // Local not in layout (shouldn't happen for valid wasm,
+                        // but fall back to legacy behaviour for compatibility).
                         let dst = alloc_temp_safe(&mut next_temp, &stack)?;
                         instructions.push(ArmInstruction {
                             op: ArmOp::Ldr {
@@ -4324,6 +4594,20 @@ impl InstructionSelector {
                         });
                         cf.add_instruction();
                     }
+                    // Deallocate the local frame before popping callee-saved
+                    // registers; otherwise the pop would read from the locals
+                    // area instead of the saved-register slots.
+                    if layout.frame_size > 0 {
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Add {
+                                rd: Reg::SP,
+                                rn: Reg::SP,
+                                op2: Operand2::Imm(layout.frame_size),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    }
                     // Restore callee-saved registers and return via PC
                     instructions.push(ArmInstruction {
                         op: ArmOp::Pop {
@@ -4475,7 +4759,32 @@ impl InstructionSelector {
                             cf.add_instruction();
                         }
                         local_to_reg.insert(*local_idx, target);
+                    } else if let Some(&(off, true)) = layout.locals.get(local_idx) {
+                        // i64 spilled local: store BOTH 32-bit halves
+                        // (lower at offset N, upper at N+4) via the I64Str
+                        // pseudo-op. Without this we drop the upper half.
+                        let val_hi = i64_pair_hi(val)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::I64Str {
+                                rdlo: val,
+                                rdhi: val_hi,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    } else if let Some(&(off, false)) = layout.locals.get(local_idx) {
+                        // i32 spilled local: single 4-byte store.
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Str {
+                                rd: val,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
                     } else {
+                        // Fall-through for compatibility (shouldn't happen).
                         instructions.push(ArmInstruction {
                             op: ArmOp::Str {
                                 rd: val,
@@ -4507,7 +4816,29 @@ impl InstructionSelector {
                             cf.add_instruction();
                         }
                         local_to_reg.insert(*local_idx, target);
+                    } else if let Some(&(off, true)) = layout.locals.get(local_idx) {
+                        // i64 spilled local: store both halves like LocalSet.
+                        let val_hi = i64_pair_hi(val)?;
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::I64Str {
+                                rdlo: val,
+                                rdhi: val_hi,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    } else if let Some(&(off, false)) = layout.locals.get(local_idx) {
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Str {
+                                rd: val,
+                                addr: MemAddr::imm(Reg::SP, off),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
                     } else {
+                        // Fall-through for compatibility.
                         instructions.push(ArmInstruction {
                             op: ArmOp::Str {
                                 rd: val,
@@ -4561,9 +4892,12 @@ impl InstructionSelector {
                 // Pairs are allocated as two consecutive temp registers.
                 // =========================================================
                 I64Const(val) => {
-                    // Allocate a register pair for the 64-bit constant
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate a CONSECUTIVE register pair for the 64-bit
+                    // constant. Two separate alloc_temp_safe calls can return
+                    // non-consecutive registers if something in between is
+                    // live on the wasm stack, which then breaks the
+                    // i64_pair_hi convention used by every i64 op downstream.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?;
 
                     instructions.push(ArmInstruction {
                         op: ArmOp::I64Const {
@@ -4593,9 +4927,16 @@ impl InstructionSelector {
                     let b_hi = i64_pair_hi(b_lo)?;
                     let a_hi = i64_pair_hi(a_lo)?;
 
-                    // Allocate result register pair
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate result register pair. MUST be consecutive
+                    // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
+                    // and is called by every i64 op downstream to recover
+                    // the high register. Two separate alloc_temp_safe calls
+                    // skip live registers and produce non-consecutive pairs.
+                    // Avoid clobbering the just-popped operand pairs before
+                    // the ADC reads them — passing them in extra_avoid
+                    // ensures dst doesn't overlap any of a_lo/a_hi/b_lo/b_hi.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?;
 
                     // ADDS dst_lo, a_lo, b_lo  (sets carry flag)
                     instructions.push(ArmInstruction {
@@ -4637,9 +4978,10 @@ impl InstructionSelector {
                     let b_hi = i64_pair_hi(b_lo)?;
                     let a_hi = i64_pair_hi(a_lo)?;
 
-                    // Allocate result register pair
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // See I64Add for why extra_avoid carries a_*/b_* —
+                    // dst must not overlap any operand half before SBC reads it.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?;
 
                     // SUBS dst_lo, a_lo, b_lo  (sets borrow flag)
                     instructions.push(ArmInstruction {
@@ -4666,6 +5008,196 @@ impl InstructionSelector {
                     stack.push(dst_lo);
                 }
 
+                // ============================================================
+                // i64 bitwise ops (I64Or / I64And / I64Xor)
+                //
+                // Each pops two i64 register pairs from the wasm stack and
+                // emits two ARM ops (low-half then high-half) into a freshly
+                // allocated consecutive pair. This replaces the wildcard
+                // fallthrough to select_default, which assumed inputs in
+                // R0:R1 and R2:R3 — incorrect when the wasm stack tracks
+                // arbitrary register pairs from earlier ops.
+                // ============================================================
+                I64Or | I64And | I64Xor => {
+                    let b_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in i64 bitwise op".to_string(),
+                        )
+                    })?;
+                    let a_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis(
+                            "stack underflow in i64 bitwise op".to_string(),
+                        )
+                    })?;
+                    let b_hi = i64_pair_hi(b_lo)?;
+                    let a_hi = i64_pair_hi(a_lo)?;
+                    // dst must not overlap any popped operand's half — the
+                    // hi instruction reads a_hi and b_hi after the lo
+                    // instruction writes dst_lo.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?;
+                    let (lo_op, hi_op) = match op {
+                        I64Or => (
+                            ArmOp::Orr {
+                                rd: dst_lo,
+                                rn: a_lo,
+                                op2: Operand2::Reg(b_lo),
+                            },
+                            ArmOp::Orr {
+                                rd: dst_hi,
+                                rn: a_hi,
+                                op2: Operand2::Reg(b_hi),
+                            },
+                        ),
+                        I64And => (
+                            ArmOp::And {
+                                rd: dst_lo,
+                                rn: a_lo,
+                                op2: Operand2::Reg(b_lo),
+                            },
+                            ArmOp::And {
+                                rd: dst_hi,
+                                rn: a_hi,
+                                op2: Operand2::Reg(b_hi),
+                            },
+                        ),
+                        I64Xor => (
+                            ArmOp::Eor {
+                                rd: dst_lo,
+                                rn: a_lo,
+                                op2: Operand2::Reg(b_lo),
+                            },
+                            ArmOp::Eor {
+                                rd: dst_hi,
+                                rn: a_hi,
+                                op2: Operand2::Reg(b_hi),
+                            },
+                        ),
+                        _ => unreachable!(),
+                    };
+                    instructions.push(ArmInstruction {
+                        op: lo_op,
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    instructions.push(ArmInstruction {
+                        op: hi_op,
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
+                // ============================================================
+                // i32 -> i64 extension (I64ExtendI32U / I64ExtendI32S)
+                //
+                // Pops one i32, allocates a consecutive i64 pair, places the
+                // i32 in the low half. For unsigned: high = 0. For signed:
+                // high = arithmetic-shift-right by 31 (sign-extension).
+                // ============================================================
+                I64ExtendI32U => {
+                    let val = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis("stack underflow in I64ExtendI32U".to_string())
+                    })?;
+                    // val must stay alive until the Mov reads it; dst_hi
+                    // must not be val (we'd write the zero high before
+                    // moving val to dst_lo).
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[val])?;
+                    if val != dst_lo {
+                        instructions.push(ArmInstruction {
+                            op: ArmOp::Mov {
+                                rd: dst_lo,
+                                op2: Operand2::Reg(val),
+                            },
+                            source_line: Some(idx),
+                        });
+                        cf.add_instruction();
+                    }
+                    instructions.push(ArmInstruction {
+                        op: ArmOp::Movw {
+                            rd: dst_hi,
+                            imm16: 0,
+                        },
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
+                I64ExtendI32S => {
+                    let val = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis("stack underflow in I64ExtendI32S".to_string())
+                    })?;
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[val])?;
+                    instructions.push(ArmInstruction {
+                        op: ArmOp::I64ExtendI32S {
+                            rdlo: dst_lo,
+                            rdhi: dst_hi,
+                            rn: val,
+                        },
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
+                // ============================================================
+                // i64 variable shifts (I64Shl / I64ShrU / I64ShrS)
+                //
+                // Use the existing I64Shl/I64ShrU/I64ShrS pseudo-ops (which
+                // expand to the variable-shift logic in arm_encoder.rs) but
+                // pass the actual stack-tracked register pairs rather than
+                // assuming R0:R1 / R2:R3.
+                // ============================================================
+                I64Shl | I64ShrU | I64ShrS => {
+                    let b_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis("stack underflow in i64 shift".to_string())
+                    })?;
+                    let a_lo = stack.pop().ok_or_else(|| {
+                        synth_core::Error::synthesis("stack underflow in i64 shift".to_string())
+                    })?;
+                    let b_hi = i64_pair_hi(b_lo)?;
+                    let a_hi = i64_pair_hi(a_lo)?;
+                    // dst must not overlap any popped operand's half — the
+                    // shift pseudo-op reads all four (rn_lo/rn_hi/rm_lo/rm_hi)
+                    // before writing the destination.
+                    let (dst_lo, dst_hi) =
+                        alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?;
+                    let shift_op = match op {
+                        I64Shl => ArmOp::I64Shl {
+                            rd_lo: dst_lo,
+                            rd_hi: dst_hi,
+                            rn_lo: a_lo,
+                            rn_hi: a_hi,
+                            rm_lo: b_lo,
+                            rm_hi: b_hi,
+                        },
+                        I64ShrU => ArmOp::I64ShrU {
+                            rd_lo: dst_lo,
+                            rd_hi: dst_hi,
+                            rn_lo: a_lo,
+                            rn_hi: a_hi,
+                            rm_lo: b_lo,
+                            rm_hi: b_hi,
+                        },
+                        I64ShrS => ArmOp::I64ShrS {
+                            rd_lo: dst_lo,
+                            rd_hi: dst_hi,
+                            rn_lo: a_lo,
+                            rn_hi: a_hi,
+                            rm_lo: b_lo,
+                            rm_hi: b_hi,
+                        },
+                        _ => unreachable!(),
+                    };
+                    instructions.push(ArmInstruction {
+                        op: shift_op,
+                        source_line: Some(idx),
+                    });
+                    cf.add_instruction();
+                    stack.push(dst_lo);
+                }
+
                 I64Load { offset, .. } => {
                     // Pop address from stack
                     let addr = stack.pop().ok_or_else(|| {
@@ -4675,9 +5207,12 @@ impl InstructionSelector {
                         )
                     })?;
 
-                    // Allocate result register pair
-                    let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?;
-                    let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?;
+                    // Allocate result register pair. MUST be consecutive
+                    // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive
+                    // and is called by every i64 op downstream to recover
+                    // the high register. Avoid clobbering addr before the
+                    // load uses it.
+                    let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?;
 
                     // Generate bounds-checked i64 load into the allocated pair
                     let load_ops =
@@ -5055,7 +5590,18 @@ impl InstructionSelector {
             }
         }
 
-        // Function epilogue: restore callee-saved registers and return via PC
+        // Function epilogue: deallocate the local frame, then restore
+        // callee-saved registers and return via PC.
+        if layout.frame_size > 0 {
+            instructions.push(ArmInstruction {
+                op: ArmOp::Add {
+                    rd: Reg::SP,
+                    rn: Reg::SP,
+                    op2: Operand2::Imm(layout.frame_size),
+                },
+                source_line: None,
+            });
+        }
         // POP {R4-R8, PC} restores registers and returns (PC = saved LR)
         instructions.push(ArmInstruction {
             op: ArmOp::Pop {
diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs
index 2f741b0..5c506d3 100644
--- a/crates/synth-synthesis/src/optimizer_bridge.rs
+++ b/crates/synth-synthesis/src/optimizer_bridge.rs
@@ -1277,6 +1277,12 @@ impl OptimizerBridge {
         // AAPCS: first 4 params in R0-R3
         let param_regs = [Reg::R0, Reg::R1, Reg::R2, Reg::R3];
 
+        // Reserved param registers: R0..R(min(num_params,4)). These hold incoming
+        // AAPCS arguments that must NOT be clobbered by i64 op handlers — at least
+        // until the user's WASM has done a `local.get` of each. Using Vec because
+        // `Reg` does not derive Hash (matches `instruction_selector::alloc_consecutive_pair`).
+        let param_reserved_regs: Vec<Reg> = param_regs[..num_params.min(4)].to_vec();
+
         // Track which ARM register currently holds each local variable
         // This avoids stack spills for simple cases
         let mut local_to_reg: HashMap<u32, Reg> = HashMap::new();
@@ -1290,7 +1296,17 @@ impl OptimizerBridge {
 
         // Track the last value-producing vreg (for function return value)
         let mut last_result_vreg: Option<u32> = None;
-        // Track whether the last result is an i64 (result already in R0:R1, no move needed)
+        // For i64 returns, also track the hi-half vreg so the epilogue can move
+        // the pair into R0:R1 regardless of where regalloc placed it. Was previously
+        // unnecessary because every i64 op pinned its result to R0:R1 — that's the
+        // bug we're fixing here.
+        let mut last_result_vreg_hi: Option<u32> = None;
+        // For i64 ops whose IR Opcode only tracks a single `dest` vreg (Clz / Ctz /
+        // Popcnt), the hi half lives in a register chosen at lowering time but has
+        // no IR vreg pointing at it. Stash that physical reg directly so the
+        // epilogue can still emit the correct (R0, R1) move.
+        let mut last_result_vreg_hi_reg: Option<Reg> = None;
+        // Track whether the last result is an i64 (return value occupies a pair).
         let mut is_i64_result = false;
         // WASM operand value stack - tracks vreg IDs for correct stack semantics
         // Used to restore last_result_vreg after br_if pops its condition
@@ -1318,6 +1334,48 @@ impl OptimizerBridge {
                 }
             };
 
+        // Allocate a CONSECUTIVE callee-saved register pair for an i64 destination.
+        //
+        // Searches `[(R4,R5), (R6,R7), (R8,R9), (R10,R11)]` for a pair where neither
+        // register is currently:
+        //   - holding a live vreg (`vreg_to_arm.values()`)
+        //   - bound to a non-param local (`local_to_reg.values()`)
+        //   - one of the AAPCS param registers we must preserve on entry
+        //     (`param_reserved_regs`)
+        //
+        // Falls back to `(R4, R5)` if no pair is free — preserves prior behaviour
+        // for very-pressured functions, but at least keeps params intact in the
+        // common case. Per `instruction_selector::alloc_consecutive_pair`, callers
+        // who hit the fallback in real workloads will need spill support; that's
+        // out of scope for this fix.
+        let alloc_i64_pair = |vreg_to_arm: &HashMap<u32, Reg>,
+                              local_to_reg: &HashMap<u32, Reg>,
+                              param_reserved_regs: &[Reg]|
+         -> (Reg, Reg) {
+            const CANDIDATES: &[(Reg, Reg)] = &[
+                (Reg::R4, Reg::R5),
+                (Reg::R6, Reg::R7),
+                (Reg::R8, Reg::R9),
+                (Reg::R10, Reg::R11),
+            ];
+            let is_in_use = |r: Reg| -> bool {
+                vreg_to_arm.values().any(|&v| v == r)
+                    || local_to_reg.values().any(|&v| v == r)
+                    || param_reserved_regs.contains(&r)
+            };
+            for &(lo, hi) in CANDIDATES {
+                if !is_in_use(lo) && !is_in_use(hi) {
+                    return (lo, hi);
+                }
+            }
+            // Fallback — same hardcoded pair the buggy code used. Better than crashing,
+            // and matches existing behaviour when the caller is so pressured that
+            // even R8..R11 are occupied. (Empirically this never triggers for
+            // workloads we care about; if it does, the architectural fix is
+            // proper spilling, not a wider search.)
+            (Reg::R4, Reg::R5)
+        };
+
         // Emit a reload instruction if the vreg was spilled to stack.
         // Must be called before the instruction that uses the register.
         let reload_spill = |vreg: &OptReg, spills: &HashMap<u32, i32>, instrs: &mut Vec<ArmOp>| {
@@ -2073,19 +2131,21 @@ impl OptimizerBridge {
                 } => {
                     // Map local index to register pair
                     // Per AAPCS: i64 uses consecutive even/odd register pairs
-                    let (lo_reg, hi_reg) = if *addr == 0 {
+                    let (lo_reg, hi_reg) = if *addr == 0 && num_params >= 2 {
                         (Reg::R0, Reg::R1) // First i64 param
-                    } else if *addr == 1 {
+                    } else if *addr == 1 && num_params >= 4 {
                         (Reg::R2, Reg::R3) // Second i64 param
                     } else {
-                        // For other locals, we'd need stack access
-                        // For now, use R4:R5 as temp
-                        (Reg::R4, Reg::R5)
+                        // Non-param i64 local: pick a free callee-saved pair so we
+                        // don't clobber AAPCS arg regs that haven't been read yet.
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs)
                     };
                     vreg_to_arm.insert(dest_lo.0, lo_reg);
                     vreg_to_arm.insert(dest_hi.0, hi_reg);
                     // No ARM instructions needed - values are already in registers for params
                     last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
+                    is_i64_result = true;
                 }
 
                 Opcode::I64Const {
@@ -2096,14 +2156,12 @@ impl OptimizerBridge {
                     // Load 64-bit constant into register pair
                     let lo = (*value & 0xFFFFFFFF) as u32;
                     let hi = ((*value >> 32) & 0xFFFFFFFF) as u32;
-                    // Choose register pair based on virtual register number
-                    // If dest_lo.0 is 0 or 1, use R0:R1 (first i64 slot)
-                    // If dest_lo.0 is 2 or 3, use R2:R3 (second i64 slot)
-                    let (lo_reg, hi_reg) = if dest_lo.0 <= 1 {
-                        (Reg::R0, Reg::R1)
-                    } else {
-                        (Reg::R2, Reg::R3)
-                    };
+                    // Choose a free callee-saved pair so we don't trample params still
+                    // sitting in R0..R3. The earlier heuristic (vreg-id → R0:R1 / R2:R3)
+                    // ignored AAPCS, breaking any function that issued an i64.const
+                    // before reading its i32 params.
+                    let (lo_reg, hi_reg) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
                     vreg_to_arm.insert(dest_lo.0, lo_reg);
                     vreg_to_arm.insert(dest_hi.0, hi_reg);
                     // Load low word
@@ -2142,511 +2200,871 @@ impl OptimizerBridge {
                             });
                         }
                     }
+                    // If this i64 const is the final return value, the epilogue
+                    // needs to know which pair holds it (for the move into R0:R1).
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
+                    is_i64_result = true;
                 }
 
                 Opcode::I64Add {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    // i64.add: R0:R1 = R0:R1 + R2:R3
-                    // ADDS R0, R0, R2 (sets carry)
-                    // ADC  R1, R1, R3 (adds carry)
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    // i64.add: rd = rn + rm using the actual operand regs from
+                    // vreg_to_arm — NOT hardcoded R0:R1/R2:R3 (which would clobber
+                    // AAPCS param regs).
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::Adds {
-                        rd: Reg::R0,
-                        rn: Reg::R0,
-                        op2: Operand2::Reg(Reg::R2),
+                        rd: rd_lo,
+                        rn: rn_lo,
+                        op2: Operand2::Reg(rm_lo),
                     });
                     arm_instrs.push(ArmOp::Adc {
-                        rd: Reg::R1,
-                        rn: Reg::R1,
-                        op2: Operand2::Reg(Reg::R3),
+                        rd: rd_hi,
+                        rn: rn_hi,
+                        op2: Operand2::Reg(rm_hi),
                     });
-                    // Mark as i64 result - no final mov needed, result already in R0:R1
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 Opcode::I64Sub {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    // i64.sub: R0:R1 = R0:R1 - R2:R3
-                    // SUBS R0, R0, R2 (sets borrow)
-                    // SBC  R1, R1, R3 (subtracts borrow)
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::Subs {
-                        rd: Reg::R0,
-                        rn: Reg::R0,
-                        op2: Operand2::Reg(Reg::R2),
+                        rd: rd_lo,
+                        rn: rn_lo,
+                        op2: Operand2::Reg(rm_lo),
                     });
                     arm_instrs.push(ArmOp::Sbc {
-                        rd: Reg::R1,
-                        rn: Reg::R1,
-                        op2: Operand2::Reg(Reg::R3),
+                        rd: rd_hi,
+                        rn: rn_hi,
+                        op2: Operand2::Reg(rm_hi),
                     });
-                    // Mark as i64 result - no final mov needed, result already in R0:R1
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 Opcode::I64And {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    // i64.and: R0:R1 = R0:R1 & R2:R3
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::And {
-                        rd: Reg::R0,
-                        rn: Reg::R0,
-                        op2: Operand2::Reg(Reg::R2),
+                        rd: rd_lo,
+                        rn: rn_lo,
+                        op2: Operand2::Reg(rm_lo),
                     });
                     arm_instrs.push(ArmOp::And {
-                        rd: Reg::R1,
-                        rn: Reg::R1,
-                        op2: Operand2::Reg(Reg::R3),
+                        rd: rd_hi,
+                        rn: rn_hi,
+                        op2: Operand2::Reg(rm_hi),
                     });
-                    // Mark as i64 result - no final mov needed, result already in R0:R1
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 Opcode::I64Or {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    // i64.or: R0:R1 = R0:R1 | R2:R3
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::Orr {
-                        rd: Reg::R0,
-                        rn: Reg::R0,
-                        op2: Operand2::Reg(Reg::R2),
+                        rd: rd_lo,
+                        rn: rn_lo,
+                        op2: Operand2::Reg(rm_lo),
                     });
                     arm_instrs.push(ArmOp::Orr {
-                        rd: Reg::R1,
-                        rn: Reg::R1,
-                        op2: Operand2::Reg(Reg::R3),
+                        rd: rd_hi,
+                        rn: rn_hi,
+                        op2: Operand2::Reg(rm_hi),
                     });
-                    // Mark as i64 result - no final mov needed, result already in R0:R1
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 Opcode::I64Xor {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    // i64.xor: R0:R1 = R0:R1 ^ R2:R3
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::Eor {
-                        rd: Reg::R0,
-                        rn: Reg::R0,
-                        op2: Operand2::Reg(Reg::R2),
+                        rd: rd_lo,
+                        rn: rn_lo,
+                        op2: Operand2::Reg(rm_lo),
                     });
                     arm_instrs.push(ArmOp::Eor {
-                        rd: Reg::R1,
-                        rn: Reg::R1,
-                        op2: Operand2::Reg(Reg::R3),
+                        rd: rd_hi,
+                        rn: rn_hi,
+                        op2: Operand2::Reg(rm_hi),
                     });
-                    // Mark as i64 result - no final mov needed, result already in R0:R1
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // ========================================================================
-                // i64 Comparisons (result is single i32 in R0)
+                // i64 Comparisons (result is single i32)
+                //
+                // Sources are read from `vreg_to_arm[src*]` rather than hardcoded
+                // R0:R1/R2:R3 — the latter would mean "i64 ops always assume their
+                // operands materialised at the AAPCS arg slots", which is false:
+                // operand registers come from whatever the upstream IR producers
+                // (I64Const, I64Load, prior i64 ops) chose. Result lands on the lo
+                // half of a freshly allocated callee-saved pair so we don't smash
+                // any AAPCS arg reg the user hasn't read yet.
                 // ========================================================================
-                Opcode::I64Eq { dest, .. } => {
-                    // i64.eq: (R0:R1) == (R2:R3), result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64Eq {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::EQ,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64Ne { dest, .. } => {
-                    // i64.ne: (R0:R1) != (R2:R3), result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64Ne {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::NE,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64LtS { dest, .. } => {
-                    // i64.lt_s: (R0:R1) < (R2:R3) signed, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64LtS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::LT,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64GtS { dest, .. } => {
-                    // i64.gt_s: (R0:R1) > (R2:R3) signed, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64GtS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::GT,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64LeS { dest, .. } => {
-                    // i64.le_s: (R0:R1) <= (R2:R3) signed, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64LeS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::LE,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64GeS { dest, .. } => {
-                    // i64.ge_s: (R0:R1) >= (R2:R3) signed, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64GeS {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::GE,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
                 // Unsigned i64 comparisons
-                Opcode::I64LtU { dest, .. } => {
-                    // i64.lt_u: (R0:R1) < (R2:R3) unsigned, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64LtU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::LO,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64GtU { dest, .. } => {
-                    // i64.gt_u: (R0:R1) > (R2:R3) unsigned, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64GtU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::HI,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64LeU { dest, .. } => {
-                    // i64.le_u: (R0:R1) <= (R2:R3) unsigned, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64LeU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::LS,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64GeU { dest, .. } => {
-                    // i64.ge_u: (R0:R1) >= (R2:R3) unsigned, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                Opcode::I64GeU {
+                    dest,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
                     arm_instrs.push(ArmOp::I64SetCond {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                         cond: Condition::HS,
                     });
                     last_result_vreg = Some(dest.0);
                 }
 
-                Opcode::I64Eqz { dest, .. } => {
-                    // i64.eqz: (R0:R1) == 0, result in R0
-                    vreg_to_arm.insert(dest.0, Reg::R0);
-                    arm_instrs.push(ArmOp::I64SetCondZ {
-                        rd: Reg::R0,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                    });
+                Opcode::I64Eqz {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    let rn_lo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest.0, rd);
+                    arm_instrs.push(ArmOp::I64SetCondZ { rd, rn_lo, rn_hi });
                     last_result_vreg = Some(dest.0);
                 }
 
-                // i64 count leading zeros (returns i64 where high word is always 0)
-                Opcode::I64Clz { dest, .. } => {
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                // i64 count leading zeros (i64 result: lo gets count, hi must be 0).
+                //
+                // The ArmOp::I64Clz encoder writes the count into `rd` AND zeroes
+                // `rnhi` in-place — so `rnhi` doubles as the result's hi half. To
+                // keep the upstream src_hi register intact and avoid clobbering
+                // unrelated AAPCS regs, we copy src_hi into a freshly allocated
+                // callee-saved hi slot and pass that as `rnhi`. After the encoded
+                // sequence, the i64 result lives in (rd_lo, rd_hi).
+                //
+                // The IR Opcode only carries a single `dest` vreg (the lo half);
+                // we register dest.0 → rd_lo. The hi-zero is implicit and used by
+                // the function epilogue when this is the i64 return value (see
+                // last_result_vreg_hi_reg below).
+                Opcode::I64Clz {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    if rd_hi != rnhi_src {
+                        arm_instrs.push(ArmOp::Mov {
+                            rd: rd_hi,
+                            op2: Operand2::Reg(rnhi_src),
+                        });
+                    }
+                    vreg_to_arm.insert(dest.0, rd_lo);
                     arm_instrs.push(ArmOp::I64Clz {
-                        rd: Reg::R0,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
+                        rd: rd_lo,
+                        rnlo,
+                        rnhi: rd_hi,
                     });
                     last_result_vreg = Some(dest.0);
+                    last_result_vreg_hi_reg = Some(rd_hi);
                     is_i64_result = true;
                 }
 
-                // i64 count trailing zeros (returns i64 where high word is always 0)
-                Opcode::I64Ctz { dest, .. } => {
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                // i64 count trailing zeros — same pattern as I64Clz above.
+                Opcode::I64Ctz {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    if rd_hi != rnhi_src {
+                        arm_instrs.push(ArmOp::Mov {
+                            rd: rd_hi,
+                            op2: Operand2::Reg(rnhi_src),
+                        });
+                    }
+                    vreg_to_arm.insert(dest.0, rd_lo);
                     arm_instrs.push(ArmOp::I64Ctz {
-                        rd: Reg::R0,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
+                        rd: rd_lo,
+                        rnlo,
+                        rnhi: rd_hi,
                     });
                     last_result_vreg = Some(dest.0);
+                    last_result_vreg_hi_reg = Some(rd_hi);
                     is_i64_result = true;
                 }
 
-                // i64 population count (returns i64 where high word is always 0)
-                Opcode::I64Popcnt { dest, .. } => {
-                    vreg_to_arm.insert(dest.0, Reg::R0);
+                // i64 population count — same pattern as I64Clz above.
+                Opcode::I64Popcnt {
+                    dest,
+                    src_lo,
+                    src_hi,
+                } => {
+                    let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    if rd_hi != rnhi_src {
+                        arm_instrs.push(ArmOp::Mov {
+                            rd: rd_hi,
+                            op2: Operand2::Reg(rnhi_src),
+                        });
+                    }
+                    vreg_to_arm.insert(dest.0, rd_lo);
                     arm_instrs.push(ArmOp::I64Popcnt {
-                        rd: Reg::R0,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
+                        rd: rd_lo,
+                        rnlo,
+                        rnhi: rd_hi,
                     });
                     last_result_vreg = Some(dest.0);
+                    last_result_vreg_hi_reg = Some(rd_hi);
                     is_i64_result = true;
                 }
 
                 // i64 sign extension operations
                 Opcode::I64Extend8S {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src_lo,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
-                    arm_instrs.push(ArmOp::I64Extend8S {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                    });
+                    let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
+                    arm_instrs.push(ArmOp::I64Extend8S { rdlo, rdhi, rnlo });
                     last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 Opcode::I64Extend16S {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src_lo,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
-                    arm_instrs.push(ArmOp::I64Extend16S {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                    });
+                    let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
+                    arm_instrs.push(ArmOp::I64Extend16S { rdlo, rdhi, rnlo });
                     last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 Opcode::I64Extend32S {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src_lo,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
-                    arm_instrs.push(ArmOp::I64Extend32S {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                    });
+                    let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
+                    arm_instrs.push(ArmOp::I64Extend32S { rdlo, rdhi, rnlo });
                     last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 multiply: UMULL + MLA cross products
                 Opcode::I64Mul {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::I64Mul {
-                        rd_lo: Reg::R0,
-                        rd_hi: Reg::R1,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd_lo,
+                        rd_hi,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 shift left
                 Opcode::I64Shl {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::I64Shl {
-                        rd_lo: Reg::R0,
-                        rd_hi: Reg::R1,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd_lo,
+                        rd_hi,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 arithmetic shift right
                 Opcode::I64ShrS {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::I64ShrS {
-                        rd_lo: Reg::R0,
-                        rd_hi: Reg::R1,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd_lo,
+                        rd_hi,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 logical shift right
                 Opcode::I64ShrU {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rd_lo, rd_hi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rd_lo);
+                    vreg_to_arm.insert(dest_hi.0, rd_hi);
                     arm_instrs.push(ArmOp::I64ShrU {
-                        rd_lo: Reg::R0,
-                        rd_hi: Reg::R1,
-                        rn_lo: Reg::R0,
-                        rn_hi: Reg::R1,
-                        rm_lo: Reg::R2,
-                        rm_hi: Reg::R3,
+                        rd_lo,
+                        rd_hi,
+                        rn_lo,
+                        rn_hi,
+                        rm_lo,
+                        rm_hi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 rotate left
                 Opcode::I64Rotl {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    ..
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let shift = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
                     arm_instrs.push(ArmOp::I64Rotl {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
-                        shift: Reg::R2, // Only use low word of shift amount
+                        rdlo,
+                        rdhi,
+                        rnlo,
+                        rnhi,
+                        shift,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 rotate right
                 Opcode::I64Rotr {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    ..
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let shift = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
                     arm_instrs.push(ArmOp::I64Rotr {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
-                        shift: Reg::R2, // Only use low word of shift amount
+                        rdlo,
+                        rdhi,
+                        rnlo,
+                        rnhi,
+                        shift,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 signed division
                 Opcode::I64DivS {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
                     arm_instrs.push(ArmOp::I64DivS {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
-                        rmlo: Reg::R2,
-                        rmhi: Reg::R3,
+                        rdlo,
+                        rdhi,
+                        rnlo,
+                        rnhi,
+                        rmlo,
+                        rmhi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 unsigned division
                 Opcode::I64DivU {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
                     arm_instrs.push(ArmOp::I64DivU {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
-                        rmlo: Reg::R2,
-                        rmhi: Reg::R3,
+                        rdlo,
+                        rdhi,
+                        rnlo,
+                        rnhi,
+                        rmlo,
+                        rmhi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 signed remainder
                 Opcode::I64RemS {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
                     arm_instrs.push(ArmOp::I64RemS {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
-                        rmlo: Reg::R2,
-                        rmhi: Reg::R3,
+                        rdlo,
+                        rdhi,
+                        rnlo,
+                        rnhi,
+                        rmlo,
+                        rmhi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
                 // i64 unsigned remainder
                 Opcode::I64RemU {
-                    dest_lo, dest_hi, ..
+                    dest_lo,
+                    dest_hi,
+                    src1_lo,
+                    src1_hi,
+                    src2_lo,
+                    src2_hi,
                 } => {
-                    vreg_to_arm.insert(dest_lo.0, Reg::R0);
-                    vreg_to_arm.insert(dest_hi.0, Reg::R1);
+                    let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs);
+                    let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs);
+                    let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs);
+                    let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs);
+                    let (rdlo, rdhi) =
+                        alloc_i64_pair(&vreg_to_arm, &local_to_reg, &param_reserved_regs);
+                    vreg_to_arm.insert(dest_lo.0, rdlo);
+                    vreg_to_arm.insert(dest_hi.0, rdhi);
                     arm_instrs.push(ArmOp::I64RemU {
-                        rdlo: Reg::R0,
-                        rdhi: Reg::R1,
-                        rnlo: Reg::R0,
-                        rnhi: Reg::R1,
-                        rmlo: Reg::R2,
-                        rmhi: Reg::R3,
+                        rdlo,
+                        rdhi,
+                        rnlo,
+                        rnhi,
+                        rmlo,
+                        rmhi,
                     });
+                    last_result_vreg = Some(dest_lo.0);
+                    last_result_vreg_hi = Some(dest_hi.0);
                     is_i64_result = true;
                 }
 
@@ -2996,36 +3414,19 @@ impl OptimizerBridge {
                 ArmOp::Mov {
                     rd,
                     op2: Operand2::Imm(v),
-                } => {
-                    if reg_num(rd) > 7 || *v > 255 || *v < 0 {
-                        4
-                    } else {
-                        2
-                    }
-                }
+                } if reg_num(rd) > 7 || *v > 255 || *v < 0 => 4,
+                ArmOp::Mov { .. } => 2,
                 // SUB/ADD with high registers need 32-bit encoding
                 ArmOp::Sub {
                     rd,
                     rn,
                     op2: Operand2::Reg(rm),
-                } => {
-                    if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 {
-                        4
-                    } else {
-                        2
-                    }
-                }
+                } if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 => 4,
                 ArmOp::Add {
                     rd,
                     rn,
                     op2: Operand2::Reg(rm),
-                } => {
-                    if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 {
-                        4
-                    } else {
-                        2
-                    }
-                }
+                } if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 => 4,
                 // Most 16-bit Thumb instructions (MOV low, CMP low, B, etc.)
                 _ => 2,
             }
@@ -3066,9 +3467,73 @@ impl OptimizerBridge {
             }
         }
 
-        // Ensure return value is in R0 (skip for i64 results which are already in R0:R1)
-        if !is_i64_result
-            && let Some(result_vreg) = last_result_vreg
+        // Ensure the return value is in R0 (i32 result) or R0:R1 (i64 result).
+        //
+        // Pre-fix, every i64 op pinned its result at R0:R1 so this could be a
+        // no-op for is_i64_result. After the fix, the result pair may live in
+        // any callee-saved pair (R4:R5..R10:R11), and we need an explicit move.
+        // The order matters: copy hi → R1 first, then lo → R0, so we don't
+        // clobber the lo value if the source happens to be R1.
+        if is_i64_result {
+            // Resolve the lo half from vreg_to_arm.
+            let lo_reg = last_result_vreg.and_then(|v| vreg_to_arm.get(&v).copied());
+            // Resolve the hi half: prefer an explicit vreg id, else fall back to
+            // the physical reg stash used by Clz/Ctz/Popcnt.
+            let hi_reg = last_result_vreg_hi
+                .and_then(|v| vreg_to_arm.get(&v).copied())
+                .or(last_result_vreg_hi_reg);
+
+            if let (Some(lo), Some(hi)) = (lo_reg, hi_reg) {
+                // Move hi first (so we don't clobber lo if hi's source is R1).
+                if hi != Reg::R1 {
+                    arm_instrs.push(ArmOp::Mov {
+                        rd: Reg::R1,
+                        op2: Operand2::Reg(hi),
+                    });
+                }
+                // Now move lo. If lo was R1 originally, it just got smashed by
+                // the hi-move above; but R1's prior contents are now in R1
+                // (the hi value), so we'd actually have wanted to save lo first.
+                // Handle that case explicitly: save lo to R12 (IP scratch) first.
+                if lo == Reg::R1 && hi != Reg::R1 {
+                    // lo was in R1, which we just overwrote. We can't recover it
+                    // unless we saved earlier. The clean fix: detect this
+                    // arrangement up front. For now, swap order via R12.
+                    // (This is reached only on bizarre regalloc choices; the
+                    // common case is lo in R4..R10, which doesn't hit it.)
+                    arm_instrs.pop(); // remove the hi-move we just emitted
+                    arm_instrs.push(ArmOp::Mov {
+                        rd: Reg::R12,
+                        op2: Operand2::Reg(lo),
+                    });
+                    if hi != Reg::R1 {
+                        arm_instrs.push(ArmOp::Mov {
+                            rd: Reg::R1,
+                            op2: Operand2::Reg(hi),
+                        });
+                    }
+                    arm_instrs.push(ArmOp::Mov {
+                        rd: Reg::R0,
+                        op2: Operand2::Reg(Reg::R12),
+                    });
+                } else if lo != Reg::R0 {
+                    arm_instrs.push(ArmOp::Mov {
+                        rd: Reg::R0,
+                        op2: Operand2::Reg(lo),
+                    });
+                }
+            } else if let Some(lo) = lo_reg
+                && lo != Reg::R0
+            {
+                // Hi is unknown — fall back to single-register move (caller of
+                // this function may have set is_i64_result without populating
+                // the hi tracker; preserve old behaviour rather than crash).
+                arm_instrs.push(ArmOp::Mov {
+                    rd: Reg::R0,
+                    op2: Operand2::Reg(lo),
+                });
+            }
+        } else if let Some(result_vreg) = last_result_vreg
             && let Some(&result_reg) = vreg_to_arm.get(&result_vreg)
             && result_reg != Reg::R0
         {
diff --git a/crates/synth-synthesis/src/pattern_matcher.rs b/crates/synth-synthesis/src/pattern_matcher.rs
index 3c61b36..c7e1d30 100644
--- a/crates/synth-synthesis/src/pattern_matcher.rs
+++ b/crates/synth-synthesis/src/pattern_matcher.rs
@@ -53,7 +53,7 @@ impl PatternMatcher {
         }
 
         // Sort by priority (highest first)
-        matches.sort_by(|a, b| b.rule.priority.cmp(&a.rule.priority));
+        matches.sort_by_key(|m| std::cmp::Reverse(m.rule.priority));
 
         matches
     }
diff --git a/crates/synth-synthesis/src/rules.rs b/crates/synth-synthesis/src/rules.rs
index 8180fbd..687b40a 100644
--- a/crates/synth-synthesis/src/rules.rs
+++ b/crates/synth-synthesis/src/rules.rs
@@ -1835,7 +1835,7 @@ impl RuleDatabase {
     pub fn add_rule(&mut self, rule: SynthesisRule) {
         self.rules.push(rule);
         // Sort by priority (highest first)
-        self.rules.sort_by(|a, b| b.priority.cmp(&a.priority));
+        self.rules.sort_by_key(|r| std::cmp::Reverse(r.priority));
     }
 
     /// Get all rules
diff --git a/crates/synth-synthesis/tests/semantic_correctness.rs b/crates/synth-synthesis/tests/semantic_correctness.rs
index 9ae0910..45e72ba 100644
--- a/crates/synth-synthesis/tests/semantic_correctness.rs
+++ b/crates/synth-synthesis/tests/semantic_correctness.rs
@@ -146,8 +146,8 @@ fn interpret_single(state: &mut ArmState, instr: &ArmInstruction) {
         ArmOp::Udiv { rd, rn, rm } => {
             let a = state.get(rn);
             let b = state.get(rm);
-            if b != 0 {
-                state.set(*rd, a / b);
+            if let Some(q) = a.checked_div(b) {
+                state.set(*rd, q);
             }
         }
         ArmOp::Mls { rd, rn, rm, ra } => {
@@ -233,11 +233,9 @@ fn interpret_single(state: &mut ArmState, instr: &ArmInstruction) {
             let sr = result as i32;
             state.flag_v = (sa > 0 && sb > 0 && sr < 0) || (sa < 0 && sb < 0 && sr >= 0);
         }
-        ArmOp::SelectMove { rd, rm, cond } => {
-            if state.condition_met(cond) {
-                let val = state.get(rm);
-                state.set(*rd, val);
-            }
+        ArmOp::SelectMove { rd, rm, cond } if state.condition_met(cond) => {
+            let val = state.get(rm);
+            state.set(*rd, val);
         }
         // Skip non-computational instructions (prologue/epilogue, branches, labels)
         _ => {}
diff --git a/tests/integration/m7_codegen_smoke.sh b/tests/integration/m7_codegen_smoke.sh
new file mode 100755
index 0000000..1efbc6e
--- /dev/null
+++ b/tests/integration/m7_codegen_smoke.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Smoke test: validate that synth's Cortex-M7 codegen path produces a
+# well-formed ELF for both single-precision (M7) and double-precision (M7DP)
+# targets, exercising f32 and f64 arithmetic.
+#
+# Companion to fetch_osxcar_wasm.sh, which exercises M7DP with real-world
+# components. This test runs without network access and is suitable for CI.
+#
+# Usage:
+#   bash tests/integration/m7_codegen_smoke.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+SYNTH="$PROJECT_ROOT/target/debug/synth"
+TMPDIR="${TMPDIR:-/tmp}/synth_m7_smoke_$$"
+
+cleanup() { rm -rf "$TMPDIR"; }
+trap cleanup EXIT
+mkdir -p "$TMPDIR"
+
+echo "=== Synth M7 codegen smoke test ==="
+
+if [ ! -x "$SYNTH" ]; then
+    (cd "$PROJECT_ROOT" && cargo build -p synth-cli --quiet)
+fi
+
+# i32-only module — should compile under M7 (single FPU)
+cat > "$TMPDIR/i32_only.wat" << 'WAT'
+(module
+  (func (export "add") (param i32 i32) (result i32)
+    local.get 0 local.get 1 i32.add)
+  (func (export "sub") (param i32 i32) (result i32)
+    local.get 0 local.get 1 i32.sub)
+  (func (export "mul") (param i32 i32) (result i32)
+    local.get 0 local.get 1 i32.mul)
+  (memory (export "memory") 1))
+WAT
+
+# f32 module — single-precision, should compile under M7
+cat > "$TMPDIR/f32.wat" << 'WAT'
+(module
+  (func (export "fadd") (param f32 f32) (result f32)
+    local.get 0 local.get 1 f32.add)
+  (func (export "fmul") (param f32 f32) (result f32)
+    local.get 0 local.get 1 f32.mul)
+  (memory (export "memory") 1))
+WAT
+
+# f64 module — double-precision, should compile under M7DP only
+cat > "$TMPDIR/f64.wat" << 'WAT'
+(module
+  (func (export "dadd") (param f64 f64) (result f64)
+    local.get 0 local.get 1 f64.add)
+  (func (export "dmul") (param f64 f64) (result f64)
+    local.get 0 local.get 1 f64.mul)
+  (memory (export "memory") 1))
+WAT
+
+PASS=0
+FAIL=0
+
+check_compile() {
+    local label="$1"; local wat="$2"; local target="$3"; local expect="$4"
+    local elf="$TMPDIR/${label}.elf"
+    if "$SYNTH" compile "$wat" -o "$elf" --target "$target" --all-exports >/dev/null 2>&1; then
+        result="ok"
+    else
+        result="fail"
+    fi
+    if [ "$result" = "$expect" ]; then
+        echo "PASS: ${label} on ${target} → ${result}"
+        PASS=$((PASS + 1))
+    else
+        echo "FAIL: ${label} on ${target} → ${result} (expected ${expect})"
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+# i32 should compile on every M7 variant
+check_compile "i32_m7"   "$TMPDIR/i32_only.wat" cortex-m7   ok
+check_compile "i32_m7dp" "$TMPDIR/i32_only.wat" cortex-m7dp ok
+
+# f32 should compile on both (M7 has single-precision FPU)
+check_compile "f32_m7"   "$TMPDIR/f32.wat" cortex-m7   ok
+check_compile "f32_m7dp" "$TMPDIR/f32.wat" cortex-m7dp ok
+
+# f64 must compile on M7DP. On M7 it should also work — synth falls back
+# to soft-float helpers when hardware doesn't support double-precision.
+check_compile "f64_m7dp" "$TMPDIR/f64.wat" cortex-m7dp ok
+
+echo ""
+echo "=== Results: ${PASS} passed, ${FAIL} failed ==="
+[ "$FAIL" -eq 0 ]
diff --git a/tests/renode/BUILD.bazel b/tests/renode/BUILD.bazel
index 83f32c7..2a247a6 100644
--- a/tests/renode/BUILD.bazel
+++ b/tests/renode/BUILD.bazel
@@ -1,7 +1,10 @@
 load("@rules_renode//renode:defs.bzl", "renode_test")
 
-# Export platform file for use by other test packages
-exports_files(["synth_cortex_m.repl"])
+# Export platform files for use by other test packages
+exports_files([
+    "synth_cortex_m.repl",
+    "synth_cortex_m7.repl",
+])
 
 # Renode-based integration tests for Synth-generated ARM binaries
 
@@ -46,3 +49,24 @@ renode_test(
     },
     tags = ["renode"],
 )
+
+# M7 codegen path: same WAT, compiled with --target cortex-m7
+genrule(
+    name = "test_add_m7_elf",
+    srcs = ["//examples/wat:simple_add.wat"],
+    outs = ["test_add_m7.elf"],
+    cmd = "$(location //crates:synth) compile $(location //examples/wat:simple_add.wat) -o $@ --target cortex-m7",
+    tools = ["//crates:synth"],
+)
+
+renode_test(
+    name = "cortex_m7_add_test",
+    robot_test = "cortex_m7_test.robot",
+    deps = [
+        "synth_cortex_m7.repl",
+    ],
+    variables_with_label = {
+        "ELF": "//tests/renode:test_add_m7_elf",
+    },
+    tags = ["renode"],
+)
diff --git a/tests/renode/cortex_m7_test.robot b/tests/renode/cortex_m7_test.robot
new file mode 100644
index 0000000..a7ce7b3
--- /dev/null
+++ b/tests/renode/cortex_m7_test.robot
@@ -0,0 +1,32 @@
+*** Settings ***
+Documentation     Cortex-M7 ELF execution test for Synth-generated binaries
+...               Validates that Synth's M7 codegen path emits a correctly
+...               structured ELF that loads and executes on a 16-MPU-region
+...               M7-class platform with single-precision FPU.
+
+*** Variables ***
+${PLATFORM}                         ${CURDIR}/synth_cortex_m7.repl
+
+*** Keywords ***
+Create Cortex-M7 Machine
+    Execute Command                 mach create "synth-m7-test"
+    Execute Command                 machine LoadPlatformDescription @${PLATFORM}
+
+*** Test Cases ***
+Should Load And Execute Simple Add Function On M7
+    [Documentation]                 Synth-generated --target cortex-m7 ELF executes correctly
+    Create Cortex-M7 Machine
+
+    Execute Command                 sysbus LoadELF "${ELF}"
+
+    # The add function lives at 0xA0 (user code, after 28-byte startup + handlers)
+    Execute Command                 cpu PC 0xA1
+
+    # AAPCS: r0 = 5, r1 = 3, expected result = 8
+    Execute Command                 cpu SetRegisterUnsafe 0 5
+    Execute Command                 cpu SetRegisterUnsafe 1 3
+
+    Execute Command                 cpu Step 2
+
+    ${r0}=                          Execute Command  cpu GetRegisterUnsafe 0
+    Should Be Equal As Integers     ${r0}  8  msg=Expected r0 to be 8 (5+3) on M7
diff --git a/tests/renode/synth_cortex_m7.repl b/tests/renode/synth_cortex_m7.repl
new file mode 100644
index 0000000..cb194c4
--- /dev/null
+++ b/tests/renode/synth_cortex_m7.repl
@@ -0,0 +1,29 @@
+// High-end Cortex-M7 platform for Synth-generated binaries
+// Models a typical M7 SoC with single-precision FPU, 16 MPU regions,
+// large OCRAM, and external XIP-capable QuadSPI flash. Vector table
+// lives at 0x60000000 (XIP flash window) on i.MX RT-class chips,
+// but we place the binary at 0x0 here for simple bring-up.
+
+flash: Memory.MappedMemory @ sysbus 0x0
+    size: 0x800000  // 8MB external QSPI flash window
+
+itcm: Memory.MappedMemory @ sysbus 0x00080000
+    size: 0x40000   // 256KB ITCM
+
+dtcm: Memory.MappedMemory @ sysbus 0x20000000
+    size: 0x40000   // 256KB DTCM
+
+ocram: Memory.MappedMemory @ sysbus 0x20200000
+    size: 0x80000   // 512KB OCRAM (FlexRAM-mapped)
+
+ocram2: Memory.MappedMemory @ sysbus 0x20280000
+    size: 0x80000   // 512KB additional OCRAM
+
+nvic: IRQControllers.NVIC @ sysbus 0xE000E000
+    priorityMask: 0xF0
+    systickFrequency: 600000000  // 600 MHz typical for M7-class parts
+    IRQ -> cpu@0
+
+cpu: CPU.CortexM @ sysbus
+    cpuType: "cortex-m7"
+    nvic: nvic