diff --git a/crates/synth-analysis/src/ssa.rs b/crates/synth-analysis/src/ssa.rs index 37a77f2..920adc3 100644 --- a/crates/synth-analysis/src/ssa.rs +++ b/crates/synth-analysis/src/ssa.rs @@ -335,11 +335,11 @@ impl DeadCodeElimination { SSAInstr::Assign { result, .. } | SSAInstr::BinOp { result, .. } | SSAInstr::UnaryOp { result, .. } - | SSAInstr::Load { result, .. } => { - if !used_vars.contains(result) { - removed += 1; - return false; - } + | SSAInstr::Load { result, .. } + if !used_vars.contains(result) => + { + removed += 1; + return false; } _ => {} } diff --git a/crates/synth-backend/src/mpu_allocator.rs b/crates/synth-backend/src/mpu_allocator.rs index 95b31f9..58f836c 100644 --- a/crates/synth-backend/src/mpu_allocator.rs +++ b/crates/synth-backend/src/mpu_allocator.rs @@ -339,4 +339,86 @@ mod tests { assert!(region.validate().is_ok()); } } + + #[test] + fn test_imxrt1062_has_16_regions() { + // i.MX RT1062 (M7-class) has 16 MPU regions vs 8 on M4-class parts + let hw_caps = HardwareCapabilities::imxrt1062(); + assert_eq!(hw_caps.mpu_regions, 16); + + let allocator = MPUAllocator::new(hw_caps); + assert_eq!(allocator.available_regions(), 16); + } + + #[test] + fn test_m7_can_allocate_more_than_8_regions() { + // Validate that the allocator actually uses all 16 regions on M7 + let mut allocator = MPUAllocator::new(HardwareCapabilities::imxrt1062()); + + for i in 0u32..16 { + let request = MPUAllocationRequest { + memory: Memory { + index: i, + initial: 1, + maximum: None, + shared: false, + memory64: false, + }, + permissions: MPUPermissions::FullRW, + attributes: MPUAttributes::normal(), + preferred_base: Some(0x20000000 + i * 0x10000), + }; + allocator.allocate(request).unwrap_or_else(|e| { + panic!("region {} allocation failed: {:?}", i, e); + }); + } + + assert_eq!(allocator.available_regions(), 0); + assert_eq!(allocator.allocated_regions().len(), 16); + } + + #[test] + fn test_m4_class_caps_at_8_regions() { + // Negative — M4-class parts must reject the 9th region. + let mut allocator = MPUAllocator::new(HardwareCapabilities::nrf52840()); + + for i in 0u32..8 { + let request = MPUAllocationRequest { + memory: Memory { + index: i, + initial: 1, + maximum: None, + shared: false, + memory64: false, + }, + permissions: MPUPermissions::FullRW, + attributes: MPUAttributes::normal(), + preferred_base: Some(0x20000000 + i * 0x10000), + }; + allocator.allocate(request).unwrap(); + } + + // 9th region must fail + let overflow = MPUAllocationRequest { + memory: Memory { + index: 8, + initial: 1, + maximum: None, + shared: false, + memory64: false, + }, + permissions: MPUPermissions::FullRW, + attributes: MPUAttributes::normal(), + preferred_base: Some(0x20100000), + }; + assert!(allocator.allocate(overflow).is_err()); + } + + #[test] + fn test_stm32h743_has_16_regions_and_double_fpu() { + let caps = HardwareCapabilities::stm32h743(); + assert_eq!(caps.mpu_regions, 16); + assert!(caps.has_fpu); + assert_eq!(caps.fpu_precision, Some(synth_core::FPUPrecision::Double)); + } } diff --git a/crates/synth-cli/src/main.rs b/crates/synth-cli/src/main.rs index cfd4eea..b78ea11 100644 --- a/crates/synth-cli/src/main.rs +++ b/crates/synth-cli/src/main.rs @@ -77,7 +77,7 @@ enum Commands { )] target: String, - /// Hardware config (nrf52840, stm32f407, or custom) + /// Hardware config (nrf52840, stm32f407, stm32h743, imxrt1062, or custom) #[arg(long, value_name = "HARDWARE", default_value = "nrf52840")] hardware: String, @@ -170,6 +170,11 @@ enum Commands { /// Path to kiln-builtins object file (.o) for linking (used with --link) #[arg(long, value_name = "BUILTINS")] builtins: Option, + + /// Force relocatable object (.o, ET_REL) output even when wasm has no imports + /// — for linking into a host build system. + #[arg(long)] + relocatable: bool, }, /// Disassemble an ARM ELF file (e.g., synth disasm output.elf) @@ -248,6 +253,7 @@ fn main() -> Result<()> { verify, link, builtins, + relocatable, } => { // Resolve target spec: --target overrides, --cortex-m is backwards compat let target_spec = resolve_target_spec(target.as_deref(), cortex_m)?; @@ -272,6 +278,7 @@ fn main() -> Result<()> { &backend, verify, &target_spec, + relocatable, )?; // If --link requested, invoke the cross-linker @@ -356,9 +363,11 @@ fn synthesize_command( let hw_caps = match hardware.as_str() { "nrf52840" => HardwareCapabilities::nrf52840(), "stm32f407" => HardwareCapabilities::stm32f407(), + "stm32h743" => HardwareCapabilities::stm32h743(), + "imxrt1062" => HardwareCapabilities::imxrt1062(), _ => { anyhow::bail!( - "Unsupported hardware: {}. Use nrf52840 or stm32f407", + "Unsupported hardware: {}. Use nrf52840, stm32f407, stm32h743, imxrt1062", hardware ); } @@ -398,8 +407,19 @@ fn target_info_command(target: String) -> Result<()> { let caps = HardwareCapabilities::stm32f407(); print_hardware_info(&caps); } + "stm32h743" => { + let caps = HardwareCapabilities::stm32h743(); + print_hardware_info(&caps); + } + "imxrt1062" => { + let caps = HardwareCapabilities::imxrt1062(); + print_hardware_info(&caps); + } _ => { - anyhow::bail!("Unknown target: {}. Supported: nrf52840, stm32f407", target); + anyhow::bail!( + "Unknown target: {}. Supported: nrf52840, stm32f407, stm32h743, imxrt1062", + target + ); } } @@ -553,6 +573,7 @@ fn compile_command( backend_name: &str, verify: bool, target_spec: &TargetSpec, + relocatable: bool, ) -> Result<()> { // Validate backend exists let registry = build_backend_registry(); @@ -595,6 +616,7 @@ fn compile_command( backend, verify, target_spec, + relocatable, ); } @@ -1222,6 +1244,7 @@ fn compile_all_exports( backend: &dyn Backend, verify: bool, target_spec: &TargetSpec, + relocatable: bool, ) -> Result<()> { let path = input.context("--all-exports requires an input file")?; @@ -1428,8 +1451,18 @@ fn compile_all_exports( // When there are relocations, produce a relocatable object (.o) instead of // an executable. This lets the output be linked with the Kiln bridge crate // (which provides __meld_dispatch_import and __meld_get_memory_base). - let elf_data = if has_relocations { - info!("Module has import calls — producing relocatable object (ET_REL)"); + // The --relocatable flag forces ET_REL output even when the wasm has no + // imports, for linking into a host build system (e.g. Zephyr). + let elf_data = if has_relocations || relocatable { + let total_relocs: usize = compiled_funcs.iter().map(|f| f.relocations.len()).sum(); + if has_relocations { + info!( + "Producing relocatable object (ET_REL): {} import call relocations", + total_relocs + ); + } else { + info!("Producing relocatable object (ET_REL): forced by --relocatable"); + } build_relocatable_elf(&compiled_funcs, &all_imports)? } else if cortex_m { build_multi_func_cortex_m_elf(&compiled_funcs, &all_memories, target_spec)? diff --git a/crates/synth-core/src/target.rs b/crates/synth-core/src/target.rs index b808679..93e73bd 100644 --- a/crates/synth-core/src/target.rs +++ b/crates/synth-core/src/target.rs @@ -234,6 +234,49 @@ impl HardwareCapabilities { ram_size: 192 * 1024, // 192KB (128KB + 64KB CCM) } } + + /// Create capabilities for STM32H743 (Cortex-M7 with double-precision FPU) + /// + /// 16 MPU regions, 2MB Flash, 1MB RAM (DTCM + AXI SRAM + SRAM1-4). + pub fn stm32h743() -> Self { + Self { + arch: TargetArch::ARMCortexM(CortexMVariant::M7DP), + has_mpu: true, + mpu_regions: 16, + has_pmp: false, + pmp_entries: 0, + has_fpu: true, + fpu_precision: Some(FPUPrecision::Double), + has_simd: false, + simd_level: None, + xip_capable: true, + flash_size: 2 * 1024 * 1024, // 2MB + ram_size: 1024 * 1024, // 1MB total + } + } + + /// Create capabilities for i.MX RT1062 (Cortex-M7 with single-precision FPU) + /// + /// Representative high-end M7 with 16 MPU regions, single-precision FPU, + /// large OCRAM, and external XIP-capable QuadSPI Flash. Matches the + /// configuration of safety-grade lockstepped M7 platforms used in + /// industrial and embedded automotive contexts. + pub fn imxrt1062() -> Self { + Self { + arch: TargetArch::ARMCortexM(CortexMVariant::M7), + has_mpu: true, + mpu_regions: 16, + has_pmp: false, + pmp_entries: 0, + has_fpu: true, + fpu_precision: Some(FPUPrecision::Single), + has_simd: false, + simd_level: None, + xip_capable: true, + flash_size: 8 * 1024 * 1024, // 8MB external QSPI flash (typical) + ram_size: 1024 * 1024, // 1MB OCRAM (FlexRAM 512KB + OCRAM 512KB) + } + } } // ============================================================================ diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 4f43edb..6821773 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -132,6 +132,66 @@ fn alloc_temp_safe(next_temp: &mut u8, stack: &[Reg]) -> Result { )) } +/// Allocate a CONSECUTIVE pair `(rN, rN+1)` of registers from ALLOCATABLE_REGS, +/// neither of which is currently in use. +/// +/// "In use" means: +/// 1. On the wasm stack (the explicit `Vec` tracking). +/// 2. The implicit *high* register of any i64 value on the stack — for every +/// `lo` in `stack`, [`i64_pair_hi`]`(lo)` is also reserved. The wasm stack +/// only tracks the lo register of each i64; the hi is reserved by +/// convention but invisible to a naive scan. If we ignored that, a fresh +/// `alloc_consecutive_pair` could return the implicit-hi of an earlier +/// i64, clobbering it on the next i64 op that reads it via i64_pair_hi. +/// 3. Any explicit registers in `extra_avoid` — used by i64-op handlers to +/// keep the just-popped operand pairs alive across the destination +/// allocation (e.g. for I64Or, the popped a_lo/a_hi/b_lo/b_hi are still +/// live until the OR is emitted). +/// +/// Calling [`alloc_temp_safe`] twice in succession is unsafe for i64 values: +/// if a register between them is live, the second call skips it and the +/// resulting pair is non-consecutive, breaking [`i64_pair_hi`]'s contract. +fn alloc_consecutive_pair( + next_temp: &mut u8, + stack: &[Reg], + extra_avoid: &[Reg], +) -> Result<(Reg, Reg)> { + // Build a "live" Vec: every stack entry, plus its conventional + // pair_hi (over-reserves for i32 stack entries but that's safe), plus + // any explicit extras the caller specifies. Using Vec rather than + // HashSet because Reg in this crate does not derive Hash. + let mut live: Vec = Vec::with_capacity(stack.len() * 2 + extra_avoid.len()); + for ® in stack { + live.push(reg); + if let Ok(hi) = i64_pair_hi(reg) { + live.push(hi); + } + } + for ® in extra_avoid { + live.push(reg); + } + + let n = ALLOCATABLE_REGS.len(); + for _ in 0..n { + let lo_idx = (*next_temp as usize) % n; + let hi_idx = lo_idx + 1; + if hi_idx < n { + let lo_reg = ALLOCATABLE_REGS[lo_idx]; + let hi_reg = ALLOCATABLE_REGS[hi_idx]; + if !live.contains(&lo_reg) && !live.contains(&hi_reg) { + *next_temp = ((hi_idx + 1) % n) as u8; + return Ok((lo_reg, hi_reg)); + } + } + *next_temp = ((*next_temp as usize + 1) % n) as u8; + } + Err(synth_core::Error::synthesis( + "register exhaustion: no consecutive pair of free registers for i64 — \ + function too complex for current register allocator" + .to_string(), + )) +} + /// Given the low register of an i64 register pair, return the high register. /// /// Convention: i64 values on 32-bit ARM use two consecutive registers. @@ -156,6 +216,162 @@ fn i64_pair_hi(lo_reg: Reg) -> Result { ))) } +/// Per-function stack-frame layout for non-parameter locals. +/// +/// `offsets[idx]` gives the byte offset (relative to SP after the frame +/// allocation) where local `idx` lives. `frame_size` is the total bytes +/// to allocate via `sub sp, sp, #frame_size` in the prologue. +/// +/// i32/i64 locals each occupy 4/8 bytes respectively. i64 locals are +/// 8-byte aligned per AAPCS. The total frame is rounded up to 8 bytes +/// to keep SP 8-byte aligned at call sites. +struct LocalLayout { + /// idx -> (offset_from_sp, is_i64) + locals: std::collections::HashMap, + frame_size: i32, +} + +/// Compute the stack-frame layout for non-parameter locals in a function. +/// +/// Walks the wasm op stream once to: +/// 1. Identify which non-param local indices are referenced (LocalGet/Set/Tee). +/// 2. Determine each local's width via `infer_i64_locals` (i32 vs i64). +/// 3. Lay them out in ascending-index order with i64 locals 8-byte aligned. +/// +/// The result drives: +/// - Prologue: `sub sp, sp, #frame_size` after pushing callee-saved regs. +/// - LocalGet/Set/Tee: use `offsets[idx]` instead of the legacy +/// `(idx - 4) * 4` formula (which only happened to work when num_params==4 +/// AND the formula's negative result was silently clamped to 0 by the +/// encoder, in both cases corrupting the caller's stack or the callee's +/// own callee-saved-register spill). +/// - Epilogue: `add sp, sp, #frame_size` before popping registers. +fn compute_local_layout(wasm_ops: &[WasmOp], num_params: u32) -> LocalLayout { + use std::collections::{BTreeSet, HashMap}; + let i64_set = infer_i64_locals(wasm_ops); + + // Collect non-param local indices, in ascending order for deterministic layout. + let mut used: BTreeSet = BTreeSet::new(); + for op in wasm_ops { + match op { + WasmOp::LocalGet(idx) | WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) + if *idx >= num_params => + { + used.insert(*idx); + } + _ => {} + } + } + + let mut locals: HashMap = HashMap::new(); + let mut offset: i32 = 0; + for &idx in &used { + let is_i64 = i64_set.contains(&idx); + // i64 locals require 8-byte alignment. + if is_i64 && (offset % 8) != 0 { + offset += 4; + } + locals.insert(idx, (offset, is_i64)); + offset += if is_i64 { 8 } else { 4 }; + } + // Round frame to 8-byte multiple for AAPCS SP alignment. + let frame_size = (offset + 7) & !7; + + LocalLayout { locals, frame_size } +} + +/// Infer which non-parameter wasm locals are i64 (8-byte) values. +/// +/// The wasm decoder discards local-declaration type info, so we re-derive +/// it from the operation stream by simulating a virtual stack of widths +/// (1 = 32-bit, 2 = 64-bit). On each `LocalSet`/`LocalTee` we record the +/// width of the value being stored. WASM type rules guarantee a local's +/// width is invariant for its lifetime, so the first store wins. +/// +/// Without this, the spilled-local store/load path would emit a single +/// 4-byte STR/LDR for i64 locals, dropping the upper half — corrupting +/// any function that returns or uses a u64-packed FFI struct. +fn infer_i64_locals(wasm_ops: &[WasmOp]) -> std::collections::HashSet { + use WasmOp::*; + let mut i64_locals: std::collections::HashSet = std::collections::HashSet::new(); + let mut vstack: Vec = Vec::new(); // true = i64 + + let is_i64_producer = |op: &WasmOp| -> bool { + matches!( + op, + I64Add + | I64Sub + | I64Mul + | I64DivS + | I64DivU + | I64RemS + | I64RemU + | I64And + | I64Or + | I64Xor + | I64Shl + | I64ShrS + | I64ShrU + | I64Rotl + | I64Rotr + | I64Clz + | I64Ctz + | I64Popcnt + | I64Const(_) + | I64Load { .. } + | I64Load8S { .. } + | I64Load8U { .. } + | I64Load16S { .. } + | I64Load16U { .. } + | I64Load32S { .. } + | I64Load32U { .. } + | I64ExtendI32S + | I64ExtendI32U + | I64Extend8S + | I64Extend16S + | I64Extend32S + ) + }; + + for op in wasm_ops { + match op { + LocalGet(idx) => { + let is_i64 = i64_locals.contains(idx); + vstack.push(is_i64); + } + LocalSet(idx) => { + if let Some(true) = vstack.pop() { + i64_locals.insert(*idx); + } + } + LocalTee(idx) => { + if let Some(&true) = vstack.last() { + i64_locals.insert(*idx); + } + } + Select => { + // pops [val1, val2, cond], pushes one value with width of val1/val2 + let _cond = vstack.pop(); + let v2 = vstack.pop(); + let v1 = vstack.pop(); + vstack.push(v1.or(v2).unwrap_or(false)); + } + _ => { + let (pops, pushes) = wasm_stack_effect(op); + for _ in 0..pops { + vstack.pop(); + } + let push_width = is_i64_producer(op); + for _ in 0..pushes { + vstack.push(push_width); + } + } + } + } + + i64_locals +} + /// Return the (pops, pushes) stack effect for a WASM op. /// /// Used by the wildcard fallthrough in select_with_stack to maintain @@ -3220,9 +3436,12 @@ impl InstructionSelector { let mut instructions = Vec::new(); - // Function prologue: save callee-saved registers and LR. + // Function prologue: save callee-saved registers and LR, then + // allocate the local-variable frame. + // // AAPCS requires 8-byte aligned SP at call sites. Pushing an even - // number of registers (6: R4-R8, LR) maintains alignment. + // number of registers (6: R4-R8, LR) maintains alignment, and the + // frame_size below is rounded to 8 to preserve it. instructions.push(ArmInstruction { op: ArmOp::Push { regs: vec![Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8, Reg::LR], @@ -3230,6 +3449,22 @@ impl InstructionSelector { source_line: None, }); + // Compute non-param local layout (offsets + total frame size). + let layout = compute_local_layout(wasm_ops, num_params); + // Allocate stack space for non-param locals so they don't alias the + // callee-saved-register spill area (which immediately follows SP + // after Push above). + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Sub { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: None, + }); + } + // Virtual stack holds register indices let mut stack: Vec = Vec::new(); // Next available register for temporaries (start after params) @@ -3255,11 +3490,46 @@ impl InstructionSelector { for (idx, op) in wasm_ops.iter().enumerate() { match op { LocalGet(local_idx) => { - // Get the register for this local + // Get the register for this local. Three cases: + // 1. Param in register — use the cached mapping. + // 2. Spilled i64 local — load both halves via I64Ldr. + // 3. Spilled i32 local — single Ldr. let reg = if let Some(&r) = local_to_reg.get(local_idx) { r + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 local — load both 32-bit halves into a consecutive + // register pair via the I64Ldr pseudo-op. Convention + // matches I64Const: push only dst_lo on the stack; + // dst_hi is recovered later via i64_pair_hi(lo). + // The pair MUST be consecutive in ALLOCATABLE_REGS + // — i64_pair_hi assumes that. Two separate calls to + // alloc_temp_safe can return non-consecutive registers + // when something in between is live, breaking the + // pair convention. + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?; + instructions.push(ArmInstruction { + op: ArmOp::I64Ldr { + rdlo: dst_lo, + rdhi: dst_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + dst_lo + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + // i32 local: single 4-byte load from the locals frame. + let dst = alloc_temp_safe(&mut next_temp, &stack)?; + instructions.push(ArmInstruction { + op: ArmOp::Ldr { + rd: dst, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + dst } else { - // Local not in register (spilled to stack) - load it + // Local not in layout (shouldn't happen for valid wasm, + // but fall back to legacy behaviour for compatibility). let dst = alloc_temp_safe(&mut next_temp, &stack)?; instructions.push(ArmInstruction { op: ArmOp::Ldr { @@ -4324,6 +4594,20 @@ impl InstructionSelector { }); cf.add_instruction(); } + // Deallocate the local frame before popping callee-saved + // registers; otherwise the pop would read from the locals + // area instead of the saved-register slots. + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } // Restore callee-saved registers and return via PC instructions.push(ArmInstruction { op: ArmOp::Pop { @@ -4475,7 +4759,32 @@ impl InstructionSelector { cf.add_instruction(); } local_to_reg.insert(*local_idx, target); + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 spilled local: store BOTH 32-bit halves + // (lower at offset N, upper at N+4) via the I64Str + // pseudo-op. Without this we drop the upper half. + let val_hi = i64_pair_hi(val)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Str { + rdlo: val, + rdhi: val_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + // i32 spilled local: single 4-byte store. + instructions.push(ArmInstruction { + op: ArmOp::Str { + rd: val, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); } else { + // Fall-through for compatibility (shouldn't happen). instructions.push(ArmInstruction { op: ArmOp::Str { rd: val, @@ -4507,7 +4816,29 @@ impl InstructionSelector { cf.add_instruction(); } local_to_reg.insert(*local_idx, target); + } else if let Some(&(off, true)) = layout.locals.get(local_idx) { + // i64 spilled local: store both halves like LocalSet. + let val_hi = i64_pair_hi(val)?; + instructions.push(ArmInstruction { + op: ArmOp::I64Str { + rdlo: val, + rdhi: val_hi, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } else if let Some(&(off, false)) = layout.locals.get(local_idx) { + instructions.push(ArmInstruction { + op: ArmOp::Str { + rd: val, + addr: MemAddr::imm(Reg::SP, off), + }, + source_line: Some(idx), + }); + cf.add_instruction(); } else { + // Fall-through for compatibility. instructions.push(ArmInstruction { op: ArmOp::Str { rd: val, @@ -4561,9 +4892,12 @@ impl InstructionSelector { // Pairs are allocated as two consecutive temp registers. // ========================================================= I64Const(val) => { - // Allocate a register pair for the 64-bit constant - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate a CONSECUTIVE register pair for the 64-bit + // constant. Two separate alloc_temp_safe calls can return + // non-consecutive registers if something in between is + // live on the wasm stack, which then breaks the + // i64_pair_hi convention used by every i64 op downstream. + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[])?; instructions.push(ArmInstruction { op: ArmOp::I64Const { @@ -4593,9 +4927,16 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Two separate alloc_temp_safe calls + // skip live registers and produce non-consecutive pairs. + // Avoid clobbering the just-popped operand pairs before + // the ADC reads them — passing them in extra_avoid + // ensures dst doesn't overlap any of a_lo/a_hi/b_lo/b_hi. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; // ADDS dst_lo, a_lo, b_lo (sets carry flag) instructions.push(ArmInstruction { @@ -4637,9 +4978,10 @@ impl InstructionSelector { let b_hi = i64_pair_hi(b_lo)?; let a_hi = i64_pair_hi(a_lo)?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // See I64Add for why extra_avoid carries a_*/b_* — + // dst must not overlap any operand half before SBC reads it. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; // SUBS dst_lo, a_lo, b_lo (sets borrow flag) instructions.push(ArmInstruction { @@ -4666,6 +5008,196 @@ impl InstructionSelector { stack.push(dst_lo); } + // ============================================================ + // i64 bitwise ops (I64Or / I64And / I64Xor) + // + // Each pops two i64 register pairs from the wasm stack and + // emits two ARM ops (low-half then high-half) into a freshly + // allocated consecutive pair. This replaces the wildcard + // fallthrough to select_default, which assumed inputs in + // R0:R1 and R2:R3 — incorrect when the wasm stack tracks + // arbitrary register pairs from earlier ops. + // ============================================================ + I64Or | I64And | I64Xor => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 bitwise op".to_string(), + ) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 bitwise op".to_string(), + ) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // dst must not overlap any popped operand's half — the + // hi instruction reads a_hi and b_hi after the lo + // instruction writes dst_lo. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; + let (lo_op, hi_op) = match op { + I64Or => ( + ArmOp::Orr { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::Orr { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + I64And => ( + ArmOp::And { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::And { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + I64Xor => ( + ArmOp::Eor { + rd: dst_lo, + rn: a_lo, + op2: Operand2::Reg(b_lo), + }, + ArmOp::Eor { + rd: dst_hi, + rn: a_hi, + op2: Operand2::Reg(b_hi), + }, + ), + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: lo_op, + source_line: Some(idx), + }); + cf.add_instruction(); + instructions.push(ArmInstruction { + op: hi_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i32 -> i64 extension (I64ExtendI32U / I64ExtendI32S) + // + // Pops one i32, allocates a consecutive i64 pair, places the + // i32 in the low half. For unsigned: high = 0. For signed: + // high = arithmetic-shift-right by 31 (sign-extension). + // ============================================================ + I64ExtendI32U => { + let val = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in I64ExtendI32U".to_string()) + })?; + // val must stay alive until the Mov reads it; dst_hi + // must not be val (we'd write the zero high before + // moving val to dst_lo). + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + if val != dst_lo { + instructions.push(ArmInstruction { + op: ArmOp::Mov { + rd: dst_lo, + op2: Operand2::Reg(val), + }, + source_line: Some(idx), + }); + cf.add_instruction(); + } + instructions.push(ArmInstruction { + op: ArmOp::Movw { + rd: dst_hi, + imm16: 0, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + I64ExtendI32S => { + let val = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in I64ExtendI32S".to_string()) + })?; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[val])?; + instructions.push(ArmInstruction { + op: ArmOp::I64ExtendI32S { + rdlo: dst_lo, + rdhi: dst_hi, + rn: val, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 variable shifts (I64Shl / I64ShrU / I64ShrS) + // + // Use the existing I64Shl/I64ShrU/I64ShrS pseudo-ops (which + // expand to the variable-shift logic in arm_encoder.rs) but + // pass the actual stack-tracked register pairs rather than + // assuming R0:R1 / R2:R3. + // ============================================================ + I64Shl | I64ShrU | I64ShrS => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in i64 shift".to_string()) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in i64 shift".to_string()) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // dst must not overlap any popped operand's half — the + // shift pseudo-op reads all four (rn_lo/rn_hi/rm_lo/rm_hi) + // before writing the destination. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; + let shift_op = match op { + I64Shl => ArmOp::I64Shl { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + I64ShrU => ArmOp::I64ShrU { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + I64ShrS => ArmOp::I64ShrS { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: shift_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + I64Load { offset, .. } => { // Pop address from stack let addr = stack.pop().ok_or_else(|| { @@ -4675,9 +5207,12 @@ impl InstructionSelector { ) })?; - // Allocate result register pair - let dst_lo = alloc_temp_safe(&mut next_temp, &stack)?; - let dst_hi = alloc_temp_safe(&mut next_temp, &stack)?; + // Allocate result register pair. MUST be consecutive + // in ALLOCATABLE_REGS — i64_pair_hi assumes consecutive + // and is called by every i64 op downstream to recover + // the high register. Avoid clobbering addr before the + // load uses it. + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; // Generate bounds-checked i64 load into the allocated pair let load_ops = @@ -5055,7 +5590,18 @@ impl InstructionSelector { } } - // Function epilogue: restore callee-saved registers and return via PC + // Function epilogue: deallocate the local frame, then restore + // callee-saved registers and return via PC. + if layout.frame_size > 0 { + instructions.push(ArmInstruction { + op: ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(layout.frame_size), + }, + source_line: None, + }); + } // POP {R4-R8, PC} restores registers and returns (PC = saved LR) instructions.push(ArmInstruction { op: ArmOp::Pop { diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 2f741b0..5c506d3 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -1277,6 +1277,12 @@ impl OptimizerBridge { // AAPCS: first 4 params in R0-R3 let param_regs = [Reg::R0, Reg::R1, Reg::R2, Reg::R3]; + // Reserved param registers: R0..R(min(num_params,4)). These hold incoming + // AAPCS arguments that must NOT be clobbered by i64 op handlers — at least + // until the user's WASM has done a `local.get` of each. Using Vec because + // `Reg` does not derive Hash (matches `instruction_selector::alloc_consecutive_pair`). + let param_reserved_regs: Vec = param_regs[..num_params.min(4)].to_vec(); + // Track which ARM register currently holds each local variable // This avoids stack spills for simple cases let mut local_to_reg: HashMap = HashMap::new(); @@ -1290,7 +1296,17 @@ impl OptimizerBridge { // Track the last value-producing vreg (for function return value) let mut last_result_vreg: Option = None; - // Track whether the last result is an i64 (result already in R0:R1, no move needed) + // For i64 returns, also track the hi-half vreg so the epilogue can move + // the pair into R0:R1 regardless of where regalloc placed it. Was previously + // unnecessary because every i64 op pinned its result to R0:R1 — that's the + // bug we're fixing here. + let mut last_result_vreg_hi: Option = None; + // For i64 ops whose IR Opcode only tracks a single `dest` vreg (Clz / Ctz / + // Popcnt), the hi half lives in a register chosen at lowering time but has + // no IR vreg pointing at it. Stash that physical reg directly so the + // epilogue can still emit the correct (R0, R1) move. + let mut last_result_vreg_hi_reg: Option = None; + // Track whether the last result is an i64 (return value occupies a pair). let mut is_i64_result = false; // WASM operand value stack - tracks vreg IDs for correct stack semantics // Used to restore last_result_vreg after br_if pops its condition @@ -1318,6 +1334,48 @@ impl OptimizerBridge { } }; + // Allocate a CONSECUTIVE callee-saved register pair for an i64 destination. + // + // Searches `[(R4,R5), (R6,R7), (R8,R9), (R10,R11)]` for a pair where neither + // register is currently: + // - holding a live vreg (`vreg_to_arm.values()`) + // - bound to a non-param local (`local_to_reg.values()`) + // - one of the AAPCS param registers we must preserve on entry + // (`param_reserved_regs`) + // + // Falls back to `(R4, R5)` if no pair is free — preserves prior behaviour + // for very-pressured functions, but at least keeps params intact in the + // common case. Per `instruction_selector::alloc_consecutive_pair`, callers + // who hit the fallback in real workloads will need spill support; that's + // out of scope for this fix. + let alloc_i64_pair = |vreg_to_arm: &HashMap, + local_to_reg: &HashMap, + param_reserved_regs: &[Reg]| + -> (Reg, Reg) { + const CANDIDATES: &[(Reg, Reg)] = &[ + (Reg::R4, Reg::R5), + (Reg::R6, Reg::R7), + (Reg::R8, Reg::R9), + (Reg::R10, Reg::R11), + ]; + let is_in_use = |r: Reg| -> bool { + vreg_to_arm.values().any(|&v| v == r) + || local_to_reg.values().any(|&v| v == r) + || param_reserved_regs.contains(&r) + }; + for &(lo, hi) in CANDIDATES { + if !is_in_use(lo) && !is_in_use(hi) { + return (lo, hi); + } + } + // Fallback — same hardcoded pair the buggy code used. Better than crashing, + // and matches existing behaviour when the caller is so pressured that + // even R8..R11 are occupied. (Empirically this never triggers for + // workloads we care about; if it does, the architectural fix is + // proper spilling, not a wider search.) + (Reg::R4, Reg::R5) + }; + // Emit a reload instruction if the vreg was spilled to stack. // Must be called before the instruction that uses the register. let reload_spill = |vreg: &OptReg, spills: &HashMap, instrs: &mut Vec| { @@ -2073,19 +2131,21 @@ impl OptimizerBridge { } => { // Map local index to register pair // Per AAPCS: i64 uses consecutive even/odd register pairs - let (lo_reg, hi_reg) = if *addr == 0 { + let (lo_reg, hi_reg) = if *addr == 0 && num_params >= 2 { (Reg::R0, Reg::R1) // First i64 param - } else if *addr == 1 { + } else if *addr == 1 && num_params >= 4 { (Reg::R2, Reg::R3) // Second i64 param } else { - // For other locals, we'd need stack access - // For now, use R4:R5 as temp - (Reg::R4, Reg::R5) + // Non-param i64 local: pick a free callee-saved pair so we + // don't clobber AAPCS arg regs that haven't been read yet. + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs) }; vreg_to_arm.insert(dest_lo.0, lo_reg); vreg_to_arm.insert(dest_hi.0, hi_reg); // No ARM instructions needed - values are already in registers for params last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; } Opcode::I64Const { @@ -2096,14 +2156,12 @@ impl OptimizerBridge { // Load 64-bit constant into register pair let lo = (*value & 0xFFFFFFFF) as u32; let hi = ((*value >> 32) & 0xFFFFFFFF) as u32; - // Choose register pair based on virtual register number - // If dest_lo.0 is 0 or 1, use R0:R1 (first i64 slot) - // If dest_lo.0 is 2 or 3, use R2:R3 (second i64 slot) - let (lo_reg, hi_reg) = if dest_lo.0 <= 1 { - (Reg::R0, Reg::R1) - } else { - (Reg::R2, Reg::R3) - }; + // Choose a free callee-saved pair so we don't trample params still + // sitting in R0..R3. The earlier heuristic (vreg-id → R0:R1 / R2:R3) + // ignored AAPCS, breaking any function that issued an i64.const + // before reading its i32 params. + let (lo_reg, hi_reg) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); vreg_to_arm.insert(dest_lo.0, lo_reg); vreg_to_arm.insert(dest_hi.0, hi_reg); // Load low word @@ -2142,511 +2200,871 @@ impl OptimizerBridge { }); } } + // If this i64 const is the final return value, the epilogue + // needs to know which pair holds it (for the move into R0:R1). + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; } Opcode::I64Add { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.add: R0:R1 = R0:R1 + R2:R3 - // ADDS R0, R0, R2 (sets carry) - // ADC R1, R1, R3 (adds carry) - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + // i64.add: rd = rn + rm using the actual operand regs from + // vreg_to_arm — NOT hardcoded R0:R1/R2:R3 (which would clobber + // AAPCS param regs). + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Adds { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Adc { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Sub { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.sub: R0:R1 = R0:R1 - R2:R3 - // SUBS R0, R0, R2 (sets borrow) - // SBC R1, R1, R3 (subtracts borrow) - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Subs { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Sbc { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64And { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.and: R0:R1 = R0:R1 & R2:R3 - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::And { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::And { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Or { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.or: R0:R1 = R0:R1 | R2:R3 - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Orr { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Orr { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Xor { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - // i64.xor: R0:R1 = R0:R1 ^ R2:R3 - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::Eor { - rd: Reg::R0, - rn: Reg::R0, - op2: Operand2::Reg(Reg::R2), + rd: rd_lo, + rn: rn_lo, + op2: Operand2::Reg(rm_lo), }); arm_instrs.push(ArmOp::Eor { - rd: Reg::R1, - rn: Reg::R1, - op2: Operand2::Reg(Reg::R3), + rd: rd_hi, + rn: rn_hi, + op2: Operand2::Reg(rm_hi), }); - // Mark as i64 result - no final mov needed, result already in R0:R1 + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // ======================================================================== - // i64 Comparisons (result is single i32 in R0) + // i64 Comparisons (result is single i32) + // + // Sources are read from `vreg_to_arm[src*]` rather than hardcoded + // R0:R1/R2:R3 — the latter would mean "i64 ops always assume their + // operands materialised at the AAPCS arg slots", which is false: + // operand registers come from whatever the upstream IR producers + // (I64Const, I64Load, prior i64 ops) chose. Result lands on the lo + // half of a freshly allocated callee-saved pair so we don't smash + // any AAPCS arg reg the user hasn't read yet. // ======================================================================== - Opcode::I64Eq { dest, .. } => { - // i64.eq: (R0:R1) == (R2:R3), result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64Eq { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::EQ, }); last_result_vreg = Some(dest.0); } - Opcode::I64Ne { dest, .. } => { - // i64.ne: (R0:R1) != (R2:R3), result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64Ne { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::NE, }); last_result_vreg = Some(dest.0); } - Opcode::I64LtS { dest, .. } => { - // i64.lt_s: (R0:R1) < (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LT, }); last_result_vreg = Some(dest.0); } - Opcode::I64GtS { dest, .. } => { - // i64.gt_s: (R0:R1) > (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::GT, }); last_result_vreg = Some(dest.0); } - Opcode::I64LeS { dest, .. } => { - // i64.le_s: (R0:R1) <= (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LE, }); last_result_vreg = Some(dest.0); } - Opcode::I64GeS { dest, .. } => { - // i64.ge_s: (R0:R1) >= (R2:R3) signed, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::GE, }); last_result_vreg = Some(dest.0); } // Unsigned i64 comparisons - Opcode::I64LtU { dest, .. } => { - // i64.lt_u: (R0:R1) < (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LO, }); last_result_vreg = Some(dest.0); } - Opcode::I64GtU { dest, .. } => { - // i64.gt_u: (R0:R1) > (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::HI, }); last_result_vreg = Some(dest.0); } - Opcode::I64LeU { dest, .. } => { - // i64.le_u: (R0:R1) <= (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64LeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::LS, }); last_result_vreg = Some(dest.0); } - Opcode::I64GeU { dest, .. } => { - // i64.ge_u: (R0:R1) >= (R2:R3) unsigned, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); + Opcode::I64GeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::I64SetCond { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, cond: Condition::HS, }); last_result_vreg = Some(dest.0); } - Opcode::I64Eqz { dest, .. } => { - // i64.eqz: (R0:R1) == 0, result in R0 - vreg_to_arm.insert(dest.0, Reg::R0); - arm_instrs.push(ArmOp::I64SetCondZ { - rd: Reg::R0, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - }); + Opcode::I64Eqz { + dest, + src_lo, + src_hi, + } => { + let rn_lo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd, _) = alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest.0, rd); + arm_instrs.push(ArmOp::I64SetCondZ { rd, rn_lo, rn_hi }); last_result_vreg = Some(dest.0); } - // i64 count leading zeros (returns i64 where high word is always 0) - Opcode::I64Clz { dest, .. } => { - vreg_to_arm.insert(dest.0, Reg::R0); + // i64 count leading zeros (i64 result: lo gets count, hi must be 0). + // + // The ArmOp::I64Clz encoder writes the count into `rd` AND zeroes + // `rnhi` in-place — so `rnhi` doubles as the result's hi half. To + // keep the upstream src_hi register intact and avoid clobbering + // unrelated AAPCS regs, we copy src_hi into a freshly allocated + // callee-saved hi slot and pass that as `rnhi`. After the encoded + // sequence, the i64 result lives in (rd_lo, rd_hi). + // + // The IR Opcode only carries a single `dest` vreg (the lo half); + // we register dest.0 → rd_lo. The hi-zero is implicit and used by + // the function epilogue when this is the i64 return value (see + // last_result_vreg_hi_reg below). + Opcode::I64Clz { + dest, + src_lo, + src_hi, + } => { + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + if rd_hi != rnhi_src { + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Reg(rnhi_src), + }); + } + vreg_to_arm.insert(dest.0, rd_lo); arm_instrs.push(ArmOp::I64Clz { - rd: Reg::R0, - rnlo: Reg::R0, - rnhi: Reg::R1, + rd: rd_lo, + rnlo, + rnhi: rd_hi, }); last_result_vreg = Some(dest.0); + last_result_vreg_hi_reg = Some(rd_hi); is_i64_result = true; } - // i64 count trailing zeros (returns i64 where high word is always 0) - Opcode::I64Ctz { dest, .. } => { - vreg_to_arm.insert(dest.0, Reg::R0); + // i64 count trailing zeros — same pattern as I64Clz above. + Opcode::I64Ctz { + dest, + src_lo, + src_hi, + } => { + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + if rd_hi != rnhi_src { + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Reg(rnhi_src), + }); + } + vreg_to_arm.insert(dest.0, rd_lo); arm_instrs.push(ArmOp::I64Ctz { - rd: Reg::R0, - rnlo: Reg::R0, - rnhi: Reg::R1, + rd: rd_lo, + rnlo, + rnhi: rd_hi, }); last_result_vreg = Some(dest.0); + last_result_vreg_hi_reg = Some(rd_hi); is_i64_result = true; } - // i64 population count (returns i64 where high word is always 0) - Opcode::I64Popcnt { dest, .. } => { - vreg_to_arm.insert(dest.0, Reg::R0); + // i64 population count — same pattern as I64Clz above. + Opcode::I64Popcnt { + dest, + src_lo, + src_hi, + } => { + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let rnhi_src = get_arm_reg(src_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + if rd_hi != rnhi_src { + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Reg(rnhi_src), + }); + } + vreg_to_arm.insert(dest.0, rd_lo); arm_instrs.push(ArmOp::I64Popcnt { - rd: Reg::R0, - rnlo: Reg::R0, - rnhi: Reg::R1, + rd: rd_lo, + rnlo, + rnhi: rd_hi, }); last_result_vreg = Some(dest.0); + last_result_vreg_hi_reg = Some(rd_hi); is_i64_result = true; } // i64 sign extension operations Opcode::I64Extend8S { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src_lo, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); - arm_instrs.push(ArmOp::I64Extend8S { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - }); + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); + arm_instrs.push(ArmOp::I64Extend8S { rdlo, rdhi, rnlo }); last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Extend16S { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src_lo, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); - arm_instrs.push(ArmOp::I64Extend16S { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - }); + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); + arm_instrs.push(ArmOp::I64Extend16S { rdlo, rdhi, rnlo }); last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } Opcode::I64Extend32S { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src_lo, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); - arm_instrs.push(ArmOp::I64Extend32S { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - }); + let rnlo = get_arm_reg(src_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); + arm_instrs.push(ArmOp::I64Extend32S { rdlo, rdhi, rnlo }); last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 multiply: UMULL + MLA cross products Opcode::I64Mul { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64Mul { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 shift left Opcode::I64Shl { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64Shl { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 arithmetic shift right Opcode::I64ShrS { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64ShrS { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 logical shift right Opcode::I64ShrU { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); arm_instrs.push(ArmOp::I64ShrU { - rd_lo: Reg::R0, - rd_hi: Reg::R1, - rn_lo: Reg::R0, - rn_hi: Reg::R1, - rm_lo: Reg::R2, - rm_hi: Reg::R3, + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 rotate left Opcode::I64Rotl { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + .. } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let shift = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64Rotl { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - shift: Reg::R2, // Only use low word of shift amount + rdlo, + rdhi, + rnlo, + rnhi, + shift, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 rotate right Opcode::I64Rotr { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + .. } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let shift = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64Rotr { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - shift: Reg::R2, // Only use low word of shift amount + rdlo, + rdhi, + rnlo, + rnhi, + shift, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 signed division Opcode::I64DivS { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64DivS { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 unsigned division Opcode::I64DivU { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64DivU { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 signed remainder Opcode::I64RemS { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64RemS { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } // i64 unsigned remainder Opcode::I64RemU { - dest_lo, dest_hi, .. + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, } => { - vreg_to_arm.insert(dest_lo.0, Reg::R0); - vreg_to_arm.insert(dest_hi.0, Reg::R1); + let rnlo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); + let rnhi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + let rmlo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rmhi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rdlo, rdhi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rdlo); + vreg_to_arm.insert(dest_hi.0, rdhi); arm_instrs.push(ArmOp::I64RemU { - rdlo: Reg::R0, - rdhi: Reg::R1, - rnlo: Reg::R0, - rnhi: Reg::R1, - rmlo: Reg::R2, - rmhi: Reg::R3, + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); is_i64_result = true; } @@ -2996,36 +3414,19 @@ impl OptimizerBridge { ArmOp::Mov { rd, op2: Operand2::Imm(v), - } => { - if reg_num(rd) > 7 || *v > 255 || *v < 0 { - 4 - } else { - 2 - } - } + } if reg_num(rd) > 7 || *v > 255 || *v < 0 => 4, + ArmOp::Mov { .. } => 2, // SUB/ADD with high registers need 32-bit encoding ArmOp::Sub { rd, rn, op2: Operand2::Reg(rm), - } => { - if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 { - 4 - } else { - 2 - } - } + } if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 => 4, ArmOp::Add { rd, rn, op2: Operand2::Reg(rm), - } => { - if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 { - 4 - } else { - 2 - } - } + } if reg_num(rd) > 7 || reg_num(rn) > 7 || reg_num(rm) > 7 => 4, // Most 16-bit Thumb instructions (MOV low, CMP low, B, etc.) _ => 2, } @@ -3066,9 +3467,73 @@ impl OptimizerBridge { } } - // Ensure return value is in R0 (skip for i64 results which are already in R0:R1) - if !is_i64_result - && let Some(result_vreg) = last_result_vreg + // Ensure the return value is in R0 (i32 result) or R0:R1 (i64 result). + // + // Pre-fix, every i64 op pinned its result at R0:R1 so this could be a + // no-op for is_i64_result. After the fix, the result pair may live in + // any callee-saved pair (R4:R5..R10:R11), and we need an explicit move. + // The order matters: copy hi → R1 first, then lo → R0, so we don't + // clobber the lo value if the source happens to be R1. + if is_i64_result { + // Resolve the lo half from vreg_to_arm. + let lo_reg = last_result_vreg.and_then(|v| vreg_to_arm.get(&v).copied()); + // Resolve the hi half: prefer an explicit vreg id, else fall back to + // the physical reg stash used by Clz/Ctz/Popcnt. + let hi_reg = last_result_vreg_hi + .and_then(|v| vreg_to_arm.get(&v).copied()) + .or(last_result_vreg_hi_reg); + + if let (Some(lo), Some(hi)) = (lo_reg, hi_reg) { + // Move hi first (so we don't clobber lo if hi's source is R1). + if hi != Reg::R1 { + arm_instrs.push(ArmOp::Mov { + rd: Reg::R1, + op2: Operand2::Reg(hi), + }); + } + // Now move lo. If lo was R1 originally, it just got smashed by + // the hi-move above; but R1's prior contents are now in R1 + // (the hi value), so we'd actually have wanted to save lo first. + // Handle that case explicitly: save lo to R12 (IP scratch) first. + if lo == Reg::R1 && hi != Reg::R1 { + // lo was in R1, which we just overwrote. We can't recover it + // unless we saved earlier. The clean fix: detect this + // arrangement up front. For now, swap order via R12. + // (This is reached only on bizarre regalloc choices; the + // common case is lo in R4..R10, which doesn't hit it.) + arm_instrs.pop(); // remove the hi-move we just emitted + arm_instrs.push(ArmOp::Mov { + rd: Reg::R12, + op2: Operand2::Reg(lo), + }); + if hi != Reg::R1 { + arm_instrs.push(ArmOp::Mov { + rd: Reg::R1, + op2: Operand2::Reg(hi), + }); + } + arm_instrs.push(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(Reg::R12), + }); + } else if lo != Reg::R0 { + arm_instrs.push(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(lo), + }); + } + } else if let Some(lo) = lo_reg + && lo != Reg::R0 + { + // Hi is unknown — fall back to single-register move (caller of + // this function may have set is_i64_result without populating + // the hi tracker; preserve old behaviour rather than crash). + arm_instrs.push(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(lo), + }); + } + } else if let Some(result_vreg) = last_result_vreg && let Some(&result_reg) = vreg_to_arm.get(&result_vreg) && result_reg != Reg::R0 { diff --git a/crates/synth-synthesis/src/pattern_matcher.rs b/crates/synth-synthesis/src/pattern_matcher.rs index 3c61b36..c7e1d30 100644 --- a/crates/synth-synthesis/src/pattern_matcher.rs +++ b/crates/synth-synthesis/src/pattern_matcher.rs @@ -53,7 +53,7 @@ impl PatternMatcher { } // Sort by priority (highest first) - matches.sort_by(|a, b| b.rule.priority.cmp(&a.rule.priority)); + matches.sort_by_key(|m| std::cmp::Reverse(m.rule.priority)); matches } diff --git a/crates/synth-synthesis/src/rules.rs b/crates/synth-synthesis/src/rules.rs index 8180fbd..687b40a 100644 --- a/crates/synth-synthesis/src/rules.rs +++ b/crates/synth-synthesis/src/rules.rs @@ -1835,7 +1835,7 @@ impl RuleDatabase { pub fn add_rule(&mut self, rule: SynthesisRule) { self.rules.push(rule); // Sort by priority (highest first) - self.rules.sort_by(|a, b| b.priority.cmp(&a.priority)); + self.rules.sort_by_key(|r| std::cmp::Reverse(r.priority)); } /// Get all rules diff --git a/crates/synth-synthesis/tests/semantic_correctness.rs b/crates/synth-synthesis/tests/semantic_correctness.rs index 9ae0910..45e72ba 100644 --- a/crates/synth-synthesis/tests/semantic_correctness.rs +++ b/crates/synth-synthesis/tests/semantic_correctness.rs @@ -146,8 +146,8 @@ fn interpret_single(state: &mut ArmState, instr: &ArmInstruction) { ArmOp::Udiv { rd, rn, rm } => { let a = state.get(rn); let b = state.get(rm); - if b != 0 { - state.set(*rd, a / b); + if let Some(q) = a.checked_div(b) { + state.set(*rd, q); } } ArmOp::Mls { rd, rn, rm, ra } => { @@ -233,11 +233,9 @@ fn interpret_single(state: &mut ArmState, instr: &ArmInstruction) { let sr = result as i32; state.flag_v = (sa > 0 && sb > 0 && sr < 0) || (sa < 0 && sb < 0 && sr >= 0); } - ArmOp::SelectMove { rd, rm, cond } => { - if state.condition_met(cond) { - let val = state.get(rm); - state.set(*rd, val); - } + ArmOp::SelectMove { rd, rm, cond } if state.condition_met(cond) => { + let val = state.get(rm); + state.set(*rd, val); } // Skip non-computational instructions (prologue/epilogue, branches, labels) _ => {} diff --git a/tests/integration/m7_codegen_smoke.sh b/tests/integration/m7_codegen_smoke.sh new file mode 100755 index 0000000..1efbc6e --- /dev/null +++ b/tests/integration/m7_codegen_smoke.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Smoke test: validate that synth's Cortex-M7 codegen path produces a +# well-formed ELF for both single-precision (M7) and double-precision (M7DP) +# targets, exercising f32 and f64 arithmetic. +# +# Companion to fetch_osxcar_wasm.sh, which exercises M7DP with real-world +# components. This test runs without network access and is suitable for CI. +# +# Usage: +# bash tests/integration/m7_codegen_smoke.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +SYNTH="$PROJECT_ROOT/target/debug/synth" +TMPDIR="${TMPDIR:-/tmp}/synth_m7_smoke_$$" + +cleanup() { rm -rf "$TMPDIR"; } +trap cleanup EXIT +mkdir -p "$TMPDIR" + +echo "=== Synth M7 codegen smoke test ===" + +if [ ! -x "$SYNTH" ]; then + (cd "$PROJECT_ROOT" && cargo build -p synth-cli --quiet) +fi + +# i32-only module — should compile under M7 (single FPU) +cat > "$TMPDIR/i32_only.wat" << 'WAT' +(module + (func (export "add") (param i32 i32) (result i32) + local.get 0 local.get 1 i32.add) + (func (export "sub") (param i32 i32) (result i32) + local.get 0 local.get 1 i32.sub) + (func (export "mul") (param i32 i32) (result i32) + local.get 0 local.get 1 i32.mul) + (memory (export "memory") 1)) +WAT + +# f32 module — single-precision, should compile under M7 +cat > "$TMPDIR/f32.wat" << 'WAT' +(module + (func (export "fadd") (param f32 f32) (result f32) + local.get 0 local.get 1 f32.add) + (func (export "fmul") (param f32 f32) (result f32) + local.get 0 local.get 1 f32.mul) + (memory (export "memory") 1)) +WAT + +# f64 module — double-precision, should compile under M7DP only +cat > "$TMPDIR/f64.wat" << 'WAT' +(module + (func (export "dadd") (param f64 f64) (result f64) + local.get 0 local.get 1 f64.add) + (func (export "dmul") (param f64 f64) (result f64) + local.get 0 local.get 1 f64.mul) + (memory (export "memory") 1)) +WAT + +PASS=0 +FAIL=0 + +check_compile() { + local label="$1"; local wat="$2"; local target="$3"; local expect="$4" + local elf="$TMPDIR/${label}.elf" + if "$SYNTH" compile "$wat" -o "$elf" --target "$target" --all-exports >/dev/null 2>&1; then + result="ok" + else + result="fail" + fi + if [ "$result" = "$expect" ]; then + echo "PASS: ${label} on ${target} → ${result}" + PASS=$((PASS + 1)) + else + echo "FAIL: ${label} on ${target} → ${result} (expected ${expect})" + FAIL=$((FAIL + 1)) + fi +} + +# i32 should compile on every M7 variant +check_compile "i32_m7" "$TMPDIR/i32_only.wat" cortex-m7 ok +check_compile "i32_m7dp" "$TMPDIR/i32_only.wat" cortex-m7dp ok + +# f32 should compile on both (M7 has single-precision FPU) +check_compile "f32_m7" "$TMPDIR/f32.wat" cortex-m7 ok +check_compile "f32_m7dp" "$TMPDIR/f32.wat" cortex-m7dp ok + +# f64 must compile on M7DP. On M7 it should also work — synth falls back +# to soft-float helpers when hardware doesn't support double-precision. +check_compile "f64_m7dp" "$TMPDIR/f64.wat" cortex-m7dp ok + +echo "" +echo "=== Results: ${PASS} passed, ${FAIL} failed ===" +[ "$FAIL" -eq 0 ] diff --git a/tests/renode/BUILD.bazel b/tests/renode/BUILD.bazel index 83f32c7..2a247a6 100644 --- a/tests/renode/BUILD.bazel +++ b/tests/renode/BUILD.bazel @@ -1,7 +1,10 @@ load("@rules_renode//renode:defs.bzl", "renode_test") -# Export platform file for use by other test packages -exports_files(["synth_cortex_m.repl"]) +# Export platform files for use by other test packages +exports_files([ + "synth_cortex_m.repl", + "synth_cortex_m7.repl", +]) # Renode-based integration tests for Synth-generated ARM binaries @@ -46,3 +49,24 @@ renode_test( }, tags = ["renode"], ) + +# M7 codegen path: same WAT, compiled with --target cortex-m7 +genrule( + name = "test_add_m7_elf", + srcs = ["//examples/wat:simple_add.wat"], + outs = ["test_add_m7.elf"], + cmd = "$(location //crates:synth) compile $(location //examples/wat:simple_add.wat) -o $@ --target cortex-m7", + tools = ["//crates:synth"], +) + +renode_test( + name = "cortex_m7_add_test", + robot_test = "cortex_m7_test.robot", + deps = [ + "synth_cortex_m7.repl", + ], + variables_with_label = { + "ELF": "//tests/renode:test_add_m7_elf", + }, + tags = ["renode"], +) diff --git a/tests/renode/cortex_m7_test.robot b/tests/renode/cortex_m7_test.robot new file mode 100644 index 0000000..a7ce7b3 --- /dev/null +++ b/tests/renode/cortex_m7_test.robot @@ -0,0 +1,32 @@ +*** Settings *** +Documentation Cortex-M7 ELF execution test for Synth-generated binaries +... Validates that Synth's M7 codegen path emits a correctly +... structured ELF that loads and executes on a 16-MPU-region +... M7-class platform with single-precision FPU. + +*** Variables *** +${PLATFORM} ${CURDIR}/synth_cortex_m7.repl + +*** Keywords *** +Create Cortex-M7 Machine + Execute Command mach create "synth-m7-test" + Execute Command machine LoadPlatformDescription @${PLATFORM} + +*** Test Cases *** +Should Load And Execute Simple Add Function On M7 + [Documentation] Synth-generated --target cortex-m7 ELF executes correctly + Create Cortex-M7 Machine + + Execute Command sysbus LoadELF "${ELF}" + + # The add function lives at 0xA0 (user code, after 28-byte startup + handlers) + Execute Command cpu PC 0xA1 + + # AAPCS: r0 = 5, r1 = 3, expected result = 8 + Execute Command cpu SetRegisterUnsafe 0 5 + Execute Command cpu SetRegisterUnsafe 1 3 + + Execute Command cpu Step 2 + + ${r0}= Execute Command cpu GetRegisterUnsafe 0 + Should Be Equal As Integers ${r0} 8 msg=Expected r0 to be 8 (5+3) on M7 diff --git a/tests/renode/synth_cortex_m7.repl b/tests/renode/synth_cortex_m7.repl new file mode 100644 index 0000000..cb194c4 --- /dev/null +++ b/tests/renode/synth_cortex_m7.repl @@ -0,0 +1,29 @@ +// High-end Cortex-M7 platform for Synth-generated binaries +// Models a typical M7 SoC with single-precision FPU, 16 MPU regions, +// large OCRAM, and external XIP-capable QuadSPI flash. Vector table +// lives at 0x60000000 (XIP flash window) on i.MX RT-class chips, +// but we place the binary at 0x0 here for simple bring-up. + +flash: Memory.MappedMemory @ sysbus 0x0 + size: 0x800000 // 8MB external QSPI flash window + +itcm: Memory.MappedMemory @ sysbus 0x00080000 + size: 0x40000 // 256KB ITCM + +dtcm: Memory.MappedMemory @ sysbus 0x20000000 + size: 0x40000 // 256KB DTCM + +ocram: Memory.MappedMemory @ sysbus 0x20200000 + size: 0x80000 // 512KB OCRAM (FlexRAM-mapped) + +ocram2: Memory.MappedMemory @ sysbus 0x20280000 + size: 0x80000 // 512KB additional OCRAM + +nvic: IRQControllers.NVIC @ sysbus 0xE000E000 + priorityMask: 0xF0 + systickFrequency: 600000000 // 600 MHz typical for M7-class parts + IRQ -> cpu@0 + +cpu: CPU.CortexM @ sysbus + cpuType: "cortex-m7" + nvic: nvic