diff --git a/crates/synth-backend/src/arm_backend.rs b/crates/synth-backend/src/arm_backend.rs index b5a2370..9ad5070 100644 --- a/crates/synth-backend/src/arm_backend.rs +++ b/crates/synth-backend/src/arm_backend.rs @@ -404,4 +404,66 @@ mod tests { "f32 operations should fail on Cortex-M4 (no FPU)" ); } + + /// Issue #94: end-to-end byte-size check for the canonical u64-packed + /// FFI-return hi32 extract pattern. Compiles two near-identical + /// functions — one with the optimized shift-by-32, one with a generic + /// shift-by-7 — and asserts the optimized form is meaningfully smaller. + #[test] + fn test_issue94_hi32_extract_is_smaller_than_generic_shift() { + let backend = ArmBackend::new(); + let config = CompileConfig { + target: TargetSpec::cortex_m4f(), + ..CompileConfig::default() + }; + + // Optimized path: `(local.get 0) >>> 32; wrap_i64` + let ops_hi32 = vec![ + WasmOp::LocalGet(0), // i64 param in R0:R1 + WasmOp::I64Const(32), + WasmOp::I64ShrU, + WasmOp::I32WrapI64, + ]; + let func_hi32 = backend + .compile_function("hi32_extract", &ops_hi32, &config) + .unwrap(); + + // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the + // shift amount is not a multiple of 32, so it falls through to the + // 38-byte runtime shift. + let ops_generic = vec![ + WasmOp::LocalGet(0), + WasmOp::I64Const(7), + WasmOp::I64ShrU, + WasmOp::I32WrapI64, + ]; + let func_generic = backend + .compile_function("generic_shr", &ops_generic, &config) + .unwrap(); + + let bytes_hi32 = func_hi32.code.len(); + let bytes_generic = func_generic.code.len(); + println!( + "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})", + bytes_hi32, + bytes_generic, + bytes_generic.saturating_sub(bytes_hi32) + ); + let hex: String = func_hi32 + .code + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "); + println!("[issue #94] hi32 bytes: {}", hex); + // We expect the optimized form to be at least 30 bytes smaller than + // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.) + assert!( + bytes_hi32 + 30 <= bytes_generic, + "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \ + expected optimized form to be at least 30 bytes smaller", + bytes_hi32, + bytes_generic, + ); + } } diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index f8446cd..1130614 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -1319,6 +1319,17 @@ impl OptimizerBridge { // Insertion order tracking for LRU eviction let mut vreg_alloc_order: Vec = Vec::new(); + // Known i64 constant values keyed by their `dest_lo` vreg id. Populated + // when we lower `Opcode::I64Const`; consulted by i64 shift / mask + // handlers to detect compile-time-constant operands and avoid emitting + // the full 38-byte runtime shift sequence. + // + // Issue #94: when a u64-packed FFI return is split via `i64.shr_u 32` + // followed by `i32.wrap_i64`, the hi32 field is already in the high + // register of the i64 pair — no shift instructions are needed, just a + // vreg rename plus a single `mov rd_hi, #0` for the (now-zero) hi half. + let mut known_i64_consts: HashMap = HashMap::new(); + // Helper to get ARM reg from virtual reg. // Also checks spill slots — if a vreg was spilled, returns R12 (IP scratch). // Callers should also call `reload_spill` to emit the actual load instruction. @@ -1387,6 +1398,106 @@ impl OptimizerBridge { } }; + // Pre-pass for issue #94: find i64 constants whose ONLY use is as the + // shift amount of a shr_u/shr_s where the value is 32, or as the mask + // operand of an `i64.and 0xFFFFFFFF` — those constants are folded away + // by the shift / and handlers below, so the MOVs that load them into + // registers are dead. This skip set is consulted in the I64Const + // handler to bypass emitting those MOVs (saves ~4 bytes per fold). + let mut skip_const_dest_lo: std::collections::HashSet = + std::collections::HashSet::new(); + { + // First, collect simple constant values produced by I64Const (we + // only need their `value` and `dest_lo` here — the rest is + // determined by the consumer instruction below). + let mut const_values: HashMap = HashMap::new(); + // Count uses of each const dest_lo across the IR. A const is + // skip-eligible only if it has exactly one use and that use is the + // folded shift / and pattern. + let mut use_count: HashMap = HashMap::new(); + for inst in instructions { + if let Opcode::I64Const { dest_lo, value, .. } = &inst.opcode { + const_values.insert(dest_lo.0, *value); + } + let mut bump = |v: u32| { + *use_count.entry(v).or_insert(0) += 1; + }; + match &inst.opcode { + Opcode::I64Add { + src1_lo, src2_lo, .. + } + | Opcode::I64Sub { + src1_lo, src2_lo, .. + } + | Opcode::I64Or { + src1_lo, src2_lo, .. + } + | Opcode::I64Xor { + src1_lo, src2_lo, .. + } + | Opcode::I64Mul { + src1_lo, src2_lo, .. + } + | Opcode::I64DivS { + src1_lo, src2_lo, .. + } + | Opcode::I64DivU { + src1_lo, src2_lo, .. + } + | Opcode::I64RemS { + src1_lo, src2_lo, .. + } + | Opcode::I64RemU { + src1_lo, src2_lo, .. + } + | Opcode::I64Shl { + src1_lo, src2_lo, .. + } + | Opcode::I64ShrU { + src1_lo, src2_lo, .. + } + | Opcode::I64ShrS { + src1_lo, src2_lo, .. + } + | Opcode::I64And { + src1_lo, src2_lo, .. + } + | Opcode::I64Eq { + src1_lo, src2_lo, .. + } + | Opcode::I64Ne { + src1_lo, src2_lo, .. + } => { + bump(src1_lo.0); + bump(src2_lo.0); + } + _ => {} + } + } + // Now mark constants whose only use is the folded pattern. + for inst in instructions { + match &inst.opcode { + Opcode::I64ShrU { src2_lo, .. } | Opcode::I64ShrS { src2_lo, .. } => { + if use_count.get(&src2_lo.0).copied() == Some(1) + && let Some(v) = const_values.get(&src2_lo.0) + && (*v as u64) & 0x3F == 32 + { + skip_const_dest_lo.insert(src2_lo.0); + } + } + Opcode::I64And { src2_lo, .. } => { + if use_count.get(&src2_lo.0).copied() == Some(1) + && let Some(v) = const_values.get(&src2_lo.0) + && (*v as u64) == 0xFFFF_FFFF + { + skip_const_dest_lo.insert(src2_lo.0); + } + } + _ => {} + } + } + } + // First pass: map Load instructions (local.get) to ARM registers for inst in instructions { if let Opcode::Load { dest, addr } = &inst.opcode { @@ -2153,6 +2264,16 @@ impl OptimizerBridge { dest_hi, value, } => { + // Record the constant value so downstream i64 shift / mask + // handlers can detect and special-case constant operands. + // See `known_i64_consts` declaration for the rationale (issue #94). + known_i64_consts.insert(dest_lo.0, *value); + // Issue #94: if the pre-pass marked this constant as + // dead-on-arrival (its sole use is folded by the + // shift/and handler), skip emitting it entirely. + if skip_const_dest_lo.contains(&dest_lo.0) { + continue; + } // Load 64-bit constant into register pair let lo = (*value & 0xFFFFFFFF) as u32; let hi = ((*value >> 32) & 0xFFFFFFFF) as u32; @@ -2282,6 +2403,33 @@ impl OptimizerBridge { } => { let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); + // Issue #94: `i64.and` against a constant low-half mask + // (typically `0xFFFFFFFF` to extract the lo32 of a u64-packed + // FFI return). The lo half collapses to a no-op rename, the + // hi half collapses to MOV #0 — saving the two AND.W + // instructions (8 bytes total) plus the MOVW/MOVT for the + // mask constant itself (deduped by ConstantFolding/CSE in + // most cases, but the ANDs always remain). + let mask = known_i64_consts.get(&src2_lo.0).copied(); + if let Some(m) = mask { + let m_u64 = m as u64; + if m_u64 == 0xFFFF_FFFF { + // Low 32 bits unchanged, high 32 bits zeroed. + let rd_hi = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs).1; + vreg_to_arm.insert(dest_lo.0, rn_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Imm(0), + }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; + // Skip the generic AND emit below. + continue; + } + } let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); let (rd_lo, rd_hi) = @@ -2843,23 +2991,47 @@ impl OptimizerBridge { } => { let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); - let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); - let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd_lo, rd_hi) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); - vreg_to_arm.insert(dest_lo.0, rd_lo); - vreg_to_arm.insert(dest_hi.0, rd_hi); - arm_instrs.push(ArmOp::I64ShrS { - rd_lo, - rd_hi, - rn_lo, - rn_hi, - rm_lo, - rm_hi, - }); - last_result_vreg = Some(dest_lo.0); - last_result_vreg_hi = Some(dest_hi.0); - is_i64_result = true; + // Issue #94 (signed variant): `i64.shr_s 32; i32.wrap_i64` + // also extracts the upper 32 bits unchanged. dest_lo = rn_hi, + // dest_hi = sign-extension of rn_hi (ASR #31 of rn_hi). + let shr_const = known_i64_consts + .get(&src2_lo.0) + .copied() + .map(|v| (v as u64) & 0x3F); + if shr_const == Some(32) { + let rd_hi = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs).1; + vreg_to_arm.insert(dest_lo.0, rn_hi); + vreg_to_arm.insert(dest_hi.0, rd_hi); + // ASR rd_hi, rn_hi, #31 — replicate the sign bit across + // the new hi half. 4-byte Thumb-2 encoding via Asr-imm. + arm_instrs.push(ArmOp::Asr { + rd: rd_hi, + rn: rn_hi, + shift: 31, + }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; + } else { + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); + arm_instrs.push(ArmOp::I64ShrS { + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, + }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; + } } // i64 logical shift right @@ -2873,23 +3045,51 @@ impl OptimizerBridge { } => { let rn_lo = get_arm_reg(src1_lo, &vreg_to_arm, &spilled_vregs); let rn_hi = get_arm_reg(src1_hi, &vreg_to_arm, &spilled_vregs); - let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); - let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); - let (rd_lo, rd_hi) = - alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); - vreg_to_arm.insert(dest_lo.0, rd_lo); - vreg_to_arm.insert(dest_hi.0, rd_hi); - arm_instrs.push(ArmOp::I64ShrU { - rd_lo, - rd_hi, - rn_lo, - rn_hi, - rm_lo, - rm_hi, - }); - last_result_vreg = Some(dest_lo.0); - last_result_vreg_hi = Some(dest_hi.0); - is_i64_result = true; + // Issue #94: u64-packed FFI return — `i64.shr_u 32` extracts + // the high 32 bits, which are already sitting in `rn_hi`. Skip + // the 38-byte runtime shift sequence; just rename `dest_lo` + // onto `rn_hi` and zero `dest_hi`. Total cost: a single + // `mov rd_hi, #0` (2-4 bytes) instead of 38 bytes. + // + // Per WASM semantics the shift amount is taken modulo 64, so + // any constant whose low 6 bits == 32 hits this fast path. + let shr_const = known_i64_consts + .get(&src2_lo.0) + .copied() + .map(|v| (v as u64) & 0x3F); + if shr_const == Some(32) { + let rd_hi = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs).1; + // dest_lo aliases the source's hi register (no instruction). + vreg_to_arm.insert(dest_lo.0, rn_hi); + // dest_hi is zero — emit a single MOV rd_hi, #0. + vreg_to_arm.insert(dest_hi.0, rd_hi); + arm_instrs.push(ArmOp::Mov { + rd: rd_hi, + op2: Operand2::Imm(0), + }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; + } else { + let rm_lo = get_arm_reg(src2_lo, &vreg_to_arm, &spilled_vregs); + let rm_hi = get_arm_reg(src2_hi, &vreg_to_arm, &spilled_vregs); + let (rd_lo, rd_hi) = + alloc_i64_pair(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs); + vreg_to_arm.insert(dest_lo.0, rd_lo); + vreg_to_arm.insert(dest_hi.0, rd_hi); + arm_instrs.push(ArmOp::I64ShrU { + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, + }); + last_result_vreg = Some(dest_lo.0); + last_result_vreg_hi = Some(dest_hi.0); + is_i64_result = true; + } } // i64 rotate left @@ -4208,4 +4408,315 @@ mod tests { } } } + + // ======================================================================== + // Issue #94: u64-packed FFI return — direct hi/lo extraction + // ======================================================================== + // + // The canonical wasm sequence emitted by Rust → wasm for extracting the + // upper 32 bits of a u64-packed return value is: + // + // local.get $packed_i64 ;; the packed u64 in an i64 local + // i64.const 32 + // i64.shr_u + // i32.wrap_i64 + // + // Pre-fix, synth lowered this to ~38 bytes of generic 64-bit shift + // emulation. After the fix, it should collapse to a single + // `mov rd_hi, #0` plus a vreg rename — the hi half of the input pair + // is already in a register. + + #[cfg(test)] + fn count_arm_byte_size(arm: &[ArmOp]) -> usize { + use crate::rules::Operand2; + // Mirror the size table in OptimizerBridge::estimate_arm_byte_size + // for the ops we care about in these tests. Keep this small and + // local so the tests don't depend on encoder reachability — only + // a regression in the optimizer bridge should change these counts. + arm.iter() + .map(|op| match op { + ArmOp::I64ShrU { .. } | ArmOp::I64Shl { .. } => 38, + ArmOp::I64ShrS { .. } => 40, + ArmOp::And { .. } + | ArmOp::Orr { .. } + | ArmOp::Eor { .. } + | ArmOp::Asr { .. } + | ArmOp::Adc { .. } + | ArmOp::Sbc { .. } => 4, + ArmOp::Mov { rd, op2 } => { + // 16-bit Thumb encoding for MOV rd, #imm8 is available + // when rd is a low register (R0..R7) and the immediate + // fits in 8 bits. MOV rd, rm (register form) is 2 bytes + // for low-register destinations. + let rd_n = reg_idx(*rd); + match op2 { + Operand2::Imm(v) if rd_n < 8 && (0..=255).contains(v) => 2, + Operand2::Reg(_) if rd_n < 8 => 2, + _ => 4, + } + } + ArmOp::Movw { .. } | ArmOp::Movt { .. } => 4, + _ => 4, + }) + .sum() + } + + #[cfg(test)] + fn reg_idx(r: crate::rules::Reg) -> u32 { + match r { + crate::rules::Reg::R0 => 0, + crate::rules::Reg::R1 => 1, + crate::rules::Reg::R2 => 2, + crate::rules::Reg::R3 => 3, + crate::rules::Reg::R4 => 4, + crate::rules::Reg::R5 => 5, + crate::rules::Reg::R6 => 6, + crate::rules::Reg::R7 => 7, + crate::rules::Reg::R8 => 8, + crate::rules::Reg::R9 => 9, + crate::rules::Reg::R10 => 10, + crate::rules::Reg::R11 => 11, + crate::rules::Reg::R12 => 12, + crate::rules::Reg::SP => 13, + crate::rules::Reg::LR => 14, + crate::rules::Reg::PC => 15, + } + } + + /// Print before/after sizes for the canonical issue #94 pattern. This + /// "test" is informational — it passes regardless and is intended to be + /// run with `cargo test ... -- --nocapture` to surface the win in CI logs. + #[test] + fn test_issue94_size_demo() { + let bridge = OptimizerBridge::new(); + + // After-fix: shift-by-32 hits the constant-aware fast path. + let after_ops = vec![ + WasmOp::LocalGet(0), + WasmOp::I64Const(32), + WasmOp::I64ShrU, + WasmOp::I32WrapI64, + ]; + let (instrs, _, _) = bridge.optimize_full(&after_ops).unwrap(); + let arm_after = bridge.ir_to_arm(&instrs, 2); + let bytes_after = count_arm_byte_size(&arm_after); + + // Pre-fix proxy: shift-by-7 takes the generic ArmOp::I64ShrU path, + // which is byte-identical to the unoptimized shift-by-32 emit. + let before_ops = vec![ + WasmOp::LocalGet(0), + WasmOp::I64Const(7), + WasmOp::I64ShrU, + WasmOp::I32WrapI64, + ]; + let (instrs, _, _) = bridge.optimize_full(&before_ops).unwrap(); + let arm_before = bridge.ir_to_arm(&instrs, 2); + let bytes_before = count_arm_byte_size(&arm_before); + + println!( + "\n[issue #94 demo] hi32 extract: BEFORE = {} bytes, AFTER = {} bytes (saved {})", + bytes_before, + bytes_after, + bytes_before.saturating_sub(bytes_after) + ); + println!("[issue #94 demo] AFTER ops emitted = {}", arm_after.len()); + for op in &arm_after { + println!("[issue #94 demo] {:?}", op); + } + } + + /// Issue #94: `i64.shr_u 32` followed by `i32.wrap_i64` should NOT emit + /// a runtime 64-bit shift. The hi half of the source pair is already in + /// a register; we just need to rename + zero out the (now-unused) hi half. + #[test] + fn test_issue94_shr_u_32_lowers_to_register_rename() { + let bridge = OptimizerBridge::new(); + // (i64) -> i32, body = `local.get 0; i64.const 32; i64.shr_u; i32.wrap_i64` + let wasm_ops = vec![ + WasmOp::LocalGet(0), // i64 param in R0:R1 + WasmOp::I64Const(32), // shift amount + WasmOp::I64ShrU, + WasmOp::I32WrapI64, + ]; + + let (instrs, _, _) = bridge.optimize_full(&wasm_ops).unwrap(); + // num_params = 1 (one i64 param occupies R0:R1 per AAPCS — but we + // pass 2 to ir_to_arm because each i64 counts as two AAPCS slots + // in the codegen's accounting). + let arm = bridge.ir_to_arm(&instrs, 2); + + // After fix: NO ArmOp::I64ShrU should be emitted. The 38-byte runtime + // shift sequence is the bug we're fixing. + let has_runtime_shift = arm.iter().any(|op| matches!(op, ArmOp::I64ShrU { .. })); + assert!( + !has_runtime_shift, + "i64.shr_u with constant 32 should NOT emit ArmOp::I64ShrU; got: {:#?}", + arm + ); + + // Total emitted byte size should drop well below the 38-byte runtime + // shift alone. The pre-fix lower bound was 38 bytes for the shift + // plus several bytes for the constant-32 load and epilogue moves. + let bytes = count_arm_byte_size(&arm); + assert!( + bytes < 30, + "expected < 30 bytes after fix; got {} bytes: {:#?}", + bytes, + arm + ); + } + + /// Issue #94: signed variant. `i64.shr_s 32` extracts the upper 32 bits + /// just like `shr_u`, but the result is sign-extended into the (still- + /// unused) hi half via a single `asr rd_hi, rn_hi, #31`. + #[test] + fn test_issue94_shr_s_32_lowers_to_register_rename_with_sign_extend() { + let bridge = OptimizerBridge::new(); + let wasm_ops = vec![ + WasmOp::LocalGet(0), + WasmOp::I64Const(32), + WasmOp::I64ShrS, + WasmOp::I32WrapI64, + ]; + + let (instrs, _, _) = bridge.optimize_full(&wasm_ops).unwrap(); + let arm = bridge.ir_to_arm(&instrs, 2); + + let has_runtime_shift = arm.iter().any(|op| matches!(op, ArmOp::I64ShrS { .. })); + assert!( + !has_runtime_shift, + "i64.shr_s with constant 32 should NOT emit ArmOp::I64ShrS; got: {:#?}", + arm + ); + + // We expect exactly one ASR (for the sign-extend of the hi half). + let asr_count = arm + .iter() + .filter(|op| matches!(op, ArmOp::Asr { .. })) + .count(); + assert_eq!( + asr_count, 1, + "expected one ASR for sign extend; got: {:#?}", + arm + ); + + let bytes = count_arm_byte_size(&arm); + assert!( + bytes < 30, + "expected < 30 bytes after fix; got {} bytes: {:#?}", + bytes, + arm + ); + } + + /// Issue #94: `i64.and 0xFFFFFFFF` (lo32 mask) followed by `i32.wrap_i64` + /// should NOT emit two AND.W instructions; the lo half is unchanged and + /// the hi half is zero. Becomes a register rename + `mov rd_hi, #0`. + #[test] + fn test_issue94_and_mask_low32_lowers_to_register_rename() { + let bridge = OptimizerBridge::new(); + let wasm_ops = vec![ + WasmOp::LocalGet(0), + WasmOp::I64Const(0xFFFF_FFFF), + WasmOp::I64And, + WasmOp::I32WrapI64, + ]; + + let (instrs, _, _) = bridge.optimize_full(&wasm_ops).unwrap(); + let arm = bridge.ir_to_arm(&instrs, 2); + + // After fix: at most one AND should remain, and it shouldn't be the + // pair of ANDs from the generic i64.and lowering. (We allow zero ANDs + // because the lo half is unchanged via vreg rename.) + let and_count = arm + .iter() + .filter(|op| matches!(op, ArmOp::And { .. })) + .count(); + assert!( + and_count == 0, + "i64.and 0xFFFFFFFF should not emit any AND.W; got {} ANDs in {:#?}", + and_count, + arm + ); + } + + /// Sanity: a non-32 shift constant should still emit the full ArmOp::I64ShrU + /// (we don't want to silently regress the general case). + #[test] + fn test_issue94_shr_u_non_32_still_emits_runtime_shift() { + let bridge = OptimizerBridge::new(); + let wasm_ops = vec![ + WasmOp::LocalGet(0), + WasmOp::I64Const(7), // not 32 — generic path + WasmOp::I64ShrU, + WasmOp::I32WrapI64, + ]; + + let (instrs, _, _) = bridge.optimize_full(&wasm_ops).unwrap(); + let arm = bridge.ir_to_arm(&instrs, 2); + + let has_runtime_shift = arm.iter().any(|op| matches!(op, ArmOp::I64ShrU { .. })); + assert!( + has_runtime_shift, + "i64.shr_u with non-32 constant must still emit ArmOp::I64ShrU; got: {:#?}", + arm + ); + } + + /// Direct IR-level test (independent of wasm_to_ir's vreg numbering): + /// construct the IR by hand and verify the lowering eliminates I64ShrU. + #[test] + fn test_issue94_ir_level_shr_u_32() { + let bridge = OptimizerBridge::new(); + // Pattern: + // v0:v1 = I64Const 0x0123_4567_89AB_CDEF + // v2:v3 = I64Const 32 + // v4:v5 = I64ShrU v0:v1, v2:v3 + // + // Expected: no ArmOp::I64ShrU is emitted; v4 maps to the same ARM + // register as v1 (the hi of the source pair). + let instrs = vec![ + Instruction { + id: 0, + opcode: Opcode::I64Const { + dest_lo: OptReg(0), + dest_hi: OptReg(1), + value: 0x0123_4567_89AB_CDEFi64, + }, + block_id: 0, + is_dead: false, + }, + Instruction { + id: 2, + opcode: Opcode::I64Const { + dest_lo: OptReg(2), + dest_hi: OptReg(3), + value: 32, + }, + block_id: 0, + is_dead: false, + }, + Instruction { + id: 4, + opcode: Opcode::I64ShrU { + dest_lo: OptReg(4), + dest_hi: OptReg(5), + src1_lo: OptReg(0), + src1_hi: OptReg(1), + src2_lo: OptReg(2), + src2_hi: OptReg(3), + }, + block_id: 0, + is_dead: false, + }, + ]; + + let arm = bridge.ir_to_arm(&instrs, 0); + let has_runtime_shift = arm.iter().any(|op| matches!(op, ArmOp::I64ShrU { .. })); + assert!( + !has_runtime_shift, + "IR-level: i64.shr_u with constant 32 should NOT emit ArmOp::I64ShrU; got: {:#?}", + arm + ); + } }