From d3d83d1c31cfefffc45b0c819e5f7b3f18bcab7a Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Wed, 7 Oct 2020 15:47:02 +0200 Subject: [PATCH 1/2] machinst x64: use the (base,offset) addressing mode even in the presence of a uextend; --- cranelift/codegen/src/isa/x64/lower.rs | 29 ++++++++++++- .../filetests/isa/x64/amode-opt.clif | 41 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 cranelift/filetests/filetests/isa/x64/amode-opt.clif diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index a9a05177fb47..1da69c4d309b 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -449,6 +449,7 @@ fn lower_to_amode>(ctx: &mut C, spec: InsnInput, offset: i // We now either have an add that we must materialize, or some other input; as well as the // final offset. if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) { + debug_assert_eq!(ctx.output_ty(add, 0), types::I64); let add_inputs = &[ InsnInput { insn: add, @@ -480,7 +481,33 @@ fn lower_to_amode>(ctx: &mut C, spec: InsnInput, offset: i ) } else { for i in 0..=1 { - if let Some(cst) = ctx.get_input(add, i).constant { + let input = ctx.get_input(add, i); + + // Try to pierce through uextend. + if let Some(uextend) = matches_input( + ctx, + InsnInput { + insn: add, + input: i, + }, + Opcode::Uextend, + ) { + if let Some(cst) = ctx.get_input(uextend, 0).constant { + // Zero the upper bits. + let input_size = ctx.input_ty(uextend, 0).bits() as u64; + let shift: u64 = 64 - input_size; + let uext_cst: u64 = (cst << shift) >> shift; + + let final_offset = (offset as i64).wrapping_add(uext_cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base); + } + } + } + + // If it's a constant, add it directly! + if let Some(cst) = input.constant { let final_offset = (offset as i64).wrapping_add(cst as i64); if low32_will_sign_extend_to_64(final_offset as u64) { let base = put_input_in_reg(ctx, add_inputs[1 - i]); diff --git a/cranelift/filetests/filetests/isa/x64/amode-opt.clif b/cranelift/filetests/filetests/isa/x64/amode-opt.clif new file mode 100644 index 000000000000..dbeed5475eeb --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/amode-opt.clif @@ -0,0 +1,41 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %amode_add(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = iadd v0, v1 + v3 = load.i64 v2 + return v3 + ; check: movq 0(%rdi,%rsi,1), %r12 +} + +function %amode_add_imm(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 42 + v2 = iadd v0, v1 + v3 = load.i64 v2 + return v3 + ; check: movq 42(%rdi), %r12 +} + +;; Same as above, but add operands have been reversed. +function %amode_add_imm_order(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 42 + v2 = iadd v1, v0 + v3 = load.i64 v2 + return v3 + ; check: movq 42(%rdi), %r12 +} + +;; Make sure that uextend(cst) are ignored when the cst will naturally sign-extend. +function %amode_add_uext_imm(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i32 42 + v2 = uextend.i64 v1 + v3 = iadd v2, v0 + v4 = load.i64 v3 + return v4 + ; check: movq 42(%rdi), %r12 +} From a1a4e9a3a68e89c129bcd5a3cf431b310ce5a9ac Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Wed, 7 Oct 2020 16:31:16 +0200 Subject: [PATCH 2/2] machinst x64: avoid emitting movzx when the input is an ALU 32-bits operation; --- cranelift/codegen/src/isa/x64/inst/mod.rs | 6 +- cranelift/codegen/src/isa/x64/lower.rs | 73 +++++++++++++++---- .../filetests/isa/x64/uextend-elision.clif | 17 +++++ 3 files changed, 81 insertions(+), 15 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/uextend-elision.clif diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 5d4c409fda32..ec5861577350 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1168,7 +1168,11 @@ impl Inst { ) -> Inst { let rc = from_reg.get_class(); match rc { - RegClass::I64 => Inst::mov_r_m(ty.bytes() as u8, from_reg, to_addr, srcloc), + RegClass::I64 => { + // Always store the full register, to ensure that the high bits are properly set + // when doing a full reload. + Inst::mov_r_m(8 /* bytes */, from_reg, to_addr, srcloc) + } RegClass::V128 => { let opcode = match ty { types::F32 => SseOpcode::Movss, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 1da69c4d309b..9ae014956545 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -70,6 +70,25 @@ fn matches_input>( }) } +/// Returns whether the given specified `input` is a result produced by an instruction with any of +/// the opcodes specified in `ops`. +fn matches_input_any>( + ctx: &mut C, + input: InsnInput, + ops: &[Opcode], +) -> Option { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + for &op in ops { + if data.opcode() == op { + return Some(src_inst); + } + } + None + }) +} + fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg { ctx.use_input_reg(input); input.reg @@ -1339,29 +1358,55 @@ fn lower_insn_to_regs>( let src_ty = ctx.input_ty(insn, 0); let dst_ty = ctx.output_ty(insn, 0); + // Sextend requires a sign-extended move, but all the other opcodes are simply a move + // from a zero-extended source. Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to + // zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so + // again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored with undefined + // high-order bits, so we can simply do a copy. + + if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { + // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on + // 32-bits will zero-extend the upper 32-bits, so we can even not generate a + // zero-extended move in this case. + // TODO add loads and shifts here. + if let Some(_) = matches_input_any( + ctx, + inputs[0], + &[ + Opcode::Iadd, + Opcode::IaddIfcout, + Opcode::Isub, + Opcode::Imul, + Opcode::Band, + Opcode::Bor, + Opcode::Bxor, + ], + ) { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, types::I64)); + return Ok(()); + } + } + let src = input_to_reg_mem(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]); let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); - assert!( - (src_ty.bits() < dst_ty.bits() && ext_mode.is_some()) || ext_mode.is_none(), + assert_eq!( + src_ty.bits() < dst_ty.bits(), + ext_mode.is_some(), "unexpected extension: {} -> {}", src_ty, dst_ty ); - // All of these other opcodes are simply a move from a zero-extended source. Here - // is why this works, in each case: - // - // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we - // merely need to zero-extend here. - // - // - Breduce, Bextend: changing width of a boolean. We represent a - // bool as a 0 or 1, so again, this is a zero-extend / no-op. - // - // - Ireduce: changing width of an integer. Smaller ints are stored - // with undefined high-order bits, so we can simply do a copy. - if let Some(ext_mode) = ext_mode { if op == Opcode::Sextend { ctx.emit(Inst::movsx_rm_r( diff --git a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif new file mode 100644 index 000000000000..aed6068d428b --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif @@ -0,0 +1,17 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %elide_uextend_add(i32, i32) -> i64 { +block0(v0: i32, v1: i32): + ; check: pushq %rbp + ; check: movq %rsp, %rbp + v2 = iadd v0, v1 + ; check: addl %esi, %edi + v3 = uextend.i64 v2 + ; check: movq %rdi, %rax + ; check: movq %rbp, %rsp + ; check: popq %rbp + ; check: ret + return v3 +}