diff --git a/cranelift-codegen/meta/src/cdsl/types.rs b/cranelift-codegen/meta/src/cdsl/types.rs index f431bb3ed..92b9ab3a2 100644 --- a/cranelift-codegen/meta/src/cdsl/types.rs +++ b/cranelift-codegen/meta/src/cdsl/types.rs @@ -264,6 +264,27 @@ impl LaneType { ValueType::Vector(VectorType::new(*self, lanes.into())) } } + + pub fn is_float(&self) -> bool { + match self { + LaneType::FloatType(_) => true, + _ => false, + } + } + + pub fn is_int(&self) -> bool { + match self { + LaneType::IntType(_) => true, + _ => false, + } + } + + pub fn is_bool(&self) -> bool { + match self { + LaneType::BoolType(_) => true, + _ => false, + } + } } impl fmt::Display for LaneType { diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs index 2ae60896d..d773c2c66 100644 --- a/cranelift-codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs @@ -4,8 +4,8 @@ use std::collections::HashMap; use crate::cdsl::encodings::{Encoding, EncodingBuilder}; use crate::cdsl::instructions::{ - BoundInstruction, InstSpec, Instruction, InstructionGroup, InstructionPredicate, - InstructionPredicateNode, InstructionPredicateRegistry, + InstSpec, Instruction, InstructionGroup, InstructionPredicate, InstructionPredicateNode, + InstructionPredicateRegistry, }; use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes}; use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber}; @@ -234,7 +234,7 @@ impl PerCpuModeEncodings { } fn enc_both_isap( &mut self, - inst: BoundInstruction, + inst: impl Clone + Into, template: Template, isap: SettingPredicateNumber, ) { @@ -243,7 +243,7 @@ impl PerCpuModeEncodings { } fn enc_both_instp( &mut self, - inst: BoundInstruction, + inst: impl Clone + Into, template: Template, instp: InstructionPredicateNode, ) { @@ -279,6 +279,17 @@ impl PerCpuModeEncodings { } } + /// Add the same encoding/recipe pairing to both X86_32 and X86_64 + fn enc_32_64_rec( + &mut self, + inst: impl Clone + Into, + recipe: &EncodingRecipe, + bits: u16, + ) { + self.enc32_rec(inst.clone(), recipe, bits); + self.enc64_rec(inst, recipe, bits); + } + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand /// binding) has already happened. fn enc_32_64_maybe_isap( @@ -356,7 +367,6 @@ pub(crate) fn define( let copy_to_ssa = shared.by_name("copy_to_ssa"); let ctz = shared.by_name("ctz"); let debugtrap = shared.by_name("debugtrap"); - let extractlane = shared.by_name("extractlane"); let f32const = shared.by_name("f32const"); let f64const = shared.by_name("f64const"); let fadd = shared.by_name("fadd"); @@ -386,7 +396,6 @@ pub(crate) fn define( let ifcmp_sp = shared.by_name("ifcmp_sp"); let imul = shared.by_name("imul"); let indirect_jump_table_br = shared.by_name("indirect_jump_table_br"); - let insertlane = shared.by_name("insertlane"); let ireduce = shared.by_name("ireduce"); let ishl = shared.by_name("ishl"); let ishl_imm = shared.by_name("ishl_imm"); @@ -459,7 +468,12 @@ pub(crate) fn define( let x86_cvtt2si = x86.by_name("x86_cvtt2si"); let x86_fmax = x86.by_name("x86_fmax"); let x86_fmin = x86.by_name("x86_fmin"); + let x86_insertps = x86.by_name("x86_insertps"); + let x86_movlhps = x86.by_name("x86_movlhps"); + let x86_movsd = x86.by_name("x86_movsd"); let x86_pop = x86.by_name("x86_pop"); + let x86_pextr = x86.by_name("x86_pextr"); + let x86_pinsr = x86.by_name("x86_pinsr"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_push = x86.by_name("x86_push"); @@ -490,6 +504,7 @@ pub(crate) fn define( let rec_f64imm_z = r.template("f64imm_z"); let rec_fa = r.template("fa"); let rec_fax = r.template("fax"); + let rec_fa_ib = r.template("fa_ib"); let rec_fcmp = r.template("fcmp"); let rec_fcscc = r.template("fcscc"); let rec_ffillnull = r.recipe("ffillnull"); @@ -1729,16 +1744,17 @@ pub(crate) fn define( e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e])); e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e])); - // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available + // SIMD vector size: eventually multiple vector sizes may be supported but for now only + // SSE-sized vectors are available. let sse_vector_size: u64 = 128; // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the - // value across the register + // value across the register. let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128; - // PSHUFB, 8-bit shuffle using two XMM registers + // PSHUFB, 8-bit shuffle using two XMM registers. for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size); let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]); @@ -1746,7 +1762,7 @@ pub(crate) fn define( e.enc64_isap(instruction, template, use_ssse3_simd); } - // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate + // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate. for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_fpr @@ -1761,64 +1777,84 @@ pub(crate) fn define( // written to the low doubleword of the register and the regiser is zero-extended to 128 bits." for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size); - let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ - if ty.lane_bits() < 64 { - // no 32-bit encodings for 64-bit widths - e.enc32(instruction.clone(), template.clone()); + if ty.is_float() { + e.enc_32_64_rec(instruction, rec_null_fpr, 0); + } else { + let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ + if ty.lane_bits() < 64 { + // no 32-bit encodings for 64-bit widths + e.enc32(instruction.clone(), template.clone()); + } + e.enc_x86_64(instruction, template); } - e.enc_x86_64(instruction, template); } // SIMD insertlane - let mut insertlane_mapping: HashMap, Option)> = + let mut x86_pinsr_mapping: HashMap, Option)> = HashMap::new(); - insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB - insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2 - insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD - insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64 + x86_pinsr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB + x86_pinsr_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2 + x86_pinsr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD + x86_pinsr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64 for ty in ValueType::all_lane_types().filter(allowed_simd_type) { - if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) { - let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size); + if let Some((opcode, isap)) = x86_pinsr_mapping.get(&ty.lane_bits()) { + let instruction = x86_pinsr.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_r.opcodes(opcode.clone()); if ty.lane_bits() < 64 { e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone()); } else { - // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 + // It turns out the 64-bit widths have REX/W encodings and only are available on + // x86_64. e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone()); } } } + // For legalizing insertlane with floats, INSERTPS from SSE4.1. + { + let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size); + let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]); + e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + + // For legalizing insertlane with floats, MOVSD from SSE2. + { + let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size); + let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]); + e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2 + } + + // For legalizing insertlane with floats, MOVLHPS from SSE. + { + let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size); + let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]); + e.enc_32_64_maybe_isap(instruction, template, None); // from SSE + } + // SIMD extractlane - let mut extractlane_mapping: HashMap, Option)> = + let mut x86_pextr_mapping: HashMap, Option)> = HashMap::new(); - extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB - extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes - extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD - extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64 + x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB + x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from SSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes + x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD + x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64 for ty in ValueType::all_lane_types().filter(allowed_simd_type) { - if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) { - let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size); + if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) { + let instruction = x86_pextr.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone()); if ty.lane_bits() < 64 { e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone()); } else { - // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 + // It turns out the 64-bit widths have REX/W encodings and only are available on + // x86_64. e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone()); } } } - // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register - for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { - let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64); - e.enc32_rec(instruction.clone(), rec_null_fpr, 0); - e.enc64_rec(instruction, rec_null_fpr, 0); - } - - // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8) + // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { for to_type in ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type) @@ -1826,8 +1862,28 @@ pub(crate) fn define( let instruction = raw_bitcast .bind_vector_from_lane(to_type, sse_vector_size) .bind_vector_from_lane(from_type, sse_vector_size); - e.enc32_rec(instruction.clone(), rec_null_fpr, 0); - e.enc64_rec(instruction, rec_null_fpr, 0); + e.enc_32_64_rec(instruction, rec_null_fpr, 0); + } + } + + // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an + // XMM register. + for float_type in &[F32, F64] { + for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) { + e.enc_32_64_rec( + raw_bitcast + .bind_vector_from_lane(lane_type, sse_vector_size) + .bind(*float_type), + rec_null_fpr, + 0, + ); + e.enc_32_64_rec( + raw_bitcast + .bind(*float_type) + .bind_vector_from_lane(lane_type, sse_vector_size), + rec_null_fpr, + 0, + ); } } diff --git a/cranelift-codegen/meta/src/isa/x86/instructions.rs b/cranelift-codegen/meta/src/isa/x86/instructions.rs index 03730cdea..b9f2496a8 100644 --- a/cranelift-codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift-codegen/meta/src/isa/x86/instructions.rs @@ -291,5 +291,101 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let Idx = &operand_doc("Idx", uimm8, "Lane index"); + let x = &operand("x", TxN); + let a = &operand("a", &TxN.lane_of()); + + ig.push( + Inst::new( + "x86_pextr", + r#" + Extract lane ``Idx`` from ``x``. + The lane index, ``Idx``, is an immediate value, not an SSA value. It + must indicate a valid lane index for the type of ``x``. + "#, + ) + .operands_in(vec![x, Idx]) + .operands_out(vec![a]), + ); + + let IBxN = &TypeVar::new( + "IBxN", + "A SIMD vector type containing only booleans and integers", + TypeSetBuilder::new() + .ints(Interval::All) + .bools(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let x = &operand("x", IBxN); + let y = &operand_doc("y", &IBxN.lane_of(), "New lane value"); + let a = &operand("a", IBxN); + + ig.push( + Inst::new( + "x86_pinsr", + r#" + Insert ``y`` into ``x`` at lane ``Idx``. + The lane index, ``Idx``, is an immediate value, not an SSA value. It + must indicate a valid lane index for the type of ``x``. + "#, + ) + .operands_in(vec![x, Idx, y]) + .operands_out(vec![a]), + ); + + let FxN = &TypeVar::new( + "FxN", + "A SIMD vector type containing floats", + TypeSetBuilder::new() + .floats(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let x = &operand("x", FxN); + let y = &operand_doc("y", &FxN.lane_of(), "New lane value"); + let a = &operand("a", FxN); + + ig.push( + Inst::new( + "x86_insertps", + r#" + Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is + extracted from and which it is inserted to. This is similar to x86_pinsr but inserts + floats, which are already stored in an XMM register. + "#, + ) + .operands_in(vec![x, Idx, y]) + .operands_out(vec![a]), + ); + + let x = &operand("x", FxN); + let y = &operand("y", FxN); + let a = &operand("a", FxN); + + ig.push( + Inst::new( + "x86_movsd", + r#" + Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x`` + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_movlhps", + r#" + Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x`` + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + ig.build() } diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs index 2fd160de3..555a93f9c 100644 --- a/cranelift-codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs @@ -20,10 +20,10 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct // List of instructions. let insts = &shared.instructions; let band = insts.by_name("band"); - let bitcast = insts.by_name("bitcast"); let bor = insts.by_name("bor"); let clz = insts.by_name("clz"); let ctz = insts.by_name("ctz"); + let extractlane = insts.by_name("extractlane"); let f64const = insts.by_name("f64const"); let fcmp = insts.by_name("fcmp"); let fcvt_from_uint = insts.by_name("fcvt_from_uint"); @@ -321,7 +321,9 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct // SIMD splat: 8-bits for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size); - let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64); + let bitcast_f64_to_any8x16 = raw_bitcast + .bind_vector_from_lane(ty, sse_vector_size) + .bind(F64); narrow.legalize( def!(y = splat_any8x16(x)), vec![ @@ -378,5 +380,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct ); } + narrow.custom_legalize(extractlane, "convert_extractlane"); + narrow.custom_legalize(insertlane, "convert_insertlane"); + narrow.build_and_add_to(&mut shared.transform_groups); } diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs index 3f14769de..8176effc4 100644 --- a/cranelift-codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs @@ -566,6 +566,27 @@ pub(crate) fn define<'shared>( ), ); + // XX /r with FPR ins and outs. A form with a byte immediate. + { + let format = formats.get(f_insert_lane); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fa_ib", f_insert_lane, 2) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + format, "lane", 8, 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + let imm:i64 = lane.into(); + sink.put1(imm as u8); + "#, + ), + ); + } + // XX /n for a unary operation with extension bits. recipes.add_template_recipe( EncodingRecipeBuilder::new("ur", f_unary, 1) diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs index 843347ce9..3ebebfe18 100644 --- a/cranelift-codegen/meta/src/shared/instructions.rs +++ b/cranelift-codegen/meta/src/shared/instructions.rs @@ -1537,7 +1537,9 @@ pub(crate) fn define( Extract lane ``Idx`` from ``x``. The lane index, ``Idx``, is an immediate value, not an SSA value. It - must indicate a valid lane index for the type of ``x``. + must indicate a valid lane index for the type of ``x``. Note that the upper bits of ``a`` + may or may not be zeroed depending on the ISA but the type system should prevent using + ``a`` as anything other than the extracted value. "#, ) .operands_in(vec![x, Idx]) @@ -2782,9 +2784,11 @@ pub(crate) fn define( Inst::new( "scalar_to_vector", r#" - Scalar To Vector -- move a value out of a scalar register and into a vector - register; the scalar will be moved to the lowest-order bits of the vector - register and any higher bits will be zeroed. + Scalar To Vector -- move a value out of a scalar register and into a vector register; the + scalar will be moved to the lowest-order bits of the vector register. Note that this + instruction is intended as a low-level legalization instruction and frontends should prefer + insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some + types (e.g. integers) but not for others (e.g. floats). "#, ) .operands_in(vec![s]) diff --git a/cranelift-codegen/src/context.rs b/cranelift-codegen/src/context.rs index 704d89177..5350b3444 100644 --- a/cranelift-codegen/src/context.rs +++ b/cranelift-codegen/src/context.rs @@ -33,6 +33,7 @@ use crate::timing; use crate::unreachable_code::eliminate_unreachable_code; use crate::value_label::{build_value_labels_ranges, ComparableSourceLoc, ValueLabelsRanges}; use crate::verifier::{verify_context, verify_locations, VerifierErrors, VerifierResult}; +use log::debug; use std::vec::Vec; /// Persistent data structures and compilation pipeline. @@ -129,6 +130,7 @@ impl Context { pub fn compile(&mut self, isa: &dyn TargetIsa) -> CodegenResult { let _tt = timing::compile(); self.verify_if(isa)?; + debug!("Compiling:\n{}", self.func.display(isa)); self.compute_cfg(); if isa.flags().opt_level() != OptLevel::Fastest { @@ -158,7 +160,10 @@ impl Context { self.redundant_reload_remover(isa)?; self.shrink_instructions(isa)?; } - self.relax_branches(isa) + let result = self.relax_branches(isa); + + debug!("Compiled:\n{}", self.func.display(isa)); + result } /// Emit machine code directly into raw memory. @@ -256,6 +261,7 @@ impl Context { self.domtree.clear(); self.loop_analysis.clear(); legalize_function(&mut self.func, &mut self.cfg, isa); + debug!("Legalized:\n{}", self.func.display(isa)); self.verify_if(isa) } diff --git a/cranelift-codegen/src/isa/x86/enc_tables.rs b/cranelift-codegen/src/isa/x86/enc_tables.rs index e0fc05178..f67d7f0b6 100644 --- a/cranelift-codegen/src/isa/x86/enc_tables.rs +++ b/cranelift-codegen/src/isa/x86/enc_tables.rs @@ -5,6 +5,7 @@ use crate::bitset::BitSet; use crate::cursor::{Cursor, FuncCursor}; use crate::flowgraph::ControlFlowGraph; use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::types::*; use crate::ir::{self, Function, Inst, InstBuilder}; use crate::isa::constraints::*; use crate::isa::enc_tables::*; @@ -893,3 +894,121 @@ fn expand_fcvt_to_uint_sat( cfg.recompute_ebb(pos.func, uint_large_ebb); cfg.recompute_ebb(pos.func, done); } + +/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF +/// extractlane instruction +fn convert_extractlane( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::ExtractLane { + opcode: ir::Opcode::Extractlane, + arg, + lane, + } = pos.func.dfg[inst] + { + // NOTE: the following legalization assumes that the upper bits of the XMM register do + // not need to be zeroed during extractlane. + let value_type = pos.func.dfg.value_type(arg); + if value_type.lane_type().is_float() { + // Floats are already in XMM registers and can stay there. + let shuffled = if lane != 0 { + // Replace the extractlane with a PSHUFD to get the float in the right place. + match value_type { + F32X4 => { + // Move the selected lane to the 0 lane. + let shuffle_mask: u8 = 0b00_00_00_00 | lane; + pos.ins().x86_pshufd(arg, shuffle_mask) + } + F64X2 => { + assert_eq!(lane, 1); + // Because we know the lane == 1, we move the upper 64 bits to the lower + // 64 bits, leaving the top 64 bits as-is. + let shuffle_mask = 0b11_10_11_10; + let bitcast = pos.ins().raw_bitcast(F32X4, arg); + pos.ins().x86_pshufd(bitcast, shuffle_mask) + } + _ => unreachable!(), + } + } else { + // Remove the extractlane instruction, leaving the float where it is. + arg + }; + // Then we must bitcast to the right type. + pos.func + .dfg + .replace(inst) + .raw_bitcast(value_type.lane_type(), shuffled); + } else { + // For non-floats, lower with the usual PEXTR* instruction. + pos.func.dfg.replace(inst).x86_pextr(arg, lane); + } + } +} + +/// Because floats exist in XMM registers, we can keep them there when executing a CLIF +/// insertlane instruction +fn convert_insertlane( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::InsertLane { + opcode: ir::Opcode::Insertlane, + args: [vector, replacement], + lane, + } = pos.func.dfg[inst] + { + let value_type = pos.func.dfg.value_type(vector); + if value_type.lane_type().is_float() { + // Floats are already in XMM registers and can stay there. + match value_type { + F32X4 => { + assert!(lane > 0 && lane <= 3); + let immediate = 0b00_00_00_00 | lane << 4; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + pos.func + .dfg + .replace(inst) + .x86_insertps(vector, immediate, replacement) + } + F64X2 => { + let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types + if lane == 0 { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + pos.func + .dfg + .replace(inst) + .x86_movsd(vector, replacement_as_vector) + } else { + assert_eq!(lane, 1); + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + pos.func + .dfg + .replace(inst) + .x86_movlhps(vector, replacement_as_vector) + } + } + _ => unreachable!(), + }; + } else { + // For non-floats, lower with the usual PINSR* instruction. + pos.func + .dfg + .replace(inst) + .x86_pinsr(vector, lane, replacement); + } + } +} diff --git a/cranelift-codegen/src/verifier/locations.rs b/cranelift-codegen/src/verifier/locations.rs index bf1a4e186..cf17ae13d 100644 --- a/cranelift-codegen/src/verifier/locations.rs +++ b/cranelift-codegen/src/verifier/locations.rs @@ -107,8 +107,10 @@ impl<'a> LocationVerifier<'a> { fatal!( errors, inst, - "{} constraints not satisfied", - self.encinfo.display(enc) + "{} constraints not satisfied in: {}\n{}", + self.encinfo.display(enc), + self.func.dfg.display_inst(inst, self.isa), + self.func.display(self.isa) ) } diff --git a/cranelift-native/src/lib.rs b/cranelift-native/src/lib.rs index 0687e7017..cfa27cef8 100644 --- a/cranelift-native/src/lib.rs +++ b/cranelift-native/src/lib.rs @@ -59,6 +59,9 @@ fn parse_x86_cpuid(isa_builder: &mut isa::Builder) -> Result<(), &'static str> { if info.has_sse3() { isa_builder.enable("has_sse3").unwrap(); } + if info.has_ssse3() { + isa_builder.enable("has_ssse3").unwrap(); + } if info.has_sse41() { isa_builder.enable("has_sse41").unwrap(); } diff --git a/cranelift-wasm/src/code_translator.rs b/cranelift-wasm/src/code_translator.rs index 1801c6539..e5ce5a0c3 100644 --- a/cranelift-wasm/src/code_translator.rs +++ b/cranelift-wasm/src/code_translator.rs @@ -940,6 +940,16 @@ pub fn translate_operator( let splatted = builder.ins().splat(ty, value_to_splat); state.push1(splatted) } + Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => { + let vector = optionally_bitcast_vector(state.pop1(), type_of(op), builder); + let extracted = builder.ins().extractlane(vector, lane.clone()); + state.push1(builder.ins().sextend(I32, extracted)) + } + Operator::I8x16ExtractLaneU { lane } | Operator::I16x8ExtractLaneU { lane } => { + let vector = optionally_bitcast_vector(state.pop1(), type_of(op), builder); + state.push1(builder.ins().extractlane(vector, lane.clone())); + // on x86, PEXTRB zeroes the upper bits of the destination register of extractlane so uextend is elided; of course, this depends on extractlane being legalized to a PEXTRB + } Operator::I32x4ExtractLane { lane } | Operator::I64x2ExtractLane { lane } | Operator::F32x4ExtractLane { lane } @@ -967,10 +977,6 @@ pub fn translate_operator( } Operator::V128Load { .. } | Operator::V128Store { .. } - | Operator::I8x16ExtractLaneS { .. } - | Operator::I8x16ExtractLaneU { .. } - | Operator::I16x8ExtractLaneS { .. } - | Operator::I16x8ExtractLaneU { .. } | Operator::V8x16Shuffle { .. } | Operator::I8x16Eq | Operator::I8x16Ne diff --git a/filetests/isa/x86/extractlane-binemit.clif b/filetests/isa/x86/extractlane-binemit.clif new file mode 100644 index 000000000..0a3b776a9 --- /dev/null +++ b/filetests/isa/x86/extractlane-binemit.clif @@ -0,0 +1,38 @@ +test binemit +set enable_simd +target x86_64 haswell + +; for extractlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pextr +; which is manually placed in the IR so that it can be binemit-tested + +function %test_extractlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %xmm0] v1 = splat.b8x16 v0 +[-, %rax] v2 = x86_pextr v1, 10 ; bin: 66 0f 3a 14 c0 0a + return +} + +function %test_extractlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %xmm1] v1 = splat.i16x8 v0 +[-, %rax] v2 = x86_pextr v1, 4 ; bin: 66 0f c5 c8 04 + return +} + +function %test_extractlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %xmm4] v1 = splat.i32x4 v0 +[-, %rcx] v2 = x86_pextr v1, 2 ; bin: 66 0f 3a 16 e1 02 + return +} + +function %test_extractlane_b64() { +ebb0: +[-, %rax] v0 = bconst.b64 false +[-, %xmm2] v1 = splat.b64x2 v0 +[-, %rbx] v2 = x86_pextr v1, 1 ; bin: 66 48 0f 3a 16 d3 01 + return +} diff --git a/filetests/isa/x86/extractlane-run.clif b/filetests/isa/x86/extractlane-run.clif new file mode 100644 index 000000000..adb2e7b8e --- /dev/null +++ b/filetests/isa/x86/extractlane-run.clif @@ -0,0 +1,68 @@ +test run +set enable_simd + +function %test_extractlane_b8() -> b8 { +ebb0: + v1 = vconst.b8x16 [false false false false false false false false false false true false false + false false false] + v2 = extractlane v1, 10 + return v2 +} +; run + +function %test_extractlane_i16() -> b1 { +ebb0: + v0 = vconst.i16x8 0x00080007000600050004000300020001 + v1 = extractlane v0, 1 + v2 = icmp_imm eq v1, 2 + return v2 +} +; run + +function %test_extractlane_f32() -> b1 { +ebb0: + v0 = f32const 0x42.42 + v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42] + v2 = extractlane v1, 3 + v3 = fcmp eq v2, v0 + return v3 +} +; run + +function %test_extractlane_i32_with_vector_reuse() -> b1 { +ebb0: + v0 = iconst.i32 42 + v1 = iconst.i32 99 + + v2 = splat.i32x4 v0 + v3 = insertlane v2, 2, v1 + + v4 = extractlane v3, 3 + v5 = icmp eq v4, v0 + + v6 = extractlane v3, 2 + v7 = icmp eq v6, v1 + + v8 = band v5, v7 + return v8 +} +; run + +function %test_extractlane_f32_with_vector_reuse() -> b1 { +ebb0: + v0 = f32const 0x42.42 + v1 = f32const 0x99.99 + + v2 = splat.f32x4 v0 + v3 = insertlane v2, 2, v1 + + v4 = extractlane v3, 3 + v5 = fcmp eq v4, v0 + + v6 = extractlane v3, 2 + v7 = fcmp eq v6, v1 + + v8 = band v5, v7 + return v8 +} +; run diff --git a/filetests/isa/x86/extractlane.clif b/filetests/isa/x86/extractlane.clif deleted file mode 100644 index e7a1ea898..000000000 --- a/filetests/isa/x86/extractlane.clif +++ /dev/null @@ -1,35 +0,0 @@ -test binemit -set enable_simd -target x86_64 haswell - -function %test_extractlane_b8() { -ebb0: -[-, %rax] v0 = bconst.b8 true -[-, %xmm0] v1 = splat.b8x16 v0 -[-, %rax] v2 = extractlane v1, 10 ; bin: 66 0f 3a 14 c0 0a - return -} - -function %test_extractlane_i16() { -ebb0: -[-, %rax] v0 = iconst.i16 4 -[-, %xmm1] v1 = splat.i16x8 v0 -[-, %rax] v2 = extractlane v1, 4 ; bin: 66 0f c5 c8 04 - return -} - -function %test_extractlane_i32() { -ebb0: -[-, %rax] v0 = iconst.i32 42 -[-, %xmm4] v1 = splat.i32x4 v0 -[-, %rcx] v2 = extractlane v1, 2 ; bin: 66 0f 3a 16 e1 02 - return -} - -function %test_extractlane_f64() { -ebb0: -[-, %rax] v0 = f64const 0x0.0 -[-, %xmm2] v1 = splat.f64x2 v0 -[-, %rbx] v2 = extractlane v1, 1 ; bin: 66 48 0f 3a 16 d3 01 - return -} diff --git a/filetests/isa/x86/insertlane-binemit.clif b/filetests/isa/x86/insertlane-binemit.clif new file mode 100644 index 000000000..c388ed6fa --- /dev/null +++ b/filetests/isa/x86/insertlane-binemit.clif @@ -0,0 +1,42 @@ +test binemit +set enable_simd +target x86_64 haswell + +; for insertlane, floats are legalized differently than integers and booleans; integers and +; booleans use x86_pinsr which is manually placed in the IR so that it can be binemit-tested + +function %test_insertlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %rbx] v1 = bconst.b8 false +[-, %xmm0] v2 = splat.b8x16 v0 +[-, %xmm0] v3 = x86_pinsr v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a + return +} + +function %test_insertlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %rbx] v1 = iconst.i16 5 +[-, %xmm1] v2 = splat.i16x8 v0 +[-, %xmm1] v3 = x86_pinsr v2, 4, v1 ; bin: 66 0f c4 cb 04 + return +} + +function %test_insertlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %rbx] v1 = iconst.i32 99 +[-, %xmm4] v2 = splat.i32x4 v0 +[-, %xmm4] v3 = x86_pinsr v2, 2, v1 ; bin: 66 0f 3a 22 e3 02 + return +} + +function %test_insertlane_b64() { +ebb0: +[-, %rax] v0 = bconst.b64 true +[-, %rbx] v1 = bconst.b64 false +[-, %xmm2] v2 = splat.b64x2 v0 +[-, %xmm2] v3 = x86_pinsr v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01 + return +} diff --git a/filetests/isa/x86/insertlane-run.clif b/filetests/isa/x86/insertlane-run.clif new file mode 100644 index 000000000..92fb38202 --- /dev/null +++ b/filetests/isa/x86/insertlane-run.clif @@ -0,0 +1,48 @@ +test run +set enable_simd + +; TODO once SIMD vector comparison is implemented, remove use of extractlane below + +function %test_insertlane_b8() -> b8 { +ebb0: + v1 = bconst.b8 true + v2 = vconst.b8x16 [false false false false false false false false false false false false false + false false false] + v3 = insertlane v2, 10, v1 + v4 = extractlane v3, 10 + return v4 +} +; run + +function %test_insertlane_f32() -> b1 { +ebb0: + v0 = f32const 0x42.42 + v1 = vconst.f32x4 0x00 + v2 = insertlane v1, 1, v0 + v3 = extractlane v2, 1 + v4 = fcmp eq v3, v0 + return v4 +} +; run + +function %test_insertlane_f64_lane1() -> b1 { +ebb0: + v0 = f64const 0x42.42 + v1 = vconst.f64x2 0x00 + v2 = insertlane v1, 1, v0 + v3 = extractlane v2, 1 + v4 = fcmp eq v3, v0 + return v4 +} +; run + +function %test_insertlane_f64_lane0() -> b1 { +ebb0: + v0 = f64const 0x42.42 + v1 = vconst.f64x2 0x00 + v2 = insertlane v1, 0, v0 + v3 = extractlane v2, 0 + v4 = fcmp eq v3, v0 + return v4 +} +; run diff --git a/filetests/isa/x86/insertlane.clif b/filetests/isa/x86/insertlane.clif deleted file mode 100644 index c55dc4033..000000000 --- a/filetests/isa/x86/insertlane.clif +++ /dev/null @@ -1,39 +0,0 @@ -test binemit -set enable_simd -target x86_64 haswell - -function %test_insertlane_b8() { -ebb0: -[-, %rax] v0 = bconst.b8 true -[-, %rbx] v1 = bconst.b8 false -[-, %xmm0] v2 = splat.b8x16 v0 -[-, %xmm0] v3 = insertlane v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a - return -} - -function %test_insertlane_i16() { -ebb0: -[-, %rax] v0 = iconst.i16 4 -[-, %rbx] v1 = iconst.i16 5 -[-, %xmm1] v2 = splat.i16x8 v0 -[-, %xmm1] v3 = insertlane v2, 4, v1 ; bin: 66 0f c4 cb 04 - return -} - -function %test_insertlane_i32() { -ebb0: -[-, %rax] v0 = iconst.i32 42 -[-, %rbx] v1 = iconst.i32 99 -[-, %xmm4] v2 = splat.i32x4 v0 -[-, %xmm4] v3 = insertlane v2, 2, v1 ; bin: 66 0f 3a 22 e3 02 - return -} - -function %test_insertlane_f64() { -ebb0: -[-, %rax] v0 = f64const 0x0.0 -[-, %rbx] v1 = f64const 0x4.2 -[-, %xmm2] v2 = splat.f64x2 v0 -[-, %xmm2] v3 = insertlane v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01 - return -} diff --git a/filetests/isa/x86/legalize-splat.clif b/filetests/isa/x86/legalize-splat.clif index fa07f80c1..19d61d529 100644 --- a/filetests/isa/x86/legalize-splat.clif +++ b/filetests/isa/x86/legalize-splat.clif @@ -33,7 +33,7 @@ ebb0: ; check: ebb0: ; nextln: v0 = iconst.i64 42 ; nextln: v2 = scalar_to_vector.i64x2 v0 -; nextln: v1 = insertlane v2, 1, v0 +; nextln: v1 = x86_pinsr v2, 1, v0 ; nextln: return v1 @@ -48,7 +48,7 @@ ebb0: ; check: ebb0: ; nextln: v0 = bconst.b16 true ; nextln: v2 = scalar_to_vector.b16x8 v0 -; nextln: v3 = insertlane v2, 1, v0 +; nextln: v3 = x86_pinsr v2, 1, v0 ; nextln: v4 = raw_bitcast.i32x4 v3 ; nextln: v5 = x86_pshufd v4, 0 ; nextln: v1 = raw_bitcast.b16x8 v5 @@ -68,6 +68,6 @@ ebb0: ; nextln: v0 = ireduce.i8 v2 ; nextln: v3 = scalar_to_vector.i8x16 v0 ; nextln: v4 = f64const 0.0 -; nextln: v5 = bitcast.i8x16 v4 +; nextln: v5 = raw_bitcast.i8x16 v4 ; nextln: v1 = x86_pshufb v3, v5 ; nextln: return v1 diff --git a/filetests/isa/x86/scalar_to_vector.clif b/filetests/isa/x86/scalar_to_vector-binemit.clif similarity index 80% rename from filetests/isa/x86/scalar_to_vector.clif rename to filetests/isa/x86/scalar_to_vector-binemit.clif index 51ddea3e7..b26f3d2e6 100644 --- a/filetests/isa/x86/scalar_to_vector.clif +++ b/filetests/isa/x86/scalar_to_vector-binemit.clif @@ -17,10 +17,10 @@ ebb0: return } -function %test_scalar_to_vector_f32() { +function %test_scalar_to_vector_b32() { ebb0: -[-, %rcx] v0 = f32const 0x0.42 -[-, %xmm3] v1 = scalar_to_vector.f32x4 v0 ; bin: 66 0f 6e d9 +[-, %rcx] v0 = bconst.b32 false +[-, %xmm3] v1 = scalar_to_vector.b32x4 v0 ; bin: 66 0f 6e d9 return } diff --git a/filetests/isa/x86/scalar_to_vector-compile.clif b/filetests/isa/x86/scalar_to_vector-compile.clif new file mode 100644 index 000000000..2d2ab331f --- /dev/null +++ b/filetests/isa/x86/scalar_to_vector-compile.clif @@ -0,0 +1,19 @@ +test compile +set opt_level=best +set probestack_enabled=false +set enable_simd +target x86_64 + +; ensure that scalar_to_vector emits no instructions for floats (already exist in an XMM register) +function %test_scalar_to_vector_f32() -> f32x4 baldrdash_system_v { +ebb0: + v0 = f32const 0x0.42 + v1 = scalar_to_vector.f32x4 v0 + return v1 +} + +; check: ebb0 +; nextln: v2 = iconst.i32 0x3e84_0000 +; nextln: v0 = bitcast.f32 v2 +; nextln: [null_fpr#00,%xmm0] v1 = scalar_to_vector.f32x4 v0 +; nextln: return v1