From 49f6d661dead0976668eea5fae79a821aed4fe78 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Thu, 11 Jul 2019 15:49:28 -0700 Subject: [PATCH 1/4] Add x86 implementation of extractlane instruction --- .../meta/src/isa/x86/encodings.rs | 29 +++++++++++++-- cranelift-codegen/meta/src/isa/x86/recipes.rs | 23 +++++++++++- filetests/isa/x86/extractlane.clif | 35 +++++++++++++++++++ 3 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 filetests/isa/x86/extractlane.clif diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs index eaa5614bd..ed4cf18d9 100644 --- a/cranelift-codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs @@ -318,6 +318,7 @@ pub fn define( let copy_special = shared.by_name("copy_special"); let ctz = shared.by_name("ctz"); let debugtrap = shared.by_name("debugtrap"); + let extractlane = shared.by_name("extractlane"); let f32const = shared.by_name("f32const"); let f64const = shared.by_name("f64const"); let fadd = shared.by_name("fadd"); @@ -498,7 +499,8 @@ pub fn define( let rec_pushq = r.template("pushq"); let rec_ret = r.template("ret"); let rec_r_ib = r.template("r_ib"); - let rec_r_ib_unsigned = r.template("r_ib_unsigned"); + let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr"); + let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr"); let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r"); let rec_r_id = r.template("r_id"); let rec_rcmp = r.template("rcmp"); @@ -1642,7 +1644,9 @@ pub fn define( for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { let number_of_lanes = 128 / ty.lane_bits(); let instruction = x86_pshufd.bind_vector(ty, number_of_lanes); - let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]); + let template = rec_r_ib_unsigned_fpr + .nonrex() + .opcodes(vec![0x66, 0x0f, 0x70]); e.enc32_isap(instruction.clone(), template.clone(), use_sse2); e.enc64_isap(instruction, template, use_sse2); } @@ -1682,6 +1686,27 @@ pub fn define( } } + // SIMD extractlane + let mut extractlane_mapping: HashMap, SettingPredicateNumber)> = HashMap::new(); + extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], use_sse41)); // PEXTRB + extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], use_sse2)); // PEXTRW, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes + extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRD + extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRQ, only x86_64 + + for ty in ValueType::all_lane_types() { + if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) { + let number_of_lanes = 128 / ty.lane_bits(); + let instruction = extractlane.bind_vector(ty, number_of_lanes); + let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone()); + if ty.lane_bits() < 64 { + e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); + } else { + // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 + e.enc64_isap(instruction, template.rex().w(), isap.clone()); + } + } + } + // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { let instruction = bitcast.bind_vector(ty, 16).bind(F64); diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs index b948c0c2e..623689cea 100644 --- a/cranelift-codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs @@ -800,7 +800,7 @@ pub fn define<'shared>( { let format = formats.get(f_extract_lane); recipes.add_template_recipe( - EncodingRecipeBuilder::new("r_ib_unsigned", f_extract_lane, 2) + EncodingRecipeBuilder::new("r_ib_unsigned_fpr", f_extract_lane, 2) .operands_in(vec![fpr]) .operands_out(vec![fpr]) .inst_predicate(InstructionPredicate::new_is_unsigned_int( @@ -817,6 +817,27 @@ pub fn define<'shared>( ); } + // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane) + { + let format = formats.get(f_extract_lane); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("r_ib_unsigned_gpr", f_extract_lane, 2) + .operands_in(vec![fpr]) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + format, "lane", 8, 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte + let imm:i64 = lane.into(); + sink.put1(imm as u8); + "#, + ), + ); + } + // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane) { let format = formats.get(f_insert_lane); diff --git a/filetests/isa/x86/extractlane.clif b/filetests/isa/x86/extractlane.clif new file mode 100644 index 000000000..e7a1ea898 --- /dev/null +++ b/filetests/isa/x86/extractlane.clif @@ -0,0 +1,35 @@ +test binemit +set enable_simd +target x86_64 haswell + +function %test_extractlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %xmm0] v1 = splat.b8x16 v0 +[-, %rax] v2 = extractlane v1, 10 ; bin: 66 0f 3a 14 c0 0a + return +} + +function %test_extractlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %xmm1] v1 = splat.i16x8 v0 +[-, %rax] v2 = extractlane v1, 4 ; bin: 66 0f c5 c8 04 + return +} + +function %test_extractlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %xmm4] v1 = splat.i32x4 v0 +[-, %rcx] v2 = extractlane v1, 2 ; bin: 66 0f 3a 16 e1 02 + return +} + +function %test_extractlane_f64() { +ebb0: +[-, %rax] v0 = f64const 0x0.0 +[-, %xmm2] v1 = splat.f64x2 v0 +[-, %rbx] v2 = extractlane v1, 1 ; bin: 66 48 0f 3a 16 d3 01 + return +} From ac74c3ae5cd9d75bb0457f021791b3cfb71fb240 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Thu, 11 Jul 2019 15:59:39 -0700 Subject: [PATCH 2/4] Avoid unnecessary lane calculations in codegen code This refactor moves the calculation of the number of lanes to code closer to where the Instruction/BoundInstruction is bound. --- .../meta/src/cdsl/instructions.rs | 29 +++++++++++++++---- .../meta/src/isa/x86/encodings.rs | 28 +++++++++--------- .../meta/src/isa/x86/legalize.rs | 23 ++++++++------- 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/cranelift-codegen/meta/src/cdsl/instructions.rs b/cranelift-codegen/meta/src/cdsl/instructions.rs index 43a879066..2a40d5128 100644 --- a/cranelift-codegen/meta/src/cdsl/instructions.rs +++ b/cranelift-codegen/meta/src/cdsl/instructions.rs @@ -181,8 +181,17 @@ impl Instruction { bind_ref(self.clone(), Some(reference_type.into()), Vec::new()) } - pub fn bind_vector(&self, lane_type: impl Into, num_lanes: u64) -> BoundInstruction { - bind_vector(self.clone(), lane_type.into(), num_lanes, Vec::new()) + pub fn bind_vector_from_lane( + &self, + lane_type: impl Into, + vector_size_in_bits: u64, + ) -> BoundInstruction { + bind_vector( + self.clone(), + lane_type.into(), + vector_size_in_bits, + Vec::new(), + ) } pub fn bind_any(&self) -> BoundInstruction { @@ -414,8 +423,17 @@ impl BoundInstruction { bind_ref(self.inst, Some(reference_type.into()), self.value_types) } - pub fn bind_vector(self, lane_type: impl Into, num_lanes: u64) -> BoundInstruction { - bind_vector(self.inst, lane_type.into(), num_lanes, self.value_types) + pub fn bind_vector_from_lane( + self, + lane_type: impl Into, + vector_size_in_bits: u64, + ) -> BoundInstruction { + bind_vector( + self.inst, + lane_type.into(), + vector_size_in_bits, + self.value_types, + ) } pub fn bind_any(self) -> BoundInstruction { @@ -1116,9 +1134,10 @@ fn bind_ref( fn bind_vector( inst: Instruction, lane_type: LaneType, - num_lanes: u64, + vector_size_in_bits: u64, mut value_types: Vec, ) -> BoundInstruction { + let num_lanes = vector_size_in_bits / lane_type.lane_bits(); let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes)); value_types.push(ValueTypeOrAny::ValueType(vector_type)); verify_polymorphic_binding(&inst, &value_types); diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs index ed4cf18d9..3b4be51a6 100644 --- a/cranelift-codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs @@ -1627,23 +1627,24 @@ pub fn define( e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e])); e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e])); + // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available + let sse_vector_size: u64 = 128; + // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the // value across the register // PSHUFB, 8-bit shuffle using two XMM registers for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { - let number_of_lanes = 128 / ty.lane_bits(); - let instruction = x86_pshufb.bind_vector(ty, number_of_lanes); - let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]); + let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size); + let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]); e.enc32_isap(instruction.clone(), template.clone(), use_ssse3); e.enc64_isap(instruction, template, use_ssse3); } // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { - let number_of_lanes = 128 / ty.lane_bits(); - let instruction = x86_pshufd.bind_vector(ty, number_of_lanes); + let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_fpr .nonrex() .opcodes(vec![0x66, 0x0f, 0x70]); @@ -1655,8 +1656,9 @@ pub fn define( // to the Intel manual: "When the destination operand is an XMM register, the source operand is // written to the low doubleword of the register and the regiser is zero-extended to 128 bits." for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) { - let number_of_lanes = 128 / ty.lane_bits(); - let instruction = scalar_to_vector.bind_vector(ty, number_of_lanes).bind(ty); + let instruction = scalar_to_vector + .bind_vector_from_lane(ty, sse_vector_size) + .bind(ty); let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ if ty.lane_bits() < 64 { // no 32-bit encodings for 64-bit widths @@ -1674,8 +1676,7 @@ pub fn define( for ty in ValueType::all_lane_types() { if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) { - let number_of_lanes = 128 / ty.lane_bits(); - let instruction = insertlane.bind_vector(ty, number_of_lanes); + let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_r.opcodes(opcode.clone()); if ty.lane_bits() < 64 { e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); @@ -1695,8 +1696,7 @@ pub fn define( for ty in ValueType::all_lane_types() { if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) { - let number_of_lanes = 128 / ty.lane_bits(); - let instruction = extractlane.bind_vector(ty, number_of_lanes); + let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone()); if ty.lane_bits() < 64 { e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); @@ -1709,7 +1709,7 @@ pub fn define( // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { - let instruction = bitcast.bind_vector(ty, 16).bind(F64); + let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64); e.enc32_rec(instruction.clone(), rec_null_fpr, 0); e.enc64_rec(instruction, rec_null_fpr, 0); } @@ -1719,8 +1719,8 @@ pub fn define( for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type) { let instruction = raw_bitcast - .bind_vector(to_type, 128 / to_type.lane_bits()) - .bind_vector(from_type, 128 / from_type.lane_bits()); + .bind_vector_from_lane(to_type, sse_vector_size) + .bind_vector_from_lane(from_type, sse_vector_size); e.enc32_rec(instruction.clone(), rec_null_fpr, 0); e.enc64_rec(instruction, rec_null_fpr, 0); } diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs index 8423b1099..a64f63116 100644 --- a/cranelift-codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs @@ -320,12 +320,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou let c = var("c"); let d = var("d"); + // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available + let sse_vector_size: u64 = 128; + // SIMD splat: 8-bits for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { - let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits()); - let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64); + let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size); + let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64); narrow.legalize( - def!(y = splat_x8x16(x)), + def!(y = splat_any8x16(x)), vec![ def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0 @@ -337,13 +340,13 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou // SIMD splat: 16-bits for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) { - let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits()); + let splat_x16x8 = splat.bind_vector_from_lane(ty, sse_vector_size); let raw_bitcast_any16x8_to_i32x4 = raw_bitcast - .bind_vector(I32, 4) - .bind_vector(ty, 128 / ty.lane_bits()); + .bind_vector_from_lane(I32, sse_vector_size) + .bind_vector_from_lane(ty, sse_vector_size); let raw_bitcast_i32x4_to_any16x8 = raw_bitcast - .bind_vector(ty, 128 / ty.lane_bits()) - .bind_vector(I32, 4); + .bind_vector_from_lane(ty, sse_vector_size) + .bind_vector_from_lane(I32, sse_vector_size); narrow.legalize( def!(y = splat_x16x8(x)), vec![ @@ -358,7 +361,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou // SIMD splat: 32-bits for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { - let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits()); + let splat_any32x4 = splat.bind_vector_from_lane(ty, sse_vector_size); narrow.legalize( def!(y = splat_any32x4(x)), vec![ @@ -370,7 +373,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou // SIMD splat: 64-bits for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) { - let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits()); + let splat_any64x2 = splat.bind_vector_from_lane(ty, sse_vector_size); narrow.legalize( def!(y = splat_any64x2(x)), vec![ From 5fd8c91af9284626d37ba475f2bfc231c6f7b8cc Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Thu, 11 Jul 2019 16:10:42 -0700 Subject: [PATCH 3/4] Fix static analysis warnings --- cranelift-codegen/meta/src/cdsl/instructions.rs | 6 +++--- cranelift-codegen/meta/src/isa/x86/recipes.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cranelift-codegen/meta/src/cdsl/instructions.rs b/cranelift-codegen/meta/src/cdsl/instructions.rs index 2a40d5128..f31e53077 100644 --- a/cranelift-codegen/meta/src/cdsl/instructions.rs +++ b/cranelift-codegen/meta/src/cdsl/instructions.rs @@ -79,7 +79,7 @@ impl InstructionGroup { pub fn by_name(&self, name: &'static str) -> &Instruction { self.instructions .iter() - .find(|inst| inst.name == name) + .find(|inst| &inst.name == name) .expect(&format!("unexisting instruction with name {}", name)) } } @@ -155,7 +155,7 @@ impl ops::Deref for Instruction { impl Instruction { pub fn snake_name(&self) -> &str { - if self.name == "return" { + if &self.name == "return" { "return_" } else { &self.name @@ -800,7 +800,7 @@ impl InstructionPredicateNode { ret.extend(node.collect_leaves()); } } - _ => ret.push(&self), + _ => ret.push(self), } ret } diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs index 623689cea..3e773bc34 100644 --- a/cranelift-codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs @@ -53,7 +53,7 @@ impl<'builder> RecipeGroup<'builder> { pub fn recipe(&self, name: &str) -> &EncodingRecipe { self.recipes .iter() - .find(|recipe| recipe.name == name) + .find(|recipe| &recipe.name == name) .expect(&format!("unknown recipe name: {}. Try template?", name)) } pub fn template(&self, name: &str) -> &Template { From 8bd1ff4dc845478027eb228c126541bb0bd3a4c3 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 17 Jul 2019 09:45:58 -0700 Subject: [PATCH 4/4] Remove SSE2 setting for x86 In talking to @sunfishcode, he preferred to avoid the confusion of more ISA predicates by eliminating SSE2. SSE2 was released with the Pentium 4 in 2000 so it is unlikely that current CPUs would have SIMD enabled and not have this feature. I tried to note the SSE2-specific instructions with comments in the code. --- .../meta/src/isa/x86/encodings.rs | 71 +++++++++++++------ .../meta/src/isa/x86/settings.rs | 9 +-- filetests/isa/x86/pshufb.clif | 2 +- filetests/isa/x86/pshufd.clif | 2 +- filetests/isa/x86/scalar_to_vector.clif | 2 +- 5 files changed, 53 insertions(+), 33 deletions(-) diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs index 3b4be51a6..c5bbfe1a8 100644 --- a/cranelift-codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs @@ -268,14 +268,38 @@ impl PerCpuModeEncodings { } /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened - fn enc_32_64_isap( + fn enc_32_64_maybe_isap( &mut self, inst: BoundInstruction, template: Template, - isap: SettingPredicateNumber, + isap: Option, ) { - self.enc32_isap(inst.clone(), template.clone(), isap); - self.enc64_isap(inst, template, isap); + self.enc32_maybe_isap(inst.clone(), template.clone(), isap); + self.enc64_maybe_isap(inst, template, isap); + } + + fn enc32_maybe_isap( + &mut self, + inst: BoundInstruction, + template: Template, + isap: Option, + ) { + match isap { + None => self.enc32(inst, template), + Some(isap) => self.enc32_isap(inst, template, isap), + } + } + + fn enc64_maybe_isap( + &mut self, + inst: BoundInstruction, + template: Template, + isap: Option, + ) { + match isap { + None => self.enc64(inst, template), + Some(isap) => self.enc64_isap(inst, template, isap), + } } } @@ -559,7 +583,6 @@ pub fn define( let use_popcnt = settings.predicate_by_name("use_popcnt"); let use_lzcnt = settings.predicate_by_name("use_lzcnt"); let use_bmi1 = settings.predicate_by_name("use_bmi1"); - let use_sse2 = settings.predicate_by_name("use_sse2"); let use_ssse3 = settings.predicate_by_name("use_ssse3"); let use_sse41 = settings.predicate_by_name("use_sse41"); @@ -1648,8 +1671,8 @@ pub fn define( let template = rec_r_ib_unsigned_fpr .nonrex() .opcodes(vec![0x66, 0x0f, 0x70]); - e.enc32_isap(instruction.clone(), template.clone(), use_sse2); - e.enc64_isap(instruction, template, use_sse2); + e.enc32(instruction.clone(), template.clone()); + e.enc64(instruction, template); } // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according @@ -1662,47 +1685,49 @@ pub fn define( let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ if ty.lane_bits() < 64 { // no 32-bit encodings for 64-bit widths - e.enc32_isap(instruction.clone(), template.clone(), use_sse2); + e.enc32(instruction.clone(), template.clone()); } - e.enc_x86_64_isap(instruction, template, use_sse2); + e.enc_x86_64(instruction, template); } // SIMD insertlane - let mut insertlane_mapping: HashMap, SettingPredicateNumber)> = HashMap::new(); - insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], use_sse41)); // PINSRB - insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], use_sse2)); // PINSRW - insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRD - insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRQ, only x86_64 + let mut insertlane_mapping: HashMap, Option)> = + HashMap::new(); + insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41))); // PINSRB + insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2 + insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41))); // PINSRD + insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41))); // PINSRQ, only x86_64 for ty in ValueType::all_lane_types() { if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) { let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_r.opcodes(opcode.clone()); if ty.lane_bits() < 64 { - e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); + e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone()); } else { // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 - e.enc64_isap(instruction, template.rex().w(), isap.clone()); + e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone()); } } } // SIMD extractlane - let mut extractlane_mapping: HashMap, SettingPredicateNumber)> = HashMap::new(); - extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], use_sse41)); // PEXTRB - extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], use_sse2)); // PEXTRW, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes - extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRD - extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRQ, only x86_64 + let mut extractlane_mapping: HashMap, Option)> = + HashMap::new(); + extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB + extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes + extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD + extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64 for ty in ValueType::all_lane_types() { if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) { let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone()); if ty.lane_bits() < 64 { - e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); + e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone()); } else { // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 - e.enc64_isap(instruction, template.rex().w(), isap.clone()); + e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone()); } } } diff --git a/cranelift-codegen/meta/src/isa/x86/settings.rs b/cranelift-codegen/meta/src/isa/x86/settings.rs index bc8c81f48..3a4255338 100644 --- a/cranelift-codegen/meta/src/isa/x86/settings.rs +++ b/cranelift-codegen/meta/src/isa/x86/settings.rs @@ -3,9 +3,6 @@ use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder}; pub fn define(shared: &SettingGroup) -> SettingGroup { let mut settings = SettingGroupBuilder::new("x86"); - // CPUID.01H:EDX - let has_sse2 = settings.add_bool("has_sse2", "SSE2: CPUID.01H:EDX.SSE2[bit 26]", false); - // CPUID.01H:ECX let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false); let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false); @@ -35,7 +32,6 @@ pub fn define(shared: &SettingGroup) -> SettingGroup { let shared_enable_simd = shared.get_bool("enable_simd"); - settings.add_predicate("use_sse2", predicate!(shared_enable_simd && has_sse2)); settings.add_predicate("use_ssse3", predicate!(shared_enable_simd && has_ssse3)); settings.add_predicate("use_sse41", predicate!(shared_enable_simd && has_sse41)); settings.add_predicate( @@ -69,7 +65,7 @@ pub fn define(shared: &SettingGroup) -> SettingGroup { settings.add_preset("baseline", preset!()); let nehalem = settings.add_preset( "nehalem", - preset!(has_sse2 && has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt), + preset!(has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt), ); let haswell = settings.add_preset( "haswell", @@ -82,8 +78,7 @@ pub fn define(shared: &SettingGroup) -> SettingGroup { settings.add_preset( "znver1", preset!( - has_sse2 - && has_sse3 + has_sse3 && has_ssse3 && has_sse41 && has_sse42 diff --git a/filetests/isa/x86/pshufb.clif b/filetests/isa/x86/pshufb.clif index 7c23c5ab6..6fb31b198 100644 --- a/filetests/isa/x86/pshufb.clif +++ b/filetests/isa/x86/pshufb.clif @@ -1,6 +1,6 @@ test binemit set enable_simd -target x86_64 has_sse2=true has_ssse3=true +target x86_64 has_ssse3=true function %test_pshufb() { ebb0: diff --git a/filetests/isa/x86/pshufd.clif b/filetests/isa/x86/pshufd.clif index 183af4fc0..6f4896d0d 100644 --- a/filetests/isa/x86/pshufd.clif +++ b/filetests/isa/x86/pshufd.clif @@ -1,6 +1,6 @@ test binemit set enable_simd -target x86_64 has_sse2=true +target x86_64 function %test_pshuf() { ebb0: diff --git a/filetests/isa/x86/scalar_to_vector.clif b/filetests/isa/x86/scalar_to_vector.clif index 6c77dfafd..51ddea3e7 100644 --- a/filetests/isa/x86/scalar_to_vector.clif +++ b/filetests/isa/x86/scalar_to_vector.clif @@ -1,7 +1,7 @@ test binemit set opt_level=best set enable_simd -target x86_64 has_sse2=true +target x86_64 function %test_scalar_to_vector_b8() { ebb0: