From 49f6d661dead0976668eea5fae79a821aed4fe78 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Thu, 11 Jul 2019 15:49:28 -0700
Subject: [PATCH 1/4] Add x86 implementation of extractlane instruction

---
 .../meta/src/isa/x86/encodings.rs             | 29 +++++++++++++--
 cranelift-codegen/meta/src/isa/x86/recipes.rs | 23 +++++++++++-
 filetests/isa/x86/extractlane.clif            | 35 +++++++++++++++++++
 3 files changed, 84 insertions(+), 3 deletions(-)
 create mode 100644 filetests/isa/x86/extractlane.clif
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index eaa5614bd..ed4cf18d9 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -318,6 +318,7 @@ pub fn define(
     let copy_special = shared.by_name("copy_special");
     let ctz = shared.by_name("ctz");
     let debugtrap = shared.by_name("debugtrap");
+    let extractlane = shared.by_name("extractlane");
     let f32const = shared.by_name("f32const");
     let f64const = shared.by_name("f64const");
     let fadd = shared.by_name("fadd");
@@ -498,7 +499,8 @@ pub fn define(
     let rec_pushq = r.template("pushq");
     let rec_ret = r.template("ret");
     let rec_r_ib = r.template("r_ib");
-    let rec_r_ib_unsigned = r.template("r_ib_unsigned");
+    let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr");
+    let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr");
     let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r");
     let rec_r_id = r.template("r_id");
     let rec_rcmp = r.template("rcmp");
@@ -1642,7 +1644,9 @@ pub fn define(
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
         let number_of_lanes = 128 / ty.lane_bits();
         let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
-        let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]);
+        let template = rec_r_ib_unsigned_fpr
+            .nonrex()
+            .opcodes(vec![0x66, 0x0f, 0x70]);
         e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
         e.enc64_isap(instruction, template, use_sse2);
     }
@@ -1682,6 +1686,27 @@ pub fn define(
         }
     }
 
+    // SIMD extractlane
+    let mut extractlane_mapping: HashMap<u64, (Vec<u8>, SettingPredicateNumber)> = HashMap::new();
+    extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], use_sse41)); // PEXTRB
+    extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], use_sse2)); // PEXTRW, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
+    extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRD
+    extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRQ, only x86_64
+
+    for ty in ValueType::all_lane_types() {
+        if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
+            let number_of_lanes = 128 / ty.lane_bits();
+            let instruction = extractlane.bind_vector(ty, number_of_lanes);
+            let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
+            if ty.lane_bits() < 64 {
+                e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
+            } else {
+                // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
+                e.enc64_isap(instruction, template.rex().w(), isap.clone());
+            }
+        }
+    }
+
     // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
         let instruction = bitcast.bind_vector(ty, 16).bind(F64);
diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs
index b948c0c2e..623689cea 100644
--- a/cranelift-codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs
@@ -800,7 +800,7 @@ pub fn define<'shared>(
     {
         let format = formats.get(f_extract_lane);
         recipes.add_template_recipe(
-            EncodingRecipeBuilder::new("r_ib_unsigned", f_extract_lane, 2)
+            EncodingRecipeBuilder::new("r_ib_unsigned_fpr", f_extract_lane, 2)
                 .operands_in(vec![fpr])
                 .operands_out(vec![fpr])
                 .inst_predicate(InstructionPredicate::new_is_unsigned_int(
@@ -817,6 +817,27 @@ pub fn define<'shared>(
         );
     }
 
+    // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane)
+    {
+        let format = formats.get(f_extract_lane);
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("r_ib_unsigned_gpr", f_extract_lane, 2)
+                .operands_in(vec![fpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    format, "lane", 8, 0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte
+                    let imm:i64 = lane.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+        );
+    }
+
     // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane)
     {
         let format = formats.get(f_insert_lane);
diff --git a/filetests/isa/x86/extractlane.clif b/filetests/isa/x86/extractlane.clif
new file mode 100644
index 000000000..e7a1ea898
--- /dev/null
+++ b/filetests/isa/x86/extractlane.clif
@@ -0,0 +1,35 @@
+test binemit
+set enable_simd
+target x86_64 haswell
+
+function %test_extractlane_b8() {
+ebb0:
+[-, %rax]   v0 = bconst.b8 true
+[-, %xmm0]  v1 = splat.b8x16 v0
+[-, %rax]   v2 = extractlane v1, 10    ; bin: 66 0f 3a 14 c0 0a
+            return
+}
+
+function %test_extractlane_i16() {
+ebb0:
+[-, %rax]   v0 = iconst.i16 4
+[-, %xmm1]  v1 = splat.i16x8 v0
+[-, %rax]   v2 = extractlane v1, 4    ; bin: 66 0f c5 c8 04
+            return
+}
+
+function %test_extractlane_i32() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %xmm4]  v1 = splat.i32x4 v0
+[-, %rcx]   v2 = extractlane v1, 2    ; bin: 66 0f 3a 16 e1 02
+            return
+}
+
+function %test_extractlane_f64() {
+ebb0:
+[-, %rax]   v0 = f64const 0x0.0
+[-, %xmm2]  v1 = splat.f64x2 v0
+[-, %rbx]   v2 = extractlane v1, 1    ; bin: 66 48 0f 3a 16 d3 01
+            return
+}

From ac74c3ae5cd9d75bb0457f021791b3cfb71fb240 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Thu, 11 Jul 2019 15:59:39 -0700
Subject: [PATCH 2/4] Avoid unnecessary lane calculations in codegen code

This refactor moves the calculation of the number of lanes to code closer to where the Instruction/BoundInstruction is bound.
---
 .../meta/src/cdsl/instructions.rs             | 29 +++++++++++++++----
 .../meta/src/isa/x86/encodings.rs             | 28 +++++++++---------
 .../meta/src/isa/x86/legalize.rs              | 23 ++++++++-------
 3 files changed, 51 insertions(+), 29 deletions(-)

diff --git a/cranelift-codegen/meta/src/cdsl/instructions.rs b/cranelift-codegen/meta/src/cdsl/instructions.rs
index 43a879066..2a40d5128 100644
--- a/cranelift-codegen/meta/src/cdsl/instructions.rs
+++ b/cranelift-codegen/meta/src/cdsl/instructions.rs
@@ -181,8 +181,17 @@ impl Instruction {
         bind_ref(self.clone(), Some(reference_type.into()), Vec::new())
     }
 
-    pub fn bind_vector(&self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
-        bind_vector(self.clone(), lane_type.into(), num_lanes, Vec::new())
+    pub fn bind_vector_from_lane(
+        &self,
+        lane_type: impl Into<LaneType>,
+        vector_size_in_bits: u64,
+    ) -> BoundInstruction {
+        bind_vector(
+            self.clone(),
+            lane_type.into(),
+            vector_size_in_bits,
+            Vec::new(),
+        )
     }
 
     pub fn bind_any(&self) -> BoundInstruction {
@@ -414,8 +423,17 @@ impl BoundInstruction {
         bind_ref(self.inst, Some(reference_type.into()), self.value_types)
     }
 
-    pub fn bind_vector(self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
-        bind_vector(self.inst, lane_type.into(), num_lanes, self.value_types)
+    pub fn bind_vector_from_lane(
+        self,
+        lane_type: impl Into<LaneType>,
+        vector_size_in_bits: u64,
+    ) -> BoundInstruction {
+        bind_vector(
+            self.inst,
+            lane_type.into(),
+            vector_size_in_bits,
+            self.value_types,
+        )
     }
 
     pub fn bind_any(self) -> BoundInstruction {
@@ -1116,9 +1134,10 @@ fn bind_ref(
 fn bind_vector(
     inst: Instruction,
     lane_type: LaneType,
-    num_lanes: u64,
+    vector_size_in_bits: u64,
     mut value_types: Vec<ValueTypeOrAny>,
 ) -> BoundInstruction {
+    let num_lanes = vector_size_in_bits / lane_type.lane_bits();
     let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes));
     value_types.push(ValueTypeOrAny::ValueType(vector_type));
     verify_polymorphic_binding(&inst, &value_types);
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index ed4cf18d9..3b4be51a6 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -1627,23 +1627,24 @@ pub fn define(
     e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
     e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
 
+    // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
+    let sse_vector_size: u64 = 128;
+
     // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
     // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
     // value across the register
 
     // PSHUFB, 8-bit shuffle using two XMM registers
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let number_of_lanes = 128 / ty.lane_bits();
-        let instruction = x86_pshufb.bind_vector(ty, number_of_lanes);
-        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]);
+        let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
         e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
         e.enc64_isap(instruction, template, use_ssse3);
     }
 
     // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
-        let number_of_lanes = 128 / ty.lane_bits();
-        let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
+        let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_r_ib_unsigned_fpr
             .nonrex()
             .opcodes(vec![0x66, 0x0f, 0x70]);
@@ -1655,8 +1656,9 @@ pub fn define(
     // to the Intel manual: "When the destination operand is an XMM register, the source operand is
     // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
-        let number_of_lanes = 128 / ty.lane_bits();
-        let instruction = scalar_to_vector.bind_vector(ty, number_of_lanes).bind(ty);
+        let instruction = scalar_to_vector
+            .bind_vector_from_lane(ty, sse_vector_size)
+            .bind(ty);
         let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
         if ty.lane_bits() < 64 {
             // no 32-bit encodings for 64-bit widths
@@ -1674,8 +1676,7 @@ pub fn define(
 
     for ty in ValueType::all_lane_types() {
         if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
-            let number_of_lanes = 128 / ty.lane_bits();
-            let instruction = insertlane.bind_vector(ty, number_of_lanes);
+            let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
                 e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
@@ -1695,8 +1696,7 @@ pub fn define(
 
     for ty in ValueType::all_lane_types() {
         if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
-            let number_of_lanes = 128 / ty.lane_bits();
-            let instruction = extractlane.bind_vector(ty, number_of_lanes);
+            let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
                 e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
@@ -1709,7 +1709,7 @@ pub fn define(
 
     // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let instruction = bitcast.bind_vector(ty, 16).bind(F64);
+        let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
         e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
         e.enc64_rec(instruction, rec_null_fpr, 0);
     }
@@ -1719,8 +1719,8 @@ pub fn define(
         for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
         {
             let instruction = raw_bitcast
-                .bind_vector(to_type, 128 / to_type.lane_bits())
-                .bind_vector(from_type, 128 / from_type.lane_bits());
+                .bind_vector_from_lane(to_type, sse_vector_size)
+                .bind_vector_from_lane(from_type, sse_vector_size);
             e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
             e.enc64_rec(instruction, rec_null_fpr, 0);
         }
diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs
index 8423b1099..a64f63116 100644
--- a/cranelift-codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs
@@ -320,12 +320,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
     let c = var("c");
     let d = var("d");
 
+    // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
+    let sse_vector_size: u64 = 128;
+
     // SIMD splat: 8-bits
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
-        let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
+        let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size);
+        let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
         narrow.legalize(
-            def!(y = splat_x8x16(x)),
+            def!(y = splat_any8x16(x)),
             vec![
                 def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
                 def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
@@ -337,13 +340,13 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
 
     // SIMD splat: 16-bits
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
-        let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let splat_x16x8 = splat.bind_vector_from_lane(ty, sse_vector_size);
         let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
-            .bind_vector(I32, 4)
-            .bind_vector(ty, 128 / ty.lane_bits());
+            .bind_vector_from_lane(I32, sse_vector_size)
+            .bind_vector_from_lane(ty, sse_vector_size);
         let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
-            .bind_vector(ty, 128 / ty.lane_bits())
-            .bind_vector(I32, 4);
+            .bind_vector_from_lane(ty, sse_vector_size)
+            .bind_vector_from_lane(I32, sse_vector_size);
         narrow.legalize(
             def!(y = splat_x16x8(x)),
             vec![
@@ -358,7 +361,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
 
     // SIMD splat: 32-bits
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
-        let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let splat_any32x4 = splat.bind_vector_from_lane(ty, sse_vector_size);
         narrow.legalize(
             def!(y = splat_any32x4(x)),
             vec![
@@ -370,7 +373,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
 
     // SIMD splat: 64-bits
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
-        let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
+        let splat_any64x2 = splat.bind_vector_from_lane(ty, sse_vector_size);
         narrow.legalize(
             def!(y = splat_any64x2(x)),
             vec![

From 5fd8c91af9284626d37ba475f2bfc231c6f7b8cc Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Thu, 11 Jul 2019 16:10:42 -0700
Subject: [PATCH 3/4] Fix static analysis warnings

---
 cranelift-codegen/meta/src/cdsl/instructions.rs | 6 +++---
 cranelift-codegen/meta/src/isa/x86/recipes.rs   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cranelift-codegen/meta/src/cdsl/instructions.rs b/cranelift-codegen/meta/src/cdsl/instructions.rs
index 2a40d5128..f31e53077 100644
--- a/cranelift-codegen/meta/src/cdsl/instructions.rs
+++ b/cranelift-codegen/meta/src/cdsl/instructions.rs
@@ -79,7 +79,7 @@ impl InstructionGroup {
     pub fn by_name(&self, name: &'static str) -> &Instruction {
         self.instructions
             .iter()
-            .find(|inst| inst.name == name)
+            .find(|inst| &inst.name == name)
             .expect(&format!("unexisting instruction with name {}", name))
     }
 }
@@ -155,7 +155,7 @@ impl ops::Deref for Instruction {
 
 impl Instruction {
     pub fn snake_name(&self) -> &str {
-        if self.name == "return" {
+        if &self.name == "return" {
             "return_"
         } else {
             &self.name
@@ -800,7 +800,7 @@ impl InstructionPredicateNode {
                     ret.extend(node.collect_leaves());
                 }
             }
-            _ => ret.push(&self),
+            _ => ret.push(self),
         }
         ret
     }
diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs
index 623689cea..3e773bc34 100644
--- a/cranelift-codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs
@@ -53,7 +53,7 @@ impl<'builder> RecipeGroup<'builder> {
     pub fn recipe(&self, name: &str) -> &EncodingRecipe {
         self.recipes
             .iter()
-            .find(|recipe| recipe.name == name)
+            .find(|recipe| &recipe.name == name)
             .expect(&format!("unknown recipe name: {}. Try template?", name))
     }
     pub fn template(&self, name: &str) -> &Template {

From 8bd1ff4dc845478027eb228c126541bb0bd3a4c3 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 17 Jul 2019 09:45:58 -0700
Subject: [PATCH 4/4] Remove SSE2 setting for x86

In talking to @sunfishcode, he preferred to avoid the confusion of more ISA predicates by eliminating SSE2. SSE2 was released with the Pentium 4 in 2000 so it is unlikely that current CPUs would have SIMD enabled and not have this feature. I tried to note the SSE2-specific instructions with comments in the code.
---
 .../meta/src/isa/x86/encodings.rs             | 71 +++++++++++++------
 .../meta/src/isa/x86/settings.rs              |  9 +--
 filetests/isa/x86/pshufb.clif                 |  2 +-
 filetests/isa/x86/pshufd.clif                 |  2 +-
 filetests/isa/x86/scalar_to_vector.clif       |  2 +-
 5 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 3b4be51a6..c5bbfe1a8 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -268,14 +268,38 @@ impl PerCpuModeEncodings {
     }
 
     /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
-    fn enc_32_64_isap(
+    fn enc_32_64_maybe_isap(
         &mut self,
         inst: BoundInstruction,
         template: Template,
-        isap: SettingPredicateNumber,
+        isap: Option<SettingPredicateNumber>,
     ) {
-        self.enc32_isap(inst.clone(), template.clone(), isap);
-        self.enc64_isap(inst, template, isap);
+        self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
+        self.enc64_maybe_isap(inst, template, isap);
+    }
+
+    fn enc32_maybe_isap(
+        &mut self,
+        inst: BoundInstruction,
+        template: Template,
+        isap: Option<SettingPredicateNumber>,
+    ) {
+        match isap {
+            None => self.enc32(inst, template),
+            Some(isap) => self.enc32_isap(inst, template, isap),
+        }
+    }
+
+    fn enc64_maybe_isap(
+        &mut self,
+        inst: BoundInstruction,
+        template: Template,
+        isap: Option<SettingPredicateNumber>,
+    ) {
+        match isap {
+            None => self.enc64(inst, template),
+            Some(isap) => self.enc64_isap(inst, template, isap),
+        }
     }
 }
 
@@ -559,7 +583,6 @@ pub fn define(
     let use_popcnt = settings.predicate_by_name("use_popcnt");
     let use_lzcnt = settings.predicate_by_name("use_lzcnt");
     let use_bmi1 = settings.predicate_by_name("use_bmi1");
-    let use_sse2 = settings.predicate_by_name("use_sse2");
     let use_ssse3 = settings.predicate_by_name("use_ssse3");
     let use_sse41 = settings.predicate_by_name("use_sse41");
 
@@ -1648,8 +1671,8 @@ pub fn define(
         let template = rec_r_ib_unsigned_fpr
             .nonrex()
             .opcodes(vec![0x66, 0x0f, 0x70]);
-        e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
-        e.enc64_isap(instruction, template, use_sse2);
+        e.enc32(instruction.clone(), template.clone());
+        e.enc64(instruction, template);
     }
 
     // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
@@ -1662,47 +1685,49 @@ pub fn define(
         let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
         if ty.lane_bits() < 64 {
             // no 32-bit encodings for 64-bit widths
-            e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
+            e.enc32(instruction.clone(), template.clone());
         }
-        e.enc_x86_64_isap(instruction, template, use_sse2);
+        e.enc_x86_64(instruction, template);
     }
 
     // SIMD insertlane
-    let mut insertlane_mapping: HashMap<u64, (Vec<u8>, SettingPredicateNumber)> = HashMap::new();
-    insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], use_sse41)); // PINSRB
-    insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], use_sse2)); // PINSRW
-    insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRD
-    insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRQ, only x86_64
+    let mut insertlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+        HashMap::new();
+    insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41))); // PINSRB
+    insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
+    insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41))); // PINSRD
+    insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41))); // PINSRQ, only x86_64
 
     for ty in ValueType::all_lane_types() {
         if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
             let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
-                e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
+                e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
             } else {
                 // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
-                e.enc64_isap(instruction, template.rex().w(), isap.clone());
+                e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
             }
         }
     }
 
     // SIMD extractlane
-    let mut extractlane_mapping: HashMap<u64, (Vec<u8>, SettingPredicateNumber)> = HashMap::new();
-    extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], use_sse41)); // PEXTRB
-    extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], use_sse2)); // PEXTRW, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
-    extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRD
-    extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRQ, only x86_64
+    let mut extractlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+        HashMap::new();
+    extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB
+    extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
+    extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD
+    extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64
 
     for ty in ValueType::all_lane_types() {
         if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
             let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
-                e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
+                e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
             } else {
                 // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
-                e.enc64_isap(instruction, template.rex().w(), isap.clone());
+                e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
             }
         }
     }
diff --git a/cranelift-codegen/meta/src/isa/x86/settings.rs b/cranelift-codegen/meta/src/isa/x86/settings.rs
index bc8c81f48..3a4255338 100644
--- a/cranelift-codegen/meta/src/isa/x86/settings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/settings.rs
@@ -3,9 +3,6 @@ use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
 pub fn define(shared: &SettingGroup) -> SettingGroup {
     let mut settings = SettingGroupBuilder::new("x86");
 
-    // CPUID.01H:EDX
-    let has_sse2 = settings.add_bool("has_sse2", "SSE2: CPUID.01H:EDX.SSE2[bit 26]", false);
-
     // CPUID.01H:ECX
     let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
     let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
@@ -35,7 +32,6 @@ pub fn define(shared: &SettingGroup) -> SettingGroup {
 
     let shared_enable_simd = shared.get_bool("enable_simd");
 
-    settings.add_predicate("use_sse2", predicate!(shared_enable_simd && has_sse2));
     settings.add_predicate("use_ssse3", predicate!(shared_enable_simd && has_ssse3));
     settings.add_predicate("use_sse41", predicate!(shared_enable_simd && has_sse41));
     settings.add_predicate(
@@ -69,7 +65,7 @@ pub fn define(shared: &SettingGroup) -> SettingGroup {
     settings.add_preset("baseline", preset!());
     let nehalem = settings.add_preset(
         "nehalem",
-        preset!(has_sse2 && has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt),
+        preset!(has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt),
     );
     let haswell = settings.add_preset(
         "haswell",
@@ -82,8 +78,7 @@ pub fn define(shared: &SettingGroup) -> SettingGroup {
     settings.add_preset(
         "znver1",
         preset!(
-            has_sse2
-                && has_sse3
+            has_sse3
                 && has_ssse3
                 && has_sse41
                 && has_sse42
diff --git a/filetests/isa/x86/pshufb.clif b/filetests/isa/x86/pshufb.clif
index 7c23c5ab6..6fb31b198 100644
--- a/filetests/isa/x86/pshufb.clif
+++ b/filetests/isa/x86/pshufb.clif
@@ -1,6 +1,6 @@
 test binemit
 set enable_simd
-target x86_64 has_sse2=true has_ssse3=true
+target x86_64 has_ssse3=true
 
 function %test_pshufb() {
 ebb0:
diff --git a/filetests/isa/x86/pshufd.clif b/filetests/isa/x86/pshufd.clif
index 183af4fc0..6f4896d0d 100644
--- a/filetests/isa/x86/pshufd.clif
+++ b/filetests/isa/x86/pshufd.clif
@@ -1,6 +1,6 @@
 test binemit
 set enable_simd
-target x86_64 has_sse2=true
+target x86_64
 
 function %test_pshuf() {
 ebb0:
diff --git a/filetests/isa/x86/scalar_to_vector.clif b/filetests/isa/x86/scalar_to_vector.clif
index 6c77dfafd..51ddea3e7 100644
--- a/filetests/isa/x86/scalar_to_vector.clif
+++ b/filetests/isa/x86/scalar_to_vector.clif
@@ -1,7 +1,7 @@
 test binemit
 set opt_level=best
 set enable_simd
-target x86_64 has_sse2=true
+target x86_64
 
 function %test_scalar_to_vector_b8() {
 ebb0: