diff --git a/cranelift-codegen/meta/src/cdsl/types.rs b/cranelift-codegen/meta/src/cdsl/types.rs
index f431bb3ed..92b9ab3a2 100644
--- a/cranelift-codegen/meta/src/cdsl/types.rs
+++ b/cranelift-codegen/meta/src/cdsl/types.rs
@@ -264,6 +264,27 @@ impl LaneType {
             ValueType::Vector(VectorType::new(*self, lanes.into()))
         }
     }
+
+    pub fn is_float(&self) -> bool {
+        match self {
+            LaneType::FloatType(_) => true,
+            _ => false,
+        }
+    }
+
+    pub fn is_int(&self) -> bool {
+        match self {
+            LaneType::IntType(_) => true,
+            _ => false,
+        }
+    }
+
+    pub fn is_bool(&self) -> bool {
+        match self {
+            LaneType::BoolType(_) => true,
+            _ => false,
+        }
+    }
 }
 
 impl fmt::Display for LaneType {
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 2ae60896d..d773c2c66 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -4,8 +4,8 @@ use std::collections::HashMap;
 
 use crate::cdsl::encodings::{Encoding, EncodingBuilder};
 use crate::cdsl::instructions::{
-    BoundInstruction, InstSpec, Instruction, InstructionGroup, InstructionPredicate,
-    InstructionPredicateNode, InstructionPredicateRegistry,
+    InstSpec, Instruction, InstructionGroup, InstructionPredicate, InstructionPredicateNode,
+    InstructionPredicateRegistry,
 };
 use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
 use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
@@ -234,7 +234,7 @@ impl PerCpuModeEncodings {
     }
     fn enc_both_isap(
         &mut self,
-        inst: BoundInstruction,
+        inst: impl Clone + Into<InstSpec>,
         template: Template,
         isap: SettingPredicateNumber,
     ) {
@@ -243,7 +243,7 @@ impl PerCpuModeEncodings {
     }
     fn enc_both_instp(
         &mut self,
-        inst: BoundInstruction,
+        inst: impl Clone + Into<InstSpec>,
         template: Template,
         instp: InstructionPredicateNode,
     ) {
@@ -279,6 +279,17 @@ impl PerCpuModeEncodings {
         }
     }
 
+    /// Add the same encoding/recipe pairing to both X86_32 and X86_64
+    fn enc_32_64_rec(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        recipe: &EncodingRecipe,
+        bits: u16,
+    ) {
+        self.enc32_rec(inst.clone(), recipe, bits);
+        self.enc64_rec(inst, recipe, bits);
+    }
+
     /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
     /// binding) has already happened.
     fn enc_32_64_maybe_isap(
@@ -356,7 +367,6 @@ pub(crate) fn define(
     let copy_to_ssa = shared.by_name("copy_to_ssa");
     let ctz = shared.by_name("ctz");
     let debugtrap = shared.by_name("debugtrap");
-    let extractlane = shared.by_name("extractlane");
     let f32const = shared.by_name("f32const");
     let f64const = shared.by_name("f64const");
     let fadd = shared.by_name("fadd");
@@ -386,7 +396,6 @@ pub(crate) fn define(
     let ifcmp_sp = shared.by_name("ifcmp_sp");
     let imul = shared.by_name("imul");
     let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
-    let insertlane = shared.by_name("insertlane");
     let ireduce = shared.by_name("ireduce");
     let ishl = shared.by_name("ishl");
     let ishl_imm = shared.by_name("ishl_imm");
@@ -459,7 +468,12 @@ pub(crate) fn define(
     let x86_cvtt2si = x86.by_name("x86_cvtt2si");
     let x86_fmax = x86.by_name("x86_fmax");
     let x86_fmin = x86.by_name("x86_fmin");
+    let x86_insertps = x86.by_name("x86_insertps");
+    let x86_movlhps = x86.by_name("x86_movlhps");
+    let x86_movsd = x86.by_name("x86_movsd");
     let x86_pop = x86.by_name("x86_pop");
+    let x86_pextr = x86.by_name("x86_pextr");
+    let x86_pinsr = x86.by_name("x86_pinsr");
     let x86_pshufd = x86.by_name("x86_pshufd");
     let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_push = x86.by_name("x86_push");
@@ -490,6 +504,7 @@ pub(crate) fn define(
     let rec_f64imm_z = r.template("f64imm_z");
     let rec_fa = r.template("fa");
     let rec_fax = r.template("fax");
+    let rec_fa_ib = r.template("fa_ib");
     let rec_fcmp = r.template("fcmp");
     let rec_fcscc = r.template("fcscc");
     let rec_ffillnull = r.recipe("ffillnull");
@@ -1729,16 +1744,17 @@ pub(crate) fn define(
     e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
     e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
 
-    // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
+    // SIMD vector size: eventually multiple vector sizes may be supported but for now only
+    // SSE-sized vectors are available.
     let sse_vector_size: u64 = 128;
 
     // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
     // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
-    // value across the register
+    // value across the register.
 
     let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
 
-    // PSHUFB, 8-bit shuffle using two XMM registers
+    // PSHUFB, 8-bit shuffle using two XMM registers.
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
         let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
@@ -1746,7 +1762,7 @@ pub(crate) fn define(
         e.enc64_isap(instruction, template, use_ssse3_simd);
     }
 
-    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
+    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate.
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
         let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_r_ib_unsigned_fpr
@@ -1761,64 +1777,84 @@ pub(crate) fn define(
     // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
-        let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
-        if ty.lane_bits() < 64 {
-            // no 32-bit encodings for 64-bit widths
-            e.enc32(instruction.clone(), template.clone());
+        if ty.is_float() {
+            e.enc_32_64_rec(instruction, rec_null_fpr, 0);
+        } else {
+            let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
+            if ty.lane_bits() < 64 {
+                // no 32-bit encodings for 64-bit widths
+                e.enc32(instruction.clone(), template.clone());
+            }
+            e.enc_x86_64(instruction, template);
         }
-        e.enc_x86_64(instruction, template);
     }
 
     // SIMD insertlane
-    let mut insertlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+    let mut x86_pinsr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();
-    insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
-    insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
-    insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
-    insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
+    x86_pinsr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
+    x86_pinsr_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
+    x86_pinsr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
+    x86_pinsr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
 
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
-        if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
-            let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
+        if let Some((opcode, isap)) = x86_pinsr_mapping.get(&ty.lane_bits()) {
+            let instruction = x86_pinsr.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
                 e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
             } else {
-                // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
+                // It turns out the 64-bit widths have REX/W encodings and only are available on
+                // x86_64.
                 e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
             }
         }
     }
 
+    // For legalizing insertlane with floats, INSERTPS from SSE4.1.
+    {
+        let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size);
+        let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]);
+        e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
+    // For legalizing insertlane with floats,  MOVSD from SSE2.
+    {
+        let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2
+    }
+
+    // For legalizing insertlane with floats, MOVLHPS from SSE.
+    {
+        let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
+    }
+
     // SIMD extractlane
-    let mut extractlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+    let mut x86_pextr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();
-    extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
-    extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
-    extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
-    extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
+    x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
+    x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from SSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
+    x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
+    x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
 
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
-        if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
-            let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
+        if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) {
+            let instruction = x86_pextr.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
                 e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
             } else {
-                // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
+                // It turns out the 64-bit widths have REX/W encodings and only are available on
+                // x86_64.
                 e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
             }
         }
     }
 
-    // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
-    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
-        e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
-        e.enc64_rec(instruction, rec_null_fpr, 0);
-    }
-
-    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
+    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
         for to_type in
             ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
@@ -1826,8 +1862,28 @@ pub(crate) fn define(
             let instruction = raw_bitcast
                 .bind_vector_from_lane(to_type, sse_vector_size)
                 .bind_vector_from_lane(from_type, sse_vector_size);
-            e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
-            e.enc64_rec(instruction, rec_null_fpr, 0);
+            e.enc_32_64_rec(instruction, rec_null_fpr, 0);
+        }
+    }
+
+    // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an
+    // XMM register.
+    for float_type in &[F32, F64] {
+        for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
+            e.enc_32_64_rec(
+                raw_bitcast
+                    .bind_vector_from_lane(lane_type, sse_vector_size)
+                    .bind(*float_type),
+                rec_null_fpr,
+                0,
+            );
+            e.enc_32_64_rec(
+                raw_bitcast
+                    .bind(*float_type)
+                    .bind_vector_from_lane(lane_type, sse_vector_size),
+                rec_null_fpr,
+                0,
+            );
         }
     }
 
diff --git a/cranelift-codegen/meta/src/isa/x86/instructions.rs b/cranelift-codegen/meta/src/isa/x86/instructions.rs
index 03730cdea..b9f2496a8 100644
--- a/cranelift-codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift-codegen/meta/src/isa/x86/instructions.rs
@@ -291,5 +291,101 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let Idx = &operand_doc("Idx", uimm8, "Lane index");
+    let x = &operand("x", TxN);
+    let a = &operand("a", &TxN.lane_of());
+
+    ig.push(
+        Inst::new(
+            "x86_pextr",
+            r#"
+        Extract lane ``Idx`` from ``x``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+        )
+        .operands_in(vec![x, Idx])
+        .operands_out(vec![a]),
+    );
+
+    let IBxN = &TypeVar::new(
+        "IBxN",
+        "A SIMD vector type containing only booleans and integers",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .bools(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &operand("x", IBxN);
+    let y = &operand_doc("y", &IBxN.lane_of(), "New lane value");
+    let a = &operand("a", IBxN);
+
+    ig.push(
+        Inst::new(
+            "x86_pinsr",
+            r#"
+        Insert ``y`` into ``x`` at lane ``Idx``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+        )
+        .operands_in(vec![x, Idx, y])
+        .operands_out(vec![a]),
+    );
+
+    let FxN = &TypeVar::new(
+        "FxN",
+        "A SIMD vector type containing floats",
+        TypeSetBuilder::new()
+            .floats(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &operand("x", FxN);
+    let y = &operand_doc("y", &FxN.lane_of(), "New lane value");
+    let a = &operand("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_insertps",
+            r#"
+        Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is 
+        extracted from and which it is inserted to. This is similar to x86_pinsr but inserts 
+        floats, which are already stored in an XMM register.
+        "#,
+        )
+        .operands_in(vec![x, Idx, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &operand("x", FxN);
+    let y = &operand("y", FxN);
+    let a = &operand("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_movsd",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x``
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_movlhps",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x``
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     ig.build()
 }
diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs
index 2fd160de3..555a93f9c 100644
--- a/cranelift-codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs
@@ -20,10 +20,10 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     // List of instructions.
     let insts = &shared.instructions;
     let band = insts.by_name("band");
-    let bitcast = insts.by_name("bitcast");
     let bor = insts.by_name("bor");
     let clz = insts.by_name("clz");
     let ctz = insts.by_name("ctz");
+    let extractlane = insts.by_name("extractlane");
     let f64const = insts.by_name("f64const");
     let fcmp = insts.by_name("fcmp");
     let fcvt_from_uint = insts.by_name("fcvt_from_uint");
@@ -321,7 +321,9 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     // SIMD splat: 8-bits
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
         let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size);
-        let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
+        let bitcast_f64_to_any8x16 = raw_bitcast
+            .bind_vector_from_lane(ty, sse_vector_size)
+            .bind(F64);
         narrow.legalize(
             def!(y = splat_any8x16(x)),
             vec![
@@ -378,5 +380,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
         );
     }
 
+    narrow.custom_legalize(extractlane, "convert_extractlane");
+    narrow.custom_legalize(insertlane, "convert_insertlane");
+
     narrow.build_and_add_to(&mut shared.transform_groups);
 }
diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs
index 3f14769de..8176effc4 100644
--- a/cranelift-codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs
@@ -566,6 +566,27 @@ pub(crate) fn define<'shared>(
             ),
     );
 
+    // XX /r with FPR ins and outs. A form with a byte immediate.
+    {
+        let format = formats.get(f_insert_lane);
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fa_ib", f_insert_lane, 2)
+                .operands_in(vec![fpr, fpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    format, "lane", 8, 0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    let imm:i64 = lane.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+        );
+    }
+
     // XX /n for a unary operation with extension bits.
     recipes.add_template_recipe(
         EncodingRecipeBuilder::new("ur", f_unary, 1)
diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs
index 843347ce9..3ebebfe18 100644
--- a/cranelift-codegen/meta/src/shared/instructions.rs
+++ b/cranelift-codegen/meta/src/shared/instructions.rs
@@ -1537,7 +1537,9 @@ pub(crate) fn define(
         Extract lane ``Idx`` from ``x``.
 
         The lane index, ``Idx``, is an immediate value, not an SSA value. It
-        must indicate a valid lane index for the type of ``x``.
+        must indicate a valid lane index for the type of ``x``. Note that the upper bits of ``a``
+        may or may not be zeroed depending on the ISA but the type system should prevent using 
+        ``a`` as anything other than the extracted value.
         "#,
         )
         .operands_in(vec![x, Idx])
@@ -2782,9 +2784,11 @@ pub(crate) fn define(
         Inst::new(
             "scalar_to_vector",
             r#"
-    Scalar To Vector -- move a value out of a scalar register and into a vector
-    register; the scalar will be moved to the lowest-order bits of the vector
-    register and any higher bits will be zeroed.
+    Scalar To Vector -- move a value out of a scalar register and into a vector register; the 
+    scalar will be moved to the lowest-order bits of the vector register. Note that this 
+    instruction is intended as a low-level legalization instruction and frontends should prefer 
+    insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
+    types (e.g. integers) but not for others (e.g. floats).
     "#,
         )
         .operands_in(vec![s])
diff --git a/cranelift-codegen/src/context.rs b/cranelift-codegen/src/context.rs
index 704d89177..5350b3444 100644
--- a/cranelift-codegen/src/context.rs
+++ b/cranelift-codegen/src/context.rs
@@ -33,6 +33,7 @@ use crate::timing;
 use crate::unreachable_code::eliminate_unreachable_code;
 use crate::value_label::{build_value_labels_ranges, ComparableSourceLoc, ValueLabelsRanges};
 use crate::verifier::{verify_context, verify_locations, VerifierErrors, VerifierResult};
+use log::debug;
 use std::vec::Vec;
 
 /// Persistent data structures and compilation pipeline.
@@ -129,6 +130,7 @@ impl Context {
     pub fn compile(&mut self, isa: &dyn TargetIsa) -> CodegenResult<CodeInfo> {
         let _tt = timing::compile();
         self.verify_if(isa)?;
+        debug!("Compiling:\n{}", self.func.display(isa));
 
         self.compute_cfg();
         if isa.flags().opt_level() != OptLevel::Fastest {
@@ -158,7 +160,10 @@ impl Context {
             self.redundant_reload_remover(isa)?;
             self.shrink_instructions(isa)?;
         }
-        self.relax_branches(isa)
+        let result = self.relax_branches(isa);
+
+        debug!("Compiled:\n{}", self.func.display(isa));
+        result
     }
 
     /// Emit machine code directly into raw memory.
@@ -256,6 +261,7 @@ impl Context {
         self.domtree.clear();
         self.loop_analysis.clear();
         legalize_function(&mut self.func, &mut self.cfg, isa);
+        debug!("Legalized:\n{}", self.func.display(isa));
         self.verify_if(isa)
     }
 
diff --git a/cranelift-codegen/src/isa/x86/enc_tables.rs b/cranelift-codegen/src/isa/x86/enc_tables.rs
index e0fc05178..f67d7f0b6 100644
--- a/cranelift-codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift-codegen/src/isa/x86/enc_tables.rs
@@ -5,6 +5,7 @@ use crate::bitset::BitSet;
 use crate::cursor::{Cursor, FuncCursor};
 use crate::flowgraph::ControlFlowGraph;
 use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::types::*;
 use crate::ir::{self, Function, Inst, InstBuilder};
 use crate::isa::constraints::*;
 use crate::isa::enc_tables::*;
@@ -893,3 +894,121 @@ fn expand_fcvt_to_uint_sat(
     cfg.recompute_ebb(pos.func, uint_large_ebb);
     cfg.recompute_ebb(pos.func, done);
 }
+
+/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
+/// extractlane instruction
+fn convert_extractlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::ExtractLane {
+        opcode: ir::Opcode::Extractlane,
+        arg,
+        lane,
+    } = pos.func.dfg[inst]
+    {
+        // NOTE: the following legalization assumes that the upper bits of the XMM register do
+        // not need to be zeroed during extractlane.
+        let value_type = pos.func.dfg.value_type(arg);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            let shuffled = if lane != 0 {
+                // Replace the extractlane with a PSHUFD to get the float in the right place.
+                match value_type {
+                    F32X4 => {
+                        // Move the selected lane to the 0 lane.
+                        let shuffle_mask: u8 = 0b00_00_00_00 | lane;
+                        pos.ins().x86_pshufd(arg, shuffle_mask)
+                    }
+                    F64X2 => {
+                        assert_eq!(lane, 1);
+                        // Because we know the lane == 1, we move the upper 64 bits to the lower
+                        // 64 bits, leaving the top 64 bits as-is.
+                        let shuffle_mask = 0b11_10_11_10;
+                        let bitcast = pos.ins().raw_bitcast(F32X4, arg);
+                        pos.ins().x86_pshufd(bitcast, shuffle_mask)
+                    }
+                    _ => unreachable!(),
+                }
+            } else {
+                // Remove the extractlane instruction, leaving the float where it is.
+                arg
+            };
+            // Then we must bitcast to the right type.
+            pos.func
+                .dfg
+                .replace(inst)
+                .raw_bitcast(value_type.lane_type(), shuffled);
+        } else {
+            // For non-floats, lower with the usual PEXTR* instruction.
+            pos.func.dfg.replace(inst).x86_pextr(arg, lane);
+        }
+    }
+}
+
+/// Because floats exist in XMM registers, we can keep them there when executing a CLIF
+/// insertlane instruction
+fn convert_insertlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::InsertLane {
+        opcode: ir::Opcode::Insertlane,
+        args: [vector, replacement],
+        lane,
+    } = pos.func.dfg[inst]
+    {
+        let value_type = pos.func.dfg.value_type(vector);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            match value_type {
+                F32X4 => {
+                    assert!(lane > 0 && lane <= 3);
+                    let immediate = 0b00_00_00_00 | lane << 4;
+                    // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+                    // shifted into bits 5:6).
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .x86_insertps(vector, immediate, replacement)
+                }
+                F64X2 => {
+                    let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types
+                    if lane == 0 {
+                        // Move the lowest quadword in replacement to vector without changing
+                        // the upper bits.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movsd(vector, replacement_as_vector)
+                    } else {
+                        assert_eq!(lane, 1);
+                        // Move the low 64 bits of replacement vector to the high 64 bits of the
+                        // vector.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movlhps(vector, replacement_as_vector)
+                    }
+                }
+                _ => unreachable!(),
+            };
+        } else {
+            // For non-floats, lower with the usual PINSR* instruction.
+            pos.func
+                .dfg
+                .replace(inst)
+                .x86_pinsr(vector, lane, replacement);
+        }
+    }
+}
diff --git a/cranelift-codegen/src/verifier/locations.rs b/cranelift-codegen/src/verifier/locations.rs
index bf1a4e186..cf17ae13d 100644
--- a/cranelift-codegen/src/verifier/locations.rs
+++ b/cranelift-codegen/src/verifier/locations.rs
@@ -107,8 +107,10 @@ impl<'a> LocationVerifier<'a> {
         fatal!(
             errors,
             inst,
-            "{} constraints not satisfied",
-            self.encinfo.display(enc)
+            "{} constraints not satisfied in: {}\n{}",
+            self.encinfo.display(enc),
+            self.func.dfg.display_inst(inst, self.isa),
+            self.func.display(self.isa)
         )
     }
 
diff --git a/cranelift-native/src/lib.rs b/cranelift-native/src/lib.rs
index 0687e7017..cfa27cef8 100644
--- a/cranelift-native/src/lib.rs
+++ b/cranelift-native/src/lib.rs
@@ -59,6 +59,9 @@ fn parse_x86_cpuid(isa_builder: &mut isa::Builder) -> Result<(), &'static str> {
         if info.has_sse3() {
             isa_builder.enable("has_sse3").unwrap();
         }
+        if info.has_ssse3() {
+            isa_builder.enable("has_ssse3").unwrap();
+        }
         if info.has_sse41() {
             isa_builder.enable("has_sse41").unwrap();
         }
diff --git a/cranelift-wasm/src/code_translator.rs b/cranelift-wasm/src/code_translator.rs
index 1801c6539..e5ce5a0c3 100644
--- a/cranelift-wasm/src/code_translator.rs
+++ b/cranelift-wasm/src/code_translator.rs
@@ -940,6 +940,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let splatted = builder.ins().splat(ty, value_to_splat);
             state.push1(splatted)
         }
+        Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
+            let vector = optionally_bitcast_vector(state.pop1(), type_of(op), builder);
+            let extracted = builder.ins().extractlane(vector, lane.clone());
+            state.push1(builder.ins().sextend(I32, extracted))
+        }
+        Operator::I8x16ExtractLaneU { lane } | Operator::I16x8ExtractLaneU { lane } => {
+            let vector = optionally_bitcast_vector(state.pop1(), type_of(op), builder);
+            state.push1(builder.ins().extractlane(vector, lane.clone()));
+            // on x86, PEXTRB zeroes the upper bits of the destination register of extractlane so uextend is elided; of course, this depends on extractlane being legalized to a PEXTRB
+        }
         Operator::I32x4ExtractLane { lane }
         | Operator::I64x2ExtractLane { lane }
         | Operator::F32x4ExtractLane { lane }
@@ -967,10 +977,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::V128Load { .. }
         | Operator::V128Store { .. }
-        | Operator::I8x16ExtractLaneS { .. }
-        | Operator::I8x16ExtractLaneU { .. }
-        | Operator::I16x8ExtractLaneS { .. }
-        | Operator::I16x8ExtractLaneU { .. }
         | Operator::V8x16Shuffle { .. }
         | Operator::I8x16Eq
         | Operator::I8x16Ne
diff --git a/filetests/isa/x86/extractlane-binemit.clif b/filetests/isa/x86/extractlane-binemit.clif
new file mode 100644
index 000000000..0a3b776a9
--- /dev/null
+++ b/filetests/isa/x86/extractlane-binemit.clif
@@ -0,0 +1,38 @@
+test binemit
+set enable_simd
+target x86_64 haswell
+
+; for extractlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pextr
+; which is manually placed in the IR so that it can be binemit-tested
+
+function %test_extractlane_b8() {
+ebb0:
+[-, %rax]   v0 = bconst.b8 true
+[-, %xmm0]  v1 = splat.b8x16 v0
+[-, %rax]   v2 = x86_pextr v1, 10    ; bin: 66 0f 3a 14 c0 0a
+            return
+}
+
+function %test_extractlane_i16() {
+ebb0:
+[-, %rax]   v0 = iconst.i16 4
+[-, %xmm1]  v1 = splat.i16x8 v0
+[-, %rax]   v2 = x86_pextr v1, 4    ; bin: 66 0f c5 c8 04
+            return
+}
+
+function %test_extractlane_i32() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %xmm4]  v1 = splat.i32x4 v0
+[-, %rcx]   v2 = x86_pextr v1, 2    ; bin: 66 0f 3a 16 e1 02
+            return
+}
+
+function %test_extractlane_b64() {
+ebb0:
+[-, %rax]   v0 = bconst.b64 false
+[-, %xmm2]  v1 = splat.b64x2 v0
+[-, %rbx]   v2 = x86_pextr v1, 1    ; bin: 66 48 0f 3a 16 d3 01
+            return
+}
diff --git a/filetests/isa/x86/extractlane-run.clif b/filetests/isa/x86/extractlane-run.clif
new file mode 100644
index 000000000..adb2e7b8e
--- /dev/null
+++ b/filetests/isa/x86/extractlane-run.clif
@@ -0,0 +1,68 @@
+test run
+set enable_simd
+
+function %test_extractlane_b8() -> b8 {
+ebb0:
+    v1 = vconst.b8x16 [false false false false false false false false false false true false false
+    false false false]
+    v2 = extractlane v1, 10
+    return v2
+}
+; run
+
+function %test_extractlane_i16() -> b1 {
+ebb0:
+    v0 = vconst.i16x8 0x00080007000600050004000300020001
+    v1 = extractlane v0, 1
+    v2 = icmp_imm eq v1, 2
+    return v2
+}
+; run
+
+function %test_extractlane_f32() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42]
+    v2 = extractlane v1, 3
+    v3 = fcmp eq v2, v0
+    return v3
+}
+; run
+
+function %test_extractlane_i32_with_vector_reuse() -> b1 {
+ebb0:
+    v0 = iconst.i32 42
+    v1 = iconst.i32 99
+
+    v2 = splat.i32x4 v0
+    v3 = insertlane v2, 2, v1
+
+    v4 = extractlane v3, 3
+    v5 = icmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = icmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+function %test_extractlane_f32_with_vector_reuse() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = f32const 0x99.99
+
+    v2 = splat.f32x4 v0
+    v3 = insertlane v2, 2, v1
+
+    v4 = extractlane v3, 3
+    v5 = fcmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = fcmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
diff --git a/filetests/isa/x86/extractlane.clif b/filetests/isa/x86/extractlane.clif
deleted file mode 100644
index e7a1ea898..000000000
--- a/filetests/isa/x86/extractlane.clif
+++ /dev/null
@@ -1,35 +0,0 @@
-test binemit
-set enable_simd
-target x86_64 haswell
-
-function %test_extractlane_b8() {
-ebb0:
-[-, %rax]   v0 = bconst.b8 true
-[-, %xmm0]  v1 = splat.b8x16 v0
-[-, %rax]   v2 = extractlane v1, 10    ; bin: 66 0f 3a 14 c0 0a
-            return
-}
-
-function %test_extractlane_i16() {
-ebb0:
-[-, %rax]   v0 = iconst.i16 4
-[-, %xmm1]  v1 = splat.i16x8 v0
-[-, %rax]   v2 = extractlane v1, 4    ; bin: 66 0f c5 c8 04
-            return
-}
-
-function %test_extractlane_i32() {
-ebb0:
-[-, %rax]   v0 = iconst.i32 42
-[-, %xmm4]  v1 = splat.i32x4 v0
-[-, %rcx]   v2 = extractlane v1, 2    ; bin: 66 0f 3a 16 e1 02
-            return
-}
-
-function %test_extractlane_f64() {
-ebb0:
-[-, %rax]   v0 = f64const 0x0.0
-[-, %xmm2]  v1 = splat.f64x2 v0
-[-, %rbx]   v2 = extractlane v1, 1    ; bin: 66 48 0f 3a 16 d3 01
-            return
-}
diff --git a/filetests/isa/x86/insertlane-binemit.clif b/filetests/isa/x86/insertlane-binemit.clif
new file mode 100644
index 000000000..c388ed6fa
--- /dev/null
+++ b/filetests/isa/x86/insertlane-binemit.clif
@@ -0,0 +1,42 @@
+test binemit
+set enable_simd
+target x86_64 haswell
+
+; for insertlane, floats are legalized differently than integers and booleans; integers and
+; booleans use x86_pinsr which is manually placed in the IR so that it can be binemit-tested
+
+function %test_insertlane_b8() {
+ebb0:
+[-, %rax]   v0 = bconst.b8 true
+[-, %rbx]   v1 = bconst.b8 false
+[-, %xmm0]  v2 = splat.b8x16 v0
+[-, %xmm0]  v3 = x86_pinsr v2, 10, v1    ; bin: 66 0f 3a 20 c3 0a
+            return
+}
+
+function %test_insertlane_i16() {
+ebb0:
+[-, %rax]   v0 = iconst.i16 4
+[-, %rbx]   v1 = iconst.i16 5
+[-, %xmm1]  v2 = splat.i16x8 v0
+[-, %xmm1]  v3 = x86_pinsr v2, 4, v1    ; bin: 66 0f c4 cb 04
+            return
+}
+
+function %test_insertlane_i32() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %rbx]   v1 = iconst.i32 99
+[-, %xmm4]  v2 = splat.i32x4 v0
+[-, %xmm4]  v3 = x86_pinsr v2, 2, v1    ; bin: 66 0f 3a 22 e3 02
+            return
+}
+
+function %test_insertlane_b64() {
+ebb0:
+[-, %rax]   v0 = bconst.b64 true
+[-, %rbx]   v1 = bconst.b64 false
+[-, %xmm2]  v2 = splat.b64x2 v0
+[-, %xmm2]  v3 = x86_pinsr v2, 1, v1    ; bin: 66 48 0f 3a 22 d3 01
+            return
+}
diff --git a/filetests/isa/x86/insertlane-run.clif b/filetests/isa/x86/insertlane-run.clif
new file mode 100644
index 000000000..92fb38202
--- /dev/null
+++ b/filetests/isa/x86/insertlane-run.clif
@@ -0,0 +1,48 @@
+test run
+set enable_simd
+
+; TODO once SIMD vector comparison is implemented, remove use of extractlane below
+
+function %test_insertlane_b8() -> b8 {
+ebb0:
+    v1 = bconst.b8 true
+    v2 = vconst.b8x16 [false false false false false false false false false false false false false
+     false false false]
+    v3 = insertlane v2, 10, v1
+    v4 = extractlane v3, 10
+    return v4
+}
+; run
+
+function %test_insertlane_f32() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 0x00
+    v2 = insertlane v1, 1, v0
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %test_insertlane_f64_lane1() -> b1 {
+ebb0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, 1, v0
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %test_insertlane_f64_lane0() -> b1 {
+ebb0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, 0, v0
+    v3 = extractlane v2, 0
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
diff --git a/filetests/isa/x86/insertlane.clif b/filetests/isa/x86/insertlane.clif
deleted file mode 100644
index c55dc4033..000000000
--- a/filetests/isa/x86/insertlane.clif
+++ /dev/null
@@ -1,39 +0,0 @@
-test binemit
-set enable_simd
-target x86_64 haswell
-
-function %test_insertlane_b8() {
-ebb0:
-[-, %rax]   v0 = bconst.b8 true
-[-, %rbx]   v1 = bconst.b8 false
-[-, %xmm0]  v2 = splat.b8x16 v0
-[-, %xmm0]  v3 = insertlane v2, 10, v1    ; bin: 66 0f 3a 20 c3 0a
-            return
-}
-
-function %test_insertlane_i16() {
-ebb0:
-[-, %rax]   v0 = iconst.i16 4
-[-, %rbx]   v1 = iconst.i16 5
-[-, %xmm1]  v2 = splat.i16x8 v0
-[-, %xmm1]  v3 = insertlane v2, 4, v1    ; bin: 66 0f c4 cb 04
-            return
-}
-
-function %test_insertlane_i32() {
-ebb0:
-[-, %rax]   v0 = iconst.i32 42
-[-, %rbx]   v1 = iconst.i32 99
-[-, %xmm4]  v2 = splat.i32x4 v0
-[-, %xmm4]  v3 = insertlane v2, 2, v1    ; bin: 66 0f 3a 22 e3 02
-            return
-}
-
-function %test_insertlane_f64() {
-ebb0:
-[-, %rax]   v0 = f64const 0x0.0
-[-, %rbx]   v1 = f64const 0x4.2
-[-, %xmm2]  v2 = splat.f64x2 v0
-[-, %xmm2]  v3 = insertlane v2, 1, v1    ; bin: 66 48 0f 3a 22 d3 01
-            return
-}
diff --git a/filetests/isa/x86/legalize-splat.clif b/filetests/isa/x86/legalize-splat.clif
index fa07f80c1..19d61d529 100644
--- a/filetests/isa/x86/legalize-splat.clif
+++ b/filetests/isa/x86/legalize-splat.clif
@@ -33,7 +33,7 @@ ebb0:
 ; check:   ebb0:
 ; nextln:     v0 = iconst.i64 42
 ; nextln:     v2 = scalar_to_vector.i64x2 v0
-; nextln:     v1 = insertlane v2, 1, v0
+; nextln:     v1 = x86_pinsr v2, 1, v0
 ; nextln:     return v1
 
 
@@ -48,7 +48,7 @@ ebb0:
 ; check:   ebb0:
 ; nextln:     v0 = bconst.b16 true
 ; nextln:     v2 = scalar_to_vector.b16x8 v0
-; nextln:     v3 = insertlane v2, 1, v0
+; nextln:     v3 = x86_pinsr v2, 1, v0
 ; nextln:     v4 = raw_bitcast.i32x4 v3
 ; nextln:     v5 = x86_pshufd v4, 0
 ; nextln:     v1 = raw_bitcast.b16x8 v5
@@ -68,6 +68,6 @@ ebb0:
 ; nextln:     v0 = ireduce.i8 v2
 ; nextln:     v3 = scalar_to_vector.i8x16 v0
 ; nextln:     v4 = f64const 0.0
-; nextln:     v5 = bitcast.i8x16 v4
+; nextln:     v5 = raw_bitcast.i8x16 v4
 ; nextln:     v1 = x86_pshufb v3, v5
 ; nextln:     return v1
diff --git a/filetests/isa/x86/scalar_to_vector.clif b/filetests/isa/x86/scalar_to_vector-binemit.clif
similarity index 80%
rename from filetests/isa/x86/scalar_to_vector.clif
rename to filetests/isa/x86/scalar_to_vector-binemit.clif
index 51ddea3e7..b26f3d2e6 100644
--- a/filetests/isa/x86/scalar_to_vector.clif
+++ b/filetests/isa/x86/scalar_to_vector-binemit.clif
@@ -17,10 +17,10 @@ ebb0:
             return
 }
 
-function %test_scalar_to_vector_f32() {
+function %test_scalar_to_vector_b32() {
 ebb0:
-[-, %rcx]   v0 = f32const 0x0.42
-[-, %xmm3]  v1 = scalar_to_vector.f32x4 v0    ; bin: 66 0f 6e d9
+[-, %rcx]   v0 = bconst.b32 false
+[-, %xmm3]  v1 = scalar_to_vector.b32x4 v0    ; bin: 66 0f 6e d9
             return
 }
 
diff --git a/filetests/isa/x86/scalar_to_vector-compile.clif b/filetests/isa/x86/scalar_to_vector-compile.clif
new file mode 100644
index 000000000..2d2ab331f
--- /dev/null
+++ b/filetests/isa/x86/scalar_to_vector-compile.clif
@@ -0,0 +1,19 @@
+test compile
+set opt_level=best
+set probestack_enabled=false
+set enable_simd
+target x86_64
+
+; ensure that scalar_to_vector emits no instructions for floats (already exist in an XMM register)
+function %test_scalar_to_vector_f32() -> f32x4 baldrdash_system_v {
+ebb0:
+    v0 = f32const 0x0.42
+    v1 = scalar_to_vector.f32x4 v0
+    return v1
+}
+
+; check: ebb0
+; nextln: v2 = iconst.i32 0x3e84_0000
+; nextln: v0 = bitcast.f32 v2
+; nextln: [null_fpr#00,%xmm0]                 v1 = scalar_to_vector.f32x4 v0
+; nextln: return v1