From 4e33dc292f5684a9a42bd0127d2882cd39320113 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 21 Aug 2019 14:07:50 -0700
Subject: [PATCH 1/8] Log compiled and legalized functions

---
 cranelift-codegen/src/context.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/cranelift-codegen/src/context.rs b/cranelift-codegen/src/context.rs
index 704d89177..5350b3444 100644
--- a/cranelift-codegen/src/context.rs
+++ b/cranelift-codegen/src/context.rs
@@ -33,6 +33,7 @@ use crate::timing;
 use crate::unreachable_code::eliminate_unreachable_code;
 use crate::value_label::{build_value_labels_ranges, ComparableSourceLoc, ValueLabelsRanges};
 use crate::verifier::{verify_context, verify_locations, VerifierErrors, VerifierResult};
+use log::debug;
 use std::vec::Vec;
 
 /// Persistent data structures and compilation pipeline.
@@ -129,6 +130,7 @@ impl Context {
     pub fn compile(&mut self, isa: &dyn TargetIsa) -> CodegenResult<CodeInfo> {
         let _tt = timing::compile();
         self.verify_if(isa)?;
+        debug!("Compiling:\n{}", self.func.display(isa));
 
         self.compute_cfg();
         if isa.flags().opt_level() != OptLevel::Fastest {
@@ -158,7 +160,10 @@ impl Context {
             self.redundant_reload_remover(isa)?;
             self.shrink_instructions(isa)?;
         }
-        self.relax_branches(isa)
+        let result = self.relax_branches(isa);
+
+        debug!("Compiled:\n{}", self.func.display(isa));
+        result
     }
 
     /// Emit machine code directly into raw memory.
@@ -256,6 +261,7 @@ impl Context {
         self.domtree.clear();
         self.loop_analysis.clear();
         legalize_function(&mut self.func, &mut self.cfg, isa);
+        debug!("Legalized:\n{}", self.func.display(isa));
         self.verify_if(isa)
     }
 

From 80af95df5ab0691d2a2ecf1e6beac2c374abfc8b Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 21 Aug 2019 14:59:17 -0700
Subject: [PATCH 2/8] Enable SSSE3 setting when detected on CPU

---
 cranelift-native/src/lib.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cranelift-native/src/lib.rs b/cranelift-native/src/lib.rs
index 0687e7017..cfa27cef8 100644
--- a/cranelift-native/src/lib.rs
+++ b/cranelift-native/src/lib.rs
@@ -59,6 +59,9 @@ fn parse_x86_cpuid(isa_builder: &mut isa::Builder) -> Result<(), &'static str> {
         if info.has_sse3() {
             isa_builder.enable("has_sse3").unwrap();
         }
+        if info.has_ssse3() {
+            isa_builder.enable("has_ssse3").unwrap();
+        }
         if info.has_sse41() {
             isa_builder.enable("has_sse41").unwrap();
         }

From 9c24a3568160e25e8788a37a1d2352e90d91f063 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 21 Aug 2019 15:15:27 -0700
Subject: [PATCH 3/8] Use raw_bitcast when legalizing splat

raw_bitcast matches the intent of this legalization more clearly (to simply change the CLIF type without changing any bits) and the additional null encodings added are necessary for later instructions
---
 .../meta/src/isa/x86/encodings.rs             | 37 +++++++++++++------
 .../meta/src/isa/x86/legalize.rs              |  5 ++-
 filetests/isa/x86/legalize-splat.clif         |  2 +-
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 2ae60896d..4a04ef574 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -234,7 +234,7 @@ impl PerCpuModeEncodings {
     }
     fn enc_both_isap(
         &mut self,
-        inst: BoundInstruction,
+        inst: impl Clone + Into<InstSpec>,
         template: Template,
         isap: SettingPredicateNumber,
     ) {
@@ -243,7 +243,7 @@ impl PerCpuModeEncodings {
     }
     fn enc_both_instp(
         &mut self,
-        inst: BoundInstruction,
+        inst: impl Clone + Into<InstSpec>,
         template: Template,
         instp: InstructionPredicateNode,
     ) {
@@ -1811,23 +1811,38 @@ pub(crate) fn define(
         }
     }
 
-    // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
-    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
+    // helper for generating null encodings for FPRs on both 32- and 64-bit architectures
+    let mut null_encode_32_64 = |instruction: BoundInstruction| {
         e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
         e.enc64_rec(instruction, rec_null_fpr, 0);
-    }
+    };
 
     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
         for to_type in
             ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
         {
-            let instruction = raw_bitcast
-                .bind_vector_from_lane(to_type, sse_vector_size)
-                .bind_vector_from_lane(from_type, sse_vector_size);
-            e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
-            e.enc64_rec(instruction, rec_null_fpr, 0);
+            null_encode_32_64(
+                raw_bitcast
+                    .bind_vector_from_lane(to_type, sse_vector_size)
+                    .bind_vector_from_lane(from_type, sse_vector_size),
+            );
+        }
+    }
+
+    // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an XMM register
+    for float_type in &[F32, F64] {
+        for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
+            null_encode_32_64(
+                raw_bitcast
+                    .bind_vector_from_lane(lane_type, sse_vector_size)
+                    .bind(*float_type),
+            );
+            null_encode_32_64(
+                raw_bitcast
+                    .bind(*float_type)
+                    .bind_vector_from_lane(lane_type, sse_vector_size),
+            );
         }
     }
 
diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs
index 2fd160de3..d56beb802 100644
--- a/cranelift-codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs
@@ -20,7 +20,6 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     // List of instructions.
     let insts = &shared.instructions;
     let band = insts.by_name("band");
-    let bitcast = insts.by_name("bitcast");
     let bor = insts.by_name("bor");
     let clz = insts.by_name("clz");
     let ctz = insts.by_name("ctz");
@@ -321,7 +320,9 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     // SIMD splat: 8-bits
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
         let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size);
-        let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
+        let bitcast_f64_to_any8x16 = raw_bitcast
+            .bind_vector_from_lane(ty, sse_vector_size)
+            .bind(F64);
         narrow.legalize(
             def!(y = splat_any8x16(x)),
             vec![
diff --git a/filetests/isa/x86/legalize-splat.clif b/filetests/isa/x86/legalize-splat.clif
index fa07f80c1..c0fc83ebe 100644
--- a/filetests/isa/x86/legalize-splat.clif
+++ b/filetests/isa/x86/legalize-splat.clif
@@ -68,6 +68,6 @@ ebb0:
 ; nextln:     v0 = ireduce.i8 v2
 ; nextln:     v3 = scalar_to_vector.i8x16 v0
 ; nextln:     v4 = f64const 0.0
-; nextln:     v5 = bitcast.i8x16 v4
+; nextln:     v5 = raw_bitcast.i8x16 v4
 ; nextln:     v1 = x86_pshufb v3, v5
 ; nextln:     return v1

From 5216c2f156ca41eec57638f575e6c3cb400d0a7b Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 21 Aug 2019 15:21:18 -0700
Subject: [PATCH 4/8] Translate the sign-extended and zero-extended versions of
 extract_lane

---
 cranelift-wasm/src/code_translator.rs | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cranelift-wasm/src/code_translator.rs b/cranelift-wasm/src/code_translator.rs
index 1801c6539..e5ce5a0c3 100644
--- a/cranelift-wasm/src/code_translator.rs
+++ b/cranelift-wasm/src/code_translator.rs
@@ -940,6 +940,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let splatted = builder.ins().splat(ty, value_to_splat);
             state.push1(splatted)
         }
+        Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
+            let vector = optionally_bitcast_vector(state.pop1(), type_of(op), builder);
+            let extracted = builder.ins().extractlane(vector, lane.clone());
+            state.push1(builder.ins().sextend(I32, extracted))
+        }
+        Operator::I8x16ExtractLaneU { lane } | Operator::I16x8ExtractLaneU { lane } => {
+            let vector = optionally_bitcast_vector(state.pop1(), type_of(op), builder);
+            state.push1(builder.ins().extractlane(vector, lane.clone()));
+            // on x86, PEXTRB zeroes the upper bits of the destination register of extractlane so uextend is elided; of course, this depends on extractlane being legalized to a PEXTRB
+        }
         Operator::I32x4ExtractLane { lane }
         | Operator::I64x2ExtractLane { lane }
         | Operator::F32x4ExtractLane { lane }
@@ -967,10 +977,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::V128Load { .. }
         | Operator::V128Store { .. }
-        | Operator::I8x16ExtractLaneS { .. }
-        | Operator::I8x16ExtractLaneU { .. }
-        | Operator::I16x8ExtractLaneS { .. }
-        | Operator::I16x8ExtractLaneU { .. }
         | Operator::V8x16Shuffle { .. }
         | Operator::I8x16Eq
         | Operator::I8x16Ne

From b5e6a6caa850fb5c8750992dc22390c651d72df1 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 21 Aug 2019 16:56:11 -0700
Subject: [PATCH 5/8] Avoid extra register movement when lowering the x86
 extractlane of a float vector

This commit is based on the assumption that floats are already stored in XMM registers in x86. When extracting a lane, cranelift was moving the float to a regular register and back to an XMM register; this change avoids this by shuffling the float value to the lowest bits of the XMM register. It also assumes that the upper bits can be left as is (instead of zeroing them out).
---
 .../meta/src/isa/x86/encodings.rs             | 16 +++---
 .../meta/src/isa/x86/instructions.rs          | 17 ++++++
 .../meta/src/isa/x86/legalize.rs              |  3 +
 cranelift-codegen/src/isa/x86/enc_tables.rs   | 57 +++++++++++++++++++
 filetests/isa/x86/extractlane-binemit.clif    | 38 +++++++++++++
 filetests/isa/x86/extractlane-run.clif        | 31 ++++++++++
 filetests/isa/x86/extractlane.clif            | 35 ------------
 7 files changed, 154 insertions(+), 43 deletions(-)
 create mode 100644 filetests/isa/x86/extractlane-binemit.clif
 create mode 100644 filetests/isa/x86/extractlane-run.clif
 delete mode 100644 filetests/isa/x86/extractlane.clif

diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 4a04ef574..30246b85a 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -356,7 +356,6 @@ pub(crate) fn define(
     let copy_to_ssa = shared.by_name("copy_to_ssa");
     let ctz = shared.by_name("ctz");
     let debugtrap = shared.by_name("debugtrap");
-    let extractlane = shared.by_name("extractlane");
     let f32const = shared.by_name("f32const");
     let f64const = shared.by_name("f64const");
     let fadd = shared.by_name("fadd");
@@ -460,6 +459,7 @@ pub(crate) fn define(
     let x86_fmax = x86.by_name("x86_fmax");
     let x86_fmin = x86.by_name("x86_fmin");
     let x86_pop = x86.by_name("x86_pop");
+    let x86_pextr = x86.by_name("x86_pextr");
     let x86_pshufd = x86.by_name("x86_pshufd");
     let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_push = x86.by_name("x86_push");
@@ -1791,16 +1791,16 @@ pub(crate) fn define(
     }
 
     // SIMD extractlane
-    let mut extractlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+    let mut x86_pextr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();
-    extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
-    extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
-    extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
-    extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
+    x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB
+    x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
+    x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD
+    x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64
 
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
-        if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
-            let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
+        if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) {
+            let instruction = x86_pextr.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
                 e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
diff --git a/cranelift-codegen/meta/src/isa/x86/instructions.rs b/cranelift-codegen/meta/src/isa/x86/instructions.rs
index 03730cdea..3f583c6ed 100644
--- a/cranelift-codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift-codegen/meta/src/isa/x86/instructions.rs
@@ -291,5 +291,22 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let Idx = &operand_doc("Idx", uimm8, "Lane index");
+    let x = &operand("x", TxN);
+    let a = &operand("a", &TxN.lane_of());
+
+    ig.push(
+        Inst::new(
+            "x86_pextr",
+            r#"
+        Extract lane ``Idx`` from ``x``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+        )
+        .operands_in(vec![x, Idx])
+        .operands_out(vec![a]),
+    );
+
     ig.build()
 }
diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs
index d56beb802..4c2ebaefd 100644
--- a/cranelift-codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs
@@ -23,6 +23,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     let bor = insts.by_name("bor");
     let clz = insts.by_name("clz");
     let ctz = insts.by_name("ctz");
+    let extractlane = insts.by_name("extractlane");
     let f64const = insts.by_name("f64const");
     let fcmp = insts.by_name("fcmp");
     let fcvt_from_uint = insts.by_name("fcvt_from_uint");
@@ -379,5 +380,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
         );
     }
 
+    narrow.custom_legalize(extractlane, "convert_extractlane");
+
     narrow.build_and_add_to(&mut shared.transform_groups);
 }
diff --git a/cranelift-codegen/src/isa/x86/enc_tables.rs b/cranelift-codegen/src/isa/x86/enc_tables.rs
index e0fc05178..39bdb5784 100644
--- a/cranelift-codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift-codegen/src/isa/x86/enc_tables.rs
@@ -5,6 +5,7 @@ use crate::bitset::BitSet;
 use crate::cursor::{Cursor, FuncCursor};
 use crate::flowgraph::ControlFlowGraph;
 use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::types::*;
 use crate::ir::{self, Function, Inst, InstBuilder};
 use crate::isa::constraints::*;
 use crate::isa::enc_tables::*;
@@ -893,3 +894,59 @@ fn expand_fcvt_to_uint_sat(
     cfg.recompute_ebb(pos.func, uint_large_ebb);
     cfg.recompute_ebb(pos.func, done);
 }
+
+/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
+/// extractlane instruction
+fn convert_extractlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::ExtractLane {
+        opcode: ir::Opcode::Extractlane,
+        arg,
+        lane,
+    } = pos.func.dfg[inst]
+    {
+        // NOTE: the following legalization assumes that the upper bits of the XMM register do
+        // not need to be zeroed during extractlane.
+        let value_type = pos.func.dfg.value_type(arg);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            let shuffled = if lane != 0 {
+                // Replace the extractlane with a PSHUFD to get the float in the right place.
+                match value_type {
+                    F32X4 => {
+                        // Move the selected lane to the 0 lane.
+                        let shuffle_mask: u8 = 0b00_00_00_00 | lane;
+                        pos.ins().x86_pshufd(arg, shuffle_mask)
+                    }
+                    F64X2 => {
+                        assert_eq!(lane, 1);
+                        // Because we know the lane == 1, we move the upper 64 bits to the lower
+                        // 64 bits, leaving the top 64 bits as-is.
+                        let shuffle_mask = 0b11_10_11_10;
+                        let bitcast = pos.ins().raw_bitcast(F32X4, arg);
+                        pos.ins().x86_pshufd(bitcast, shuffle_mask)
+                    }
+                    _ => unreachable!(),
+                }
+            } else {
+                // Remove the extractlane instruction, leaving the float where it is.
+                arg
+            };
+            // Then we must bitcast to the right type.
+            pos.func
+                .dfg
+                .replace(inst)
+                .raw_bitcast(value_type.lane_type(), shuffled);
+        } else {
+            // For non-floats, lower with the usual PEXTR* instruction.
+            pos.func.dfg.replace(inst).x86_pextr(arg, lane);
+        }
+    }
+}
diff --git a/filetests/isa/x86/extractlane-binemit.clif b/filetests/isa/x86/extractlane-binemit.clif
new file mode 100644
index 000000000..0a3b776a9
--- /dev/null
+++ b/filetests/isa/x86/extractlane-binemit.clif
@@ -0,0 +1,38 @@
+test binemit
+set enable_simd
+target x86_64 haswell
+
+; for extractlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pextr
+; which is manually placed in the IR so that it can be binemit-tested
+
+function %test_extractlane_b8() {
+ebb0:
+[-, %rax]   v0 = bconst.b8 true
+[-, %xmm0]  v1 = splat.b8x16 v0
+[-, %rax]   v2 = x86_pextr v1, 10    ; bin: 66 0f 3a 14 c0 0a
+            return
+}
+
+function %test_extractlane_i16() {
+ebb0:
+[-, %rax]   v0 = iconst.i16 4
+[-, %xmm1]  v1 = splat.i16x8 v0
+[-, %rax]   v2 = x86_pextr v1, 4    ; bin: 66 0f c5 c8 04
+            return
+}
+
+function %test_extractlane_i32() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %xmm4]  v1 = splat.i32x4 v0
+[-, %rcx]   v2 = x86_pextr v1, 2    ; bin: 66 0f 3a 16 e1 02
+            return
+}
+
+function %test_extractlane_b64() {
+ebb0:
+[-, %rax]   v0 = bconst.b64 false
+[-, %xmm2]  v1 = splat.b64x2 v0
+[-, %rbx]   v2 = x86_pextr v1, 1    ; bin: 66 48 0f 3a 16 d3 01
+            return
+}
diff --git a/filetests/isa/x86/extractlane-run.clif b/filetests/isa/x86/extractlane-run.clif
new file mode 100644
index 000000000..ce8c00a93
--- /dev/null
+++ b/filetests/isa/x86/extractlane-run.clif
@@ -0,0 +1,31 @@
+test run
+set enable_simd
+
+function %test_extractlane_b8() -> b8 {
+ebb0:
+    v1 = vconst.b8x16 [false false false false false false false false false false true false false
+    false false false]
+    v2 = extractlane v1, 10
+    return v2
+}
+; run
+
+function %test_extractlane_i16() -> b1 {
+ebb0:
+    v0 = vconst.i16x8 0x00080007000600050004000300020001
+    v1 = extractlane v0, 1
+    v2 = icmp_imm eq v1, 2
+    return v2
+}
+; run
+
+function %test_extractlane_f32() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42]
+    v2 = extractlane v1, 3
+    v10 = f32const 0x42.42  ; TODO this should not be necessary, v0 should be re-usable
+    v3 = fcmp eq v2, v10
+    return v3
+}
+; run
diff --git a/filetests/isa/x86/extractlane.clif b/filetests/isa/x86/extractlane.clif
deleted file mode 100644
index e7a1ea898..000000000
--- a/filetests/isa/x86/extractlane.clif
+++ /dev/null
@@ -1,35 +0,0 @@
-test binemit
-set enable_simd
-target x86_64 haswell
-
-function %test_extractlane_b8() {
-ebb0:
-[-, %rax]   v0 = bconst.b8 true
-[-, %xmm0]  v1 = splat.b8x16 v0
-[-, %rax]   v2 = extractlane v1, 10    ; bin: 66 0f 3a 14 c0 0a
-            return
-}
-
-function %test_extractlane_i16() {
-ebb0:
-[-, %rax]   v0 = iconst.i16 4
-[-, %xmm1]  v1 = splat.i16x8 v0
-[-, %rax]   v2 = extractlane v1, 4    ; bin: 66 0f c5 c8 04
-            return
-}
-
-function %test_extractlane_i32() {
-ebb0:
-[-, %rax]   v0 = iconst.i32 42
-[-, %xmm4]  v1 = splat.i32x4 v0
-[-, %rcx]   v2 = extractlane v1, 2    ; bin: 66 0f 3a 16 e1 02
-            return
-}
-
-function %test_extractlane_f64() {
-ebb0:
-[-, %rax]   v0 = f64const 0x0.0
-[-, %xmm2]  v1 = splat.f64x2 v0
-[-, %rbx]   v2 = extractlane v1, 1    ; bin: 66 48 0f 3a 16 d3 01
-            return
-}

From 2a03f241257ac644da3ef46dc81b63d1593fd72f Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Thu, 22 Aug 2019 13:59:32 -0700
Subject: [PATCH 6/8] Avoid extra register movement when lowering the x86
 scalar_to_vector of a float value

---
 cranelift-codegen/meta/src/cdsl/types.rs      | 21 ++++++++
 .../meta/src/isa/x86/encodings.rs             | 52 ++++++++++++-------
 filetests/isa/x86/extractlane-run.clif        |  3 +-
 ...tor.clif => scalar_to_vector-binemit.clif} |  6 +--
 .../isa/x86/scalar_to_vector-compile.clif     | 19 +++++++
 5 files changed, 76 insertions(+), 25 deletions(-)
 rename filetests/isa/x86/{scalar_to_vector.clif => scalar_to_vector-binemit.clif} (80%)
 create mode 100644 filetests/isa/x86/scalar_to_vector-compile.clif

diff --git a/cranelift-codegen/meta/src/cdsl/types.rs b/cranelift-codegen/meta/src/cdsl/types.rs
index f431bb3ed..92b9ab3a2 100644
--- a/cranelift-codegen/meta/src/cdsl/types.rs
+++ b/cranelift-codegen/meta/src/cdsl/types.rs
@@ -264,6 +264,27 @@ impl LaneType {
             ValueType::Vector(VectorType::new(*self, lanes.into()))
         }
     }
+
+    pub fn is_float(&self) -> bool {
+        match self {
+            LaneType::FloatType(_) => true,
+            _ => false,
+        }
+    }
+
+    pub fn is_int(&self) -> bool {
+        match self {
+            LaneType::IntType(_) => true,
+            _ => false,
+        }
+    }
+
+    pub fn is_bool(&self) -> bool {
+        match self {
+            LaneType::BoolType(_) => true,
+            _ => false,
+        }
+    }
 }
 
 impl fmt::Display for LaneType {
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 30246b85a..b95705f9b 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -4,8 +4,8 @@ use std::collections::HashMap;
 
 use crate::cdsl::encodings::{Encoding, EncodingBuilder};
 use crate::cdsl::instructions::{
-    BoundInstruction, InstSpec, Instruction, InstructionGroup, InstructionPredicate,
-    InstructionPredicateNode, InstructionPredicateRegistry,
+    InstSpec, Instruction, InstructionGroup, InstructionPredicate, InstructionPredicateNode,
+    InstructionPredicateRegistry,
 };
 use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
 use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
@@ -279,6 +279,17 @@ impl PerCpuModeEncodings {
         }
     }
 
+    /// Add the same encoding/recipe pairing to both X86_32 and X86_64
+    fn enc_32_64_rec(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        recipe: &EncodingRecipe,
+        bits: u16,
+    ) {
+        self.enc32_rec(inst.clone(), recipe, bits);
+        self.enc64_rec(inst, recipe, bits);
+    }
+
     /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
     /// binding) has already happened.
     fn enc_32_64_maybe_isap(
@@ -1761,12 +1772,16 @@ pub(crate) fn define(
     // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
-        let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
-        if ty.lane_bits() < 64 {
-            // no 32-bit encodings for 64-bit widths
-            e.enc32(instruction.clone(), template.clone());
+        if ty.is_float() {
+            e.enc_32_64_rec(instruction, rec_null_fpr, 0);
+        } else {
+            let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
+            if ty.lane_bits() < 64 {
+                // no 32-bit encodings for 64-bit widths
+                e.enc32(instruction.clone(), template.clone());
+            }
+            e.enc_x86_64(instruction, template);
         }
-        e.enc_x86_64(instruction, template);
     }
 
     // SIMD insertlane
@@ -1811,37 +1826,34 @@ pub(crate) fn define(
         }
     }
 
-    // helper for generating null encodings for FPRs on both 32- and 64-bit architectures
-    let mut null_encode_32_64 = |instruction: BoundInstruction| {
-        e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
-        e.enc64_rec(instruction, rec_null_fpr, 0);
-    };
-
     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
         for to_type in
             ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
         {
-            null_encode_32_64(
-                raw_bitcast
-                    .bind_vector_from_lane(to_type, sse_vector_size)
-                    .bind_vector_from_lane(from_type, sse_vector_size),
-            );
+            let instruction = raw_bitcast
+                .bind_vector_from_lane(to_type, sse_vector_size)
+                .bind_vector_from_lane(from_type, sse_vector_size);
+            e.enc_32_64_rec(instruction, rec_null_fpr, 0);
         }
     }
 
     // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an XMM register
     for float_type in &[F32, F64] {
         for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
-            null_encode_32_64(
+            e.enc_32_64_rec(
                 raw_bitcast
                     .bind_vector_from_lane(lane_type, sse_vector_size)
                     .bind(*float_type),
+                rec_null_fpr,
+                0,
             );
-            null_encode_32_64(
+            e.enc_32_64_rec(
                 raw_bitcast
                     .bind(*float_type)
                     .bind_vector_from_lane(lane_type, sse_vector_size),
+                rec_null_fpr,
+                0,
             );
         }
     }
diff --git a/filetests/isa/x86/extractlane-run.clif b/filetests/isa/x86/extractlane-run.clif
index ce8c00a93..4590bd067 100644
--- a/filetests/isa/x86/extractlane-run.clif
+++ b/filetests/isa/x86/extractlane-run.clif
@@ -24,8 +24,7 @@ ebb0:
     v0 = f32const 0x42.42
     v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42]
     v2 = extractlane v1, 3
-    v10 = f32const 0x42.42  ; TODO this should not be necessary, v0 should be re-usable
-    v3 = fcmp eq v2, v10
+    v3 = fcmp eq v2, v0
     return v3
 }
 ; run
diff --git a/filetests/isa/x86/scalar_to_vector.clif b/filetests/isa/x86/scalar_to_vector-binemit.clif
similarity index 80%
rename from filetests/isa/x86/scalar_to_vector.clif
rename to filetests/isa/x86/scalar_to_vector-binemit.clif
index 51ddea3e7..b26f3d2e6 100644
--- a/filetests/isa/x86/scalar_to_vector.clif
+++ b/filetests/isa/x86/scalar_to_vector-binemit.clif
@@ -17,10 +17,10 @@ ebb0:
             return
 }
 
-function %test_scalar_to_vector_f32() {
+function %test_scalar_to_vector_b32() {
 ebb0:
-[-, %rcx]   v0 = f32const 0x0.42
-[-, %xmm3]  v1 = scalar_to_vector.f32x4 v0    ; bin: 66 0f 6e d9
+[-, %rcx]   v0 = bconst.b32 false
+[-, %xmm3]  v1 = scalar_to_vector.b32x4 v0    ; bin: 66 0f 6e d9
             return
 }
 
diff --git a/filetests/isa/x86/scalar_to_vector-compile.clif b/filetests/isa/x86/scalar_to_vector-compile.clif
new file mode 100644
index 000000000..2d2ab331f
--- /dev/null
+++ b/filetests/isa/x86/scalar_to_vector-compile.clif
@@ -0,0 +1,19 @@
+test compile
+set opt_level=best
+set probestack_enabled=false
+set enable_simd
+target x86_64
+
+; ensure that scalar_to_vector emits no instructions for floats (already exist in an XMM register)
+function %test_scalar_to_vector_f32() -> f32x4 baldrdash_system_v {
+ebb0:
+    v0 = f32const 0x0.42
+    v1 = scalar_to_vector.f32x4 v0
+    return v1
+}
+
+; check: ebb0
+; nextln: v2 = iconst.i32 0x3e84_0000
+; nextln: v0 = bitcast.f32 v2
+; nextln: [null_fpr#00,%xmm0]                 v1 = scalar_to_vector.f32x4 v0
+; nextln: return v1

From ef7bf77e7ec97f424fb5f39897b79ee3a58b2c9a Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Fri, 23 Aug 2019 11:38:29 -0700
Subject: [PATCH 7/8] Avoid extra register movement when lowering an x86
 insertlane to a float vector

---
 .../meta/src/isa/x86/encodings.rs             | 49 +++++++++---
 .../meta/src/isa/x86/instructions.rs          | 79 +++++++++++++++++++
 .../meta/src/isa/x86/legalize.rs              |  1 +
 cranelift-codegen/meta/src/isa/x86/recipes.rs | 21 +++++
 cranelift-codegen/src/isa/x86/enc_tables.rs   | 62 +++++++++++++++
 cranelift-codegen/src/verifier/locations.rs   |  6 +-
 filetests/isa/x86/extractlane-run.clif        | 38 +++++++++
 filetests/isa/x86/insertlane-binemit.clif     | 42 ++++++++++
 filetests/isa/x86/insertlane-run.clif         | 48 +++++++++++
 filetests/isa/x86/insertlane.clif             | 39 ---------
 filetests/isa/x86/legalize-splat.clif         |  4 +-
 11 files changed, 334 insertions(+), 55 deletions(-)
 create mode 100644 filetests/isa/x86/insertlane-binemit.clif
 create mode 100644 filetests/isa/x86/insertlane-run.clif
 delete mode 100644 filetests/isa/x86/insertlane.clif

diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index b95705f9b..14b3c0eea 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -396,7 +396,6 @@ pub(crate) fn define(
     let ifcmp_sp = shared.by_name("ifcmp_sp");
     let imul = shared.by_name("imul");
     let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
-    let insertlane = shared.by_name("insertlane");
     let ireduce = shared.by_name("ireduce");
     let ishl = shared.by_name("ishl");
     let ishl_imm = shared.by_name("ishl_imm");
@@ -469,8 +468,12 @@ pub(crate) fn define(
     let x86_cvtt2si = x86.by_name("x86_cvtt2si");
     let x86_fmax = x86.by_name("x86_fmax");
     let x86_fmin = x86.by_name("x86_fmin");
+    let x86_insertps = x86.by_name("x86_insertps");
+    let x86_movlhps = x86.by_name("x86_movlhps");
+    let x86_movsd = x86.by_name("x86_movsd");
     let x86_pop = x86.by_name("x86_pop");
     let x86_pextr = x86.by_name("x86_pextr");
+    let x86_pinsr = x86.by_name("x86_pinsr");
     let x86_pshufd = x86.by_name("x86_pshufd");
     let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_push = x86.by_name("x86_push");
@@ -501,6 +504,7 @@ pub(crate) fn define(
     let rec_f64imm_z = r.template("f64imm_z");
     let rec_fa = r.template("fa");
     let rec_fax = r.template("fax");
+    let rec_fa_ib = r.template("fa_ib");
     let rec_fcmp = r.template("fcmp");
     let rec_fcscc = r.template("fcscc");
     let rec_ffillnull = r.recipe("ffillnull");
@@ -1785,16 +1789,16 @@ pub(crate) fn define(
     }
 
     // SIMD insertlane
-    let mut insertlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
+    let mut x86_pinsr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();
-    insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
-    insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
-    insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
-    insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
+    x86_pinsr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
+    x86_pinsr_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
+    x86_pinsr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
+    x86_pinsr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
 
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
-        if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
-            let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
+        if let Some((opcode, isap)) = x86_pinsr_mapping.get(&ty.lane_bits()) {
+            let instruction = x86_pinsr.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
             if ty.lane_bits() < 64 {
                 e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
@@ -1805,13 +1809,34 @@ pub(crate) fn define(
         }
     }
 
+    // for legalizing insertlane with floats, INSERTPS from SSE4.1
+    {
+        let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size);
+        let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]);
+        e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
+    // for legalizing insertlane with floats,  MOVSD from SSE2
+    {
+        let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2
+    }
+
+    // for legalizing insertlane with floats, MOVLHPS from SSE
+    {
+        let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]);
+        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
+    }
+
     // SIMD extractlane
     let mut x86_pextr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();
-    x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB
-    x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
-    x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD
-    x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64
+    x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
+    x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from SSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
+    x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
+    x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
 
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) {
diff --git a/cranelift-codegen/meta/src/isa/x86/instructions.rs b/cranelift-codegen/meta/src/isa/x86/instructions.rs
index 3f583c6ed..b9f2496a8 100644
--- a/cranelift-codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift-codegen/meta/src/isa/x86/instructions.rs
@@ -308,5 +308,84 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let IBxN = &TypeVar::new(
+        "IBxN",
+        "A SIMD vector type containing only booleans and integers",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .bools(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &operand("x", IBxN);
+    let y = &operand_doc("y", &IBxN.lane_of(), "New lane value");
+    let a = &operand("a", IBxN);
+
+    ig.push(
+        Inst::new(
+            "x86_pinsr",
+            r#"
+        Insert ``y`` into ``x`` at lane ``Idx``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+        )
+        .operands_in(vec![x, Idx, y])
+        .operands_out(vec![a]),
+    );
+
+    let FxN = &TypeVar::new(
+        "FxN",
+        "A SIMD vector type containing floats",
+        TypeSetBuilder::new()
+            .floats(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &operand("x", FxN);
+    let y = &operand_doc("y", &FxN.lane_of(), "New lane value");
+    let a = &operand("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_insertps",
+            r#"
+        Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is 
+        extracted from and which it is inserted to. This is similar to x86_pinsr but inserts 
+        floats, which are already stored in an XMM register.
+        "#,
+        )
+        .operands_in(vec![x, Idx, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &operand("x", FxN);
+    let y = &operand("y", FxN);
+    let a = &operand("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_movsd",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x``
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_movlhps",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x``
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     ig.build()
 }
diff --git a/cranelift-codegen/meta/src/isa/x86/legalize.rs b/cranelift-codegen/meta/src/isa/x86/legalize.rs
index 4c2ebaefd..555a93f9c 100644
--- a/cranelift-codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift-codegen/meta/src/isa/x86/legalize.rs
@@ -381,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     }
 
     narrow.custom_legalize(extractlane, "convert_extractlane");
+    narrow.custom_legalize(insertlane, "convert_insertlane");
 
     narrow.build_and_add_to(&mut shared.transform_groups);
 }
diff --git a/cranelift-codegen/meta/src/isa/x86/recipes.rs b/cranelift-codegen/meta/src/isa/x86/recipes.rs
index 3f14769de..8176effc4 100644
--- a/cranelift-codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift-codegen/meta/src/isa/x86/recipes.rs
@@ -566,6 +566,27 @@ pub(crate) fn define<'shared>(
             ),
     );
 
+    // XX /r with FPR ins and outs. A form with a byte immediate.
+    {
+        let format = formats.get(f_insert_lane);
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fa_ib", f_insert_lane, 2)
+                .operands_in(vec![fpr, fpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    format, "lane", 8, 0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    let imm:i64 = lane.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+        );
+    }
+
     // XX /n for a unary operation with extension bits.
     recipes.add_template_recipe(
         EncodingRecipeBuilder::new("ur", f_unary, 1)
diff --git a/cranelift-codegen/src/isa/x86/enc_tables.rs b/cranelift-codegen/src/isa/x86/enc_tables.rs
index 39bdb5784..f67d7f0b6 100644
--- a/cranelift-codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift-codegen/src/isa/x86/enc_tables.rs
@@ -950,3 +950,65 @@ fn convert_extractlane(
         }
     }
 }
+
+/// Because floats exist in XMM registers, we can keep them there when executing a CLIF
+/// insertlane instruction
+fn convert_insertlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::InsertLane {
+        opcode: ir::Opcode::Insertlane,
+        args: [vector, replacement],
+        lane,
+    } = pos.func.dfg[inst]
+    {
+        let value_type = pos.func.dfg.value_type(vector);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            match value_type {
+                F32X4 => {
+                    assert!(lane > 0 && lane <= 3);
+                    let immediate = 0b00_00_00_00 | lane << 4;
+                    // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+                    // shifted into bits 5:6).
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .x86_insertps(vector, immediate, replacement)
+                }
+                F64X2 => {
+                    let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types
+                    if lane == 0 {
+                        // Move the lowest quadword in replacement to vector without changing
+                        // the upper bits.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movsd(vector, replacement_as_vector)
+                    } else {
+                        assert_eq!(lane, 1);
+                        // Move the low 64 bits of replacement vector to the high 64 bits of the
+                        // vector.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movlhps(vector, replacement_as_vector)
+                    }
+                }
+                _ => unreachable!(),
+            };
+        } else {
+            // For non-floats, lower with the usual PINSR* instruction.
+            pos.func
+                .dfg
+                .replace(inst)
+                .x86_pinsr(vector, lane, replacement);
+        }
+    }
+}
diff --git a/cranelift-codegen/src/verifier/locations.rs b/cranelift-codegen/src/verifier/locations.rs
index bf1a4e186..cf17ae13d 100644
--- a/cranelift-codegen/src/verifier/locations.rs
+++ b/cranelift-codegen/src/verifier/locations.rs
@@ -107,8 +107,10 @@ impl<'a> LocationVerifier<'a> {
         fatal!(
             errors,
             inst,
-            "{} constraints not satisfied",
-            self.encinfo.display(enc)
+            "{} constraints not satisfied in: {}\n{}",
+            self.encinfo.display(enc),
+            self.func.dfg.display_inst(inst, self.isa),
+            self.func.display(self.isa)
         )
     }
 
diff --git a/filetests/isa/x86/extractlane-run.clif b/filetests/isa/x86/extractlane-run.clif
index 4590bd067..adb2e7b8e 100644
--- a/filetests/isa/x86/extractlane-run.clif
+++ b/filetests/isa/x86/extractlane-run.clif
@@ -28,3 +28,41 @@ ebb0:
     return v3
 }
 ; run
+
+function %test_extractlane_i32_with_vector_reuse() -> b1 {
+ebb0:
+    v0 = iconst.i32 42
+    v1 = iconst.i32 99
+
+    v2 = splat.i32x4 v0
+    v3 = insertlane v2, 2, v1
+
+    v4 = extractlane v3, 3
+    v5 = icmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = icmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+function %test_extractlane_f32_with_vector_reuse() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = f32const 0x99.99
+
+    v2 = splat.f32x4 v0
+    v3 = insertlane v2, 2, v1
+
+    v4 = extractlane v3, 3
+    v5 = fcmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = fcmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
diff --git a/filetests/isa/x86/insertlane-binemit.clif b/filetests/isa/x86/insertlane-binemit.clif
new file mode 100644
index 000000000..49048130c
--- /dev/null
+++ b/filetests/isa/x86/insertlane-binemit.clif
@@ -0,0 +1,42 @@
+test binemit
+set enable_simd
+target x86_64 haswell
+
+; for insertlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pinsr
+; which is manually placed in the IR so that it can be binemit-tested
+
+function %test_insertlane_b8() {
+ebb0:
+[-, %rax]   v0 = bconst.b8 true
+[-, %rbx]   v1 = bconst.b8 false
+[-, %xmm0]  v2 = splat.b8x16 v0
+[-, %xmm0]  v3 = x86_pinsr v2, 10, v1    ; bin: 66 0f 3a 20 c3 0a
+            return
+}
+
+function %test_insertlane_i16() {
+ebb0:
+[-, %rax]   v0 = iconst.i16 4
+[-, %rbx]   v1 = iconst.i16 5
+[-, %xmm1]  v2 = splat.i16x8 v0
+[-, %xmm1]  v3 = x86_pinsr v2, 4, v1    ; bin: 66 0f c4 cb 04
+            return
+}
+
+function %test_insertlane_i32() {
+ebb0:
+[-, %rax]   v0 = iconst.i32 42
+[-, %rbx]   v1 = iconst.i32 99
+[-, %xmm4]  v2 = splat.i32x4 v0
+[-, %xmm4]  v3 = x86_pinsr v2, 2, v1    ; bin: 66 0f 3a 22 e3 02
+            return
+}
+
+function %test_insertlane_b64() {
+ebb0:
+[-, %rax]   v0 = bconst.b64 true
+[-, %rbx]   v1 = bconst.b64 false
+[-, %xmm2]  v2 = splat.b64x2 v0
+[-, %xmm2]  v3 = x86_pinsr v2, 1, v1    ; bin: 66 48 0f 3a 22 d3 01
+            return
+}
diff --git a/filetests/isa/x86/insertlane-run.clif b/filetests/isa/x86/insertlane-run.clif
new file mode 100644
index 000000000..92fb38202
--- /dev/null
+++ b/filetests/isa/x86/insertlane-run.clif
@@ -0,0 +1,48 @@
+test run
+set enable_simd
+
+; TODO once SIMD vector comparison is implemented, remove use of extractlane below
+
+function %test_insertlane_b8() -> b8 {
+ebb0:
+    v1 = bconst.b8 true
+    v2 = vconst.b8x16 [false false false false false false false false false false false false false
+     false false false]
+    v3 = insertlane v2, 10, v1
+    v4 = extractlane v3, 10
+    return v4
+}
+; run
+
+function %test_insertlane_f32() -> b1 {
+ebb0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 0x00
+    v2 = insertlane v1, 1, v0
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %test_insertlane_f64_lane1() -> b1 {
+ebb0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, 1, v0
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %test_insertlane_f64_lane0() -> b1 {
+ebb0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, 0, v0
+    v3 = extractlane v2, 0
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
diff --git a/filetests/isa/x86/insertlane.clif b/filetests/isa/x86/insertlane.clif
deleted file mode 100644
index c55dc4033..000000000
--- a/filetests/isa/x86/insertlane.clif
+++ /dev/null
@@ -1,39 +0,0 @@
-test binemit
-set enable_simd
-target x86_64 haswell
-
-function %test_insertlane_b8() {
-ebb0:
-[-, %rax]   v0 = bconst.b8 true
-[-, %rbx]   v1 = bconst.b8 false
-[-, %xmm0]  v2 = splat.b8x16 v0
-[-, %xmm0]  v3 = insertlane v2, 10, v1    ; bin: 66 0f 3a 20 c3 0a
-            return
-}
-
-function %test_insertlane_i16() {
-ebb0:
-[-, %rax]   v0 = iconst.i16 4
-[-, %rbx]   v1 = iconst.i16 5
-[-, %xmm1]  v2 = splat.i16x8 v0
-[-, %xmm1]  v3 = insertlane v2, 4, v1    ; bin: 66 0f c4 cb 04
-            return
-}
-
-function %test_insertlane_i32() {
-ebb0:
-[-, %rax]   v0 = iconst.i32 42
-[-, %rbx]   v1 = iconst.i32 99
-[-, %xmm4]  v2 = splat.i32x4 v0
-[-, %xmm4]  v3 = insertlane v2, 2, v1    ; bin: 66 0f 3a 22 e3 02
-            return
-}
-
-function %test_insertlane_f64() {
-ebb0:
-[-, %rax]   v0 = f64const 0x0.0
-[-, %rbx]   v1 = f64const 0x4.2
-[-, %xmm2]  v2 = splat.f64x2 v0
-[-, %xmm2]  v3 = insertlane v2, 1, v1    ; bin: 66 48 0f 3a 22 d3 01
-            return
-}
diff --git a/filetests/isa/x86/legalize-splat.clif b/filetests/isa/x86/legalize-splat.clif
index c0fc83ebe..19d61d529 100644
--- a/filetests/isa/x86/legalize-splat.clif
+++ b/filetests/isa/x86/legalize-splat.clif
@@ -33,7 +33,7 @@ ebb0:
 ; check:   ebb0:
 ; nextln:     v0 = iconst.i64 42
 ; nextln:     v2 = scalar_to_vector.i64x2 v0
-; nextln:     v1 = insertlane v2, 1, v0
+; nextln:     v1 = x86_pinsr v2, 1, v0
 ; nextln:     return v1
 
 
@@ -48,7 +48,7 @@ ebb0:
 ; check:   ebb0:
 ; nextln:     v0 = bconst.b16 true
 ; nextln:     v2 = scalar_to_vector.b16x8 v0
-; nextln:     v3 = insertlane v2, 1, v0
+; nextln:     v3 = x86_pinsr v2, 1, v0
 ; nextln:     v4 = raw_bitcast.i32x4 v3
 ; nextln:     v5 = x86_pshufd v4, 0
 ; nextln:     v1 = raw_bitcast.b16x8 v5

From 8255ef05d3bb648f2546f9661de500b56bf3cd6d Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Thu, 5 Sep 2019 09:26:11 -0700
Subject: [PATCH 8/8] Fix documentation

---
 .../meta/src/isa/x86/encodings.rs             | 26 +++++++++++--------
 .../meta/src/shared/instructions.rs           | 12 ++++++---
 filetests/isa/x86/insertlane-binemit.clif     |  4 +--
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 14b3c0eea..d773c2c66 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -1744,16 +1744,17 @@ pub(crate) fn define(
     e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
     e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
 
-    // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
+    // SIMD vector size: eventually multiple vector sizes may be supported but for now only
+    // SSE-sized vectors are available.
     let sse_vector_size: u64 = 128;
 
     // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
     // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
-    // value across the register
+    // value across the register.
 
     let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
 
-    // PSHUFB, 8-bit shuffle using two XMM registers
+    // PSHUFB, 8-bit shuffle using two XMM registers.
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
         let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
@@ -1761,7 +1762,7 @@ pub(crate) fn define(
         e.enc64_isap(instruction, template, use_ssse3_simd);
     }
 
-    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
+    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate.
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
         let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_r_ib_unsigned_fpr
@@ -1803,27 +1804,28 @@ pub(crate) fn define(
             if ty.lane_bits() < 64 {
                 e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
             } else {
-                // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
+                // It turns out the 64-bit widths have REX/W encodings and only are available on
+                // x86_64.
                 e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
             }
         }
     }
 
-    // for legalizing insertlane with floats, INSERTPS from SSE4.1
+    // For legalizing insertlane with floats, INSERTPS from SSE4.1.
     {
         let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size);
         let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]);
         e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd));
     }
 
-    // for legalizing insertlane with floats,  MOVSD from SSE2
+    // For legalizing insertlane with floats,  MOVSD from SSE2.
     {
         let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size);
         let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]);
         e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2
     }
 
-    // for legalizing insertlane with floats, MOVLHPS from SSE
+    // For legalizing insertlane with floats, MOVLHPS from SSE.
     {
         let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size);
         let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]);
@@ -1845,13 +1847,14 @@ pub(crate) fn define(
             if ty.lane_bits() < 64 {
                 e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
             } else {
-                // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
+                // It turns out the 64-bit widths have REX/W encodings and only are available on
+                // x86_64.
                 e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
             }
         }
     }
 
-    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
+    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
         for to_type in
             ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
@@ -1863,7 +1866,8 @@ pub(crate) fn define(
         }
     }
 
-    // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an XMM register
+    // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an
+    // XMM register.
     for float_type in &[F32, F64] {
         for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
             e.enc_32_64_rec(
diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs
index 843347ce9..3ebebfe18 100644
--- a/cranelift-codegen/meta/src/shared/instructions.rs
+++ b/cranelift-codegen/meta/src/shared/instructions.rs
@@ -1537,7 +1537,9 @@ pub(crate) fn define(
         Extract lane ``Idx`` from ``x``.
 
         The lane index, ``Idx``, is an immediate value, not an SSA value. It
-        must indicate a valid lane index for the type of ``x``.
+        must indicate a valid lane index for the type of ``x``. Note that the upper bits of ``a``
+        may or may not be zeroed depending on the ISA but the type system should prevent using 
+        ``a`` as anything other than the extracted value.
         "#,
         )
         .operands_in(vec![x, Idx])
@@ -2782,9 +2784,11 @@ pub(crate) fn define(
         Inst::new(
             "scalar_to_vector",
             r#"
-    Scalar To Vector -- move a value out of a scalar register and into a vector
-    register; the scalar will be moved to the lowest-order bits of the vector
-    register and any higher bits will be zeroed.
+    Scalar To Vector -- move a value out of a scalar register and into a vector register; the 
+    scalar will be moved to the lowest-order bits of the vector register. Note that this 
+    instruction is intended as a low-level legalization instruction and frontends should prefer 
+    insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
+    types (e.g. integers) but not for others (e.g. floats).
     "#,
         )
         .operands_in(vec![s])
diff --git a/filetests/isa/x86/insertlane-binemit.clif b/filetests/isa/x86/insertlane-binemit.clif
index 49048130c..c388ed6fa 100644
--- a/filetests/isa/x86/insertlane-binemit.clif
+++ b/filetests/isa/x86/insertlane-binemit.clif
@@ -2,8 +2,8 @@ test binemit
 set enable_simd
 target x86_64 haswell
 
-; for insertlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pinsr
-; which is manually placed in the IR so that it can be binemit-tested
+; for insertlane, floats are legalized differently than integers and booleans; integers and
+; booleans use x86_pinsr which is manually placed in the IR so that it can be binemit-tested
 
 function %test_insertlane_b8() {
 ebb0: