Skip to content
This repository was archived by the owner on Jun 26, 2020. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions cranelift-codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,20 @@ impl PerCpuModeEncodings {
self.enc64(inst.bind(I64), template.rex().w());
}

/// Add encodings for `inst.b32` to X86_32.
/// Add encodings for `inst.b32` to X86_64 with and without REX.
/// Add encodings for `inst.b64` to X86_64 with a REX.W prefix.
fn enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template) {
let inst: InstSpec = inst.into();
self.enc32(inst.bind(B32), template.nonrex());

// REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
// reg-alloc would never use r8 and up.
self.enc64(inst.bind(B32), template.rex());
self.enc64(inst.bind(B32), template.nonrex());
self.enc64(inst.bind(B64), template.rex().w());
}

/// Add encodings for `inst.i32` to X86_32.
/// Add encodings for `inst.i32` to X86_64 with a REX prefix.
/// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
Expand Down Expand Up @@ -658,11 +672,15 @@ pub(crate) fn define(
e.enc_i32_i64(isub_ifborrow, rec_rio.opcodes(vec![0x19]));

e.enc_i32_i64(band, rec_rr.opcodes(vec![0x21]));
e.enc_b32_b64(band, rec_rr.opcodes(vec![0x21]));
e.enc_i32_i64(bor, rec_rr.opcodes(vec![0x09]));
e.enc_b32_b64(bor, rec_rr.opcodes(vec![0x09]));
e.enc_i32_i64(bxor, rec_rr.opcodes(vec![0x31]));
e.enc_b32_b64(bxor, rec_rr.opcodes(vec![0x31]));

// x86 has a bitwise not instruction NOT.
e.enc_i32_i64(bnot, rec_ur.opcodes(vec![0xf7]).rrr(2));
e.enc_b32_b64(bnot, rec_ur.opcodes(vec![0xf7]).rrr(2));

// Also add a `b1` encodings for the logic instructions.
// TODO: Should this be done with 8-bit instructions? It would improve partial register
Expand Down Expand Up @@ -690,7 +708,12 @@ pub(crate) fn define(
e.enc32(regmove.bind(ty), rec_rmov.opcodes(vec![0x89]));
e.enc64(regmove.bind(ty), rec_rmov.opcodes(vec![0x89]).rex());
}
for &ty in &[B8, B16, B32] {
e.enc32(regmove.bind(ty), rec_rmov.opcodes(vec![0x89]));
e.enc64(regmove.bind(ty), rec_rmov.opcodes(vec![0x89]).rex());
}
e.enc64(regmove.bind(I64), rec_rmov.opcodes(vec![0x89]).rex().w());
e.enc64(regmove.bind(B64), rec_rmov.opcodes(vec![0x89]).rex().w());
e.enc_both(regmove.bind(B1), rec_rmov.opcodes(vec![0x89]));
e.enc_both(regmove.bind(I8), rec_rmov.opcodes(vec![0x89]));
e.enc32(regmove.bind_ref(R32), rec_rmov.opcodes(vec![0x89]));
Expand Down Expand Up @@ -1785,7 +1808,7 @@ pub(crate) fn define(
let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;

// PSHUFB, 8-bit shuffle using two XMM registers.
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
e.enc32_isap(instruction.clone(), template.clone(), use_ssse3_simd);
Expand All @@ -1804,7 +1827,7 @@ pub(crate) fn define(

// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
if ty.is_float() {
Expand Down Expand Up @@ -1929,6 +1952,13 @@ pub(crate) fn define(
e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
}

// SIMD bor using ORPS
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = bor.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x56]);
e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
}

// Reference type instructions

// Null references implemented as iconst 0.
Expand Down
2 changes: 2 additions & 0 deletions cranelift-codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
let selectif = insts.by_name("selectif");
let smulhi = insts.by_name("smulhi");
let splat = insts.by_name("splat");
let shuffle = insts.by_name("shuffle");
let srem = insts.by_name("srem");
let udiv = insts.by_name("udiv");
let umulhi = insts.by_name("umulhi");
Expand Down Expand Up @@ -380,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
);
}

narrow.custom_legalize(shuffle, "convert_shuffle");
narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane");

Expand Down
6 changes: 3 additions & 3 deletions cranelift-codegen/meta/src/isa/x86/recipes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,11 +396,11 @@ pub(crate) fn define<'shared>(
let f_trap = formats.by_name("Trap");
let f_unary = formats.by_name("Unary");
let f_unary_bool = formats.by_name("UnaryBool");
let f_unary_const = formats.by_name("UnaryConst");
let f_unary_global_value = formats.by_name("UnaryGlobalValue");
let f_unary_ieee32 = formats.by_name("UnaryIeee32");
let f_unary_ieee64 = formats.by_name("UnaryIeee64");
let f_unary_imm = formats.by_name("UnaryImm");
let f_unary_imm128 = formats.by_name("UnaryImm128");

// Predicates shorthands.
let use_sse41 = settings.predicate_by_name("use_sse41");
Expand Down Expand Up @@ -2437,14 +2437,14 @@ pub(crate) fn define<'shared>(
);

recipes.add_template_recipe(
EncodingRecipeBuilder::new("vconst", f_unary_imm128, 5)
EncodingRecipeBuilder::new("vconst", f_unary_const, 5)
.operands_out(vec![fpr])
.clobbers_flags(false)
.emit(
r#"
{{PUT_OP}}(bits, rex2(0, out_reg0), sink);
modrm_riprel(out_reg0, sink);
const_disp4(imm, func, sink);
const_disp4(constant_handle, func, sink);
"#,
),
);
Expand Down
8 changes: 7 additions & 1 deletion cranelift-codegen/meta/src/shared/formats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry

registry.insert(Builder::new("Unary").value());
registry.insert(Builder::new("UnaryImm").imm(&imm.imm64));
registry.insert(Builder::new("UnaryImm128").imm(&imm.uimm128));
registry.insert(Builder::new("UnaryIeee32").imm(&imm.ieee32));
registry.insert(Builder::new("UnaryIeee64").imm(&imm.ieee64));
registry.insert(Builder::new("UnaryBool").imm(&imm.boolean));
registry.insert(Builder::new("UnaryConst").imm(&imm.pool_constant));
registry.insert(Builder::new("UnaryGlobalValue").imm(&entities.global_value));

registry.insert(Builder::new("Binary").value().value());
Expand Down Expand Up @@ -43,6 +43,12 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry
.value()
.imm_with_name("lane", &imm.uimm8),
);
registry.insert(
Builder::new("Shuffle")
.value()
.value()
.imm_with_name("mask", &imm.uimm128),
);

registry.insert(Builder::new("IntCompare").imm(&imm.intcc).value().value());
registry.insert(
Expand Down
12 changes: 12 additions & 0 deletions cranelift-codegen/meta/src/shared/immediates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ pub(crate) struct Immediates {
/// const.
pub uimm128: OperandKind,

/// A constant stored in the constant pool.
///
/// This operand is used to pass constants to instructions like vconst while storing the
/// actual bytes in the constant pool.
pub pool_constant: OperandKind,

/// A 32-bit immediate signed offset.
///
/// This is used to represent an immediate address offset in load/store instructions.
Expand Down Expand Up @@ -84,6 +90,12 @@ impl Immediates {

uimm128: Builder::new_imm("uimm128")
.doc("A 128-bit immediate unsigned integer.")
.rust_type("ir::Immediate")
.build(),

pool_constant: Builder::new_imm("poolConstant")
.doc("A constant stored in the constant pool.")
.default_member("constant_handle")
.rust_type("ir::Constant")
.build(),

Expand Down
37 changes: 36 additions & 1 deletion cranelift-codegen/meta/src/shared/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1090,7 +1090,7 @@ pub(crate) fn define(

let N = &operand_doc(
"N",
&imm.uimm128,
&imm.pool_constant,
"The 16 immediate bytes of a 128-bit vector",
);
let a = &operand_doc("a", TxN, "A constant vector value");
Expand All @@ -1108,6 +1108,41 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

let mask = &operand_doc(
"mask",
&imm.uimm128,
"The 16 immediate bytes used for selecting the elements to shuffle",
);
let Tx16 = &TypeVar::new(
"Tx16",
"A SIMD vector with exactly 16 lanes of 8-bit values; eventually this may support other \
lane counts and widths",
TypeSetBuilder::new()
.ints(8..8)
.bools(8..8)
.simd_lanes(16..16)
.includes_scalars(false)
.build(),
);
let a = &operand_doc("a", Tx16, "A vector value");
let b = &operand_doc("b", Tx16, "A vector value");

ig.push(
Inst::new(
"shuffle",
r#"
SIMD vector shuffle.

Shuffle two vectors using the given immediate bytes. For each of the 16 bytes of the
immediate, a value i of 0-15 selects the i-th element of the first vector and a value i of
Comment thread
abrown marked this conversation as resolved.
16-31 selects the (i-16)th element of the second vector. Immediate values outside of the
0-31 range place a 0 in the resulting vector lane.
"#,
)
.operands_in(vec![a, b, mask])
.operands_out(vec![a]),
);

let a = &operand_doc("a", Ref, "A constant reference null value");

ig.push(
Expand Down
10 changes: 8 additions & 2 deletions cranelift-codegen/src/ir/dfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::ir;
use crate::ir::builder::ReplaceBuilder;
use crate::ir::extfunc::ExtFuncData;
use crate::ir::instructions::{BranchInfo, CallInfo, InstructionData};
use crate::ir::{types, ConstantPool};
use crate::ir::{types, ConstantPool, Immediate};
use crate::ir::{
Ebb, FuncRef, Inst, SigRef, Signature, Type, Value, ValueLabelAssignments, ValueList,
ValueListPool,
Expand All @@ -19,6 +19,7 @@ use core::mem;
use core::ops::{Index, IndexMut};
use core::u16;
use std::collections::HashMap;
use std::vec::Vec;

/// A data flow graph defines all instructions and extended basic blocks in a function as well as
/// the data flow dependencies between them. The DFG also tracks values which can be either
Expand Down Expand Up @@ -70,6 +71,9 @@ pub struct DataFlowGraph {

/// Constants used within the function
pub constants: ConstantPool,

/// Stores large immediates that otherwise will not fit on InstructionData
pub immediates: PrimaryMap<Immediate, Vec<u8>>,
}

impl DataFlowGraph {
Expand All @@ -85,6 +89,7 @@ impl DataFlowGraph {
ext_funcs: PrimaryMap::new(),
values_labels: None,
constants: ConstantPool::new(),
immediates: PrimaryMap::new(),
}
}

Expand All @@ -98,7 +103,8 @@ impl DataFlowGraph {
self.signatures.clear();
self.ext_funcs.clear();
self.values_labels = None;
self.constants.clear()
self.constants.clear();
self.immediates.clear();
}

/// Get the total number of instructions created in this function, whether they are currently
Expand Down
23 changes: 23 additions & 0 deletions cranelift-codegen/src/ir/entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,29 @@ impl Constant {
}
}

/// An opaque reference to an immediate.
///
/// Some immediates (e.g. SIMD shuffle masks) are too large to store in the
/// [`InstructionData`](super::instructions::InstructionData) struct and therefore must be
/// tracked separately in [`DataFlowGraph::immediates`](super::dfg::DataFlowGraph). `Immediate`
/// provides a way to reference values stored there.
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub struct Immediate(u32);
entity_impl!(Immediate, "imm");

impl Immediate {
/// Create an immediate reference from its number.
///
/// This method is for use by the parser.
pub fn with_number(n: u32) -> Option<Self> {
if n < u32::MAX {
Some(Immediate(n))
} else {
None
}
}
}

/// An opaque reference to a [jump table](https://en.wikipedia.org/wiki/Branch_table).
///
/// `JumpTable`s are used for indirect branching and are specialized for dense,
Expand Down
3 changes: 2 additions & 1 deletion cranelift-codegen/src/ir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ pub use crate::ir::builder::{InsertBuilder, InstBuilder, InstBuilderBase, InstIn
pub use crate::ir::constant::{ConstantData, ConstantOffset, ConstantPool};
pub use crate::ir::dfg::{DataFlowGraph, ValueDef};
pub use crate::ir::entities::{
Constant, Ebb, FuncRef, GlobalValue, Heap, Inst, JumpTable, SigRef, StackSlot, Table, Value,
Constant, Ebb, FuncRef, GlobalValue, Heap, Immediate, Inst, JumpTable, SigRef, StackSlot,
Table, Value,
};
pub use crate::ir::extfunc::{
AbiParam, ArgumentExtension, ArgumentPurpose, ExtFuncData, Signature,
Expand Down
74 changes: 74 additions & 0 deletions cranelift-codegen/src/isa/x86/enc_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,80 @@ fn expand_fcvt_to_uint_sat(
cfg.recompute_ebb(pos.func, done);
}

/// Convert shuffle instructions.
fn convert_shuffle(
Comment thread
abrown marked this conversation as resolved.
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
_isa: &dyn TargetIsa,
) {
let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);

if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] {
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1
// in the most significant position zeroes the lane.
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };

// We only have to worry about aliasing here because copies will be introduced later (in
// regalloc).
let a = pos.func.dfg.resolve_aliases(args[0]);
let b = pos.func.dfg.resolve_aliases(args[1]);
let mask = pos
.func
.dfg
.immediates
.get(mask)
.expect("The shuffle immediate should have been recorded before this point")
.clone();
Comment thread
abrown marked this conversation as resolved.
if a == b {
// PSHUFB the first argument (since it is the same as the second).
let constructed_mask = mask
.iter()
// If the mask is greater than 15 it still may be referring to a lane in b.
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
.map(zero_unknown_lane_index)
.collect();
let handle = pos.func.dfg.constants.insert(constructed_mask);
// Move the built mask into another XMM register.
let a_type = pos.func.dfg.value_type(a);
Comment thread
bnjbvr marked this conversation as resolved.
let mask_value = pos.ins().vconst(a_type, handle);
// Shuffle the single incoming argument.
pos.func.dfg.replace(inst).x86_pshufb(a, mask_value);
} else {
// PSHUFB the first argument, placing zeroes for unused lanes.
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
let handle = pos.func.dfg.constants.insert(constructed_mask);
// Move the built mask into another XMM register.
let a_type = pos.func.dfg.value_type(a);
let mask_value = pos.ins().vconst(a_type, handle);
// Shuffle the first argument.
let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value);

// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
.iter()
.map(|b| b.wrapping_sub(16))
.map(zero_unknown_lane_index)
.collect();
let handle = pos.func.dfg.constants.insert(constructed_mask);
// Move the built mask into another XMM register.
let b_type = pos.func.dfg.value_type(b);
let mask_value = pos.ins().vconst(b_type, handle);
// Shuffle the second argument.
let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value);

// OR the vectors together to form the final shuffled value.
pos.func
.dfg
.replace(inst)
.bor(shuffled_first_arg, shuffled_second_arg);

// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
};
}
}

/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
/// extractlane instruction
fn convert_extractlane(
Expand Down
Loading