Skip to content
This repository was archived by the owner on Jun 26, 2020. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions cranelift-codegen/meta/src/cdsl/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl InstructionGroup {
pub fn by_name(&self, name: &'static str) -> &Instruction {
self.instructions
.iter()
.find(|inst| inst.name == name)
.find(|inst| &inst.name == name)
.expect(&format!("unexisting instruction with name {}", name))
}
}
Expand Down Expand Up @@ -155,7 +155,7 @@ impl ops::Deref for Instruction {

impl Instruction {
pub fn snake_name(&self) -> &str {
if self.name == "return" {
if &self.name == "return" {
"return_"
} else {
&self.name
Expand All @@ -181,8 +181,17 @@ impl Instruction {
bind_ref(self.clone(), Some(reference_type.into()), Vec::new())
}

pub fn bind_vector(&self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
bind_vector(self.clone(), lane_type.into(), num_lanes, Vec::new())
pub fn bind_vector_from_lane(
&self,
lane_type: impl Into<LaneType>,
vector_size_in_bits: u64,
Comment thread
abrown marked this conversation as resolved.
) -> BoundInstruction {
bind_vector(
self.clone(),
lane_type.into(),
vector_size_in_bits,
Vec::new(),
)
}

pub fn bind_any(&self) -> BoundInstruction {
Expand Down Expand Up @@ -414,8 +423,17 @@ impl BoundInstruction {
bind_ref(self.inst, Some(reference_type.into()), self.value_types)
}

pub fn bind_vector(self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
bind_vector(self.inst, lane_type.into(), num_lanes, self.value_types)
pub fn bind_vector_from_lane(
self,
lane_type: impl Into<LaneType>,
vector_size_in_bits: u64,
) -> BoundInstruction {
bind_vector(
self.inst,
lane_type.into(),
vector_size_in_bits,
self.value_types,
)
}

pub fn bind_any(self) -> BoundInstruction {
Expand Down Expand Up @@ -782,7 +800,7 @@ impl InstructionPredicateNode {
ret.extend(node.collect_leaves());
}
}
_ => ret.push(&self),
_ => ret.push(self),
}
ret
}
Expand Down Expand Up @@ -1116,9 +1134,10 @@ fn bind_ref(
fn bind_vector(
inst: Instruction,
lane_type: LaneType,
num_lanes: u64,
vector_size_in_bits: u64,
mut value_types: Vec<ValueTypeOrAny>,
) -> BoundInstruction {
let num_lanes = vector_size_in_bits / lane_type.lane_bits();
let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes));
value_types.push(ValueTypeOrAny::ValueType(vector_type));
verify_polymorphic_binding(&inst, &value_types);
Expand Down
110 changes: 80 additions & 30 deletions cranelift-codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,14 +268,38 @@ impl PerCpuModeEncodings {
}

/// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
fn enc_32_64_isap(
fn enc_32_64_maybe_isap(
&mut self,
inst: BoundInstruction,
template: Template,
isap: SettingPredicateNumber,
isap: Option<SettingPredicateNumber>,
) {
self.enc32_isap(inst.clone(), template.clone(), isap);
self.enc64_isap(inst, template, isap);
self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
self.enc64_maybe_isap(inst, template, isap);
}

fn enc32_maybe_isap(
&mut self,
inst: BoundInstruction,
template: Template,
isap: Option<SettingPredicateNumber>,
) {
match isap {
None => self.enc32(inst, template),
Some(isap) => self.enc32_isap(inst, template, isap),
}
}

fn enc64_maybe_isap(
&mut self,
inst: BoundInstruction,
template: Template,
isap: Option<SettingPredicateNumber>,
) {
match isap {
None => self.enc64(inst, template),
Some(isap) => self.enc64_isap(inst, template, isap),
}
}
}

Expand Down Expand Up @@ -318,6 +342,7 @@ pub fn define(
let copy_special = shared.by_name("copy_special");
let ctz = shared.by_name("ctz");
let debugtrap = shared.by_name("debugtrap");
let extractlane = shared.by_name("extractlane");
let f32const = shared.by_name("f32const");
let f64const = shared.by_name("f64const");
let fadd = shared.by_name("fadd");
Expand Down Expand Up @@ -498,7 +523,8 @@ pub fn define(
let rec_pushq = r.template("pushq");
let rec_ret = r.template("ret");
let rec_r_ib = r.template("r_ib");
let rec_r_ib_unsigned = r.template("r_ib_unsigned");
let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr");
let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr");
let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r");
let rec_r_id = r.template("r_id");
let rec_rcmp = r.template("rcmp");
Expand Down Expand Up @@ -557,7 +583,6 @@ pub fn define(
let use_popcnt = settings.predicate_by_name("use_popcnt");
let use_lzcnt = settings.predicate_by_name("use_lzcnt");
let use_bmi1 = settings.predicate_by_name("use_bmi1");
let use_sse2 = settings.predicate_by_name("use_sse2");
let use_ssse3 = settings.predicate_by_name("use_ssse3");
let use_sse41 = settings.predicate_by_name("use_sse41");

Expand Down Expand Up @@ -1625,66 +1650,91 @@ pub fn define(
e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));

// SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
let sse_vector_size: u64 = 128;

// SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
// legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
// value across the register

// PSHUFB, 8-bit shuffle using two XMM registers
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let number_of_lanes = 128 / ty.lane_bits();
let instruction = x86_pshufb.bind_vector(ty, number_of_lanes);
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]);
let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
e.enc64_isap(instruction, template, use_ssse3);
}

// PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
let number_of_lanes = 128 / ty.lane_bits();
let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]);
e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
e.enc64_isap(instruction, template, use_sse2);
let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_r_ib_unsigned_fpr
.nonrex()
.opcodes(vec![0x66, 0x0f, 0x70]);
e.enc32(instruction.clone(), template.clone());
e.enc64(instruction, template);
}

// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
let number_of_lanes = 128 / ty.lane_bits();
let instruction = scalar_to_vector.bind_vector(ty, number_of_lanes).bind(ty);
let instruction = scalar_to_vector
.bind_vector_from_lane(ty, sse_vector_size)
.bind(ty);
let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
if ty.lane_bits() < 64 {
// no 32-bit encodings for 64-bit widths
e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
e.enc32(instruction.clone(), template.clone());
}
e.enc_x86_64_isap(instruction, template, use_sse2);
e.enc_x86_64(instruction, template);
}

// SIMD insertlane
let mut insertlane_mapping: HashMap<u64, (Vec<u8>, SettingPredicateNumber)> = HashMap::new();
insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], use_sse41)); // PINSRB
insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], use_sse2)); // PINSRW
insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRD
insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRQ, only x86_64
let mut insertlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
HashMap::new();
insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41))); // PINSRB
insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41))); // PINSRD
insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41))); // PINSRQ, only x86_64

for ty in ValueType::all_lane_types() {
if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
let number_of_lanes = 128 / ty.lane_bits();
let instruction = insertlane.bind_vector(ty, number_of_lanes);
let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
if ty.lane_bits() < 64 {
e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
} else {
// turns out the 64-bit widths have REX/W encodings and only are available on x86_64
e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
}
}
}

// SIMD extractlane
let mut extractlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
HashMap::new();
extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB
extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD
extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64

for ty in ValueType::all_lane_types() {
if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
if ty.lane_bits() < 64 {
e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
} else {
// turns out the 64-bit widths have REX/W encodings and only are available on x86_64
e.enc64_isap(instruction, template.rex().w(), isap.clone());
e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
}
}
}

// SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let instruction = bitcast.bind_vector(ty, 16).bind(F64);
let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
e.enc64_rec(instruction, rec_null_fpr, 0);
}
Expand All @@ -1694,8 +1744,8 @@ pub fn define(
for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
{
let instruction = raw_bitcast
.bind_vector(to_type, 128 / to_type.lane_bits())
.bind_vector(from_type, 128 / from_type.lane_bits());
.bind_vector_from_lane(to_type, sse_vector_size)
.bind_vector_from_lane(from_type, sse_vector_size);
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
e.enc64_rec(instruction, rec_null_fpr, 0);
}
Expand Down
23 changes: 13 additions & 10 deletions cranelift-codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,12 +320,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
let c = var("c");
let d = var("d");

// SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
let sse_vector_size: u64 = 128;

// SIMD splat: 8-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size);
let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
narrow.legalize(
def!(y = splat_x8x16(x)),
def!(y = splat_any8x16(x)),
vec![
def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
Expand All @@ -337,13 +340,13 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou

// SIMD splat: 16-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
let splat_x16x8 = splat.bind_vector_from_lane(ty, sse_vector_size);
let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
.bind_vector(I32, 4)
.bind_vector(ty, 128 / ty.lane_bits());
.bind_vector_from_lane(I32, sse_vector_size)
.bind_vector_from_lane(ty, sse_vector_size);
let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
.bind_vector(ty, 128 / ty.lane_bits())
.bind_vector(I32, 4);
.bind_vector_from_lane(ty, sse_vector_size)
.bind_vector_from_lane(I32, sse_vector_size);
narrow.legalize(
def!(y = splat_x16x8(x)),
vec![
Expand All @@ -358,7 +361,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou

// SIMD splat: 32-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
let splat_any32x4 = splat.bind_vector_from_lane(ty, sse_vector_size);
narrow.legalize(
def!(y = splat_any32x4(x)),
vec![
Expand All @@ -370,7 +373,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou

// SIMD splat: 64-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
let splat_any64x2 = splat.bind_vector_from_lane(ty, sse_vector_size);
narrow.legalize(
def!(y = splat_any64x2(x)),
vec![
Expand Down
25 changes: 23 additions & 2 deletions cranelift-codegen/meta/src/isa/x86/recipes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl<'builder> RecipeGroup<'builder> {
pub fn recipe(&self, name: &str) -> &EncodingRecipe {
self.recipes
.iter()
.find(|recipe| recipe.name == name)
.find(|recipe| &recipe.name == name)
.expect(&format!("unknown recipe name: {}. Try template?", name))
}
pub fn template(&self, name: &str) -> &Template {
Expand Down Expand Up @@ -800,7 +800,7 @@ pub fn define<'shared>(
{
let format = formats.get(f_extract_lane);
recipes.add_template_recipe(
EncodingRecipeBuilder::new("r_ib_unsigned", f_extract_lane, 2)
EncodingRecipeBuilder::new("r_ib_unsigned_fpr", f_extract_lane, 2)
.operands_in(vec![fpr])
.operands_out(vec![fpr])
.inst_predicate(InstructionPredicate::new_is_unsigned_int(
Expand All @@ -817,6 +817,27 @@ pub fn define<'shared>(
);
}

// XX /r ib with 8-bit unsigned immediate (e.g. for extractlane)
{
let format = formats.get(f_extract_lane);
recipes.add_template_recipe(
EncodingRecipeBuilder::new("r_ib_unsigned_gpr", f_extract_lane, 2)
.operands_in(vec![fpr])
.operands_out(vec![gpr])
.inst_predicate(InstructionPredicate::new_is_unsigned_int(
format, "lane", 8, 0,
))
.emit(
r#"
{{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte
let imm:i64 = lane.into();
sink.put1(imm as u8);
"#,
),
);
}

// XX /r ib with 8-bit unsigned immediate (e.g. for insertlane)
{
let format = formats.get(f_insert_lane);
Expand Down
Loading