Skip to content
This repository was archived by the owner on Jun 26, 2020. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions cranelift-codegen/meta/src/cdsl/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,27 @@ impl LaneType {
ValueType::Vector(VectorType::new(*self, lanes.into()))
}
}

pub fn is_float(&self) -> bool {
match self {
LaneType::FloatType(_) => true,
_ => false,
}
}

pub fn is_int(&self) -> bool {
match self {
LaneType::IntType(_) => true,
_ => false,
}
}

pub fn is_bool(&self) -> bool {
match self {
LaneType::BoolType(_) => true,
_ => false,
}
}
}

impl fmt::Display for LaneType {
Expand Down
138 changes: 97 additions & 41 deletions cranelift-codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use std::collections::HashMap;

use crate::cdsl::encodings::{Encoding, EncodingBuilder};
use crate::cdsl::instructions::{
BoundInstruction, InstSpec, Instruction, InstructionGroup, InstructionPredicate,
InstructionPredicateNode, InstructionPredicateRegistry,
InstSpec, Instruction, InstructionGroup, InstructionPredicate, InstructionPredicateNode,
InstructionPredicateRegistry,
};
use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
Expand Down Expand Up @@ -234,7 +234,7 @@ impl PerCpuModeEncodings {
}
fn enc_both_isap(
&mut self,
inst: BoundInstruction,
inst: impl Clone + Into<InstSpec>,
template: Template,
isap: SettingPredicateNumber,
) {
Expand All @@ -243,7 +243,7 @@ impl PerCpuModeEncodings {
}
fn enc_both_instp(
&mut self,
inst: BoundInstruction,
inst: impl Clone + Into<InstSpec>,
template: Template,
instp: InstructionPredicateNode,
) {
Expand Down Expand Up @@ -279,6 +279,17 @@ impl PerCpuModeEncodings {
}
}

/// Add the same encoding/recipe pairing to both X86_32 and X86_64
fn enc_32_64_rec(
&mut self,
inst: impl Clone + Into<InstSpec>,
recipe: &EncodingRecipe,
bits: u16,
) {
self.enc32_rec(inst.clone(), recipe, bits);
self.enc64_rec(inst, recipe, bits);
}

/// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
/// binding) has already happened.
fn enc_32_64_maybe_isap(
Expand Down Expand Up @@ -356,7 +367,6 @@ pub(crate) fn define(
let copy_to_ssa = shared.by_name("copy_to_ssa");
let ctz = shared.by_name("ctz");
let debugtrap = shared.by_name("debugtrap");
let extractlane = shared.by_name("extractlane");
let f32const = shared.by_name("f32const");
let f64const = shared.by_name("f64const");
let fadd = shared.by_name("fadd");
Expand Down Expand Up @@ -386,7 +396,6 @@ pub(crate) fn define(
let ifcmp_sp = shared.by_name("ifcmp_sp");
let imul = shared.by_name("imul");
let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
let insertlane = shared.by_name("insertlane");
let ireduce = shared.by_name("ireduce");
let ishl = shared.by_name("ishl");
let ishl_imm = shared.by_name("ishl_imm");
Expand Down Expand Up @@ -459,7 +468,12 @@ pub(crate) fn define(
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
let x86_fmax = x86.by_name("x86_fmax");
let x86_fmin = x86.by_name("x86_fmin");
let x86_insertps = x86.by_name("x86_insertps");
let x86_movlhps = x86.by_name("x86_movlhps");
let x86_movsd = x86.by_name("x86_movsd");
let x86_pop = x86.by_name("x86_pop");
let x86_pextr = x86.by_name("x86_pextr");
let x86_pinsr = x86.by_name("x86_pinsr");
let x86_pshufd = x86.by_name("x86_pshufd");
let x86_pshufb = x86.by_name("x86_pshufb");
let x86_push = x86.by_name("x86_push");
Expand Down Expand Up @@ -490,6 +504,7 @@ pub(crate) fn define(
let rec_f64imm_z = r.template("f64imm_z");
let rec_fa = r.template("fa");
let rec_fax = r.template("fax");
let rec_fa_ib = r.template("fa_ib");
let rec_fcmp = r.template("fcmp");
let rec_fcscc = r.template("fcscc");
let rec_ffillnull = r.recipe("ffillnull");
Expand Down Expand Up @@ -1729,24 +1744,25 @@ pub(crate) fn define(
e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));

// SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
// SIMD vector size: eventually multiple vector sizes may be supported but for now only
// SSE-sized vectors are available.
let sse_vector_size: u64 = 128;

// SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
// legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
// value across the register
// value across the register.

let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;

// PSHUFB, 8-bit shuffle using two XMM registers
// PSHUFB, 8-bit shuffle using two XMM registers.
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
e.enc32_isap(instruction.clone(), template.clone(), use_ssse3_simd);
e.enc64_isap(instruction, template, use_ssse3_simd);
}

// PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
// PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate.
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_r_ib_unsigned_fpr
Expand All @@ -1761,73 +1777,113 @@ pub(crate) fn define(
// written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
if ty.lane_bits() < 64 {
// no 32-bit encodings for 64-bit widths
e.enc32(instruction.clone(), template.clone());
if ty.is_float() {
e.enc_32_64_rec(instruction, rec_null_fpr, 0);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I do:

v0 = f32const 0x42.42
v1 = splat.f32x4 v0
v2 = extractlane 0 v1
v3 = scalar_to_vector v2

Then my understanding is that, because of the previous commit and this change, the high bits of v3 will be non-zeroes, in contrary to what the comment of scalar_to_vector suggests. Am I understanding correctly? If so, I think this code should be reworked, and it'd be nice to add a test.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, good point; I think I added this because when legalizing splat I needed to get ints/booleans into an XMM register but I realized that floats already were in the right place. Perhaps the scalar_to_vector definition should change to not specify that the higher bits are zeroed?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good question: it would introduce more nondeterminism, which wouldn't be confusing if it's properly documented.

One question is, if we decided to do this, what would be the difference between insertlane 0 and scalar_to_vector? I think they'd do the same thing, conceptually, so we could:

  • either remove scalar_to_vector, and require to use insertlane 0 instead. If people want to have the higher bits zeroed, they can generate a 0 constant first.
  • or keep scalar_to_vector, and it would do the zeroing on behalf of the user.

For the sake of having a minimal IR, and if having a different instruction doesn't bring any new optimization opportunity (I can't think of any brought by scalar_to_vector with zeroing semantics), I think removing scalar_to_vector would be fine. @sunfishcode, any opinions here?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree that scalar_to_vector zeroing the high lanes has overlap with insertlane.

What if make scalar_to_vector leave nondeterministic values in the high lanes, but document it as a low-level legalization instruction, and that frontends should generally prefer insertlane 0? Nondeterminism is worth avoiding when we can, but there are a variety of situations where it's useful to be able to get a scalar value into a vector register, where one knows that subsequent operations won't care about the high lanes of the vector, and the extra zeroing would be needless overhead.

@abrown abrown Sep 5, 2019

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I changed the documentation of scalar_to_vector to look like:

    Scalar To Vector -- move a value out of a scalar register and into a vector register; the 
    scalar will be moved to the lowest-order bits of the vector register. Note that this 
    instruction is intended as a low-level legalization instruction and frontends should prefer 
    insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
    types (e.g. integers) but not for others (e.g. floats).

See b2e9e09#diff-7c1b843a5d2e8c61f75e0d350b3f3914R2774-R2778.

} else {
let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
if ty.lane_bits() < 64 {
// no 32-bit encodings for 64-bit widths
e.enc32(instruction.clone(), template.clone());
}
e.enc_x86_64(instruction, template);
}
e.enc_x86_64(instruction, template);
}

// SIMD insertlane
let mut insertlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
let mut x86_pinsr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
HashMap::new();
insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
x86_pinsr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB
x86_pinsr_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2
x86_pinsr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
x86_pinsr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64

for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
if let Some((opcode, isap)) = x86_pinsr_mapping.get(&ty.lane_bits()) {
let instruction = x86_pinsr.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
if ty.lane_bits() < 64 {
e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
} else {
// turns out the 64-bit widths have REX/W encodings and only are available on x86_64
// It turns out the 64-bit widths have REX/W encodings and only are available on
// x86_64.
e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
}
}
}

// For legalizing insertlane with floats, INSERTPS from SSE4.1.
{
let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size);
let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]);
e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd));
Comment thread
abrown marked this conversation as resolved.
}

// For legalizing insertlane with floats, MOVSD from SSE2.
{
let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]);
e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2
}

// For legalizing insertlane with floats, MOVLHPS from SSE.
{
let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]);
e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
}

// SIMD extractlane
let mut extractlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
let mut x86_pextr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
HashMap::new();
extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from SSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64

for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) {
let instruction = x86_pextr.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
if ty.lane_bits() < 64 {
e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
} else {
// turns out the 64-bit widths have REX/W encodings and only are available on x86_64
// It turns out the 64-bit widths have REX/W encodings and only are available on
// x86_64.
e.enc64_maybe_isap(instruction, template.rex().w(), isap.clone());
}
}
}

// SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
e.enc64_rec(instruction, rec_null_fpr, 0);
}

// SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
// SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
for to_type in
ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
{
let instruction = raw_bitcast
.bind_vector_from_lane(to_type, sse_vector_size)
.bind_vector_from_lane(from_type, sse_vector_size);
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
e.enc64_rec(instruction, rec_null_fpr, 0);
e.enc_32_64_rec(instruction, rec_null_fpr, 0);
}
}

// SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an
// XMM register.
for float_type in &[F32, F64] {
for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
e.enc_32_64_rec(
Comment thread
abrown marked this conversation as resolved.
raw_bitcast
.bind_vector_from_lane(lane_type, sse_vector_size)
.bind(*float_type),
rec_null_fpr,
0,
);
e.enc_32_64_rec(
raw_bitcast
.bind(*float_type)
.bind_vector_from_lane(lane_type, sse_vector_size),
rec_null_fpr,
0,
);
}
}

Expand Down
96 changes: 96 additions & 0 deletions cranelift-codegen/meta/src/isa/x86/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,5 +291,101 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

let Idx = &operand_doc("Idx", uimm8, "Lane index");
let x = &operand("x", TxN);
let a = &operand("a", &TxN.lane_of());

ig.push(
Inst::new(
"x86_pextr",
r#"
Extract lane ``Idx`` from ``x``.
The lane index, ``Idx``, is an immediate value, not an SSA value. It
must indicate a valid lane index for the type of ``x``.
Comment thread
abrown marked this conversation as resolved.
"#,
)
.operands_in(vec![x, Idx])
.operands_out(vec![a]),
);

let IBxN = &TypeVar::new(
"IBxN",
"A SIMD vector type containing only booleans and integers",
TypeSetBuilder::new()
.ints(Interval::All)
.bools(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let x = &operand("x", IBxN);
let y = &operand_doc("y", &IBxN.lane_of(), "New lane value");
let a = &operand("a", IBxN);

ig.push(
Inst::new(
"x86_pinsr",
r#"
Insert ``y`` into ``x`` at lane ``Idx``.
The lane index, ``Idx``, is an immediate value, not an SSA value. It
must indicate a valid lane index for the type of ``x``.
"#,
)
.operands_in(vec![x, Idx, y])
Comment thread
abrown marked this conversation as resolved.
.operands_out(vec![a]),
);

let FxN = &TypeVar::new(
"FxN",
"A SIMD vector type containing floats",
TypeSetBuilder::new()
.floats(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let x = &operand("x", FxN);
let y = &operand_doc("y", &FxN.lane_of(), "New lane value");
let a = &operand("a", FxN);

ig.push(
Inst::new(
"x86_insertps",
r#"
Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is
extracted from and which it is inserted to. This is similar to x86_pinsr but inserts
floats, which are already stored in an XMM register.
"#,
)
.operands_in(vec![x, Idx, y])
.operands_out(vec![a]),
);

let x = &operand("x", FxN);
let y = &operand("y", FxN);
let a = &operand("a", FxN);

ig.push(
Inst::new(
"x86_movsd",
r#"
Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x``
Comment thread
abrown marked this conversation as resolved.
"#,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);

ig.push(
Inst::new(
"x86_movlhps",
r#"
Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x``
"#,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);

ig.build()
}
Loading