Skip to content
Merged
38 changes: 38 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1571,6 +1571,44 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
#endif // DEBUG
}
}
else if (node->GetHWIntrinsicId() == NI_AVX512_TernaryLogic)
{
// These are the control bytes used for TernaryLogic

const uint8_t A = 0xF0;
const uint8_t B = 0xCC;
const uint8_t C = 0xAA;

uint8_t control = static_cast<uint8_t>(ival);
const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control);
TernaryLogicUseFlags useFlags = info.GetAllUseFlags();

if (useFlags == TernaryLogicUseFlags::ABC)
{
// We're using all the operands and can potentially have any
// operand overlap with the target register. So we need to
// detect those cases and adjust the control byte accordingly.

if (targetReg == op2Reg)
{
std::swap(op1, op2);
std::swap(op1Reg, op2Reg);

control = TernaryLogicInfo::GetTernaryControlByte(info, B, A, C);
ival = static_cast<int8_t>(control);
}
else if ((targetReg == op3->GetRegNum()) && !op3->isUsedFromSpillTemp())
{
assert(!op3->isContained());

std::swap(op1, op3);
op1Reg = op1->GetRegNum();

control = TernaryLogicInfo::GetTernaryControlByte(info, C, B, A);
ival = static_cast<int8_t>(control);
}
}
Comment thread
tannergooding marked this conversation as resolved.
}

assert(targetReg != REG_NA);
assert(op1Reg != REG_NA);
Expand Down
174 changes: 93 additions & 81 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2520,99 +2520,36 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
case NI_AVX512_FusedMultiplySubtractNegated:
case NI_AVX512_FusedMultiplySubtractNegatedScalar:
{
// While this operation is RMW, it is also almost freely reorderable
// and so we do not need to set the operands as delay free unless
// we are copying the upper bits and therefore op1 must be the target

assert((numArgs == 3) || (intrinsicTree->OperIsEmbRoundingEnabled()));
assert(isRMW);
assert(HWIntrinsicInfo::IsFmaIntrinsic(intrinsicId));

const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);

LIR::Use use;
GenTree* user = nullptr;

if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use))
{
user = use.User();
}
unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3);

unsigned containedOpNum = 0;

// containedOpNum remains 0 when no operand is contained or regOptional
if (op1->isContained() || op1->IsRegOptional())
{
containedOpNum = 1;
}
else if (op2->isContained() || op2->IsRegOptional())
{
containedOpNum = 2;
}
else if (op3->isContained() || op3->IsRegOptional())
{
containedOpNum = 3;
}

GenTree* emitOp1 = op1;
GenTree* emitOp2 = op2;
GenTree* emitOp3 = op3;

// Intrinsics with CopyUpperBits semantics must have op1 as target
assert(containedOpNum != 1 || !copiesUpperBits);

// We need to keep this in sync with hwintrinsiccodegenxarch.cpp
// Ideally we'd actually swap the operands here and simplify codegen
// but its a bit more complicated to do so for many operands as well
// as being complicated to tell codegen how to pick the right instruction

if (containedOpNum == 1)
if (op1->isContained() || op1->IsRegOptional())
{
// https://github.com/dotnet/runtime/issues/62215
// resultOpNum might change between lowering and lsra, comment out assertion for now.
// assert(containedOpNum != resultOpNum);
// resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3
assert(!copiesUpperBits);
std::swap(emitOp1, emitOp3);

if (resultOpNum == 2)
{
// op2 = ([op1] * op2) + op3
std::swap(emitOp1, emitOp2);
}
}
else if (containedOpNum == 3)
{
// assert(containedOpNum != resultOpNum);
if (resultOpNum == 2 && !copiesUpperBits)
{
// op2 = (op1 * op2) + [op3]
std::swap(emitOp1, emitOp2);
}
// else: op1/? = (op1 * op2) + [op3]
}
else if (containedOpNum == 2)
else if (op2->isContained() || op2->IsRegOptional())
{
// assert(containedOpNum != resultOpNum);

// op1/? = (op1 * [op2]) + op3
std::swap(emitOp2, emitOp3);
if (resultOpNum == 3 && !copiesUpperBits)
{
// op3 = (op1 * [op2]) + op3
std::swap(emitOp1, emitOp2);
}
}
else
{
// containedOpNum == 0
// no extra work when resultOpNum is 0 or 1
if (resultOpNum == 2)
{
std::swap(emitOp1, emitOp2);
}
else if (resultOpNum == 3)
{
std::swap(emitOp1, emitOp3);
}
}

// We don't have different intrinsic IDs for the 3 instruction forms and so,
// unlike other intrinsics, the operand order and the emit order are not
// consistent and have not been adjusted by lowering. Because of this, we
// need to track what the expected emit order is to know how to build the use
// but still need build the users in the order they exist in LIR.

GenTree* ops[] = {op1, op2, op3};
for (GenTree* op : ops)
{
Expand All @@ -2623,14 +2560,33 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
else if (op == emitOp2)
{
srcCount += BuildDelayFreeUses(op, emitOp1);
if (copiesUpperBits)
{
srcCount += BuildDelayFreeUses(op, emitOp1);
}
else
{
tgtPrefUse2 = BuildUse(op);
srcCount++;
}
}
else if (op == emitOp3)
{
SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op, RBM_NONE, canHWIntrinsicUseApxRegs);
srcCount += op->isContained() ? BuildOperandUses(op, apxAwareRegCandidates)
: BuildDelayFreeUses(op, emitOp1);
if (op->isContained())
{
SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op, RBM_NONE, canHWIntrinsicUseApxRegs);
srcCount += BuildOperandUses(op, apxAwareRegCandidates);
}
else if (copiesUpperBits)
{
srcCount += BuildDelayFreeUses(op, emitOp1);
}
else
{
tgtPrefUse3 = BuildUse(op);
srcCount++;
}
}
}

Expand Down Expand Up @@ -2748,6 +2704,62 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_AVX512_TernaryLogic:
{
// While this operation can be RMW when all operands are used, it
// is also almost freely reorderable and so we do not need to set
// the operands as delay free unless the control byte is unknown.

assert(numArgs == 4);

if (!op4->isContainedIntOrIImmed())
{
break;
}

// We have a bit of a special consideration where not all operands may be "used"
// and where the leading operands can be contained vector constants that will be
// ignored in codegen. Only op3 can be the "true" contained operand from memory.

if (op1->isContained())
{
srcCount += BuildOperandUses(op1);
}
else
{
tgtPrefUse = BuildUse(op1);
srcCount++;
}

if (op2->isContained())
{
srcCount += BuildOperandUses(op2);
}
else
{
tgtPrefUse2 = BuildUse(op2);
srcCount++;
}

if (op3->isContained())
{
SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op3, RBM_NONE, canHWIntrinsicUseApxRegs);
srcCount += BuildOperandUses(op3, apxAwareRegCandidates);
Comment thread
tannergooding marked this conversation as resolved.
}
else
{
tgtPrefUse3 = BuildUse(op3);
srcCount++;
}

assert(op4->isContained());
srcCount += BuildOperandUses(op4);

buildUses = false;
break;
}

case NI_AVXVNNI_MultiplyWideningAndAdd:
case NI_AVXVNNI_MultiplyWideningAndAddSaturate:
case NI_AVXVNNIINT_MultiplyWideningAndAdd:
Expand Down
Loading