From b172f73e04027fe74ab682ae23b2efbee3904191 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 26 Oct 2022 12:05:38 -0700 Subject: [PATCH 1/4] Cleaning up TODO-XARCH-CLEANUP on EA_8BYTE size. --- src/coreclr/jit/codegenxarch.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 9cbb8fc3de37e9..60bb1440123cc4 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -10714,8 +10714,7 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) { // ABI requires us to preserve lower 128-bits of YMM register. GetEmitter()->emitIns_AR_R(copyIns, - EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be - // EA_16BYTE + EA_16BYTE, reg, REG_SPBASE, offset); compiler->unwindSaveReg(reg, offset); regMask &= ~regBit; @@ -10781,8 +10780,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) { // ABI requires us to restore lower 128-bits of YMM register. GetEmitter()->emitIns_R_AR(copyIns, - EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be - // EA_16BYTE + EA_16BYTE, reg, regBase, offset); regMask &= ~regBit; offset -= XMM_REGSIZE_BYTES; From d6e5fd2225d6c3785e47d5d53d9185db59d9efa1 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 26 Oct 2022 12:06:13 -0700 Subject: [PATCH 2/4] Adding EVEX encoding pathways for emitOutputRR(). --- src/coreclr/jit/emitxarch.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 71b2d2ef42ad42..2eb52104e83f32 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -3051,7 +3051,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val } else { - assert(!IsSSEOrAVXInstruction(ins)); + assert(!IsAvx512OrPriorInstruction(ins)); } return valSize + emitInsSizeRR(id, code); @@ -3066,7 +3066,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re // This would probably be better expressed as a different format or something? code_t code = insCodeRM(ins); - UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, size, insCodeRM(ins)); + UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, size, insCodeRM(ins)); bool includeRexPrefixSize = true; // REX prefix @@ -3082,7 +3082,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re if ((code & 0xFF00) != 0) { - sz += IsSSEOrAVXInstruction(ins) ? emitInsSize(code, includeRexPrefixSize) : 5; + sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(code, includeRexPrefixSize) : 5; } else { @@ -13274,7 +13274,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) regNumber reg2 = id->idReg2(); emitAttr size = id->idOpSize(); - if (IsSSEOrAVXInstruction(ins)) + if (IsAvx512OrPriorInstruction(ins)) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); @@ -13286,10 +13286,12 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code = insCodeMR(ins); } - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); code = insEncodeRMreg(ins, code); - if (TakesRexWPrefix(ins, size)) + // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. + // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) { code = AddRexWPrefix(ins, code); } @@ -13298,7 +13300,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); code = insEncodeRMreg(ins, code) | (int)(size == EA_2BYTE); #ifdef TARGET_AMD64 @@ -13312,7 +13314,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); code = insEncodeRMreg(ins, code); #endif // TARGET_AMD64 @@ -13343,7 +13345,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) #endif // FEATURE_HW_INTRINSICS else { - assert(!TakesVexPrefix(ins)); + assert(!TakesSimdPrefix(ins)); code = insCodeMR(ins); code = insEncodeMRreg(ins, code); @@ -13415,7 +13417,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) unsigned regCode = insEncodeReg345(ins, regFor345Bits, size, &code); regCode |= insEncodeReg012(ins, regFor012Bits, size, &code); - if (TakesVexPrefix(ins)) + if (TakesSimdPrefix(ins)) { // In case of AVX instructions that take 3 operands, we generally want to encode reg1 // as first source. In this case, reg1 is both a source and a destination. @@ -13437,7 +13439,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); if (code & 0xFF000000) { From 257037422b8a7c8e139b3c5fc75d5826ded5e16b Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 26 Oct 2022 17:10:06 -0700 Subject: [PATCH 3/4] Adding EVEX encoding support for AM paths. --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/codegenxarch.cpp | 14 +- src/coreclr/jit/emit.cpp | 1 + src/coreclr/jit/emitxarch.cpp | 540 ++++++++--- src/coreclr/jit/emitxarch.h | 67 +- src/coreclr/jit/hwintrinsiclistxarch.h | 8 +- src/coreclr/jit/instr.cpp | 40 +- src/coreclr/jit/instr.h | 110 ++- src/coreclr/jit/instrsxarch.h | 1202 ++++++++++++------------ src/coreclr/jit/simdcodegenxarch.cpp | 3 +- 10 files changed, 1214 insertions(+), 773 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 0df8859f4ad4ed..cc544e5bb7e532 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1700,7 +1700,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX instruction ins_Copy(var_types dstType); instruction ins_Copy(regNumber srcReg, var_types dstType); - instruction ins_FloatConv(var_types to, var_types from); + instruction ins_FloatConv(var_types to, var_types from, emitAttr attr); instruction ins_MathOp(genTreeOps oper, var_types type); void instGen_Return(unsigned stkArgSize); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 60bb1440123cc4..3f33ee2ac044ca 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -6911,7 +6911,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode) } else { - instruction ins = ins_FloatConv(dstType, srcType); + instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType)); GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); } @@ -7005,7 +7005,7 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // Note that here we need to specify srcType that will determine // the size of source reg/mem operand and rex.w prefix. - instruction ins = ins_FloatConv(dstType, TYP_INT); + instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType)); GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction @@ -7110,7 +7110,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) // Note that we need to specify dstType here so that it will determine // the size of destination integer register and also the rex.w prefix. genConsumeOperands(treeNode->AsOp()); - instruction ins = ins_FloatConv(TYP_INT, srcType); + instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(dstType)); GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); genProduceReg(treeNode); } @@ -10713,9 +10713,7 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) if ((regBit & regMask) != 0) { // ABI requires us to preserve lower 128-bits of YMM register. - GetEmitter()->emitIns_AR_R(copyIns, - EA_16BYTE, - reg, REG_SPBASE, offset); + GetEmitter()->emitIns_AR_R(copyIns, EA_16BYTE, reg, REG_SPBASE, offset); compiler->unwindSaveReg(reg, offset); regMask &= ~regBit; offset -= XMM_REGSIZE_BYTES; @@ -10779,9 +10777,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) if ((regBit & regMask) != 0) { // ABI requires us to restore lower 128-bits of YMM register. - GetEmitter()->emitIns_R_AR(copyIns, - EA_16BYTE, - reg, regBase, offset); + GetEmitter()->emitIns_R_AR(copyIns, EA_16BYTE, reg, regBase, offset); regMask &= ~regBit; offset -= XMM_REGSIZE_BYTES; } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index a7644c3bcbd0f1..a54bacb0558ef6 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -3999,6 +3999,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr); unsigned estimatedSize = id->idCodeSize(); + if (actualSize != estimatedSize) { // It is fatal to under-estimate the instruction size, except for alignment instructions diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 2eb52104e83f32..f73cb54d0361ce 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -746,6 +746,20 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins); } +//------------------------------------------------------------------------ +// TakesSimdPrefix: Checks if the instruction should be VEX or EVEX encoded. +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// true if this instruction requires a VEX or EVEX prefix. +// +bool emitter::TakesSimdPrefix(instruction ins) const +{ + return TakesEvexPrefix(ins) || TakesVexPrefix(ins); +} + //------------------------------------------------------------------------ // TakesEvexPrefix: Checks if the instruction should be EVEX encoded. // TODO-XArch-AVX512: This check needs to be updated once AVX512 instructions are added. @@ -1000,8 +1014,8 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_cvttss2si: case INS_cvtsd2si: case INS_cvtss2si: - case INS_cvtsi2sd: - case INS_cvtsi2ss: + case INS_cvtsi2sd64: + case INS_cvtsi2ss64: case INS_movnti: case INS_mulx: case INS_pdep: @@ -1176,6 +1190,16 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) { + if (UseEvexEncoding() && IsAvx512Instruction(ins)) + { + if (TakesEvexPrefix(ins)) + { + // X-bit is available in 4-byte EVEX prefix that starts with byte 62. + assert(hasEvexPrefix(code)); + // X-bit is added in bit-inverted form. + return code & 0xFFBFFFFFFFFFFFFFULL; + } + } if (UseVEXEncoding() && IsAVXInstruction(ins)) { if (TakesVexPrefix(ins)) @@ -2216,12 +2240,12 @@ inline ssize_t emitter::emitGetInsCIdisp(instrDesc* id) // clang-format off const insFlags CodeGenInterface::instInfo[] = { - #define INST0(id, nm, um, mr, flags) static_cast(flags), - #define INST1(id, nm, um, mr, flags) static_cast(flags), - #define INST2(id, nm, um, mr, mi, flags) static_cast(flags), - #define INST3(id, nm, um, mr, mi, rm, flags) static_cast(flags), - #define INST4(id, nm, um, mr, mi, rm, a4, flags) static_cast(flags), - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) static_cast(flags), + #define INST0(id, nm, um, mr, tt, flags) static_cast(flags), + #define INST1(id, nm, um, mr, tt, flags) static_cast(flags), + #define INST2(id, nm, um, mr, mi, tt, flags) static_cast(flags), + #define INST3(id, nm, um, mr, mi, rm, tt, flags) static_cast(flags), + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) static_cast(flags), + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) static_cast(flags), #include "instrs.h" #undef INST0 #undef INST1 @@ -2240,12 +2264,12 @@ const insFlags CodeGenInterface::instInfo[] = // clang-format off const BYTE emitter::emitInsModeFmtTab[] = { - #define INST0(id, nm, um, mr, flags) um, - #define INST1(id, nm, um, mr, flags) um, - #define INST2(id, nm, um, mr, mi, flags) um, - #define INST3(id, nm, um, mr, mi, rm, flags) um, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) um, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) um, + #define INST0(id, nm, um, mr, tt, flags) um, + #define INST1(id, nm, um, mr, tt, flags) um, + #define INST2(id, nm, um, mr, mi, tt, flags) um, + #define INST3(id, nm, um, mr, mi, rm, tt, flags) um, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) um, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) um, #include "instrs.h" #undef INST0 #undef INST1 @@ -2340,12 +2364,12 @@ inline size_t insCode(instruction ins) const static size_t insCodes[] = { - #define INST0(id, nm, um, mr, flags) mr, - #define INST1(id, nm, um, mr, flags) mr, - #define INST2(id, nm, um, mr, mi, flags) mr, - #define INST3(id, nm, um, mr, mi, rm, flags) mr, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) mr, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr, + #define INST0(id, nm, um, mr, tt, flags) mr, + #define INST1(id, nm, um, mr, tt, flags) mr, + #define INST2(id, nm, um, mr, mi, tt, flags) mr, + #define INST3(id, nm, um, mr, mi, rm, tt, flags) mr, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) mr, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) mr, #include "instrs.h" #undef INST0 #undef INST1 @@ -2373,12 +2397,12 @@ inline size_t insCodeACC(instruction ins) const static size_t insCodesACC[] = { - #define INST0(id, nm, um, mr, flags) - #define INST1(id, nm, um, mr, flags) - #define INST2(id, nm, um, mr, mi, flags) - #define INST3(id, nm, um, mr, mi, rm, flags) - #define INST4(id, nm, um, mr, mi, rm, a4, flags) a4, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) a4, + #define INST0(id, nm, um, mr, tt, flags) + #define INST1(id, nm, um, mr, tt, flags) + #define INST2(id, nm, um, mr, mi, tt, flags) + #define INST3(id, nm, um, mr, mi, rm, tt, flags) + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) a4, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) a4, #include "instrs.h" #undef INST0 #undef INST1 @@ -2406,12 +2430,12 @@ inline size_t insCodeRR(instruction ins) const static size_t insCodesRR[] = { - #define INST0(id, nm, um, mr, flags) - #define INST1(id, nm, um, mr, flags) - #define INST2(id, nm, um, mr, mi, flags) - #define INST3(id, nm, um, mr, mi, rm, flags) - #define INST4(id, nm, um, mr, mi, rm, a4, flags) - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rr, + #define INST0(id, nm, um, mr, tt, flags) + #define INST1(id, nm, um, mr, tt, flags) + #define INST2(id, nm, um, mr, mi, tt, flags) + #define INST3(id, nm, um, mr, mi, rm, tt, flags) + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) rr, #include "instrs.h" #undef INST0 #undef INST1 @@ -2432,12 +2456,12 @@ inline size_t insCodeRR(instruction ins) const static size_t insCodesRM[] = { - #define INST0(id, nm, um, mr, flags) - #define INST1(id, nm, um, mr, flags) - #define INST2(id, nm, um, mr, mi, flags) - #define INST3(id, nm, um, mr, mi, rm, flags) rm, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) rm, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rm, + #define INST0(id, nm, um, mr, tt, flags) + #define INST1(id, nm, um, mr, tt, flags) + #define INST2(id, nm, um, mr, mi, tt, flags) + #define INST3(id, nm, um, mr, mi, rm, tt, flags) rm, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) rm, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) rm, #include "instrs.h" #undef INST0 #undef INST1 @@ -2472,12 +2496,12 @@ inline size_t insCodeRM(instruction ins) const static size_t insCodesMI[] = { - #define INST0(id, nm, um, mr, flags) - #define INST1(id, nm, um, mr, flags) - #define INST2(id, nm, um, mr, mi, flags) mi, - #define INST3(id, nm, um, mr, mi, rm, flags) mi, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) mi, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mi, + #define INST0(id, nm, um, mr, tt, flags) + #define INST1(id, nm, um, mr, tt, flags) + #define INST2(id, nm, um, mr, mi, tt, flags) mi, + #define INST3(id, nm, um, mr, mi, rm, tt, flags) mi, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) mi, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) mi, #include "instrs.h" #undef INST0 #undef INST1 @@ -2512,12 +2536,12 @@ inline size_t insCodeMI(instruction ins) const static size_t insCodesMR[] = { - #define INST0(id, nm, um, mr, flags) - #define INST1(id, nm, um, mr, flags) mr, - #define INST2(id, nm, um, mr, mi, flags) mr, - #define INST3(id, nm, um, mr, mi, rm, flags) mr, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) mr, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr, + #define INST0(id, nm, um, mr, tt, flags) + #define INST1(id, nm, um, mr, tt, flags) mr, + #define INST2(id, nm, um, mr, mi, tt, flags) mr, + #define INST3(id, nm, um, mr, mi, rm, tt, flags) mr, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) mr, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) mr, #include "instrs.h" #undef INST0 #undef INST1 @@ -2548,6 +2572,56 @@ inline size_t insCodeMR(instruction ins) return insCodesMR[ins]; } +// clang-format off +const static +insTupleType insTupleTypeInfos[] = +{ + #define INST0(id, nm, um, mr, tt, flags) static_cast(tt), + #define INST1(id, nm, um, mr, tt, flags) static_cast(tt), + #define INST2(id, nm, um, mr, mi, tt, flags) static_cast(tt), + #define INST3(id, nm, um, mr, mi, rm, tt, flags) static_cast(tt), + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) static_cast(tt), + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) static_cast(tt), + #include "instrs.h" + #undef INST0 + #undef INST1 + #undef INST2 + #undef INST3 + #undef INST4 + #undef INST5 +}; +// clang-format on +//------------------------------------------------------------------------ +// hasTupleTypeInfo: Checks if the instruction has tuple type info. +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// true if this instruction has tuple type info. +// +inline bool hasTupleTypeInfo(instruction ins) +{ + assert((unsigned)ins < ArrLen(insTupleTypeInfos)); + return ((insTupleTypeInfos[ins] != INS_TT_NONE)); +} + +//------------------------------------------------------------------------ +// insTupleTypeInfo: Returns the tuple type info for a given CPU instruction. +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// the tuple type info for a given CPU instruction. +// +inline insTupleType insTupleTypeInfo(instruction ins) +{ + assert((unsigned)ins < ArrLen(insTupleTypeInfos)); + assert((insTupleTypeInfos[ins] != INS_TT_NONE)); + return insTupleTypeInfos[ins]; +} + // Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h. bool emitter::EncodedBySSE38orSSE3A(instruction ins) { @@ -2557,7 +2631,7 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) size_t insCode = 0; - if (!IsSSEOrAVXInstruction(ins)) + if (!IsAvx512OrPriorInstruction(ins)) { return false; } @@ -3092,9 +3166,19 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re return sz; } -/*****************************************************************************/ - -inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp) +//------------------------------------------------------------------------ +// emitInsSizeSVCalcDisp: Calculate instruction size. +// +// Arguments: +// id -- Instruction descriptor. +// code -- The current opcode and any known prefixes +// var - Stack variable +// dsp -- Displacemnt. +// +// Return Value: +// Estimated instruction size. +// +inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp) { UNATIVE_OFFSET size = emitInsSize(code, /* includeRexPrefixSize */ true); UNATIVE_OFFSET offs; @@ -3214,6 +3298,14 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp) offs -= emitMaxTmpSize; } + // Check whether we can use compressed displacement if EVEX. + if (TakesEvexPrefix(id->idIns())) + { + bool compressedFitsInByte = false; + TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); + return size + (compressedFitsInByte ? sizeof(char) : sizeof(int)); + } + if ((int)offs < 0) { // offset is negative @@ -3255,14 +3347,22 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp) #endif // !FEATURE_FIXED_OUT_ARGS -// printf("lcl = %04X, tmp = %04X, stk = %04X, offs = %04X\n", -// emitLclSize, emitMaxTmpSize, emitCurStackLvl, offs); + // printf("lcl = %04X, tmp = %04X, stk = %04X, offs = %04X\n", + // emitLclSize, emitMaxTmpSize, emitCurStackLvl, offs); + bool useSmallEncoding = false; + if (TakesEvexPrefix(id->idIns())) + { + TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); + } + else + { #ifdef TARGET_AMD64 - bool useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX); + useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX); #else - bool useSmallEncoding = (offs <= size_t(SCHAR_MAX)); + useSmallEncoding = (offs <= size_t(SCHAR_MAX)); #endif + } // If it is ESP based, and the offset is zero, we will not encode the disp part. if (!EBPbased && offs == 0) @@ -3280,7 +3380,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var assert(id->idIns() != INS_invalid); instruction ins = id->idIns(); emitAttr attrSize = id->idOpSize(); - UNATIVE_OFFSET prefix = emitGetAdjustedSize(ins, attrSize, code); + UNATIVE_OFFSET prefix = emitGetAdjustedSizeEvexAware(ins, attrSize, code); // REX prefix if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) || @@ -3289,7 +3389,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var prefix += emitGetRexPrefixSize(ins); } - return prefix + emitInsSizeSV(code, var, dsp); + return prefix + emitInsSizeSVCalcDisp(id, code, var, dsp); } inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val) @@ -3298,7 +3398,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var instruction ins = id->idIns(); emitAttr attrSize = id->idOpSize(); UNATIVE_OFFSET valSize = EA_SIZE_IN_BYTES(attrSize); - UNATIVE_OFFSET prefix = emitGetAdjustedSize(ins, attrSize, code); + UNATIVE_OFFSET prefix = emitGetAdjustedSizeEvexAware(ins, attrSize, code); bool valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test); #ifdef TARGET_AMD64 @@ -3334,7 +3434,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var prefix += emitGetRexPrefixSize(ins); } - return prefix + valSize + emitInsSizeSV(code, var, dsp); + return prefix + valSize + emitInsSizeSVCalcDisp(id, code, var, dsp); } /*****************************************************************************/ @@ -3402,6 +3502,13 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) dspInByte = false; // relocs can't be placed in a byte dspIsZero = false; // relocs won't always be zero } + else + { + if (TakesEvexPrefix(ins)) + { + dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); + } + } if (code & 0xFF000000) { @@ -3424,7 +3531,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) size = 2; } - size += emitGetAdjustedSize(ins, attrSize, code); + size += emitGetAdjustedSizeEvexAware(ins, attrSize, code); if (hasRexPrefix(code)) { @@ -10488,7 +10595,8 @@ void emitter::emitDispIns( { printf("%s, %s", emitRegName(id->idReg1(), EA_4BYTE), emitRegName(id->idReg2(), attr)); } - else if ((ins == INS_cvtsi2ss) || (ins == INS_cvtsi2sd)) + else if ((ins == INS_cvtsi2ss32) || (ins == INS_cvtsi2sd32) || (ins == INS_cvtsi2ss64) || + (ins == INS_cvtsi2sd64)) { printf(" %s, %s", emitRegName(id->idReg1(), EA_16BYTE), emitRegName(id->idReg2(), attr)); } @@ -11236,7 +11344,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (id->idIsCallRegPtr()) { code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code); - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code); goto DONE; } @@ -11262,7 +11370,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // And emit the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); #endif // TARGET_AMD64 @@ -11281,7 +11389,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // SSE/AVX do not need to modify opcode if ((signed char)cval == cval && addc->cnsReloc == false && ins != INS_mov && ins != INS_test) { - if (id->idInsFmt() != IF_ARW_SHF && !IsSSEOrAVXInstruction(ins)) + if (id->idInsFmt() != IF_ARW_SHF && !IsAvx512OrPriorInstruction(ins)) { code |= 2; } @@ -11325,13 +11433,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } #endif // TARGET_X86 - // Emit VEX prefix if required - // There are some callers who already add VEX prefix and call this routine. - // Therefore, add VEX prefix is one is not already present. - code = AddVexPrefixIfNeededAndNotPresent(ins, code, size); + // Emit SIMD prefix if required + // There are some callers who already add SIMD prefix and call this routine. + // Therefore, add SIMD prefix is one is not already present. + code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); // For this format, moves do not support a third operand, so we only need to handle the binary ops. - if (TakesVexPrefix(ins)) + if (TakesSimdPrefix(ins)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -11365,7 +11473,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Emit the REX prefix if required - if (TakesRexWPrefix(ins, size)) + // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. + // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently + // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support. + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) { code = AddRexWPrefix(ins, code); } @@ -11421,9 +11532,9 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } unsigned regcode = insEncodeReg345(ins, reg345, size, &code); - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); - if (UseVEXEncoding() && (ins != INS_crc32)) + if (UseSimdEncoding() && (ins != INS_crc32)) { // Emit last opcode byte // TODO-XArch-CQ: Right now support 4-byte opcode instructions only @@ -11449,7 +11560,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Output the highest word of the opcode // We need to check again as in case of AVX instructions leading opcode bytes are stripped off @@ -11466,7 +11577,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(ins != INS_bt); // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Output the highest byte of the opcode if (code & 0x00FF0000) @@ -11534,20 +11645,30 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Get the displacement value dsp = emitGetInsAmdAny(id); GOT_DSP: - dspInByte = ((signed char)dsp == (ssize_t)dsp); dspIsZero = (dsp == 0); if (id->idIsDspReloc()) { dspInByte = false; // relocs can't be placed in a byte } + else + { + if (TakesEvexPrefix(ins)) + { + dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); + } + else + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } + } if (isMoffset) { @@ -12160,11 +12281,11 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) ssize_t cval = addc->cnsVal; // Does the constant fit in a byte? - // SSE/AVX do not need to modify opcode + // SSE/AVX/AVX512 do not need to modify opcode if ((signed char)cval == cval && addc->cnsReloc == false && ins != INS_mov && ins != INS_test) { if ((id->idInsFmt() != IF_SRW_SHF) && (id->idInsFmt() != IF_RRW_SRD_CNS) && - (id->idInsFmt() != IF_RWR_RRD_SRD_CNS) && !IsSSEOrAVXInstruction(ins)) + (id->idInsFmt() != IF_RWR_RRD_SRD_CNS) && !IsAvx512OrPriorInstruction(ins)) { code |= 2; } @@ -12173,13 +12294,17 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } } - // Add VEX prefix if required. - // There are some callers who already add VEX prefix and call this routine. - // Therefore, add VEX prefix is one is not already present. - code = AddVexPrefixIfNeededAndNotPresent(ins, code, size); + // Add VEX or EVEX prefix if required. + // There are some callers who already add prefix and call this routine. + // Therefore, add VEX or EVEX prefix if one is not already present. + code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); // Compute the REX prefix - if (TakesRexWPrefix(ins, size)) + // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. + // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). + // Not doing so currently since we cannot differentiate EVEX vs VEX without + // 'code' until all paths have EVEX support. + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) { code = AddRexWPrefix(ins, code); } @@ -12212,9 +12337,9 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } unsigned regcode = insEncodeReg345(ins, reg345, size, &code); - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); - if (UseVEXEncoding() && (ins != INS_crc32)) + if (UseSimdEncoding() && (ins != INS_crc32)) { // Emit last opcode byte // TODO-XArch-CQ: Right now support 4-byte opcode instructions only @@ -12240,7 +12365,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Output the highest word of the opcode // We need to check again because in case of AVX instructions the leading @@ -12257,7 +12382,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) assert(ins != INS_bt); // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Output the highest byte of the opcode. // We need to check again because in case of AVX instructions the leading @@ -12283,7 +12408,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) code += 4; } } - else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins)) + else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins) && !IsAvx512Instruction(ins)) { // Is the operand size larger than a byte? switch (size) @@ -12329,7 +12454,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Figure out the variable's frame position int varNum = id->idAddr()->iiaLclVar.lvaVarNum(); @@ -12337,7 +12462,18 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) adr = emitComp->lvaFrameAddress(varNum, &EBPbased); dsp = adr + id->idAddr()->iiaLclVar.lvaOffset(); - dspInByte = ((signed char)dsp == (int)dsp); + // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following + // function, to which the remainder of the emitter logic should handle properly. + // TODO-XARCH-AVX512 : embedded broadcast might change this + int dspAsByte = dsp; + if (TakesEvexPrefix(ins)) + { + dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); + } + else + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } dspIsZero = (dsp == 0); // for stack variables the dsp should never be a reloc @@ -12351,7 +12487,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (dspInByte) { dst += emitOutputByte(dst, code | 0x45); - dst += emitOutputByte(dst, dsp); + dst += emitOutputByte(dst, dspAsByte); } else { @@ -12364,7 +12500,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (dspInByte) { dst += emitOutputWord(dst, code | 0x4500); - dst += emitOutputByte(dst, dsp); + dst += emitOutputByte(dst, dspAsByte); } else { @@ -12381,7 +12517,21 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) dsp += emitCurStackLvl; #endif - dspInByte = ((signed char)dsp == (int)dsp); + // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following + // function, to which the remainder of the emitter logic should handle properly. + // TODO-XARCH-AVX512 : embedded broadcast might change this + if (TakesEvexPrefix(ins)) + { + dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); + } + else + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + if (dspInByte) + { + dspAsByte = dsp; + } + } dspIsZero = (dsp == 0); // Does the offset fit in a byte? @@ -12398,7 +12548,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { dst += emitOutputByte(dst, code | 0x44); dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dsp); + dst += emitOutputByte(dst, dspAsByte); } } else @@ -12421,7 +12571,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { dst += emitOutputWord(dst, code | 0x4400); dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dsp); + dst += emitOutputByte(dst, dspAsByte); } } else @@ -14529,6 +14679,168 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) return dst; } +//------------------------------------------------------------------------ +// GetInputSizeInBytes: Get size of input for instruction in bytes. +// +// Arguments: +// id -- Instruction descriptor. +// +// Return Value: +// size in bytes. +// +ssize_t emitter::GetInputSizeInBytes(instrDesc* id) +{ + insFlags inputSize = static_cast((CodeGenInterface::instInfo[id->idIns()] & Input_Mask)); + switch (inputSize) + { + case 0: + return EA_SIZE_IN_BYTES(id->idOpSize()); + case Input_8Bit: + return 1; + case Input_16Bit: + return 2; + case Input_32Bit: + return 4; + case Input_64Bit: + return 8; + default: + unreached(); + } +} + +//------------------------------------------------------------------------ +// TryEvexCompressDisp8Byte: Do we do compressed displacement encoding for EVEX. +// +// Arguments: +// id -- Instruction descriptor. +// dsp -- Displacemnt. +// dspInByte[out] - TRUE if compressed displacement +// +// Return Value: +// compressed displacement value if dspInByte === TRUE. +// Original dsp otherwise. +// +ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) +{ + assert(TakesEvexPrefix(id->idIns())); + insTupleType tt = insTupleTypeInfo(id->idIns()); + assert(hasTupleTypeInfo(id->idIns())); + + // if dsp is 0, no need for all of this + if (dsp == 0) + { + *dspInByte = true; + return dsp; + } + + // Only handling non-broadcast forms right now + ssize_t vectorLength = EA_SIZE_IN_BYTES(id->idOpSize()); + + ssize_t inputSize = GetInputSizeInBytes(id); + + ssize_t disp8Compression = 1; + + switch (tt) + { + case INS_TT_FULL: + assert(inputSize == 4 || inputSize == 8); + if (HasEmbeddedBroadcast(id)) + { + // N = input size in bytes + disp8Compression = inputSize; + } + else + { + // N = vector length in bytes + disp8Compression = vectorLength; + } + break; + case INS_TT_HALF: + assert(inputSize == 4); + if (HasEmbeddedBroadcast(id)) + { + // N = input size in bytes + disp8Compression = inputSize; + } + else + { + // N = vector length in bytes + disp8Compression = vectorLength / 2; + } + break; + case INS_TT_FULL_MEM: + // N = vector length in bytes + disp8Compression = vectorLength; + break; + case INS_TT_TUPLE1_SCALAR: + { + disp8Compression = inputSize; + break; + } + case INS_TT_TUPLE1_FIXED: + // N = input size in bytes, 32bit and 64bit only + assert(inputSize == 4 || inputSize == 8); + disp8Compression = inputSize; + break; + case INS_TT_TUPLE2: + // N = input size in bytes * 2, 32bit and 64bit for 256 bit and 512 bit only + assert((inputSize == 4) || (inputSize == 8 && vectorLength >= 32)); + disp8Compression = inputSize * 2; + break; + case INS_TT_TUPLE4: + // N = input size in bytes * 4, 32bit for 256 bit and 512 bit, 64bit for 512 bit + assert((inputSize == 4 && vectorLength >= 32) || (inputSize == 8 && vectorLength >= 64)); + disp8Compression = inputSize * 4; + break; + case INS_TT_TUPLE8: + // N = input size in bytes * 4, 32bit for 512 only + assert((inputSize == 4 && vectorLength >= 64)); + disp8Compression = inputSize * 4; + break; + case INS_TT_HALF_MEM: + // N = vector length in bytes / 2 + disp8Compression = vectorLength / 2; + break; + case INS_TT_QUARTER_MEM: + // N = vector length in bytes / 4 + disp8Compression = vectorLength / 4; + break; + case INS_TT_EIGHTH_MEM: + // N = vector length in bytes / 8 + disp8Compression = vectorLength / 8; + break; + case INS_TT_MEM128: + // N = 16 + disp8Compression = 16; + break; + case INS_TT_MOVDDUP: + // N = vector length in bytes / 2 + disp8Compression = (vectorLength == 16) ? (vectorLength / 2) : vectorLength; + break; + default: + unreached(); + } + + // If we can evenly divide dsp by the disp8Compression, we can attempt to use it in a disp8 byte form + if (dsp % disp8Compression != 0) + { + *dspInByte = false; + return dsp; + } + + ssize_t compressedDsp = dsp / disp8Compression; + + *dspInByte = ((signed char)compressedDsp == (ssize_t)compressedDsp); + if (*dspInByte) + { + return compressedDsp; + } + else + { + return dsp; + } +} + /***************************************************************************** * * Append the machine code corresponding to the given instruction descriptor @@ -15138,7 +15450,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -15166,7 +15478,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } @@ -15195,7 +15507,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -15207,7 +15519,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD: case IF_ARW_RRD: code = insCodeMR(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); sz = emitSizeOfInsDsc(id); @@ -15216,7 +15528,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD_RRD: { code = insCodeMR(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); dst = emitOutputAM(dst, id, code); sz = emitSizeOfInsDsc(id); break; @@ -15285,7 +15597,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; case IF_SWR_RRD_CNS: - assert(IsSSEOrAVXInstruction(ins)); + assert(ins == INS_vextracti128 || ins == INS_vextractf128); + assert(UseSimdEncoding()); emitGetInsAmdCns(id, &cnsVal); code = insCodeMR(ins); dst = emitOutputSV(dst, id, insCodeMR(ins), &cnsVal); @@ -15305,7 +15618,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15340,7 +15653,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); if (IsDstDstSrcAVXInstruction(ins)) { @@ -15358,11 +15671,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_RRD_SRD: { - // This should only be called on AVX instructions - assert(IsAVXInstruction(ins)); + assert(IsSimdInstruction(ins)); code = insCodeRM(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); code = insEncodeReg3456(ins, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -15388,7 +15700,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); code = insEncodeReg3456(ins, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -15412,7 +15724,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SWR_RRD: case IF_SRW_RRD: code = insCodeMR(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -16810,8 +17122,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cvttsd2si: case INS_cvtsd2si: - case INS_cvtsi2sd: - case INS_cvtsi2ss: + case INS_cvtsi2sd32: + case INS_cvtsi2ss32: + case INS_cvtsi2sd64: + case INS_cvtsi2ss64: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_7C; break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 644fed77da8241..e2910b3b34192f 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -39,7 +39,7 @@ struct CnsVal }; UNATIVE_OFFSET emitInsSize(code_t code, bool includeRexPrefixSize); -UNATIVE_OFFSET emitInsSizeSV(code_t code, int var, int dsp); +UNATIVE_OFFSET emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp); UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp); UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val); UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code); @@ -430,6 +430,14 @@ bool HasKMaskRegisterDest(instruction ins) const case INS_cmpss: case INS_cmppd: case INS_cmpsd: + case INS_vpgatherdd: + case INS_vpgatherqd: + case INS_vpgatherdq: + case INS_vpgatherqq: + case INS_vgatherdps: + case INS_vgatherqps: + case INS_vgatherdpd: + case INS_vgatherqpd: { return true; } @@ -461,6 +469,17 @@ void SetUseEvexEncoding(bool value) useEvexEncodings = value; } +//------------------------------------------------------------------------ +// UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported +// contains Evex prefix. +// +// Returns: +// TRUE if target supports either. +bool UseSimdEncoding() const +{ + return UseVEXEncoding() || UseEvexEncoding(); +} + // 4-byte EVEX prefix starts with byte 0x62 #define EVEX_PREFIX_MASK 0xFF00000000000000ULL #define EVEX_PREFIX_CODE 0x6200000000000000ULL @@ -491,7 +510,7 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); // size - operand size // // Returns: -// TRUE if code has an Evex prefix. +// code with prefix added. code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) { if (TakesEvexPrefix(ins)) @@ -505,6 +524,32 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) return code; } +//------------------------------------------------------------------------ +// AddSimdPrefixIfNeeded: Add the correct SIMD prefix. +// Check if the prefix already exists befpre adding. +// +// Arguments: +// ins - the instruction being encoded. +// code - opcode + prefixes bits at some stage of encoding. +// size - operand size +// +// Returns: +// TRUE if code has an Evex prefix. +code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size) +{ + if (TakesEvexPrefix(ins)) + { + code = !hasEvexPrefix(code) ? AddEvexPrefix(ins, code, size) : code; + } + else if (TakesVexPrefix(ins)) + { + code = !hasVexPrefix(code) ? AddVexPrefix(ins, code, size) : code; + } + return code; +} + +bool TakesSimdPrefix(instruction ins) const; + //------------------------------------------------------------------------ // hasVexOrEvexPrefix: Returns true if the instruction encoding already // contains a Vex or Evex prefix. @@ -519,6 +564,8 @@ bool hasVexOrEvexPrefix(code_t code) return (hasVexPrefix(code) || hasEvexPrefix(code)); } +ssize_t TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte); + //------------------------------------------------------------------------ // codeEvexMigrationCheck: Temporary check to use when adding EVEX codepaths // TODO-XArch-AVX512: Remove implementation and uses once all Evex paths are @@ -534,6 +581,8 @@ bool codeEvexMigrationCheck(code_t code) return hasEvexPrefix(code); } +ssize_t GetInputSizeInBytes(instrDesc* id); + bool containsAVXInstruction = false; bool ContainsAVX() { @@ -954,4 +1003,18 @@ inline bool emitIsUncondJump(instrDesc* jmp) return (ins == INS_jmp); } +//------------------------------------------------------------------------ +// HasEmbeddedBroadcast: Do we consider embedded broadcast while encoding. +// TODO-XArch-AVX512: Add eventual check on the instrDesc +// +// Arguments: +// id - Instruction descriptor. +// +// Returns: +// TRUE if the instruction does embeddded broadcast. +inline bool HasEmbeddedBroadcast(instrDesc* id) +{ + return false; +} + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index a060cc3053d993..37738201789abe 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -289,7 +289,7 @@ HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, Divide, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE, DivideScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -340,7 +340,7 @@ HARDWARE_INTRINSIC(SSE, Xor, // SSE 64-bit-only Intrinsics HARDWARE_INTRINSIC(SSE_X64, ConvertToInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(SSE_X64, ConvertToInt64WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(SSE_X64, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(SSE_X64, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -393,7 +393,7 @@ HARDWARE_INTRINSIC(SSE2, ConvertToInt32, HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToVector128Double, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2sd, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2sd, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2, ConvertToVector128Int32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Int32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToVector128Int32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_cvttpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -459,7 +459,7 @@ HARDWARE_INTRINSIC(SSE2, Xor, HARDWARE_INTRINSIC(SSE2_X64, ConvertToInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_X64, ConvertToInt64WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_X64, ConvertToUInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_X64, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(SSE2_X64, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2_X64, ConvertScalarToVector128Int64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(SSE2_X64, ConvertScalarToVector128UInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(SSE2_X64, StoreNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d8bb92d89fdf7c..cc552a964ed21c 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -36,12 +36,12 @@ const char* CodeGen::genInsName(instruction ins) const char * const insNames[] = { #if defined(TARGET_XARCH) - #define INST0(id, nm, um, mr, flags) nm, - #define INST1(id, nm, um, mr, flags) nm, - #define INST2(id, nm, um, mr, mi, flags) nm, - #define INST3(id, nm, um, mr, mi, rm, flags) nm, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) nm, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) nm, + #define INST0(id, nm, um, mr, tt, flags) nm, + #define INST1(id, nm, um, mr, tt, flags) nm, + #define INST2(id, nm, um, mr, mi, tt, flags) nm, + #define INST3(id, nm, um, mr, mi, rm, tt, flags) nm, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) nm, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) nm, #include "instrs.h" #elif defined(TARGET_ARM) @@ -1789,7 +1789,7 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) } // Conversions to or from floating point values -instruction CodeGen::ins_FloatConv(var_types to, var_types from) +instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) { // AVX: For now we support only conversion from Int/Long -> float @@ -1801,9 +1801,29 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from) switch (to) { case TYP_FLOAT: - return INS_cvtsi2ss; + { + if (EA_SIZE(attr) == EA_4BYTE) + { + return INS_cvtsi2ss32; + } + else if (EA_SIZE(attr) == EA_8BYTE) + { + return INS_cvtsi2ss64; + } + unreached(); + } case TYP_DOUBLE: - return INS_cvtsi2sd; + { + if (EA_SIZE(attr) == EA_4BYTE) + { + return INS_cvtsi2sd32; + } + else if (EA_SIZE(attr) == EA_8BYTE) + { + return INS_cvtsi2sd64; + } + unreached(); + } default: unreached(); } @@ -1867,7 +1887,7 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) } } -instruction CodeGen::ins_FloatConv(var_types to, var_types from) +instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) { switch (from) { diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 27152cac97b1e5..103ec6140c2d53 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -18,12 +18,12 @@ enum instruction : unsigned { #if defined(TARGET_XARCH) - #define INST0(id, nm, um, mr, flags) INS_##id, - #define INST1(id, nm, um, mr, flags) INS_##id, - #define INST2(id, nm, um, mr, mi, flags) INS_##id, - #define INST3(id, nm, um, mr, mi, rm, flags) INS_##id, - #define INST4(id, nm, um, mr, mi, rm, a4, flags) INS_##id, - #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) INS_##id, + #define INST0(id, nm, um, mr, tt, flags) INS_##id, + #define INST1(id, nm, um, mr, tt, flags) INS_##id, + #define INST2(id, nm, um, mr, mi, tt, flags) INS_##id, + #define INST3(id, nm, um, mr, mi, rm, tt, flags) INS_##id, + #define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) INS_##id, + #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) INS_##id, #include "instrs.h" #elif defined(TARGET_ARM) @@ -96,57 +96,66 @@ enum GCtype : unsigned #if defined(TARGET_XARCH) -enum insFlags : uint32_t +enum insFlags : uint64_t { - INS_FLAGS_None = 0, + INS_FLAGS_None = 0ULL, // Reads - Reads_OF = 1 << 0, - Reads_SF = 1 << 1, - Reads_ZF = 1 << 2, - Reads_PF = 1 << 3, - Reads_CF = 1 << 4, - Reads_DF = 1 << 5, + Reads_OF = 1ULL << 0, + Reads_SF = 1ULL << 1, + Reads_ZF = 1ULL << 2, + Reads_PF = 1ULL << 3, + Reads_CF = 1ULL << 4, + Reads_DF = 1ULL << 5, // Writes - Writes_OF = 1 << 6, - Writes_SF = 1 << 7, - Writes_ZF = 1 << 8, - Writes_AF = 1 << 9, - Writes_PF = 1 << 10, - Writes_CF = 1 << 11, + Writes_OF = 1ULL << 6, + Writes_SF = 1ULL << 7, + Writes_ZF = 1ULL << 8, + Writes_AF = 1ULL << 9, + Writes_PF = 1ULL << 10, + Writes_CF = 1ULL << 11, // Resets - Resets_OF = 1 << 12, - Resets_SF = 1 << 13, - Resets_AF = 1 << 14, - Resets_PF = 1 << 15, - Resets_CF = 1 << 16, + Resets_OF = 1ULL << 12, + Resets_SF = 1ULL << 13, + Resets_AF = 1ULL << 14, + Resets_PF = 1ULL << 15, + Resets_CF = 1ULL << 16, // Undefined - Undefined_OF = 1 << 17, - Undefined_SF = 1 << 18, - Undefined_ZF = 1 << 19, - Undefined_AF = 1 << 20, - Undefined_PF = 1 << 21, - Undefined_CF = 1 << 22, + Undefined_OF = 1ULL << 17, + Undefined_SF = 1ULL << 18, + Undefined_ZF = 1ULL << 19, + Undefined_AF = 1ULL << 20, + Undefined_PF = 1ULL << 21, + Undefined_CF = 1ULL << 22, // Restore - Restore_SF_ZF_AF_PF_CF = 1 << 23, + Restore_SF_ZF_AF_PF_CF = 1ULL << 23, // x87 instruction - INS_FLAGS_x87Instr = 1 << 24, + INS_FLAGS_x87Instr = 1ULL << 24, // Avx - INS_Flags_IsDstDstSrcAVXInstruction = 1 << 25, - INS_Flags_IsDstSrcSrcAVXInstruction = 1 << 26, + INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 25, + INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 26, // w and s bits - INS_FLAGS_Has_Wbit = 1 << 27, - INS_FLAGS_Has_Sbit = 1 << 28, + INS_FLAGS_Has_Wbit = 1ULL << 27, + INS_FLAGS_Has_Sbit = 1ULL << 28, + + // instruction input size + // if not input size is set, instruction defaults to using + // the emitAttr for size + Input_8Bit = 1ULL << 29, + Input_16Bit = 1ULL << 30, + Input_32Bit = 1ULL << 31, + Input_64Bit = 1ULL << 32, + Input_Mask = (0xFULL) << 29, // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH - INS_FLAGS_DONT_CARE = 0x00, + INS_FLAGS_DONT_CARE = 0x00ULL, }; #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) @@ -330,6 +339,31 @@ enum insBarrier : unsigned }; #endif +#if defined(TARGET_XARCH) +// Represents tupletype attribute of instruction. +// This is used in determining factor N while calculated compressed displacement in EVEX encoding +// Reference: Section 2.6.5 in Intel 64 and ia-32 architectures software developer's manual volume 2. +enum insTupleType : uint32_t +{ + INS_TT_NONE = 0x00000, + INS_TT_FULL = 0x00001, + INS_TT_HALF = 0x00002, + INS_TT_IS_BROADCAST = 0x00003, + INS_TT_FULL_MEM = 0x00010, + INS_TT_TUPLE1_SCALAR = 0x00020, + INS_TT_TUPLE1_FIXED = 0x00040, + INS_TT_TUPLE2 = 0x00080, + INS_TT_TUPLE4 = 0x01000, + INS_TT_TUPLE8 = 0x02000, + INS_TT_HALF_MEM = 0x04000, + INS_TT_QUARTER_MEM = 0x08000, + INS_TT_EIGHTH_MEM = 0x10000, + INS_TT_MEM128 = 0x20000, + INS_TT_MOVDDUP = 0x40000, + INS_TT_IS_NON_BROADCAST = 0x7FFFC +}; +#endif + #undef EA_UNKNOWN enum emitAttr : unsigned { diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 21b6278db0a556..55a39b52bffbe8 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -29,120 +29,120 @@ #endif /*****************************************************************************/ #ifndef INST0 -#define INST0(id, nm, um, mr, flags) +#define INST0(id, nm, um, mr, tt, flags) #endif #ifndef INST2 -#define INST2(id, nm, um, mr, mi, flags) +#define INST2(id, nm, um, mr, mi, tt, flags) #endif #ifndef INST3 -#define INST3(id, nm, um, mr, mi, rm, flags) +#define INST3(id, nm, um, mr, mi, rm, tt, flags) #endif #ifndef INST4 -#define INST4(id, nm, um, mr, mi, rm, a4, flags) +#define INST4(id, nm, um, mr, mi, rm, a4, tt, flags) #endif #ifndef INST5 -#define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) +#define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) #endif /*****************************************************************************/ /* The following is x86-specific */ /*****************************************************************************/ -// id nm um mr mi rm a4 rr flags -INST5(invalid, "INVALID", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +// id nm um mr mi rm a4 rr tt flags +INST5(invalid, "INVALID", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_FLAGS_None ) -INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_FLAGS_None ) +INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None ) +INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None ) // Does not affect the stack tracking in the emitter -INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_FLAGS_None ) -INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_FLAGS_None ) +INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None ) +INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None ) -INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit ) -INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF ) -INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit ) -INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF ) +INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit ) +INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF ) +INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit ) +INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF ) // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_FLAGS_None ) -// id nm um mr mi rm a4 flags -INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +// id nm um mr mi rm a4 tt flags +INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) // Does not affect the stack tracking in the emitter -INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit ) -INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_FLAGS_Has_Wbit ) +INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit ) +INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, INS_FLAGS_None ) +INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) -// id nm um mr mi rm flags +// id nm um mr mi rm tt flags // Note that emitter has only partial support for BT. It can only emit the reg,reg form // and the registers need to be reversed to get the correct encoding. -INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF ) +INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF ) -INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF ) -INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF ) +INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF ) +INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF ) -INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, INS_FLAGS_Has_Wbit ) +INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, INS_TT_NONE, INS_FLAGS_Has_Wbit ) #ifdef TARGET_AMD64 -INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x4800000063, INS_FLAGS_Has_Wbit ) +INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x4800000063, INS_TT_NONE, INS_FLAGS_Has_Wbit ) #endif -INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_FLAGS_Has_Wbit ) - -INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, Reads_OF ) -INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, Reads_OF ) -INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, Reads_CF ) -INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, Reads_CF ) -INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, Reads_ZF ) -INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, Reads_ZF ) -INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, Reads_ZF | Reads_CF ) -INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, Reads_ZF | Reads_CF ) -INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, Reads_SF ) -INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, Reads_SF ) -INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, Reads_PF ) -INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, Reads_PF ) -INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, Reads_OF | Reads_SF ) -INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, Reads_OF | Reads_SF ) -INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, Reads_OF | Reads_SF | Reads_ZF ) -INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, Reads_OF | Reads_SF | Reads_ZF ) - -INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_FLAGS_Has_Wbit ) -INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) - -// id nm um mr mi rm flags +INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit ) + +INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF ) +INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF ) +INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF ) +INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF ) +INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF ) +INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF ) +INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF ) +INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF ) +INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF ) +INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF ) +INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF ) +INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF ) +INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) +INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) + +INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) + +// id nm um mr mi rm tt flags // Instead of encoding these as 3-operand instructions, we encode them // as 2-operand instructions with the target register being implicit // implicit_reg = op1*op2_icon #define INSTMUL INST3 -INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) #ifdef TARGET_AMD64 -INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) #endif // TARGET_AMD64 @@ -183,603 +183,615 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, #define VEX3INT(c1,c2) PACK4(c1, 0xc5, 0x02, c2) #define VEX3FLT(c1,c2) PACK4(c1, 0xc5, 0x02, c2) -INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // These are the SSE instructions used on x86 -INST3(pmovmskb, "pmovmskb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD7), INS_FLAGS_None) // Move the MSB bits of all bytes in a xmm reg to an int reg -INST3(movmskpd, "movmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), INS_FLAGS_None) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. -INST3(movd, "movd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), INS_FLAGS_None) // Move Double/Quadword between mm regs <-> memory/r32/r64 regs, cleanup https://github.com/dotnet/runtime/issues/47943 -INST3(movq, "movq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), INS_FLAGS_None) // Move Quadword between memory/mm <-> regs, cleanup https://github.com/dotnet/runtime/issues/47943 -INST3(movsdsse2, "movsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), INS_Flags_IsDstSrcSrcAVXInstruction) - -INST3(punpckldq, "punpckldq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x62), INS_Flags_IsDstDstSrcAVXInstruction) - -INST3(xorps, "xorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles - -INST3(cvttsd2si, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_FLAGS_None) // cvt with trunc scalar double to signed DWORDs - -INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_FLAGS_None) -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_FLAGS_None) -INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movhps, "movhps", IUM_WR, PCKFLT(0x17), BAD_CODE, PCKFLT(0x16), INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movss, "movss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movapd, "movapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), INS_FLAGS_None) -INST3(movaps, "movaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), INS_FLAGS_None) -INST3(movupd, "movupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), INS_FLAGS_None) -INST3(movups, "movups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), INS_FLAGS_None) -INST3(movhlps, "movhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), INS_Flags_IsDstDstSrcAVXInstruction) -INST3(movlhps, "movlhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x16), INS_Flags_IsDstDstSrcAVXInstruction) -INST3(movmskps, "movmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), INS_FLAGS_None) -INST3(unpckhps, "unpckhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x15), INS_Flags_IsDstDstSrcAVXInstruction) -INST3(unpcklps, "unpcklps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x14), INS_Flags_IsDstDstSrcAVXInstruction) -INST3(maskmovdqu, "maskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), INS_FLAGS_None) - -INST3(shufps, "shufps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC6), INS_Flags_IsDstDstSrcAVXInstruction) -INST3(shufpd, "shufpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC6), INS_Flags_IsDstDstSrcAVXInstruction) - -INST3(punpckhdq, "punpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), INS_Flags_IsDstDstSrcAVXInstruction) - -INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(mfence, "mfence", IUM_RD, 0x000FF0AE, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(pmovmskb, "pmovmskb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD7), INS_TT_NONE, INS_FLAGS_None) // Move the MSB bits of all bytes in a xmm reg to an int reg +INST3(movmskpd, "movmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), INS_TT_NONE, INS_FLAGS_None) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. +INST3(movd, "movd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), INS_TT_TUPLE1_SCALAR, INS_FLAGS_None) // Move Double/Quadword between mm regs <-> memory/r32/r64 regs, cleanup https://github.com/dotnet/runtime/issues/47943 +INST3(movq, "movq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Move Quadword between memory/mm <-> regs, cleanup https://github.com/dotnet/runtime/issues/47943 +INST3(movsdsse2, "movsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) + +INST3(punpckldq, "punpckldq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x62), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) + +INST3(xorps, "xorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles + +INST3(cvttsd2si, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_TT_TUPLE1_FIXED, Input_64Bit) // cvt with trunc scalar double to signed DWORDs + +INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) +INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit) +INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_NONE, INS_FLAGS_None) +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_NONE, INS_FLAGS_None) +INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movhps, "movhps", IUM_WR, PCKFLT(0x17), BAD_CODE, PCKFLT(0x16), INS_TT_TUPLE2, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movss, "movss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movapd, "movapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), INS_TT_FULL_MEM, Input_64Bit) +INST3(movaps, "movaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), INS_TT_FULL_MEM, Input_32Bit) +INST3(movupd, "movupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), INS_TT_FULL_MEM, Input_64Bit) +INST3(movups, "movups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), INS_TT_FULL_MEM, Input_32Bit) +INST3(movhlps, "movhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(movlhps, "movlhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x16), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(movmskps, "movmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), INS_TT_NONE, INS_FLAGS_None) +INST3(unpckhps, "unpckhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x15), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(unpcklps, "unpcklps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x14), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(maskmovdqu, "maskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), INS_TT_NONE, INS_FLAGS_None) + +INST3(shufps, "shufps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC6), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(shufpd, "shufpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC6), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) + +INST3(punpckhdq, "punpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) + +INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(mfence, "mfence", IUM_RD, 0x000FF0AE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // SSE 2 arith -INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles -INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles -INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles -INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles -INST3(mulps, "mulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles -INST3(mulss, "mulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single -INST3(mulpd, "mulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles -INST3(mulsd, "mulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles -INST3(subps, "subps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5C), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed singles -INST3(subss, "subss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5C), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar singles -INST3(subpd, "subpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5C), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed doubles -INST3(subsd, "subsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5C), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar doubles -INST3(minps, "minps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5D), INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed singles -INST3(minss, "minss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5D), INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar single -INST3(minpd, "minpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5D), INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed doubles -INST3(minsd, "minsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5D), INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar double -INST3(divps, "divps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles -INST3(divss, "divss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles -INST3(divpd, "divpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed doubles -INST3(divsd, "divsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles -INST3(maxps, "maxps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5F), INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed singles -INST3(maxss, "maxss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5F), INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar single -INST3(maxpd, "maxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles -INST3(maxsd, "maxsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5F), INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar double -INST3(xorpd, "xorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles -INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles -INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles -INST3(sqrtps, "sqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x51), INS_FLAGS_None) // Sqrt of packed singles -INST3(sqrtss, "sqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x51), INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar single -INST3(sqrtpd, "sqrtpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x51), INS_FLAGS_None) // Sqrt of packed doubles -INST3(sqrtsd, "sqrtsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x51), INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar double -INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles -INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles -INST3(orps, "orps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles -INST3(orpd, "orpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles -INST3(haddpd, "haddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles -INST3(haddps, "haddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats -INST3(hsubpd, "hsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles -INST3(hsubps, "hsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats -INST3(addsubps, "addsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles -INST3(addsubpd, "addsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles +INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles +INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles +INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles +INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles +INST3(mulps, "mulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles +INST3(mulss, "mulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single +INST3(mulpd, "mulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles +INST3(mulsd, "mulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles +INST3(subps, "subps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5C), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed singles +INST3(subss, "subss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5C), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar singles +INST3(subpd, "subpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5C), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed doubles +INST3(subsd, "subsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5C), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar doubles +INST3(minps, "minps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5D), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed singles +INST3(minss, "minss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5D), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar single +INST3(minpd, "minpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5D), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed doubles +INST3(minsd, "minsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5D), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar double +INST3(divps, "divps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles +INST3(divss, "divss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles +INST3(divpd, "divpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed doubles +INST3(divsd, "divsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles +INST3(maxps, "maxps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5F), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed singles +INST3(maxss, "maxss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5F), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar single +INST3(maxpd, "maxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles +INST3(maxsd, "maxsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5F), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar double +INST3(xorpd, "xorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles +INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles +INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles +INST3(sqrtps, "sqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x51), INS_TT_FULL, Input_32Bit | INS_FLAGS_None) // Sqrt of packed singles +INST3(sqrtss, "sqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x51), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar single +INST3(sqrtpd, "sqrtpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x51), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Sqrt of packed doubles +INST3(sqrtsd, "sqrtsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x51), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar double +INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles +INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles +INST3(orps, "orps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles +INST3(orpd, "orpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles +INST3(haddpd, "haddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles +INST3(haddps, "haddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats +INST3(hsubpd, "hsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles +INST3(hsubps, "hsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats +INST3(addsubps, "addsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles +INST3(addsubpd, "addsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles // SSE 2 approx arith -INST3(rcpps, "rcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), INS_FLAGS_None) // Reciprocal of packed singles -INST3(rcpss, "rcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single -INST3(rsqrtps, "rsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), INS_FLAGS_None) // Reciprocal Sqrt of packed singles -INST3(rsqrtss, "rsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x52), INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal Sqrt of scalar single +INST3(rcpps, "rcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Reciprocal of packed singles +INST3(rcpss, "rcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single +INST3(rsqrtps, "rsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Reciprocal Sqrt of packed singles +INST3(rsqrtss, "rsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x52), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal Sqrt of scalar single // SSE2 conversions -INST3(cvtpi2ps, "cvtpi2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2A), INS_FLAGS_None) // cvt packed DWORDs to singles -INST3(cvtsi2ss, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single -INST3(cvtpi2pd, "cvtpi2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2A), INS_FLAGS_None) // cvt packed DWORDs to doubles -INST3(cvtsi2sd, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar double -INST3(cvttps2pi, "cvttps2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2C), INS_FLAGS_None) // cvt with trunc packed singles to DWORDs -INST3(cvttss2si, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_FLAGS_None) // cvt with trunc scalar single to DWORD -INST3(cvttpd2pi, "cvttpd2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2C), INS_FLAGS_None) // cvt with trunc packed doubles to DWORDs -INST3(cvtps2pi, "cvtps2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2D), INS_FLAGS_None) // cvt packed singles to DWORDs -INST3(cvtss2si, "cvtss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2D), INS_FLAGS_None) // cvt scalar single to DWORD -INST3(cvtpd2pi, "cvtpd2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2D), INS_FLAGS_None) // cvt packed doubles to DWORDs -INST3(cvtsd2si, "cvtsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2D), INS_FLAGS_None) // cvt scalar double to DWORD -INST3(cvtps2pd, "cvtps2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5A), INS_FLAGS_None) // cvt packed singles to doubles -INST3(cvtpd2ps, "cvtpd2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5A), INS_FLAGS_None) // cvt packed doubles to singles -INST3(cvtss2sd, "cvtss2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5A), INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar single to scalar doubles -INST3(cvtsd2ss, "cvtsd2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5A), INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar double to scalar singles -INST3(cvtdq2ps, "cvtdq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_FLAGS_None) // cvt packed DWORDs to singles -INST3(cvtps2dq, "cvtps2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5B), INS_FLAGS_None) // cvt packed singles to DWORDs -INST3(cvttps2dq, "cvttps2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5B), INS_FLAGS_None) // cvt with trunc packed singles to DWORDs -INST3(cvtpd2dq, "cvtpd2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xE6), INS_FLAGS_None) // cvt packed doubles to DWORDs -INST3(cvttpd2dq, "cvttpd2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE6), INS_FLAGS_None) // cvt with trunc packed doubles to DWORDs -INST3(cvtdq2pd, "cvtdq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_FLAGS_None) // cvt packed DWORDs to doubles +INST3(cvtpi2ps, "cvtpi2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2A), INS_TT_NONE, Input_32Bit) // cvt packed DWORDs to singles +INST3(cvtsi2ss32, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single +INST3(cvtsi2ss64, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single +INST3(cvtpi2pd, "cvtpi2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2A), INS_TT_NONE, Input_32Bit) // cvt packed DWORDs to doubles +INST3(cvtsi2sd32, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar double +INST3(cvtsi2sd64, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar double +INST3(cvttps2pi, "cvttps2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2C), INS_TT_NONE, Input_32Bit) // cvt with trunc packed singles to DWORDs +INST3(cvttss2si, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_TT_TUPLE1_FIXED, Input_32Bit) // cvt with trunc scalar single to DWORD +INST3(cvttpd2pi, "cvttpd2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2C), INS_TT_NONE, Input_64Bit) // cvt with trunc packed doubles to DWORDs +INST3(cvtps2pi, "cvtps2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2D), INS_TT_NONE, Input_32Bit) // cvt packed singles to DWORDs +INST3(cvtss2si, "cvtss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2D), INS_TT_TUPLE1_FIXED, Input_32Bit) // cvt scalar single to DWORD +INST3(cvtpd2pi, "cvtpd2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2D), INS_TT_NONE, Input_64Bit) // cvt packed doubles to DWORDs +INST3(cvtsd2si, "cvtsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2D), INS_TT_TUPLE1_FIXED, Input_64Bit) // cvt scalar double to DWORD +INST3(cvtps2pd, "cvtps2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5A), INS_TT_HALF, Input_32Bit) // cvt packed singles to doubles +INST3(cvtpd2ps, "cvtpd2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5A), INS_TT_FULL, Input_64Bit) // cvt packed doubles to singles +INST3(cvtss2sd, "cvtss2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5A), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar single to scalar doubles +INST3(cvtsd2ss, "cvtsd2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5A), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar double to scalar singles +INST3(cvtdq2ps, "cvtdq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_TT_FULL, Input_32Bit) // cvt packed DWORDs to singles +INST3(cvtps2dq, "cvtps2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5B), INS_TT_FULL, Input_32Bit) // cvt packed singles to DWORDs +INST3(cvttps2dq, "cvttps2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5B), INS_TT_FULL, Input_32Bit) // cvt with trunc packed singles to DWORDs +INST3(cvtpd2dq, "cvtpd2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xE6), INS_TT_FULL, Input_64Bit) // cvt packed doubles to DWORDs +INST3(cvttpd2dq, "cvttpd2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE6), INS_TT_FULL, Input_64Bit) // cvt with trunc packed doubles to DWORDs +INST3(cvtdq2pd, "cvtdq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_TT_HALF, Input_32Bit) // cvt packed DWORDs to doubles // SSE2 comparison instructions -INST3(comiss, "comiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2F), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // ordered compare singles -INST3(comisd, "comisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2F), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // ordered compare doubles -INST3(ucomiss, "ucomiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2E), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // unordered compare singles -INST3(ucomisd, "ucomisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2E), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // unordered compare doubles +INST3(comiss, "comiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2F), INS_TT_TUPLE1_SCALAR, Input_32Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // ordered compare singles +INST3(comisd, "comisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2F), INS_TT_TUPLE1_SCALAR, Input_64Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // ordered compare doubles +INST3(ucomiss, "ucomiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2E), INS_TT_TUPLE1_SCALAR, Input_32Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // unordered compare singles +INST3(ucomisd, "ucomisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2E), INS_TT_TUPLE1_SCALAR, Input_64Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // unordered compare doubles // SSE2 packed single/double comparison operations. // Note that these instructions not only compare but also overwrite the first source. -INST3(cmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles -INST3(cmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles -INST3(cmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles -INST3(cmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles +INST3(cmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles +INST3(cmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles +INST3(cmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles +INST3(cmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles //SSE2 packed integer operations -INST3(paddb, "paddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers -INST3(paddw, "paddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers -INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers -INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers -INST3(paddsb, "paddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results -INST3(paddsw, "paddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results -INST3(paddusb, "paddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results -INST3(paddusw, "paddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results -INST3(pavgb, "pavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers -INST3(pavgw, "pavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers -INST3(psubb, "psubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers -INST3(psubw, "psubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers -INST3(psubd, "psubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers -INST3(psubq, "psubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers -INST3(pmaddwd, "pmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst -INST3(pmulhw, "pmulhw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE5), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit signed integers -INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE4), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit unsigned integers -INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result -INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result -INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs -INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers -INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation -INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation -INST3(psubsw, "psubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation -INST3(psubusw, "psubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation +INST3(paddb, "paddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers +INST3(paddw, "paddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers +INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers +INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers +INST3(paddsb, "paddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results +INST3(paddsw, "paddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results +INST3(paddusb, "paddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results +INST3(paddusw, "paddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results +INST3(pavgb, "pavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers +INST3(pavgw, "pavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers +INST3(psubb, "psubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(psubw, "psubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(psubd, "psubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers +INST3(psubq, "psubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), INS_TT_FULL_MEM, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers +INST3(pmaddwd, "pmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst +INST3(pmulhw, "pmulhw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit signed integers +INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE4), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit unsigned integers +INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result +INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result +// TODO-XArch-AVX512: pand, pandn, por, and pxor have AVX512 instructions under different names, pandd, pandq etc +INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs +INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs +INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers +INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation +INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation +INST3(psubsw, "psubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation +INST3(psubusw, "psubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation // Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode, // which is handled in emitxarch.cpp. -INST3(psrldq, "psrldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift right logical of xmm reg by given number of bytes -INST3(pslldq, "pslldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift left logical of xmm reg by given number of bytes -INST3(psllw, "psllw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xF1), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 16-bit integers -INST3(pslld, "pslld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 32-bit integers -INST3(psllq, "psllq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 64-bit integers -INST3(psrlw, "psrlw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 16-bit integers -INST3(psrld, "psrld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xD2), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 32-bit integers -INST3(psrlq, "psrlq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 64-bit integers -INST3(psraw, "psraw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xE1), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 16-bit integers -INST3(psrad, "psrad", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2), INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 32-bit integers - -INST3(pmaxub, "pmaxub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDE), INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum unsigned bytes -INST3(pminub, "pminub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDA), INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum unsigned bytes -INST3(pmaxsw, "pmaxsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEE), INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed words -INST3(pminsw, "pminsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEA), INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed words -INST3(pcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality -INST3(pcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than -INST3(pcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality -INST3(pcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than -INST3(pcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality -INST3(pcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than - -INST3(pshufd, "pshufd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x70), INS_FLAGS_None) // Packed shuffle of 32-bit integers -INST3(pshufhw, "pshufhw", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x70), INS_FLAGS_None) // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. -INST3(pshuflw, "pshuflw", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x70), INS_FLAGS_None) // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. -INST3(pextrw, "pextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), INS_FLAGS_None) // Extract 16-bit value into a r32 with zero extended to 32-bits -INST3(pinsrw, "pinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index - -INST3(punpckhbw, "punpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) -INST3(punpcklbw, "punpcklbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x60), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (lo) -INST3(punpckhqdq, "punpckhqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6D), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (hi) -INST3(punpcklqdq, "punpcklqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6C), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (lo) -INST3(punpckhwd, "punpckhwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x69), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (hi) -INST3(punpcklwd, "punpcklwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x61), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (lo) -INST3(unpckhpd, "unpckhpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x15), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) -INST3(unpcklpd, "unpcklpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x14), INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) - -INST3(packssdw, "packssdw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6B), INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to short with saturation -INST3(packsswb, "packsswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation -INST3(packuswb, "packuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation +// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. +// This hopefully means we do not need psrldq128/psrldq256/psrldq512 etc +INST3(psrldq, "psrldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift right logical of xmm reg by given number of bytes +INST3(pslldq, "pslldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift left logical of xmm reg by given number of bytes +INST3(psllw, "psllw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xF1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 16-bit integers +INST3(pslld, "pslld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 32-bit integers +INST3(psllq, "psllq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 64-bit integers +INST3(psrlw, "psrlw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 16-bit integers +INST3(psrld, "psrld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xD2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 32-bit integers +INST3(psrlq, "psrlq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 64-bit integers +INST3(psraw, "psraw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xE1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 16-bit integers +INST3(psrad, "psrad", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 32-bit integers +INST3(pmaxub, "pmaxub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDE), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum unsigned bytes +INST3(pminub, "pminub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDA), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum unsigned bytes +INST3(pmaxsw, "pmaxsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEE), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed words +INST3(pminsw, "pminsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEA), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed words +INST3(pcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality +INST3(pcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than +INST3(pcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality +INST3(pcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than +INST3(pcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality +INST3(pcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than + +INST3(pshufd, "pshufd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x70), INS_TT_FULL, Input_32Bit) // Packed shuffle of 32-bit integers +INST3(pshufhw, "pshufhw", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x70), INS_TT_FULL_MEM, Input_16Bit) // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. +INST3(pshuflw, "pshuflw", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x70), INS_TT_FULL_MEM, Input_16Bit) // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. +INST3(pextrw, "pextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), INS_TT_TUPLE1_SCALAR, Input_16Bit) // Extract 16-bit value into a r32 with zero extended to 32-bits +INST3(pinsrw, "pinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index + +INST3(punpckhbw, "punpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) +INST3(punpcklbw, "punpcklbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x60), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (lo) +INST3(punpckhqdq, "punpckhqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6D), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (hi) +INST3(punpcklqdq, "punpcklqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6C), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (lo) +INST3(punpckhwd, "punpckhwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x69), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (hi) +INST3(punpcklwd, "punpcklwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x61), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (lo) +INST3(unpckhpd, "unpckhpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x15), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) +INST3(unpcklpd, "unpcklpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x14), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) + +INST3(packssdw, "packssdw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6B), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to short with saturation +INST3(packsswb, "packsswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation +INST3(packuswb, "packuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation // id nm um mr mi rm flags -INST3(dpps, "dpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs -INST3(dppd, "dppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs -INST3(insertps, "insertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value -INST3(pcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality -INST3(pcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality -INST3(pmulld, "pmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result -INST3(ptest, "ptest", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x17), INS_FLAGS_None) // Packed logical compare -INST3(phaddd, "phaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add -INST3(pabsb, "pabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), INS_FLAGS_None) // Packed absolute value of bytes -INST3(pabsw, "pabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), INS_FLAGS_None) // Packed absolute value of 16-bit integers -INST3(pabsd, "pabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), INS_FLAGS_None) // Packed absolute value of 32-bit integers -INST3(palignr, "palignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right -INST3(pmaddubsw, "pmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes -INST3(pmulhrsw, "pmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale -INST3(pshufb, "pshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes -INST3(psignb, "psignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(psignw, "psignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(psignd, "psignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(pminsb, "pminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes -INST3(pminsd, "pminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers -INST3(pminuw, "pminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers -INST3(pminud, "pminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers -INST3(pmaxsb, "pmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes -INST3(pmaxsd, "pmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers -INST3(pmaxuw, "pmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers -INST3(pmaxud, "pmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers -INST3(pmovsxbw, "pmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), INS_FLAGS_None) // Packed sign extend byte to short -INST3(pmovsxbd, "pmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), INS_FLAGS_None) // Packed sign extend byte to int -INST3(pmovsxbq, "pmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), INS_FLAGS_None) // Packed sign extend byte to long -INST3(pmovsxwd, "pmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), INS_FLAGS_None) // Packed sign extend short to int -INST3(pmovsxwq, "pmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), INS_FLAGS_None) // Packed sign extend short to long -INST3(pmovsxdq, "pmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), INS_FLAGS_None) // Packed sign extend int to long -INST3(pmovzxbw, "pmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), INS_FLAGS_None) // Packed zero extend byte to short -INST3(pmovzxbd, "pmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), INS_FLAGS_None) // Packed zero extend byte to intg -INST3(pmovzxbq, "pmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), INS_FLAGS_None) // Packed zero extend byte to lon -INST3(pmovzxwd, "pmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), INS_FLAGS_None) // Packed zero extend short to int -INST3(pmovzxwq, "pmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), INS_FLAGS_None) // Packed zero extend short to long -INST3(pmovzxdq, "pmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), INS_FLAGS_None) // Packed zero extend int to long -INST3(packusdw, "packusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation -INST3(roundps, "roundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), INS_FLAGS_None) // Round packed single precision floating-point values -INST3(roundss, "roundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar single precision floating-point values -INST3(roundpd, "roundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), INS_FLAGS_None) // Round packed double precision floating-point values -INST3(roundsd, "roundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar double precision floating-point values -INST3(pmuldq, "pmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result -INST3(blendps, "blendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values -INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), INS_FLAGS_None) // Variable Blend Packed Singles -INST3(blendpd, "blendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values -INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), INS_FLAGS_None) // Variable Blend Packed Doubles -INST3(pblendw, "pblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words -INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_FLAGS_None) // Variable Blend Packed Bytes -INST3(phaddw, "phaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers -INST3(phsubw, "phsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers -INST3(phsubd, "phsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers -INST3(phaddsw, "phaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation -INST3(phsubsw, "phsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation -INST3(lddqu, "lddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), INS_FLAGS_None) // Load Unaligned integer -INST3(movntdqa, "movntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), INS_FLAGS_None) // Load Double Quadword Non-Temporal Aligned Hint -INST3(movddup, "movddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), INS_FLAGS_None) // Replicate Double FP Values -INST3(movsldup, "movsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), INS_FLAGS_None) // Replicate even-indexed Single FP Values -INST3(movshdup, "movshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), INS_FLAGS_None) // Replicate odd-indexed Single FP Values -INST3(phminposuw, "phminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), INS_FLAGS_None) // Packed Horizontal Word Minimum -INST3(mpsadbw, "mpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference -INST3(pinsrb, "pinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte -INST3(pinsrd, "pinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword -INST3(pinsrq, "pinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword -INST3(pextrb, "pextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract Byte -INST3(pextrd, "pextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract Dword -INST3(pextrq, "pextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract Qword -INST3(pextrw_sse41, "pextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract Word -INST3(extractps, "extractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract Packed Floating-Point Values +INST3(dpps, "dpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs +INST3(dppd, "dppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs +INST3(insertps, "insertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value +INST3(pcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality +INST3(pcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality +INST3(pmulld, "pmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result +INST3(ptest, "ptest", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x17), INS_TT_NONE, INS_FLAGS_None) // Packed logical compare +INST3(phaddd, "phaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add +INST3(pabsb, "pabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), INS_TT_FULL_MEM, Input_8Bit | INS_FLAGS_None) // Packed absolute value of bytes +INST3(pabsw, "pabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), INS_TT_FULL_MEM, Input_16Bit | INS_FLAGS_None) // Packed absolute value of 16-bit integers +INST3(pabsd, "pabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), INS_TT_FULL, Input_32Bit | INS_FLAGS_None) // Packed absolute value of 32-bit integers +// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. +// This hopefully means we do not need palignr128/palignr256/palignr512 etc. +INST3(palignr, "palignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right +INST3(pmaddubsw, "pmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes +INST3(pmulhrsw, "pmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale +INST3(pshufb, "pshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes +INST3(psignb, "psignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), INS_TT_NONE, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(psignw, "psignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(psignd, "psignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(pminsb, "pminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes +INST3(pminsd, "pminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers +INST3(pminuw, "pminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers +INST3(pminud, "pminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers +INST3(pmaxsb, "pmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes +INST3(pmaxsd, "pmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers +INST3(pmaxuw, "pmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers +INST3(pmaxud, "pmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers +INST3(pmovsxbw, "pmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), INS_TT_HALF_MEM, Input_8Bit) // Packed sign extend byte to short +INST3(pmovsxbd, "pmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), INS_TT_QUARTER_MEM, Input_8Bit) // Packed sign extend byte to int +INST3(pmovsxbq, "pmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), INS_TT_EIGHTH_MEM, Input_8Bit) // Packed sign extend byte to long +INST3(pmovsxwd, "pmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), INS_TT_HALF_MEM, Input_16Bit) // Packed sign extend short to int +INST3(pmovsxwq, "pmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), INS_TT_QUARTER_MEM, Input_16Bit) // Packed sign extend short to long +INST3(pmovsxdq, "pmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), INS_TT_HALF_MEM, Input_32Bit) // Packed sign extend int to long +INST3(pmovzxbw, "pmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), INS_TT_HALF_MEM, Input_8Bit) // Packed zero extend byte to short +INST3(pmovzxbd, "pmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), INS_TT_QUARTER_MEM, Input_8Bit) // Packed zero extend byte to intg +INST3(pmovzxbq, "pmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), INS_TT_EIGHTH_MEM, Input_8Bit) // Packed zero extend byte to lon +INST3(pmovzxwd, "pmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), INS_TT_HALF_MEM, Input_16Bit) // Packed zero extend short to int +INST3(pmovzxwq, "pmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), INS_TT_QUARTER_MEM, Input_16Bit) // Packed zero extend short to long +INST3(pmovzxdq, "pmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), INS_TT_HALF_MEM, Input_32Bit) // Packed zero extend int to long +INST3(packusdw, "packusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation +INST3(roundps, "roundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Round packed single precision floating-point values +INST3(roundss, "roundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar single precision floating-point values +INST3(roundpd, "roundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), INS_TT_NONE, Input_64Bit | INS_FLAGS_None) // Round packed double precision floating-point values +INST3(roundsd, "roundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar double precision floating-point values +INST3(pmuldq, "pmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result +INST3(blendps, "blendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values +INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Variable Blend Packed Singles +INST3(blendpd, "blendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values +INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), INS_TT_NONE, Input_64Bit | INS_FLAGS_None) // Variable Blend Packed Doubles +INST3(pblendw, "pblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words +INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_NONE, Input_8Bit | INS_FLAGS_None) // Variable Blend Packed Bytes +INST3(phaddw, "phaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers +INST3(phsubw, "phsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers +INST3(phsubd, "phsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers +INST3(phaddsw, "phaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation +INST3(phsubsw, "phsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation +// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. +// This hopefully means we do not need lddqu128/lddqu256/lddqu512 etc. +INST3(lddqu, "lddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), INS_TT_NONE, INS_FLAGS_None) // Load Unaligned integer +INST3(movntdqa, "movntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), INS_TT_FULL_MEM, INS_FLAGS_None) // Load Double Quadword Non-Temporal Aligned Hint +INST3(movddup, "movddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), INS_TT_MOVDDUP, Input_64Bit | INS_FLAGS_None) // Replicate Double FP Values +INST3(movsldup, "movsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) // Replicate even-indexed Single FP Values +INST3(movshdup, "movshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) // Replicate odd-indexed Single FP Values +INST3(phminposuw, "phminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), INS_TT_NONE, Input_16Bit | INS_FLAGS_None) // Packed Horizontal Word Minimum +INST3(mpsadbw, "mpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference +INST3(pinsrb, "pinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), INS_TT_TUPLE1_SCALAR, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte +INST3(pinsrd, "pinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword +INST3(pinsrq, "pinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword +INST3(pextrb, "pextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_8Bit | INS_FLAGS_None) // Extract Byte +INST3(pextrd, "pextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Extract Dword +INST3(pextrq, "pextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Extract Qword +INST3(pextrw_sse41, "pextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Extract Word +INST3(extractps, "extractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Extract Packed Floating-Point Values //PCLMULQDQ instructions -INST3(pclmulqdq, "pclmulqdq" , IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords +INST3(pclmulqdq, "pclmulqdq" , IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), INS_TT_FULL_MEM, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords //AES instructions -INST3(aesdec, "aesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow -INST3(aesdeclast, "aesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow -INST3(aesenc, "aesenc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDC), INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES encryption flow -INST3(aesenclast, "aesenclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDD), INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES encryption flow -INST3(aesimc, "aesimc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDB), INS_FLAGS_None) // Perform the AES InvMixColumn Transformation -INST3(aeskeygenassist, "aeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xDF), INS_FLAGS_None) // AES Round Key Generation Assist -INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) - -INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. +// This hopefully means we do not need aesdec128/aesdec256/aesdec512 etc. +INST3(aesdec, "aesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow +INST3(aesdeclast, "aesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow +INST3(aesenc, "aesenc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDC), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES encryption flow +INST3(aesenclast, "aesenclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDD), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES encryption flow +INST3(aesimc, "aesimc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDB), INS_TT_NONE, INS_FLAGS_None) // Perform the AES InvMixColumn Transformation +INST3(aeskeygenassist, "aeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xDF), INS_TT_NONE, INS_FLAGS_None) // AES Round Key Generation Assist +INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // AVX only instructions -INST3(vbroadcastss, "broadcastss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x18), INS_FLAGS_None) // Broadcast float value read from memory to entire ymm register -INST3(vbroadcastsd, "broadcastsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_FLAGS_None) // Broadcast float value read from memory to entire ymm register -INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x78), INS_FLAGS_None) // Broadcast int8 value from reg/memory to entire ymm register -INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register -INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register -INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register -INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract 128-bit packed floating point values -INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_FLAGS_None) // Extract 128-bit packed integer values -INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values -INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values -INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) -INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register -INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_FLAGS_None) // Permute 64-bit of input register -INST3(vpblendd, "pblendd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x02), INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed DWORDs -INST3(vblendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4A), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Singles -INST3(vblendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4B), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Doubles -INST3(vpblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4C), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Bytes -INST3(vtestps, "testps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0E), INS_FLAGS_None) // Packed Bit Test -INST3(vtestpd, "testpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0F), INS_FLAGS_None) // Packed Bit Test -INST3(vpsrlvd, "psrlvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical -INST3(vpsrlvq, "psrlvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical -INST3(vpsravd, "psravd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic -INST3(vpsllvd, "psllvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical -INST3(vpsllvq, "psllvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical -INST3(vpermilps, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x04), INS_FLAGS_None) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values -INST3(vpermilpd, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x05), INS_FLAGS_None) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values -INST3(vpermilpsvar, "permilpsvar", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0C), INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values -INST3(vpermilpdvar, "permilpdvar", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0D), INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values -INST3(vperm2f128, "perm2f128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x06), INS_Flags_IsDstDstSrcAVXInstruction) // Permute Floating-Point Values -INST3(vpermpd, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x01), INS_FLAGS_None) // Permute Double-Precision Floating-Point Values -INST3(vpermd, "permd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_Flags_IsDstDstSrcAVXInstruction) // Permute Packed Doublewords Elements -INST3(vpermps, "permps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_Flags_IsDstDstSrcAVXInstruction) // Permute Single-Precision Floating-Point Elements -INST3(vbroadcastf128, "broadcastf128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_FLAGS_None) // Broadcast packed float values read from memory to entire ymm register -INST3(vbroadcasti128, "broadcasti128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_FLAGS_None) // Broadcast packed integer values read from memory to entire ymm register -INST3(vmaskmovps, "maskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, SSE38(0x2C), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores -INST3(vmaskmovpd, "maskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores -INST3(vpmaskmovd, "pmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores -INST3(vpmaskmovq, "pmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores -INST3(vpgatherdd, "pgatherdd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Dword -INST3(vpgatherqd, "pgatherqd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Qword -INST3(vpgatherdq, "pgatherdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword with Signed Dword Indices -INST3(vpgatherqq, "pgatherqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Qword with Signed Dword Indices -INST3(vgatherdps, "gatherdps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Dword Indices -INST3(vgatherqps, "gatherqps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Qword Indices -INST3(vgatherdpd, "gatherdpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Dword Indices -INST3(vgatherqpd, "gatherqpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Qword Indices - -INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(vbroadcastss, "broadcastss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x18), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast float value read from memory to entire ymm register +INST3(vbroadcastsd, "broadcastsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast float value read from memory to entire ymm register +INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x78), INS_TT_TUPLE1_SCALAR, Input_8Bit | INS_FLAGS_None) // Broadcast int8 value from reg/memory to entire ymm register +INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register +INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register +INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register +INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed floating point values +INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed integer values +INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values +INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values +INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) +INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register +INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute 64-bit of input register +INST3(vpblendd, "pblendd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x02), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed DWORDs +INST3(vblendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4A), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Singles +INST3(vblendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4B), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Doubles +INST3(vpblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Bytes +INST3(vtestps, "testps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0E), INS_TT_NONE, INS_FLAGS_None) // Packed Bit Test +INST3(vtestpd, "testpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0F), INS_TT_NONE, INS_FLAGS_None) // Packed Bit Test +INST3(vpsrlvd, "psrlvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical +INST3(vpsrlvq, "psrlvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical +INST3(vpsravd, "psravd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic +INST3(vpsllvd, "psllvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical +INST3(vpsllvq, "psllvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical +INST3(vpermilps, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x04), INS_TT_FULL, Input_32Bit | INS_FLAGS_None) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values +INST3(vpermilpd, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x05), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values +INST3(vpermilpsvar, "permilpsvar", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0C), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values +INST3(vpermilpdvar, "permilpdvar", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0D), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values +INST3(vperm2f128, "perm2f128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x06), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute Floating-Point Values +INST3(vpermpd, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x01), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute Double-Precision Floating-Point Values +INST3(vpermd, "permd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Packed Doublewords Elements +INST3(vpermps, "permps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Single-Precision Floating-Point Elements +INST3(vbroadcastf128, "broadcastf128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_TT_NONE, INS_FLAGS_None) // Broadcast packed float values read from memory to entire ymm register +INST3(vbroadcasti128, "broadcasti128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_TT_NONE, INS_FLAGS_None) // Broadcast packed integer values read from memory to entire ymm register +INST3(vmaskmovps, "maskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, SSE38(0x2C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores +INST3(vmaskmovpd, "maskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores +INST3(vpmaskmovd, "pmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores +INST3(vpmaskmovq, "pmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores +INST3(vpgatherdd, "pgatherdd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Dword +INST3(vpgatherqd, "pgatherqd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Qword +INST3(vpgatherdq, "pgatherdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword with Signed Dword Indices +INST3(vpgatherqq, "pgatherqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Qword with Signed Dword Indices +INST3(vgatherdps, "gatherdps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Dword Indices +INST3(vgatherqps, "gatherqps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Qword Indices +INST3(vgatherdpd, "gatherdpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Dword Indices +INST3(vgatherqpd, "gatherqpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Qword Indices + +INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // id nm um mr mi rm flags -INST3(vfmadd132pd, "fmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfmadd213pd, "fmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231pd, "fmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132ps, "fmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfmadd213ps, "fmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231ps, "fmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132sd, "fmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfmadd213sd, "fmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231sd, "fmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132ss, "fmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfmadd213ss, "fmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231ss, "fmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub132pd, "fmaddsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values -INST3(vfmaddsub213pd, "fmaddsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub231pd, "fmaddsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub132ps, "fmaddsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values -INST3(vfmaddsub213ps, "fmaddsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub231ps, "fmaddsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd132pd, "fmsubadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values -INST3(vfmsubadd213pd, "fmsubadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd231pd, "fmsubadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd132ps, "fmsubadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values -INST3(vfmsubadd213ps, "fmsubadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd231ps, "fmsubadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132pd, "fmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfmsub213pd, "fmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231pd, "fmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132ps, "fmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfmsub213ps, "fmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231ps, "fmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132sd, "fmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfmsub213sd, "fmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) - -INST3(FIRST_AVXVNNI_INSTRUCTION, "FIRST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(vpdpbusd, "pdpbusd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x50), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes -INST3(vpdpwssd, "pdpwssd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x52), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers -INST3(vpdpbusds, "pdpbusds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x51), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes with Saturation -INST3(vpdpwssds, "pdpwssds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x53), INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers with Saturation -INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(vfmadd132pd, "fmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfmadd213pd, "fmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231pd, "fmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132ps, "fmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfmadd213ps, "fmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231ps, "fmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132sd, "fmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfmadd213sd, "fmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231sd, "fmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132ss, "fmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfmadd213ss, "fmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231ss, "fmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub132pd, "fmaddsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmaddsub213pd, "fmaddsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub231pd, "fmaddsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub132ps, "fmaddsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmaddsub213ps, "fmaddsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub231ps, "fmaddsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd132pd, "fmsubadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values +INST3(vfmsubadd213pd, "fmsubadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd231pd, "fmsubadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd132ps, "fmsubadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values +INST3(vfmsubadd213ps, "fmsubadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd231ps, "fmsubadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132pd, "fmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmsub213pd, "fmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231pd, "fmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132ps, "fmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmsub213ps, "fmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231ps, "fmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132sd, "fmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfmsub213sd, "fmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(FIRST_AVXVNNI_INSTRUCTION, "FIRST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(vpdpbusd, "pdpbusd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x50), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes +INST3(vpdpwssd, "pdpwssd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x52), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers +INST3(vpdpbusds, "pdpbusds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x51), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes with Saturation +INST3(vpdpwssds, "pdpwssds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x53), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers with Saturation +INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // BMI1 -INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND NOT -INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Extract Lowest Set Isolated Bit -INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Get Mask Up to Lowest Set Bit -INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Reset Lowest Set Bit -INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_Flags_IsDstDstSrcAVXInstruction) // Bit Field Extract +INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND NOT +INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Extract Lowest Set Isolated Bit +INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Get Mask Up to Lowest Set Bit +INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Reset Lowest Set Bit +INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Bit Field Extract // BMI2 -INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_FLAGS_None) -INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit -INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract -INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Zero High Bits Starting with Specified Bit Position -INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags +INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, INS_FLAGS_None) +INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit +INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract +INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Zero High Bits Starting with Specified Bit Position +INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags #ifdef TARGET_AMD64 -INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags -INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0xF7), INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags -INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF7), INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags +INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags +INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags +INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags #endif -INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) -INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Scalar instructions in SSE4.2 -INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF0), INS_FLAGS_None) +INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF0), INS_TT_NONE, INS_FLAGS_None) // BMI1 -INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) // Count the Number of Trailing Zero Bits +INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) // Count the Number of Trailing Zero Bits // LZCNT -INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) +INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) // MOVBE -INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_FLAGS_None ) +INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_TT_NONE, INS_FLAGS_None ) // POPCNT -INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF ) +INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF ) // id nm um mr mi flags -INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_FLAGS_None ) -INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_FLAGS_None ) -INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_FLAGS_None ) - -INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) - -INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None ) +INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None ) +INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, INS_FLAGS_None ) + +INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) + +INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) +INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) +INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) +INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) +INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) +INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) +INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) // id nm um mr flags -INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) #if defined(TARGET_AMD64) -INST1(r_movsq, "rep movsq", IUM_RD, 0xF3A548, Reads_DF ) +INST1(r_movsq, "rep movsq", IUM_RD, 0xF3A548, INS_TT_NONE, Reads_DF ) #endif // defined(TARGET_AMD64) -INST1(movsb, "movsb", IUM_RD, 0x0000A4, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(movsd, "movsd", IUM_RD, 0x0000A5, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(movsb, "movsb", IUM_RD, 0x0000A4, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(movsd, "movsd", IUM_RD, 0x0000A5, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) #if defined(TARGET_AMD64) -INST1(movsq, "movsq", IUM_RD, 0x00A548, Reads_DF ) +INST1(movsq, "movsq", IUM_RD, 0x00A548, INS_TT_NONE, Reads_DF ) #endif // defined(TARGET_AMD64) -INST1(r_stosb, "rep stosb", IUM_RD, 0x00AAF3, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(r_stosd, "rep stosd", IUM_RD, 0x00ABF3, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(r_stosb, "rep stosb", IUM_RD, 0x00AAF3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(r_stosd, "rep stosd", IUM_RD, 0x00ABF3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) #if defined(TARGET_AMD64) -INST1(r_stosq, "rep stosq", IUM_RD, 0xF3AB48, Reads_DF ) +INST1(r_stosq, "rep stosq", IUM_RD, 0xF3AB48, INS_TT_NONE, Reads_DF ) #endif // defined(TARGET_AMD64) -INST1(stosb, "stosb", IUM_RD, 0x0000AA, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(stosd, "stosd", IUM_RD, 0x0000AB, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(stosb, "stosb", IUM_RD, 0x0000AA, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(stosd, "stosd", IUM_RD, 0x0000AB, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) #if defined(TARGET_AMD64) -INST1(stosq, "stosq", IUM_RD, 0x00AB48, Reads_DF ) +INST1(stosq, "stosq", IUM_RD, 0x00AB48, INS_TT_NONE, Reads_DF ) #endif // defined(TARGET_AMD64) -INST1(int3, "int3", IUM_RD, 0x0000CC, INS_FLAGS_None ) -INST1(nop, "nop", IUM_RD, 0x000090, INS_FLAGS_None ) -INST1(pause, "pause", IUM_RD, 0x0090F3, INS_FLAGS_None ) -INST1(lock, "lock", IUM_RD, 0x0000F0, INS_FLAGS_None ) -INST1(leave, "leave", IUM_RD, 0x0000C9, INS_FLAGS_None ) +INST1(int3, "int3", IUM_RD, 0x0000CC, INS_TT_NONE, INS_FLAGS_None ) +INST1(nop, "nop", IUM_RD, 0x000090, INS_TT_NONE, INS_FLAGS_None ) +INST1(pause, "pause", IUM_RD, 0x0090F3, INS_TT_NONE, INS_FLAGS_None ) +INST1(lock, "lock", IUM_RD, 0x0000F0, INS_TT_NONE, INS_FLAGS_None ) +INST1(leave, "leave", IUM_RD, 0x0000C9, INS_TT_NONE, INS_FLAGS_None ) -INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_FLAGS_None ) +INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None ) -INST1(neg, "neg", IUM_RW, 0x0018F6, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(not, "not", IUM_RW, 0x0010F6, INS_FLAGS_None | INS_FLAGS_Has_Wbit ) +INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit ) -INST1(cwde, "cwde", IUM_RD, 0x000098, INS_FLAGS_None ) -INST1(cdq, "cdq", IUM_RD, 0x000099, INS_FLAGS_None ) -INST1(idiv, "idiv", IUM_RD, 0x0038F6, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit ) -INST1(imulEAX, "imul", IUM_RD, 0x0028F6, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(div, "div", IUM_RD, 0x0030F6, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit ) -INST1(mulEAX, "mul", IUM_RD, 0x0020F6, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None ) +INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None ) +INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit ) +INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit ) +INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(sahf, "sahf", IUM_RD, 0x00009E, Restore_SF_ZF_AF_PF_CF ) +INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF ) -INST1(xadd, "xadd", IUM_RW, 0x0F00C0, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(cmpxchg, "cmpxchg", IUM_RW, 0x0F00B0, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(xadd, "xadd", IUM_RW, 0x0F00C0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(cmpxchg, "cmpxchg", IUM_RW, 0x0F00B0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(shld, "shld", IUM_RW, 0x0F00A4, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF ) -INST1(shrd, "shrd", IUM_RW, 0x0F00AC, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF ) +INST1(shld, "shld", IUM_RW, 0x0F00A4, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF ) +INST1(shrd, "shrd", IUM_RW, 0x0F00AC, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF ) // For RyuJIT/x86, we follow the x86 calling convention that requires // us to return floating point value on the x87 FP stack, so we need // these instructions regardless of whether we're using full stack fp. #ifdef TARGET_X86 -INST1(fld, "fld", IUM_WR, 0x0000D9, INS_FLAGS_x87Instr) -INST1(fstp, "fstp", IUM_WR, 0x0018D9, INS_FLAGS_x87Instr) +INST1(fld, "fld", IUM_WR, 0x0000D9, INS_TT_NONE, INS_FLAGS_x87Instr) +INST1(fstp, "fstp", IUM_WR, 0x0018D9, INS_TT_NONE, INS_FLAGS_x87Instr) #endif // TARGET_X86 -INST1(seto, "seto", IUM_WR, 0x0F0090, Reads_OF ) -INST1(setno, "setno", IUM_WR, 0x0F0091, Reads_OF ) -INST1(setb, "setb", IUM_WR, 0x0F0092, Reads_CF ) -INST1(setae, "setae", IUM_WR, 0x0F0093, Reads_CF ) -INST1(sete, "sete", IUM_WR, 0x0F0094, Reads_ZF ) -INST1(setne, "setne", IUM_WR, 0x0F0095, Reads_ZF ) -INST1(setbe, "setbe", IUM_WR, 0x0F0096, Reads_ZF | Reads_CF ) -INST1(seta, "seta", IUM_WR, 0x0F0097, Reads_ZF | Reads_CF ) -INST1(sets, "sets", IUM_WR, 0x0F0098, Reads_SF ) -INST1(setns, "setns", IUM_WR, 0x0F0099, Reads_SF ) -INST1(setp, "setp", IUM_WR, 0x0F009A, Reads_PF ) -INST1(setnp, "setnp", IUM_WR, 0x0F009B, Reads_PF ) -INST1(setl, "setl", IUM_WR, 0x0F009C, Reads_OF | Reads_SF ) -INST1(setge, "setge", IUM_WR, 0x0F009D, Reads_OF | Reads_SF ) -INST1(setle, "setle", IUM_WR, 0x0F009E, Reads_OF | Reads_SF | Reads_ZF ) -INST1(setg, "setg", IUM_WR, 0x0F009F, Reads_OF | Reads_SF | Reads_ZF ) +INST1(seto, "seto", IUM_WR, 0x0F0090, INS_TT_NONE, Reads_OF ) +INST1(setno, "setno", IUM_WR, 0x0F0091, INS_TT_NONE, Reads_OF ) +INST1(setb, "setb", IUM_WR, 0x0F0092, INS_TT_NONE, Reads_CF ) +INST1(setae, "setae", IUM_WR, 0x0F0093, INS_TT_NONE, Reads_CF ) +INST1(sete, "sete", IUM_WR, 0x0F0094, INS_TT_NONE, Reads_ZF ) +INST1(setne, "setne", IUM_WR, 0x0F0095, INS_TT_NONE, Reads_ZF ) +INST1(setbe, "setbe", IUM_WR, 0x0F0096, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST1(seta, "seta", IUM_WR, 0x0F0097, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST1(sets, "sets", IUM_WR, 0x0F0098, INS_TT_NONE, Reads_SF ) +INST1(setns, "setns", IUM_WR, 0x0F0099, INS_TT_NONE, Reads_SF ) +INST1(setp, "setp", IUM_WR, 0x0F009A, INS_TT_NONE, Reads_PF ) +INST1(setnp, "setnp", IUM_WR, 0x0F009B, INS_TT_NONE, Reads_PF ) +INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF ) +INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF ) +INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) +INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) // Indirect jump used for tailcalls. We differentiate between func-internal // indirect jump (e.g. used for switch) and tailcall indirect jumps because the // x64 unwinder might require the latter to be rex.w prefixed. -INST1(tail_i_jmp, "tail.jmp", IUM_RD, 0x0020FF, INS_FLAGS_None ) -INST1(i_jmp, "jmp", IUM_RD, 0x0020FF, INS_FLAGS_None ) -INST0(jmp, "jmp", IUM_RD, 0x0000EB, INS_FLAGS_None ) -INST0(jo, "jo", IUM_RD, 0x000070, Reads_OF ) -INST0(jno, "jno", IUM_RD, 0x000071, Reads_OF ) -INST0(jb, "jb", IUM_RD, 0x000072, Reads_CF ) -INST0(jae, "jae", IUM_RD, 0x000073, Reads_CF ) -INST0(je, "je", IUM_RD, 0x000074, Reads_ZF ) -INST0(jne, "jne", IUM_RD, 0x000075, Reads_ZF ) -INST0(jbe, "jbe", IUM_RD, 0x000076, Reads_ZF | Reads_CF ) -INST0(ja, "ja", IUM_RD, 0x000077, Reads_ZF | Reads_CF ) -INST0(js, "js", IUM_RD, 0x000078, Reads_SF ) -INST0(jns, "jns", IUM_RD, 0x000079, Reads_SF ) -INST0(jp, "jp", IUM_RD, 0x00007A, Reads_PF ) -INST0(jnp, "jnp", IUM_RD, 0x00007B, Reads_PF ) -INST0(jl, "jl", IUM_RD, 0x00007C, Reads_OF | Reads_SF ) -INST0(jge, "jge", IUM_RD, 0x00007D, Reads_OF | Reads_SF ) -INST0(jle, "jle", IUM_RD, 0x00007E, Reads_OF | Reads_SF | Reads_ZF ) -INST0(jg, "jg", IUM_RD, 0x00007F, Reads_OF | Reads_SF | Reads_ZF ) - -INST0(l_jmp, "jmp", IUM_RD, 0x0000E9, INS_FLAGS_None ) -INST0(l_jo, "jo", IUM_RD, 0x00800F, Reads_OF ) -INST0(l_jno, "jno", IUM_RD, 0x00810F, Reads_OF ) -INST0(l_jb, "jb", IUM_RD, 0x00820F, Reads_CF ) -INST0(l_jae, "jae", IUM_RD, 0x00830F, Reads_CF ) -INST0(l_je, "je", IUM_RD, 0x00840F, Reads_ZF ) -INST0(l_jne, "jne", IUM_RD, 0x00850F, Reads_ZF ) -INST0(l_jbe, "jbe", IUM_RD, 0x00860F, Reads_ZF | Reads_CF ) -INST0(l_ja, "ja", IUM_RD, 0x00870F, Reads_ZF | Reads_CF ) -INST0(l_js, "js", IUM_RD, 0x00880F, Reads_SF ) -INST0(l_jns, "jns", IUM_RD, 0x00890F, Reads_SF ) -INST0(l_jp, "jp", IUM_RD, 0x008A0F, Reads_PF ) -INST0(l_jnp, "jnp", IUM_RD, 0x008B0F, Reads_PF ) -INST0(l_jl, "jl", IUM_RD, 0x008C0F, Reads_OF | Reads_SF ) -INST0(l_jge, "jge", IUM_RD, 0x008D0F, Reads_OF | Reads_SF ) -INST0(l_jle, "jle", IUM_RD, 0x008E0F, Reads_OF | Reads_SF | Reads_ZF ) -INST0(l_jg, "jg", IUM_RD, 0x008F0F, Reads_OF | Reads_SF | Reads_ZF ) - -INST0(align, "align", IUM_RD, BAD_CODE, INS_FLAGS_None) +INST1(tail_i_jmp, "tail.jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None ) +INST1(i_jmp, "jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None ) +INST0(jmp, "jmp", IUM_RD, 0x0000EB, INS_TT_NONE, INS_FLAGS_None ) +INST0(jo, "jo", IUM_RD, 0x000070, INS_TT_NONE, Reads_OF ) +INST0(jno, "jno", IUM_RD, 0x000071, INS_TT_NONE, Reads_OF ) +INST0(jb, "jb", IUM_RD, 0x000072, INS_TT_NONE, Reads_CF ) +INST0(jae, "jae", IUM_RD, 0x000073, INS_TT_NONE, Reads_CF ) +INST0(je, "je", IUM_RD, 0x000074, INS_TT_NONE, Reads_ZF ) +INST0(jne, "jne", IUM_RD, 0x000075, INS_TT_NONE, Reads_ZF ) +INST0(jbe, "jbe", IUM_RD, 0x000076, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST0(ja, "ja", IUM_RD, 0x000077, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST0(js, "js", IUM_RD, 0x000078, INS_TT_NONE, Reads_SF ) +INST0(jns, "jns", IUM_RD, 0x000079, INS_TT_NONE, Reads_SF ) +INST0(jp, "jp", IUM_RD, 0x00007A, INS_TT_NONE, Reads_PF ) +INST0(jnp, "jnp", IUM_RD, 0x00007B, INS_TT_NONE, Reads_PF ) +INST0(jl, "jl", IUM_RD, 0x00007C, INS_TT_NONE, Reads_OF | Reads_SF ) +INST0(jge, "jge", IUM_RD, 0x00007D, INS_TT_NONE, Reads_OF | Reads_SF ) +INST0(jle, "jle", IUM_RD, 0x00007E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) +INST0(jg, "jg", IUM_RD, 0x00007F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) + +INST0(l_jmp, "jmp", IUM_RD, 0x0000E9, INS_TT_NONE, INS_FLAGS_None ) +INST0(l_jo, "jo", IUM_RD, 0x00800F, INS_TT_NONE, Reads_OF ) +INST0(l_jno, "jno", IUM_RD, 0x00810F, INS_TT_NONE, Reads_OF ) +INST0(l_jb, "jb", IUM_RD, 0x00820F, INS_TT_NONE, Reads_CF ) +INST0(l_jae, "jae", IUM_RD, 0x00830F, INS_TT_NONE, Reads_CF ) +INST0(l_je, "je", IUM_RD, 0x00840F, INS_TT_NONE, Reads_ZF ) +INST0(l_jne, "jne", IUM_RD, 0x00850F, INS_TT_NONE, Reads_ZF ) +INST0(l_jbe, "jbe", IUM_RD, 0x00860F, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST0(l_ja, "ja", IUM_RD, 0x00870F, INS_TT_NONE, Reads_ZF | Reads_CF ) +INST0(l_js, "js", IUM_RD, 0x00880F, INS_TT_NONE, Reads_SF ) +INST0(l_jns, "jns", IUM_RD, 0x00890F, INS_TT_NONE, Reads_SF ) +INST0(l_jp, "jp", IUM_RD, 0x008A0F, INS_TT_NONE, Reads_PF ) +INST0(l_jnp, "jnp", IUM_RD, 0x008B0F, INS_TT_NONE, Reads_PF ) +INST0(l_jl, "jl", IUM_RD, 0x008C0F, INS_TT_NONE, Reads_OF | Reads_SF ) +INST0(l_jge, "jge", IUM_RD, 0x008D0F, INS_TT_NONE, Reads_OF | Reads_SF ) +INST0(l_jle, "jle", IUM_RD, 0x008E0F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) +INST0(l_jg, "jg", IUM_RD, 0x008F0F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) + +INST0(align, "align", IUM_RD, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) /*****************************************************************************/ #undef INST0 diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index d91f7104d1871f..469abe498f42a5 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -709,7 +709,8 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) // Currently AVX doesn't support integer. // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX. if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported && - !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && GetEmitter()->IsThreeOperandAVXInstruction(ins)) + !(ins == INS_cvtsi2ss32 || ins == INS_cvtsi2sd32 || ins == INS_cvtsi2ss64 || ins == INS_cvtsi2sd64) && + GetEmitter()->IsThreeOperandAVXInstruction(ins)) { inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType)); } From 0a1d2b9d26aea0c49a68a92e40706018ebe54189 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Thu, 27 Oct 2022 16:03:34 -0700 Subject: [PATCH 4/4] Moving 'JitStressEvexEncoding' under Debug flag and other review cleanup. --- src/coreclr/jit/codegen.h | 4 + src/coreclr/jit/codegenxarch.cpp | 2 +- src/coreclr/jit/compiler.h | 18 ++-- src/coreclr/jit/emit.cpp | 1 - src/coreclr/jit/emitxarch.cpp | 165 ++++++++++++++++-------------- src/coreclr/jit/emitxarch.h | 28 +++-- src/coreclr/jit/instr.cpp | 16 ++- src/coreclr/jit/instr.h | 16 +-- src/coreclr/jit/instrsxarch.h | 3 +- src/coreclr/jit/jitconfigvalues.h | 8 +- 10 files changed, 148 insertions(+), 113 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index cc544e5bb7e532..2a27feeaf81f3c 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1700,7 +1700,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX instruction ins_Copy(var_types dstType); instruction ins_Copy(regNumber srcReg, var_types dstType); +#if defined(TARGET_XARCH) instruction ins_FloatConv(var_types to, var_types from, emitAttr attr); +#elif defined(TARGET_ARM) + instruction ins_FloatConv(var_types to, var_types from); +#endif instruction ins_MathOp(genTreeOps oper, var_types type); void instGen_Return(unsigned stkArgSize); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3f33ee2ac044ca..8f76d0a9c40ff6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7110,7 +7110,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) // Note that we need to specify dstType here so that it will determine // the size of destination integer register and also the rex.w prefix. genConsumeOperands(treeNode->AsOp()); - instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(dstType)); + instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(srcType)); GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); genProduceReg(treeNode); } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 8461c90aeacb3f..3b6cdf4b9cbf19 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8993,7 +8993,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // canUseEvexEncoding - Answer the question: Is Evex encoding supported on this target. // // Returns: - // TRUE if Evex encoding is supported, FALSE if not. + // `true` if Evex encoding is supported, `false` if not. + // bool canUseEvexEncoding() const { #ifdef TARGET_XARCH @@ -9007,18 +9008,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding. // // Returns: - // TRUE if user requests EVEX encoding and it's safe, FALSE if not. + // `true` if user requests EVEX encoding and it's safe, `false` if not. + // bool DoJitStressEvexEncoding() const { #ifdef TARGET_XARCH - // Using JitStressEVEXEncoding flag will force instructions which would - // otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding - // This requires AVX512VL support. - if (JitConfig.JitStressEVEXEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) +// Using JitStressEvexEncoding flag will force instructions which would +// otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding +// This requires AVX512VL support. +#ifdef DEBUG + if (JitConfig.JitStressEvexEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { return true; } -#endif +#endif // DEBUG +#endif // TARGET_XARCH return false; } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index a54bacb0558ef6..a7644c3bcbd0f1 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -3999,7 +3999,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr); unsigned estimatedSize = id->idCodeSize(); - if (actualSize != estimatedSize) { // It is fatal to under-estimate the instruction size, except for alignment instructions diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index f73cb54d0361ce..db6294890e7e3f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -41,7 +41,8 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) // ins - The instruction to check. // // Returns: -// TRUE if it is a sse or avx or avx512 instruction.. +// `true` if it is a sse or avx or avx512 instruction. +// bool emitter::IsAvx512OrPriorInstruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. @@ -60,7 +61,8 @@ bool emitter::IsAVXOnlyInstruction(instruction ins) // ins - The instruction to check. // // Returns: -// TRUE if it is a avx512f+ instruction. +// `true` if it is a avx512f+ instruction. +// bool emitter::IsAvx512OnlyInstruction(instruction ins) { return (ins >= INS_FIRST_AVX512_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); @@ -150,20 +152,21 @@ regNumber emitter::getSseShiftRegNumber(instruction ins) } } -bool emitter::IsAVXInstruction(instruction ins) const +bool emitter::IsVexEncodedInstruction(instruction ins) const { return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); } //------------------------------------------------------------------------ -// IsAvx512Instruction: Answer the question- Can this instruction be Evex encoded. +// IsEvexEncodedInstruction: Answer the question- Can this instruction be Evex encoded. // // Arguments: // ins - The instruction to check. // // Returns: -// TRUE if ins can be Evex encoded. -bool emitter::IsAvx512Instruction(instruction ins) const +// `true` if ins can be Evex encoded. +// +bool emitter::IsEvexEncodedInstruction(instruction ins) const { if (!UseEvexEncoding()) { @@ -288,11 +291,11 @@ bool emitter::IsAvx512Instruction(instruction ins) const // ins - The instruction to check. // // Returns: -// TRUE if ins is a SIMD instruction. +// `true` if ins is a SIMD instruction. // -bool emitter::IsSimdInstruction(instruction ins) const +bool emitter::IsVexOrEvexEncodedInstruction(instruction ins) const { - return IsAvx512Instruction(ins) || IsAVXInstruction(ins); + return IsEvexEncodedInstruction(ins) || IsVexEncodedInstruction(ins); } // Returns true if the AVX instruction is a binary operator that requires 3 operands. @@ -303,7 +306,8 @@ bool emitter::IsSimdInstruction(instruction ins) const // to indicate whether a 3-operand instruction. bool emitter::IsDstDstSrcAVXInstruction(instruction ins) { - return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsSimdInstruction(ins); + return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && + IsVexOrEvexEncodedInstruction(ins); } // Returns true if the AVX instruction requires 3 operands that duplicate the source @@ -313,7 +317,8 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) // to indicate whether a 3-operand instruction. bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) { - return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsSimdInstruction(ins); + return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && + IsVexOrEvexEncodedInstruction(ins); } //------------------------------------------------------------------------ @@ -763,8 +768,8 @@ bool emitter::TakesSimdPrefix(instruction ins) const //------------------------------------------------------------------------ // TakesEvexPrefix: Checks if the instruction should be EVEX encoded. // TODO-XArch-AVX512: This check needs to be updated once AVX512 instructions are added. -// Eventually, this should evolve to return 'TRUE' for the following cases: -// - JitConfig.JitStressEVEXEncoding flag is set. +// Eventually, this should evolve to return `true` for the following cases: +// - JitConfig.JitStressEvexEncoding flag is set. // - Is an new AVX512 instruction. // - Uses ZMM vector registers. // - Uses upper 128-bit or 256-bit registers for an AVX512VL ins. @@ -787,9 +792,11 @@ bool emitter::TakesEvexPrefix(instruction ins) const } // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. - return IsAvx512Instruction(ins) && !HasKMaskRegisterDest(ins); + return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); } +// Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section +// 2.6. // Add base EVEX prefix without setting W, R, X, or B bits // L'L bits will be set based on emitter attr. // @@ -813,6 +820,7 @@ bool emitter::TakesEvexPrefix(instruction ins) const // - V'- bit to extend vvvv // - aaa - specifies mask register // Rest - reserved for future use and usage of them will uresult in Undefined instruction exception. +// #define DEFAULT_BYTE_EVEX_PREFIX 0x62F07C0800000000ULL #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL @@ -833,7 +841,7 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at { // Only AVX512 instructions require EVEX prefix - assert(IsAvx512Instruction(ins)); + assert(IsEvexEncodedInstruction(ins)); // Shouldn't have already added EVEX prefix assert(!hasEvexPrefix(code)); @@ -873,7 +881,7 @@ bool emitter::TakesVexPrefix(instruction ins) const break; } - return IsAVXInstruction(ins); + return IsVexEncodedInstruction(ins); } // Add base VEX prefix without setting W, R, X, or B bits @@ -910,7 +918,7 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att // emitted, by simply checking that all the requirements were met. // Only AVX instructions require VEX prefix - assert(IsAVXInstruction(ins)); + assert(IsVexEncodedInstruction(ins)); // Shouldn't have already added VEX prefix assert(!hasVexPrefix(code)); @@ -1128,7 +1136,7 @@ unsigned RegEncoding(regNumber reg) // AVX: specific bits within VEX prefix need to be set in bit-inverted form. emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) { - if (UseEvexEncoding() && IsAvx512Instruction(ins)) + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { @@ -1139,7 +1147,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) return emitter::code_t(code | 0x0000800000000000ULL); } } - if (UseVEXEncoding() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { if (TakesVexPrefix(ins)) { @@ -1162,7 +1170,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) { - if (UseEvexEncoding() && IsAvx512Instruction(ins)) + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { @@ -1173,7 +1181,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) return code & 0xFF7FFFFFFFFFFFFFULL; } } - if (UseVEXEncoding() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { if (TakesVexPrefix(ins)) { @@ -1190,7 +1198,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) { - if (UseEvexEncoding() && IsAvx512Instruction(ins)) + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { if (TakesEvexPrefix(ins)) { @@ -1200,7 +1208,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) return code & 0xFFBFFFFFFFFFFFFFULL; } } - if (UseVEXEncoding() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { if (TakesVexPrefix(ins)) { @@ -1217,7 +1225,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) { - if (UseEvexEncoding() && IsAvx512Instruction(ins)) + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { @@ -1228,7 +1236,7 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) return code & 0xFFDFFFFFFFFFFFFFULL; } } - if (UseVEXEncoding() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { if (TakesVexPrefix(ins)) { @@ -1246,8 +1254,8 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) // Adds REX prefix (0x40) without W, R, X or B bits set emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) { - assert(!UseVEXEncoding() || !IsAVXInstruction(ins)); - assert(!UseEvexEncoding() || !IsAvx512Instruction(ins)); + assert(!UseVEXEncoding() || !IsVexEncodedInstruction(ins)); + assert(!UseEvexEncoding() || !IsEvexEncodedInstruction(ins)); return code | 0x4000000000ULL; } @@ -1289,7 +1297,7 @@ unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_ if (hasEvexPrefix(code)) { // Only AVX512 instructions should have an EVEX prefix - assert(IsAvx512Instruction(ins)); + assert(IsEvexEncodedInstruction(ins)); code_t evexPrefix = (code >> 32) & 0xFFFFFFFF; code &= 0x00000000FFFFFFFFLL; @@ -1391,7 +1399,7 @@ unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_ else if (hasVexPrefix(code)) { // Only AVX instructions should have a VEX prefix - assert(UseVEXEncoding() && IsAVXInstruction(ins)); + assert(UseVEXEncoding() && IsVexEncodedInstruction(ins)); code_t vexPrefix = (code >> 32) & 0x00FFFFFF; code &= 0x00000000FFFFFFFFLL; @@ -1631,7 +1639,7 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c if (hasVexPrefix(code)) { // Only AVX instructions should have a VEX prefix - assert(UseVEXEncoding() && IsAVXInstruction(ins)); + assert(UseVEXEncoding() && IsVexEncodedInstruction(ins)); code_t vexPrefix = (code >> 32) & 0x00FFFFFF; code &= 0x00000000FFFFFFFFLL; @@ -1898,7 +1906,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins) { // In case of AVX instructions, REX prefixes are part of VEX prefix. // And hence requires no additional byte to encode REX prefixes. - if (IsAVXInstruction(ins)) + if (IsVexOrEvexEncodedInstruction(ins)) { return 0; } @@ -1919,7 +1927,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins) // unsigned emitter::emitGetEvexPrefixSize(instruction ins) { - if (IsAvx512Instruction(ins)) + if (IsEvexEncodedInstruction(ins)) { return 4; } @@ -1931,7 +1939,7 @@ unsigned emitter::emitGetEvexPrefixSize(instruction ins) // Size of vex prefix in bytes unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr) { - if (IsAVXInstruction(ins)) + if (IsVexEncodedInstruction(ins)) { return 3; } @@ -1958,9 +1966,10 @@ unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, c unsigned adjustedSize = 0; // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway - // IsAvx512Instruction(ins) is TRUE for AVX/SSE instructions also which needs to be VEX encoded unless explicitly + // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless + // explicitly // asked for EVEX. - if (IsAvx512Instruction(ins) && TakesEvexPrefix(ins)) + if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(ins)) { // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always @@ -2007,7 +2016,7 @@ unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, c adjustedSize = evexPrefixAdjustedSize; } - else if (IsAVXInstruction(ins)) + else if (IsVexEncodedInstruction(ins)) { // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always @@ -2089,7 +2098,7 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod { unsigned adjustedSize = 0; - if (IsAVXInstruction(ins)) + if (IsVexEncodedInstruction(ins)) { // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always @@ -2603,7 +2612,7 @@ insTupleType insTupleTypeInfos[] = inline bool hasTupleTypeInfo(instruction ins) { assert((unsigned)ins < ArrLen(insTupleTypeInfos)); - return ((insTupleTypeInfos[ins] != INS_TT_NONE)); + return (insTupleTypeInfos[ins] != INS_TT_NONE); } //------------------------------------------------------------------------ @@ -2618,7 +2627,7 @@ inline bool hasTupleTypeInfo(instruction ins) inline insTupleType insTupleTypeInfo(instruction ins) { assert((unsigned)ins < ArrLen(insTupleTypeInfos)); - assert((insTupleTypeInfos[ins] != INS_TT_NONE)); + assert(insTupleTypeInfos[ins] != INS_TT_NONE); return insTupleTypeInfos[ins]; } @@ -2729,7 +2738,7 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code) { assert(reg < REG_STK); - assert(IsSimdInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); assert(hasVexOrEvexPrefix(code)); // Get 4-bit register encoding @@ -2743,7 +2752,7 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, // Both prefix encodes register operand in 1's complement form assert(regBits <= 0xF); - if (UseEvexEncoding() && IsAvx512Instruction(ins)) + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) { @@ -2754,7 +2763,7 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, return code ^ regBits; } } - if (UseVEXEncoding() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { if (TakesVexPrefix(ins)) { @@ -3080,7 +3089,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code) (!id->idIsSmallDsc() && (IsExtendedReg(id->idReg3(), attr) || IsExtendedReg(id->idReg4(), attr)))) { sz += emitGetRexPrefixSize(ins); - includeRexPrefixSize = !IsAVXInstruction(ins); + includeRexPrefixSize = !IsVexEncodedInstruction(ins); } sz += emitInsSize(code, includeRexPrefixSize); @@ -3733,7 +3742,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code, int val } else { - assert(!IsSSEOrAVXInstruction(ins)); + assert(!IsAvx512OrPriorInstruction(ins)); } return valSize + emitInsSizeAM(id, code); @@ -6006,7 +6015,7 @@ void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int off void emitter::emitIns_AR_R_R( instruction ins, emitAttr attr, regNumber op2Reg, regNumber op3Reg, regNumber base, int offs) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); instrDesc* id = emitNewInstrAmd(attr, offs); @@ -6046,7 +6055,7 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival) { noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); ssize_t offs = indir->Offset(); instrDesc* id = emitNewInstrAmdCns(attr, offs, ival); @@ -6066,7 +6075,7 @@ void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenT void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival) { noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); instrDesc* id = emitNewInstrAmdCns(attr, offs, ival); @@ -6113,7 +6122,7 @@ void emitter::emitIns_R_C_I( void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival) { noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); instrDesc* id = emitNewInstrCns(attr, ival); @@ -6135,7 +6144,7 @@ void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); ssize_t offs = indir->Offset(); @@ -6156,7 +6165,7 @@ void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regN void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); instrDesc* id = emitNewInstrAmd(attr, offs); @@ -6322,7 +6331,7 @@ void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regN void emitter::emitIns_R_R_A_I( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); ssize_t offs = indir->Offset(); @@ -6344,7 +6353,7 @@ void emitter::emitIns_R_R_A_I( void emitter::emitIns_R_R_AR_I( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); instrDesc* id = emitNewInstrAmdCns(attr, offs, ival); @@ -6456,7 +6465,7 @@ void emitter::emitIns_R_R_R_I( void emitter::emitIns_R_R_S_I( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); instrDesc* id = emitNewInstrCns(attr, ival); @@ -6516,7 +6525,7 @@ void emitter::emitIns_R_R_A_R( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir) { assert(isAvxBlendv(ins)); - assert(UseVEXEncoding()); + assert(UseSimdEncoding()); int ival = encodeXmmRegAsIval(op3Reg); ssize_t offs = indir->Offset(); @@ -6555,7 +6564,7 @@ void emitter::emitIns_R_R_AR_R( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base, int offs) { assert(isAvxBlendv(ins)); - assert(UseVEXEncoding()); + assert(UseSimdEncoding()); int ival = encodeXmmRegAsIval(op3Reg); instrDesc* id = emitNewInstrAmdCns(attr, offs, ival); @@ -7229,7 +7238,7 @@ void emitter::emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs // void emitter::emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, regNumber reg, int imm) { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); assert(reg != REG_NA); instrDesc* id = emitNewInstrAmdCns(attr, indir->Offset(), imm); @@ -10641,7 +10650,7 @@ void emitter::emitDispIns( case IF_RWR_RRD_RRD: { - assert(IsAVXInstruction(ins)); + assert(IsVexEncodedInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); regNumber reg2 = id->idReg2(); regNumber reg3 = id->idReg3(); @@ -10664,7 +10673,7 @@ void emitter::emitDispIns( } case IF_RWR_RRD_RRD_CNS: - assert(IsAVXInstruction(ins)); + assert(IsVexEncodedInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); printf("%s, ", emitRegName(id->idReg1(), attr)); printf("%s, ", emitRegName(id->idReg2(), attr)); @@ -11474,8 +11483,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit the REX prefix if required // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently - // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support. + // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently + // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) { code = AddRexWPrefix(ins, code); @@ -11601,7 +11610,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) code += 4; } } - else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins)) + else if (!IsSSEInstruction(ins) && !IsVexEncodedInstruction(ins)) { /* Is the operand size larger than a byte? */ @@ -12408,7 +12417,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) code += 4; } } - else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins) && !IsAvx512Instruction(ins)) + else if (!IsSSEInstruction(ins) && !IsVexEncodedInstruction(ins) && !IsEvexEncodedInstruction(ins)) { // Is the operand size larger than a byte? switch (size) @@ -13186,7 +13195,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // We would to update GC info correctly assert(!IsSSEInstruction(ins)); - assert(!IsAVXInstruction(ins)); + assert(!IsVexEncodedInstruction(ins)); // Get the 'base' opcode switch (ins) @@ -13618,7 +13627,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) else if ((code & 0xFF) == 0x00) { // This case happens for some SSE/AVX instructions only - assert(IsAVXInstruction(ins) || Is4ByteSSEInstruction(ins)); + assert(IsVexEncodedInstruction(ins) || Is4ByteSSEInstruction(ins)); dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regCode)); @@ -13818,7 +13827,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code_t code; instruction ins = id->idIns(); - assert(IsSimdInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins)); regNumber targetReg = id->idReg1(); regNumber src1 = id->idReg2(); @@ -13866,7 +13875,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) else if ((code & 0xFF) == 0x00) { // This case happens for AVX instructions only - assert(IsSimdInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regCode)); @@ -14264,7 +14273,7 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) // We would to update GC info correctly assert(!IsSSEInstruction(ins)); - assert(!IsAVXInstruction(ins)); + assert(!IsVexEncodedInstruction(ins)); #ifdef TARGET_AMD64 // all these opcodes take a sign-extended 4-byte immediate, max @@ -14358,7 +14367,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // SSE/AVX doesnt make any sense here assert(!IsSSEInstruction(ins)); - assert(!IsAVXInstruction(ins)); + assert(!IsVexEncodedInstruction(ins)); size_t ssz; size_t lsz; @@ -14714,7 +14723,7 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) // Arguments: // id -- Instruction descriptor. // dsp -- Displacemnt. -// dspInByte[out] - TRUE if compressed displacement +// dspInByte[out] - `true` if compressed displacement // // Return Value: // compressed displacement value if dspInByte === TRUE. @@ -15367,7 +15376,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) else if ((code & 0xFF) == 0x00) { // This case happens for some SSE/AVX instructions only - assert(IsAVXInstruction(ins) || Is4ByteSSEInstruction(ins)); + assert(IsVexEncodedInstruction(ins) || Is4ByteSSEInstruction(ins)); dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regcode)); @@ -15439,7 +15448,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_ARD_CNS: case IF_RWR_ARD_CNS: - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); code = insCodeRM(ins); @@ -15459,7 +15468,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; case IF_AWR_RRD_CNS: - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); code = insCodeMR(ins); dst = emitOutputAM(dst, id, code, &cnsVal); @@ -15498,7 +15507,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_RRD_ARD_CNS: case IF_RWR_RRD_ARD_RRD: { - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); code = insCodeRM(ins); if (EncodedBySSE38orSSE3A(ins)) @@ -15607,7 +15616,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_SRD_CNS: case IF_RWR_SRD_CNS: - assert(IsSSEOrAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins)); emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); @@ -15671,7 +15680,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_RRD_SRD: { - assert(IsSimdInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(ins, code, size); @@ -15696,7 +15705,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_RRD_SRD_RRD: { // This should only be called on AVX instructions - assert(IsAVXInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); @@ -15836,7 +15845,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_RRD_MRD: { // This should only be called on AVX instructions - assert(IsAVXInstruction(ins)); + assert(IsVexEncodedInstruction(ins)); code = insCodeRM(ins); code = AddVexPrefixIfNeeded(ins, code, size); @@ -15861,7 +15870,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_RRD_MRD_RRD: { // This should only be called on AVX instructions - assert(IsAVXInstruction(ins)); + assert(IsVexEncodedInstruction(ins)); emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index e2910b3b34192f..13f79bc3833cca 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -98,9 +98,9 @@ static bool IsBMIInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); -bool IsAVXInstruction(instruction ins) const; -bool IsAvx512Instruction(instruction ins) const; -bool IsSimdInstruction(instruction ins) const; +bool IsVexEncodedInstruction(instruction ins) const; +bool IsEvexEncodedInstruction(instruction ins) const; +bool IsVexOrEvexEncodedInstruction(instruction ins) const; code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); @@ -178,7 +178,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr // ins - The instruction to check. // // Returns: -// TRUE if W bit needs to be set to 1. +// `true` if W bit needs to be set to 1. // bool IsWEvexOpcodeExtension(instruction ins) { @@ -410,7 +410,7 @@ bool IsWEvexOpcodeExtension(instruction ins) // ins - The instruction to check. // // Returns: -// TRUE if Evex encoding requires KMAsk support. +// `true` if Evex encoding requires KMAsk support. // bool HasKMaskRegisterDest(instruction ins) const { @@ -474,7 +474,8 @@ void SetUseEvexEncoding(bool value) // contains Evex prefix. // // Returns: -// TRUE if target supports either. +// `true` if target supports either. +// bool UseSimdEncoding() const { return UseVEXEncoding() || UseEvexEncoding(); @@ -494,7 +495,8 @@ bool TakesEvexPrefix(instruction ins) const; // code - opcode + prefixes bits at some stage of encoding. // // Returns: -// TRUE if code has an Evex prefix. +// `true` if code has an Evex prefix. +// bool hasEvexPrefix(code_t code) { return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; @@ -534,7 +536,8 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) // size - operand size // // Returns: -// TRUE if code has an Evex prefix. +// `true` if code has an Evex prefix. +// code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size) { if (TakesEvexPrefix(ins)) @@ -558,7 +561,8 @@ bool TakesSimdPrefix(instruction ins) const; // code - opcode + prefixes bits at some stage of encoding. // // Returns: -// TRUE if code has a SIMD prefix. +// `true` if code has a SIMD prefix. +// bool hasVexOrEvexPrefix(code_t code) { return (hasVexPrefix(code) || hasEvexPrefix(code)); @@ -575,7 +579,8 @@ ssize_t TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte); // code - opcode + prefixes bits at some stage of encoding. // // Returns: -// TRUE if code has an Evex prefix. +// `true` if code has an Evex prefix. +// bool codeEvexMigrationCheck(code_t code) { return hasEvexPrefix(code); @@ -1011,7 +1016,8 @@ inline bool emitIsUncondJump(instrDesc* jmp) // id - Instruction descriptor. // // Returns: -// TRUE if the instruction does embeddded broadcast. +// `true` if the instruction does embedded broadcast. +// inline bool HasEmbeddedBroadcast(instrDesc* id) { return false; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index cc552a964ed21c..406becd21fb853 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,7 +101,7 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsAVXInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) + if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); retbuf = buf[curBuf]; @@ -1788,7 +1788,17 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) } } -// Conversions to or from floating point values +//------------------------------------------------------------------------ +// ins_FloatConv: Conversions to or from floating point values. +// +// Arguments: +// to - Destination type. +// from - Source type. +// attr - Input size. +// +// Returns: +// The correct conversion instruction to use based on src and dst types. +// instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) { // AVX: For now we support only conversion from Int/Long -> float @@ -1887,7 +1897,7 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) } } -instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) +instruction CodeGen::ins_FloatConv(var_types to, var_types from) { switch (from) { diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 103ec6140c2d53..180ad19ad3a96e 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -341,7 +341,7 @@ enum insBarrier : unsigned #if defined(TARGET_XARCH) // Represents tupletype attribute of instruction. -// This is used in determining factor N while calculated compressed displacement in EVEX encoding +// This is used in determining factor N while calculating compressed displacement in EVEX encoding // Reference: Section 2.6.5 in Intel 64 and ia-32 architectures software developer's manual volume 2. enum insTupleType : uint32_t { @@ -353,13 +353,13 @@ enum insTupleType : uint32_t INS_TT_TUPLE1_SCALAR = 0x00020, INS_TT_TUPLE1_FIXED = 0x00040, INS_TT_TUPLE2 = 0x00080, - INS_TT_TUPLE4 = 0x01000, - INS_TT_TUPLE8 = 0x02000, - INS_TT_HALF_MEM = 0x04000, - INS_TT_QUARTER_MEM = 0x08000, - INS_TT_EIGHTH_MEM = 0x10000, - INS_TT_MEM128 = 0x20000, - INS_TT_MOVDDUP = 0x40000, + INS_TT_TUPLE4 = 0x00100, + INS_TT_TUPLE8 = 0x00200, + INS_TT_HALF_MEM = 0x00400, + INS_TT_QUARTER_MEM = 0x00800, + INS_TT_EIGHTH_MEM = 0x01000, + INS_TT_MEM128 = 0x02000, + INS_TT_MOVDDUP = 0x04000, INS_TT_IS_NON_BROADCAST = 0x7FFFC }; #endif diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 55a39b52bffbe8..59c6dc1221f4d0 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -15,6 +15,7 @@ * rm -- base encoding for reg,R/M addressing mode * a4 -- base encoding for eax,i32 addressing mode * rr -- base encoding for register addressing mode + * tt -- the tupletype for the instruction * flags -- flags, see INS_FLAGS_* enum * ******************************************************************************/ @@ -64,7 +65,7 @@ INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. -INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_FLAGS_None ) +INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None ) // id nm um mr mi rm a4 tt flags INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index eea4ae64e9b64c..2f337078a71de7 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -297,9 +297,11 @@ CONFIG_INTEGER(AltJitAssertOnNYI, W("AltJitAssertOnNYI"), 1) // Controls the Alt CONFIG_INTEGER(EnableEHWriteThru, W("EnableEHWriteThru"), 1) // Enable the register allocator to support EH-write thru: // partial enregistration of vars exposed on EH boundaries -CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are - // defined or used in a multireg context. -CONFIG_INTEGER(JitStressEVEXEncoding, W("JitStressEVEXEncoding"), 0) // Enable EVEX encoding for SIMD instructions. +CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are + // defined or used in a multireg context. +#if defined(DEBUG) +CONFIG_INTEGER(JitStressEvexEncoding, W("JitStressEvexEncoding"), 0) // Enable EVEX encoding for SIMD instructions. +#endif // DEBUG // DEBUG // clang-format off