From b172f73e04027fe74ab682ae23b2efbee3904191 Mon Sep 17 00:00:00 2001
From: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
Date: Wed, 26 Oct 2022 12:05:38 -0700
Subject: [PATCH 1/4] Cleaning up TODO-XARCH-CLEANUP on EA_8BYTE size.

---
 src/coreclr/jit/codegenxarch.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 9cbb8fc3de37e9..60bb1440123cc4 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -10714,8 +10714,7 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
         {
             // ABI requires us to preserve lower 128-bits of YMM register.
             GetEmitter()->emitIns_AR_R(copyIns,
-                                       EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
-                                                 // EA_16BYTE
+                                       EA_16BYTE,
                                        reg, REG_SPBASE, offset);
             compiler->unwindSaveReg(reg, offset);
             regMask &= ~regBit;
@@ -10781,8 +10780,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
         {
             // ABI requires us to restore lower 128-bits of YMM register.
             GetEmitter()->emitIns_R_AR(copyIns,
-                                       EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
-                                                 // EA_16BYTE
+                                       EA_16BYTE,
                                        reg, regBase, offset);
             regMask &= ~regBit;
             offset -= XMM_REGSIZE_BYTES;

From d6e5fd2225d6c3785e47d5d53d9185db59d9efa1 Mon Sep 17 00:00:00 2001
From: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
Date: Wed, 26 Oct 2022 12:06:13 -0700
Subject: [PATCH 2/4] Adding EVEX encoding pathways for emitOutputRR().

---
 src/coreclr/jit/emitxarch.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 71b2d2ef42ad42..2eb52104e83f32 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -3051,7 +3051,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val
     }
     else
     {
-        assert(!IsSSEOrAVXInstruction(ins));
+        assert(!IsAvx512OrPriorInstruction(ins));
     }
 
     return valSize + emitInsSizeRR(id, code);
@@ -3066,7 +3066,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re
     // This would probably be better expressed as a different format or something?
     code_t code = insCodeRM(ins);
 
-    UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, size, insCodeRM(ins));
+    UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, size, insCodeRM(ins));
 
     bool includeRexPrefixSize = true;
     // REX prefix
@@ -3082,7 +3082,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re
 
     if ((code & 0xFF00) != 0)
     {
-        sz += IsSSEOrAVXInstruction(ins) ? emitInsSize(code, includeRexPrefixSize) : 5;
+        sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(code, includeRexPrefixSize) : 5;
     }
     else
     {
@@ -13274,7 +13274,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     regNumber   reg2 = id->idReg2();
     emitAttr    size = id->idOpSize();
 
-    if (IsSSEOrAVXInstruction(ins))
+    if (IsAvx512OrPriorInstruction(ins))
     {
         assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2)));
 
@@ -13286,10 +13286,12 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
         {
             code = insCodeMR(ins);
         }
-        code = AddVexPrefixIfNeeded(ins, code, size);
+        code = AddSimdPrefixIfNeeded(ins, code, size);
         code = insEncodeRMreg(ins, code);
 
-        if (TakesRexWPrefix(ins, size))
+        // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support.
+        // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag.
+        if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))
         {
             code = AddRexWPrefix(ins, code);
         }
@@ -13298,7 +13300,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     {
         assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins));
         code = insCodeRM(ins);
-        code = AddVexPrefixIfNeeded(ins, code, size);
+        code = AddSimdPrefixIfNeeded(ins, code, size);
         code = insEncodeRMreg(ins, code) | (int)(size == EA_2BYTE);
 #ifdef TARGET_AMD64
 
@@ -13312,7 +13314,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     {
         assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins));
         code = insCodeRM(ins);
-        code = AddVexPrefixIfNeeded(ins, code, size);
+        code = AddSimdPrefixIfNeeded(ins, code, size);
         code = insEncodeRMreg(ins, code);
 
 #endif // TARGET_AMD64
@@ -13343,7 +13345,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
 #endif // FEATURE_HW_INTRINSICS
     else
     {
-        assert(!TakesVexPrefix(ins));
+        assert(!TakesSimdPrefix(ins));
         code = insCodeMR(ins);
         code = insEncodeMRreg(ins, code);
 
@@ -13415,7 +13417,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     unsigned regCode = insEncodeReg345(ins, regFor345Bits, size, &code);
     regCode |= insEncodeReg012(ins, regFor012Bits, size, &code);
 
-    if (TakesVexPrefix(ins))
+    if (TakesSimdPrefix(ins))
     {
         // In case of AVX instructions that take 3 operands, we generally want to encode reg1
         // as first source.  In this case, reg1 is both a source and a destination.
@@ -13437,7 +13439,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     }
 
     // Output the REX prefix
-    dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
     if (code & 0xFF000000)
     {

From 257037422b8a7c8e139b3c5fc75d5826ded5e16b Mon Sep 17 00:00:00 2001
From: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
Date: Wed, 26 Oct 2022 17:10:06 -0700
Subject: [PATCH 3/4] Adding EVEX encoding support for AM paths.

---
 src/coreclr/jit/codegen.h              |    2 +-
 src/coreclr/jit/codegenxarch.cpp       |   14 +-
 src/coreclr/jit/emit.cpp               |    1 +
 src/coreclr/jit/emitxarch.cpp          |  540 ++++++++---
 src/coreclr/jit/emitxarch.h            |   67 +-
 src/coreclr/jit/hwintrinsiclistxarch.h |    8 +-
 src/coreclr/jit/instr.cpp              |   40 +-
 src/coreclr/jit/instr.h                |  110 ++-
 src/coreclr/jit/instrsxarch.h          | 1202 ++++++++++++------------
 src/coreclr/jit/simdcodegenxarch.cpp   |    3 +-
 10 files changed, 1214 insertions(+), 773 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 0df8859f4ad4ed..cc544e5bb7e532 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -1700,7 +1700,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
     instruction ins_Copy(var_types dstType);
     instruction ins_Copy(regNumber srcReg, var_types dstType);
-    instruction ins_FloatConv(var_types to, var_types from);
+    instruction ins_FloatConv(var_types to, var_types from, emitAttr attr);
     instruction ins_MathOp(genTreeOps oper, var_types type);
 
     void instGen_Return(unsigned stkArgSize);
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 60bb1440123cc4..3f33ee2ac044ca 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -6911,7 +6911,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
     }
     else
     {
-        instruction ins = ins_FloatConv(dstType, srcType);
+        instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType));
         GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
     }
 
@@ -7005,7 +7005,7 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
 
     // Note that here we need to specify srcType that will determine
     // the size of source reg/mem operand and rex.w prefix.
-    instruction ins = ins_FloatConv(dstType, TYP_INT);
+    instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType));
     GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
 
     // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
@@ -7110,7 +7110,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
     // Note that we need to specify dstType here so that it will determine
     // the size of destination integer register and also the rex.w prefix.
     genConsumeOperands(treeNode->AsOp());
-    instruction ins = ins_FloatConv(TYP_INT, srcType);
+    instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(dstType));
     GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
     genProduceReg(treeNode);
 }
@@ -10713,9 +10713,7 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
         if ((regBit & regMask) != 0)
         {
             // ABI requires us to preserve lower 128-bits of YMM register.
-            GetEmitter()->emitIns_AR_R(copyIns,
-                                       EA_16BYTE,
-                                       reg, REG_SPBASE, offset);
+            GetEmitter()->emitIns_AR_R(copyIns, EA_16BYTE, reg, REG_SPBASE, offset);
             compiler->unwindSaveReg(reg, offset);
             regMask &= ~regBit;
             offset -= XMM_REGSIZE_BYTES;
@@ -10779,9 +10777,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
         if ((regBit & regMask) != 0)
         {
             // ABI requires us to restore lower 128-bits of YMM register.
-            GetEmitter()->emitIns_R_AR(copyIns,
-                                       EA_16BYTE,
-                                       reg, regBase, offset);
+            GetEmitter()->emitIns_R_AR(copyIns, EA_16BYTE, reg, regBase, offset);
             regMask &= ~regBit;
             offset -= XMM_REGSIZE_BYTES;
         }
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index a7644c3bcbd0f1..a54bacb0558ef6 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3999,6 +3999,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
     UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
     unsigned estimatedSize = id->idCodeSize();
+
     if (actualSize != estimatedSize)
     {
         // It is fatal to under-estimate the instruction size, except for alignment instructions
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 2eb52104e83f32..f73cb54d0361ce 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -746,6 +746,20 @@ bool emitter::Is4ByteSSEInstruction(instruction ins)
     return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins);
 }
 
+//------------------------------------------------------------------------
+// TakesSimdPrefix: Checks if the instruction should be VEX or EVEX encoded.
+//
+// Arguments:
+//    instruction -- processor instruction to check
+//
+// Return Value:
+//    true if this instruction requires a VEX or EVEX prefix.
+//
+bool emitter::TakesSimdPrefix(instruction ins) const
+{
+    return TakesEvexPrefix(ins) || TakesVexPrefix(ins);
+}
+
 //------------------------------------------------------------------------
 // TakesEvexPrefix: Checks if the instruction should be EVEX encoded.
 // TODO-XArch-AVX512: This check needs to be updated once AVX512 instructions are added.
@@ -1000,8 +1014,8 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr)
             case INS_cvttss2si:
             case INS_cvtsd2si:
             case INS_cvtss2si:
-            case INS_cvtsi2sd:
-            case INS_cvtsi2ss:
+            case INS_cvtsi2sd64:
+            case INS_cvtsi2ss64:
             case INS_movnti:
             case INS_mulx:
             case INS_pdep:
@@ -1176,6 +1190,16 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
 {
+    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    {
+        if (TakesEvexPrefix(ins))
+        {
+            // X-bit is available in 4-byte EVEX prefix that starts with byte 62.
+            assert(hasEvexPrefix(code));
+            // X-bit is added in bit-inverted form.
+            return code & 0xFFBFFFFFFFFFFFFFULL;
+        }
+    }
     if (UseVEXEncoding() && IsAVXInstruction(ins))
     {
         if (TakesVexPrefix(ins))
@@ -2216,12 +2240,12 @@ inline ssize_t emitter::emitGetInsCIdisp(instrDesc* id)
 // clang-format off
 const insFlags      CodeGenInterface::instInfo[] =
 {
-    #define INST0(id, nm, um, mr,                 flags) static_cast<insFlags>(flags),
-    #define INST1(id, nm, um, mr,                 flags) static_cast<insFlags>(flags),
-    #define INST2(id, nm, um, mr, mi,             flags) static_cast<insFlags>(flags),
-    #define INST3(id, nm, um, mr, mi, rm,         flags) static_cast<insFlags>(flags),
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) static_cast<insFlags>(flags),
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) static_cast<insFlags>(flags),
+    #define INST0(id, nm, um, mr,                 tt, flags) static_cast<insFlags>(flags),
+    #define INST1(id, nm, um, mr,                 tt, flags) static_cast<insFlags>(flags),
+    #define INST2(id, nm, um, mr, mi,             tt, flags) static_cast<insFlags>(flags),
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) static_cast<insFlags>(flags),
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) static_cast<insFlags>(flags),
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) static_cast<insFlags>(flags),
     #include "instrs.h"
     #undef  INST0
     #undef  INST1
@@ -2240,12 +2264,12 @@ const insFlags      CodeGenInterface::instInfo[] =
 // clang-format off
 const BYTE          emitter::emitInsModeFmtTab[] =
 {
-    #define INST0(id, nm, um, mr,                 flags) um,
-    #define INST1(id, nm, um, mr,                 flags) um,
-    #define INST2(id, nm, um, mr, mi,             flags) um,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) um,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) um,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) um,
+    #define INST0(id, nm, um, mr,                 tt, flags) um,
+    #define INST1(id, nm, um, mr,                 tt, flags) um,
+    #define INST2(id, nm, um, mr, mi,             tt, flags) um,
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) um,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) um,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) um,
     #include "instrs.h"
     #undef  INST0
     #undef  INST1
@@ -2340,12 +2364,12 @@ inline size_t insCode(instruction ins)
     const static
     size_t          insCodes[] =
     {
-        #define INST0(id, nm, um, mr,                 flags) mr,
-        #define INST1(id, nm, um, mr,                 flags) mr,
-        #define INST2(id, nm, um, mr, mi,             flags) mr,
-        #define INST3(id, nm, um, mr, mi, rm,         flags) mr,
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mr,
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr,
+        #define INST0(id, nm, um, mr,                 tt, flags) mr,
+        #define INST1(id, nm, um, mr,                 tt, flags) mr,
+        #define INST2(id, nm, um, mr, mi,             tt, flags) mr,
+        #define INST3(id, nm, um, mr, mi, rm,         tt, flags) mr,
+        #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) mr,
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) mr,
         #include "instrs.h"
         #undef  INST0
         #undef  INST1
@@ -2373,12 +2397,12 @@ inline size_t insCodeACC(instruction ins)
     const static
     size_t          insCodesACC[] =
     {
-        #define INST0(id, nm, um, mr,                 flags)
-        #define INST1(id, nm, um, mr,                 flags)
-        #define INST2(id, nm, um, mr, mi,             flags)
-        #define INST3(id, nm, um, mr, mi, rm,         flags)
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) a4,
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) a4,
+        #define INST0(id, nm, um, mr,                 tt, flags)
+        #define INST1(id, nm, um, mr,                 tt, flags)
+        #define INST2(id, nm, um, mr, mi,             tt, flags)
+        #define INST3(id, nm, um, mr, mi, rm,         tt, flags)
+        #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) a4,
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) a4,
         #include "instrs.h"
         #undef  INST0
         #undef  INST1
@@ -2406,12 +2430,12 @@ inline size_t insCodeRR(instruction ins)
     const static
     size_t          insCodesRR[] =
     {
-        #define INST0(id, nm, um, mr,                 flags)
-        #define INST1(id, nm, um, mr,                 flags)
-        #define INST2(id, nm, um, mr, mi,             flags)
-        #define INST3(id, nm, um, mr, mi, rm,         flags)
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags)
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rr,
+        #define INST0(id, nm, um, mr,                 tt, flags)
+        #define INST1(id, nm, um, mr,                 tt, flags)
+        #define INST2(id, nm, um, mr, mi,             tt, flags)
+        #define INST3(id, nm, um, mr, mi, rm,         tt, flags)
+        #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags)
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) rr,
         #include "instrs.h"
         #undef  INST0
         #undef  INST1
@@ -2432,12 +2456,12 @@ inline size_t insCodeRR(instruction ins)
 const static
 size_t          insCodesRM[] =
 {
-    #define INST0(id, nm, um, mr,                 flags)
-    #define INST1(id, nm, um, mr,                 flags)
-    #define INST2(id, nm, um, mr, mi,             flags)
-    #define INST3(id, nm, um, mr, mi, rm,         flags) rm,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) rm,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rm,
+    #define INST0(id, nm, um, mr,                 tt, flags)
+    #define INST1(id, nm, um, mr,                 tt, flags)
+    #define INST2(id, nm, um, mr, mi,             tt, flags)
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) rm,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) rm,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) rm,
     #include "instrs.h"
     #undef  INST0
     #undef  INST1
@@ -2472,12 +2496,12 @@ inline size_t insCodeRM(instruction ins)
 const static
 size_t          insCodesMI[] =
 {
-    #define INST0(id, nm, um, mr,                 flags)
-    #define INST1(id, nm, um, mr,                 flags)
-    #define INST2(id, nm, um, mr, mi,             flags) mi,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) mi,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mi,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mi,
+    #define INST0(id, nm, um, mr,                 tt, flags)
+    #define INST1(id, nm, um, mr,                 tt, flags)
+    #define INST2(id, nm, um, mr, mi,             tt, flags) mi,
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) mi,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) mi,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) mi,
     #include "instrs.h"
     #undef  INST0
     #undef  INST1
@@ -2512,12 +2536,12 @@ inline size_t insCodeMI(instruction ins)
 const static
 size_t          insCodesMR[] =
 {
-    #define INST0(id, nm, um, mr,                 flags)
-    #define INST1(id, nm, um, mr,                 flags) mr,
-    #define INST2(id, nm, um, mr, mi,             flags) mr,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) mr,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mr,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr,
+    #define INST0(id, nm, um, mr,                 tt, flags)
+    #define INST1(id, nm, um, mr,                 tt, flags) mr,
+    #define INST2(id, nm, um, mr, mi,             tt, flags) mr,
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) mr,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) mr,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) mr,
     #include "instrs.h"
     #undef  INST0
     #undef  INST1
@@ -2548,6 +2572,56 @@ inline size_t insCodeMR(instruction ins)
     return insCodesMR[ins];
 }
 
+// clang-format off
+const static
+insTupleType insTupleTypeInfos[] =
+{
+    #define INST0(id, nm, um, mr,                 tt, flags) static_cast<insTupleType>(tt),
+    #define INST1(id, nm, um, mr,                 tt, flags) static_cast<insTupleType>(tt),
+    #define INST2(id, nm, um, mr, mi,             tt, flags) static_cast<insTupleType>(tt),
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) static_cast<insTupleType>(tt),
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) static_cast<insTupleType>(tt),
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) static_cast<insTupleType>(tt),
+    #include "instrs.h"
+    #undef  INST0
+    #undef  INST1
+    #undef  INST2
+    #undef  INST3
+    #undef  INST4
+    #undef  INST5
+};
+// clang-format on
+//------------------------------------------------------------------------
+// hasTupleTypeInfo: Checks if the instruction has tuple type info.
+//
+// Arguments:
+//    instruction -- processor instruction to check
+//
+// Return Value:
+//    true if this instruction has tuple type info.
+//
+inline bool hasTupleTypeInfo(instruction ins)
+{
+    assert((unsigned)ins < ArrLen(insTupleTypeInfos));
+    return ((insTupleTypeInfos[ins] != INS_TT_NONE));
+}
+
+//------------------------------------------------------------------------
+// insTupleTypeInfo: Returns the tuple type info for a given CPU instruction.
+//
+// Arguments:
+//    instruction -- processor instruction to check
+//
+// Return Value:
+//    the tuple type info for a given CPU instruction.
+//
+inline insTupleType insTupleTypeInfo(instruction ins)
+{
+    assert((unsigned)ins < ArrLen(insTupleTypeInfos));
+    assert((insTupleTypeInfos[ins] != INS_TT_NONE));
+    return insTupleTypeInfos[ins];
+}
+
 // Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
 bool emitter::EncodedBySSE38orSSE3A(instruction ins)
 {
@@ -2557,7 +2631,7 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins)
 
     size_t insCode = 0;
 
-    if (!IsSSEOrAVXInstruction(ins))
+    if (!IsAvx512OrPriorInstruction(ins))
     {
         return false;
     }
@@ -3092,9 +3166,19 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re
     return sz;
 }
 
-/*****************************************************************************/
-
-inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp)
+//------------------------------------------------------------------------
+// emitInsSizeSVCalcDisp: Calculate instruction size.
+//
+// Arguments:
+//    id -- Instruction descriptor.
+//    code -- The current opcode and any known prefixes
+//    var - Stack variable
+//    dsp -- Displacemnt.
+//
+// Return Value:
+//    Estimated instruction size.
+//
+inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp)
 {
     UNATIVE_OFFSET size = emitInsSize(code, /* includeRexPrefixSize */ true);
     UNATIVE_OFFSET offs;
@@ -3214,6 +3298,14 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp)
                     offs -= emitMaxTmpSize;
                 }
 
+                // Check whether we can use compressed displacement if EVEX.
+                if (TakesEvexPrefix(id->idIns()))
+                {
+                    bool compressedFitsInByte = false;
+                    TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte);
+                    return size + (compressedFitsInByte ? sizeof(char) : sizeof(int));
+                }
+
                 if ((int)offs < 0)
                 {
                     // offset is negative
@@ -3255,14 +3347,22 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp)
 
 #endif // !FEATURE_FIXED_OUT_ARGS
 
-//  printf("lcl = %04X, tmp = %04X, stk = %04X, offs = %04X\n",
-//         emitLclSize, emitMaxTmpSize, emitCurStackLvl, offs);
+    //  printf("lcl = %04X, tmp = %04X, stk = %04X, offs = %04X\n",
+    //         emitLclSize, emitMaxTmpSize, emitCurStackLvl, offs);
 
+    bool useSmallEncoding = false;
+    if (TakesEvexPrefix(id->idIns()))
+    {
+        TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding);
+    }
+    else
+    {
 #ifdef TARGET_AMD64
-    bool useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX);
+        useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX);
 #else
-    bool useSmallEncoding = (offs <= size_t(SCHAR_MAX));
+        useSmallEncoding = (offs <= size_t(SCHAR_MAX));
 #endif
+    }
 
     // If it is ESP based, and the offset is zero, we will not encode the disp part.
     if (!EBPbased && offs == 0)
@@ -3280,7 +3380,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
     assert(id->idIns() != INS_invalid);
     instruction    ins      = id->idIns();
     emitAttr       attrSize = id->idOpSize();
-    UNATIVE_OFFSET prefix   = emitGetAdjustedSize(ins, attrSize, code);
+    UNATIVE_OFFSET prefix   = emitGetAdjustedSizeEvexAware(ins, attrSize, code);
 
     // REX prefix
     if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
@@ -3289,7 +3389,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
         prefix += emitGetRexPrefixSize(ins);
     }
 
-    return prefix + emitInsSizeSV(code, var, dsp);
+    return prefix + emitInsSizeSVCalcDisp(id, code, var, dsp);
 }
 
 inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val)
@@ -3298,7 +3398,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
     instruction    ins       = id->idIns();
     emitAttr       attrSize  = id->idOpSize();
     UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(attrSize);
-    UNATIVE_OFFSET prefix    = emitGetAdjustedSize(ins, attrSize, code);
+    UNATIVE_OFFSET prefix    = emitGetAdjustedSizeEvexAware(ins, attrSize, code);
     bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
 
 #ifdef TARGET_AMD64
@@ -3334,7 +3434,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var
         prefix += emitGetRexPrefixSize(ins);
     }
 
-    return prefix + valSize + emitInsSizeSV(code, var, dsp);
+    return prefix + valSize + emitInsSizeSVCalcDisp(id, code, var, dsp);
 }
 
 /*****************************************************************************/
@@ -3402,6 +3502,13 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         dspInByte = false; // relocs can't be placed in a byte
         dspIsZero = false; // relocs won't always be zero
     }
+    else
+    {
+        if (TakesEvexPrefix(ins))
+        {
+            dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
+        }
+    }
 
     if (code & 0xFF000000)
     {
@@ -3424,7 +3531,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         size = 2;
     }
 
-    size += emitGetAdjustedSize(ins, attrSize, code);
+    size += emitGetAdjustedSizeEvexAware(ins, attrSize, code);
 
     if (hasRexPrefix(code))
     {
@@ -10488,7 +10595,8 @@ void emitter::emitDispIns(
             {
                 printf("%s, %s", emitRegName(id->idReg1(), EA_4BYTE), emitRegName(id->idReg2(), attr));
             }
-            else if ((ins == INS_cvtsi2ss) || (ins == INS_cvtsi2sd))
+            else if ((ins == INS_cvtsi2ss32) || (ins == INS_cvtsi2sd32) || (ins == INS_cvtsi2ss64) ||
+                     (ins == INS_cvtsi2sd64))
             {
                 printf(" %s, %s", emitRegName(id->idReg1(), EA_16BYTE), emitRegName(id->idReg2(), attr));
             }
@@ -11236,7 +11344,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         if (id->idIsCallRegPtr())
         {
             code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code);
-            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+            dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
             dst += emitOutputWord(dst, code);
             goto DONE;
         }
@@ -11262,7 +11370,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // And emit the REX prefix
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
 #endif // TARGET_AMD64
 
@@ -11281,7 +11389,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         // SSE/AVX do not need to modify opcode
         if ((signed char)cval == cval && addc->cnsReloc == false && ins != INS_mov && ins != INS_test)
         {
-            if (id->idInsFmt() != IF_ARW_SHF && !IsSSEOrAVXInstruction(ins))
+            if (id->idInsFmt() != IF_ARW_SHF && !IsAvx512OrPriorInstruction(ins))
             {
                 code |= 2;
             }
@@ -11325,13 +11433,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 #endif // TARGET_X86
 
-    // Emit VEX prefix if required
-    // There are some callers who already add VEX prefix and call this routine.
-    // Therefore, add VEX prefix is one is not already present.
-    code = AddVexPrefixIfNeededAndNotPresent(ins, code, size);
+    // Emit SIMD prefix if required
+    // There are some callers who already add SIMD prefix and call this routine.
+    // Therefore, add SIMD prefix is one is not already present.
+    code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size);
 
     // For this format, moves do not support a third operand, so we only need to handle the binary ops.
-    if (TakesVexPrefix(ins))
+    if (TakesSimdPrefix(ins))
     {
         if (IsDstDstSrcAVXInstruction(ins))
         {
@@ -11365,7 +11473,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Emit the REX prefix if required
-    if (TakesRexWPrefix(ins, size))
+    // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support.
+    // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently
+    // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support.
+    if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))
     {
         code = AddRexWPrefix(ins, code);
     }
@@ -11421,9 +11532,9 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
         unsigned regcode = insEncodeReg345(ins, reg345, size, &code);
 
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
-        if (UseVEXEncoding() && (ins != INS_crc32))
+        if (UseSimdEncoding() && (ins != INS_crc32))
         {
             // Emit last opcode byte
             // TODO-XArch-CQ: Right now support 4-byte opcode instructions only
@@ -11449,7 +11560,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // Output the REX prefix
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest word of the opcode
         // We need to check again as in case of AVX instructions leading opcode bytes are stripped off
@@ -11466,7 +11577,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         assert(ins != INS_bt);
 
         // Output the REX prefix
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest byte of the opcode
         if (code & 0x00FF0000)
@@ -11534,20 +11645,30 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Output the REX prefix
-    dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
     // Get the displacement value
     dsp = emitGetInsAmdAny(id);
 
 GOT_DSP:
 
-    dspInByte = ((signed char)dsp == (ssize_t)dsp);
     dspIsZero = (dsp == 0);
 
     if (id->idIsDspReloc())
     {
         dspInByte = false; // relocs can't be placed in a byte
     }
+    else
+    {
+        if (TakesEvexPrefix(ins))
+        {
+            dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
+        }
+        else
+        {
+            dspInByte = ((signed char)dsp == (ssize_t)dsp);
+        }
+    }
 
     if (isMoffset)
     {
@@ -12160,11 +12281,11 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         ssize_t cval = addc->cnsVal;
 
         // Does the constant fit in a byte?
-        // SSE/AVX do not need to modify opcode
+        // SSE/AVX/AVX512 do not need to modify opcode
         if ((signed char)cval == cval && addc->cnsReloc == false && ins != INS_mov && ins != INS_test)
         {
             if ((id->idInsFmt() != IF_SRW_SHF) && (id->idInsFmt() != IF_RRW_SRD_CNS) &&
-                (id->idInsFmt() != IF_RWR_RRD_SRD_CNS) && !IsSSEOrAVXInstruction(ins))
+                (id->idInsFmt() != IF_RWR_RRD_SRD_CNS) && !IsAvx512OrPriorInstruction(ins))
             {
                 code |= 2;
             }
@@ -12173,13 +12294,17 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
     }
 
-    // Add VEX prefix if required.
-    // There are some callers who already add VEX prefix and call this routine.
-    // Therefore, add VEX prefix is one is not already present.
-    code = AddVexPrefixIfNeededAndNotPresent(ins, code, size);
+    // Add VEX or EVEX prefix if required.
+    // There are some callers who already add prefix and call this routine.
+    // Therefore, add VEX or EVEX prefix if one is not already present.
+    code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size);
 
     // Compute the REX prefix
-    if (TakesRexWPrefix(ins, size))
+    // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support.
+    // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix().
+    // Not doing so currently since we cannot differentiate EVEX vs VEX without
+    // 'code' until all paths have EVEX support.
+    if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))
     {
         code = AddRexWPrefix(ins, code);
     }
@@ -12212,9 +12337,9 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
         unsigned regcode = insEncodeReg345(ins, reg345, size, &code);
 
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
-        if (UseVEXEncoding() && (ins != INS_crc32))
+        if (UseSimdEncoding() && (ins != INS_crc32))
         {
             // Emit last opcode byte
             // TODO-XArch-CQ: Right now support 4-byte opcode instructions only
@@ -12240,7 +12365,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // Output the REX prefix
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest word of the opcode
         // We need to check again because in case of AVX instructions the leading
@@ -12257,7 +12382,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         assert(ins != INS_bt);
 
         // Output the REX prefix
-        dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+        dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
         // Output the highest byte of the opcode.
         // We need to check again because in case of AVX instructions the leading
@@ -12283,7 +12408,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             code += 4;
         }
     }
-    else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins))
+    else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins) && !IsAvx512Instruction(ins))
     {
         // Is the operand size larger than a byte?
         switch (size)
@@ -12329,7 +12454,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Output the REX prefix
-    dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
     // Figure out the variable's frame position
     int varNum = id->idAddr()->iiaLclVar.lvaVarNum();
@@ -12337,7 +12462,18 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     adr = emitComp->lvaFrameAddress(varNum, &EBPbased);
     dsp = adr + id->idAddr()->iiaLclVar.lvaOffset();
 
-    dspInByte = ((signed char)dsp == (int)dsp);
+    // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following
+    // function, to which the remainder of the emitter logic should handle properly.
+    // TODO-XARCH-AVX512 : embedded broadcast might change this
+    int dspAsByte = dsp;
+    if (TakesEvexPrefix(ins))
+    {
+        dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
+    }
+    else
+    {
+        dspInByte = ((signed char)dsp == (ssize_t)dsp);
+    }
     dspIsZero = (dsp == 0);
 
     // for stack variables the dsp should never be a reloc
@@ -12351,7 +12487,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             if (dspInByte)
             {
                 dst += emitOutputByte(dst, code | 0x45);
-                dst += emitOutputByte(dst, dsp);
+                dst += emitOutputByte(dst, dspAsByte);
             }
             else
             {
@@ -12364,7 +12500,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             if (dspInByte)
             {
                 dst += emitOutputWord(dst, code | 0x4500);
-                dst += emitOutputByte(dst, dsp);
+                dst += emitOutputByte(dst, dspAsByte);
             }
             else
             {
@@ -12381,7 +12517,21 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         dsp += emitCurStackLvl;
 #endif
 
-        dspInByte = ((signed char)dsp == (int)dsp);
+        // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following
+        // function, to which the remainder of the emitter logic should handle properly.
+        // TODO-XARCH-AVX512 : embedded broadcast might change this
+        if (TakesEvexPrefix(ins))
+        {
+            dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
+        }
+        else
+        {
+            dspInByte = ((signed char)dsp == (ssize_t)dsp);
+            if (dspInByte)
+            {
+                dspAsByte = dsp;
+            }
+        }
         dspIsZero = (dsp == 0);
 
         // Does the offset fit in a byte?
@@ -12398,7 +12548,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 {
                     dst += emitOutputByte(dst, code | 0x44);
                     dst += emitOutputByte(dst, 0x24);
-                    dst += emitOutputByte(dst, dsp);
+                    dst += emitOutputByte(dst, dspAsByte);
                 }
             }
             else
@@ -12421,7 +12571,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
                 {
                     dst += emitOutputWord(dst, code | 0x4400);
                     dst += emitOutputByte(dst, 0x24);
-                    dst += emitOutputByte(dst, dsp);
+                    dst += emitOutputByte(dst, dspAsByte);
                 }
             }
             else
@@ -14529,6 +14679,168 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
     return dst;
 }
 
+//------------------------------------------------------------------------
+// GetInputSizeInBytes: Get size of input for instruction in bytes.
+//
+// Arguments:
+//    id -- Instruction descriptor.
+//
+// Return Value:
+//    size in bytes.
+//
+ssize_t emitter::GetInputSizeInBytes(instrDesc* id)
+{
+    insFlags inputSize = static_cast<insFlags>((CodeGenInterface::instInfo[id->idIns()] & Input_Mask));
+    switch (inputSize)
+    {
+        case 0:
+            return EA_SIZE_IN_BYTES(id->idOpSize());
+        case Input_8Bit:
+            return 1;
+        case Input_16Bit:
+            return 2;
+        case Input_32Bit:
+            return 4;
+        case Input_64Bit:
+            return 8;
+        default:
+            unreached();
+    }
+}
+
+//------------------------------------------------------------------------
+// TryEvexCompressDisp8Byte: Do we do compressed displacement encoding for EVEX.
+//
+// Arguments:
+//    id -- Instruction descriptor.
+//    dsp -- Displacemnt.
+//    dspInByte[out] - TRUE if compressed displacement
+//
+// Return Value:
+//    compressed displacement value if dspInByte ===  TRUE.
+//    Original dsp otherwise.
+//
+ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte)
+{
+    assert(TakesEvexPrefix(id->idIns()));
+    insTupleType tt = insTupleTypeInfo(id->idIns());
+    assert(hasTupleTypeInfo(id->idIns()));
+
+    // if dsp is 0, no need for all of this
+    if (dsp == 0)
+    {
+        *dspInByte = true;
+        return dsp;
+    }
+
+    // Only handling non-broadcast forms right now
+    ssize_t vectorLength = EA_SIZE_IN_BYTES(id->idOpSize());
+
+    ssize_t inputSize = GetInputSizeInBytes(id);
+
+    ssize_t disp8Compression = 1;
+
+    switch (tt)
+    {
+        case INS_TT_FULL:
+            assert(inputSize == 4 || inputSize == 8);
+            if (HasEmbeddedBroadcast(id))
+            {
+                // N = input size in bytes
+                disp8Compression = inputSize;
+            }
+            else
+            {
+                // N = vector length in bytes
+                disp8Compression = vectorLength;
+            }
+            break;
+        case INS_TT_HALF:
+            assert(inputSize == 4);
+            if (HasEmbeddedBroadcast(id))
+            {
+                // N = input size in bytes
+                disp8Compression = inputSize;
+            }
+            else
+            {
+                // N = vector length in bytes
+                disp8Compression = vectorLength / 2;
+            }
+            break;
+        case INS_TT_FULL_MEM:
+            // N = vector length in bytes
+            disp8Compression = vectorLength;
+            break;
+        case INS_TT_TUPLE1_SCALAR:
+        {
+            disp8Compression = inputSize;
+            break;
+        }
+        case INS_TT_TUPLE1_FIXED:
+            // N = input size in bytes, 32bit and 64bit only
+            assert(inputSize == 4 || inputSize == 8);
+            disp8Compression = inputSize;
+            break;
+        case INS_TT_TUPLE2:
+            // N = input size in bytes * 2, 32bit and 64bit for 256 bit and 512 bit only
+            assert((inputSize == 4) || (inputSize == 8 && vectorLength >= 32));
+            disp8Compression = inputSize * 2;
+            break;
+        case INS_TT_TUPLE4:
+            // N = input size in bytes * 4, 32bit for 256 bit and 512 bit, 64bit for 512 bit
+            assert((inputSize == 4 && vectorLength >= 32) || (inputSize == 8 && vectorLength >= 64));
+            disp8Compression = inputSize * 4;
+            break;
+        case INS_TT_TUPLE8:
+            // N = input size in bytes * 4, 32bit for 512 only
+            assert((inputSize == 4 && vectorLength >= 64));
+            disp8Compression = inputSize * 4;
+            break;
+        case INS_TT_HALF_MEM:
+            // N = vector length in bytes / 2
+            disp8Compression = vectorLength / 2;
+            break;
+        case INS_TT_QUARTER_MEM:
+            // N = vector length in bytes / 4
+            disp8Compression = vectorLength / 4;
+            break;
+        case INS_TT_EIGHTH_MEM:
+            // N = vector length in bytes / 8
+            disp8Compression = vectorLength / 8;
+            break;
+        case INS_TT_MEM128:
+            // N = 16
+            disp8Compression = 16;
+            break;
+        case INS_TT_MOVDDUP:
+            // N = vector length in bytes / 2
+            disp8Compression = (vectorLength == 16) ? (vectorLength / 2) : vectorLength;
+            break;
+        default:
+            unreached();
+    }
+
+    // If we can evenly divide dsp by the disp8Compression, we can attempt to use it in a disp8 byte form
+    if (dsp % disp8Compression != 0)
+    {
+        *dspInByte = false;
+        return dsp;
+    }
+
+    ssize_t compressedDsp = dsp / disp8Compression;
+
+    *dspInByte = ((signed char)compressedDsp == (ssize_t)compressedDsp);
+    if (*dspInByte)
+    {
+        return compressedDsp;
+    }
+    else
+    {
+        return dsp;
+    }
+}
+
 /*****************************************************************************
  *
  *  Append the machine code corresponding to the given instruction descriptor
@@ -15138,7 +15450,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
             else
             {
-                code    = AddVexPrefixIfNeeded(ins, code, size);
+                code    = AddSimdPrefixIfNeeded(ins, code, size);
                 regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8);
                 dst     = emitOutputAM(dst, id, code | regcode, &cnsVal);
             }
@@ -15166,7 +15478,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
             else
             {
-                code    = AddVexPrefixIfNeeded(ins, code, size);
+                code    = AddSimdPrefixIfNeeded(ins, code, size);
                 regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8);
                 dst     = emitOutputAM(dst, id, code | regcode);
             }
@@ -15195,7 +15507,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
             else
             {
-                code    = AddVexPrefixIfNeeded(ins, code, size);
+                code    = AddSimdPrefixIfNeeded(ins, code, size);
                 regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8);
                 dst     = emitOutputAM(dst, id, code | regcode, &cnsVal);
             }
@@ -15207,7 +15519,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_AWR_RRD:
         case IF_ARW_RRD:
             code    = insCodeMR(ins);
-            code    = AddVexPrefixIfNeeded(ins, code, size);
+            code    = AddSimdPrefixIfNeeded(ins, code, size);
             regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8);
             dst     = emitOutputAM(dst, id, code | regcode);
             sz      = emitSizeOfInsDsc(id);
@@ -15216,7 +15528,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_AWR_RRD_RRD:
         {
             code = insCodeMR(ins);
-            code = AddVexPrefixIfNeeded(ins, code, size);
+            code = AddSimdPrefixIfNeeded(ins, code, size);
             dst  = emitOutputAM(dst, id, code);
             sz   = emitSizeOfInsDsc(id);
             break;
@@ -15285,7 +15597,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             break;
 
         case IF_SWR_RRD_CNS:
-            assert(IsSSEOrAVXInstruction(ins));
+            assert(ins == INS_vextracti128 || ins == INS_vextractf128);
+            assert(UseSimdEncoding());
             emitGetInsAmdCns(id, &cnsVal);
             code = insCodeMR(ins);
             dst  = emitOutputSV(dst, id, insCodeMR(ins), &cnsVal);
@@ -15305,7 +15618,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
             else
             {
-                code = AddVexPrefixIfNeeded(ins, code, size);
+                code = AddSimdPrefixIfNeeded(ins, code, size);
 
                 // In case of AVX instructions that take 3 operands, encode reg1 as first source.
                 // Note that reg1 is both a source and a destination.
@@ -15340,7 +15653,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
             else
             {
-                code = AddVexPrefixIfNeeded(ins, code, size);
+                code = AddSimdPrefixIfNeeded(ins, code, size);
 
                 if (IsDstDstSrcAVXInstruction(ins))
                 {
@@ -15358,11 +15671,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
         case IF_RWR_RRD_SRD:
         {
-            // This should only be called on AVX instructions
-            assert(IsAVXInstruction(ins));
+            assert(IsSimdInstruction(ins));
 
             code = insCodeRM(ins);
-            code = AddVexPrefixIfNeeded(ins, code, size);
+            code = AddSimdPrefixIfNeeded(ins, code, size);
             code = insEncodeReg3456(ins, id->idReg2(), size,
                                     code); // encode source operand reg in 'vvvv' bits in 1's complement form
 
@@ -15388,7 +15700,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             emitGetInsCns(id, &cnsVal);
 
             code = insCodeRM(ins);
-            code = AddVexPrefixIfNeeded(ins, code, size);
+            code = AddSimdPrefixIfNeeded(ins, code, size);
             code = insEncodeReg3456(ins, id->idReg2(), size,
                                     code); // encode source operand reg in 'vvvv' bits in 1's complement form
 
@@ -15412,7 +15724,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_SWR_RRD:
         case IF_SRW_RRD:
             code = insCodeMR(ins);
-            code = AddVexPrefixIfNeeded(ins, code, size);
+            code = AddSimdPrefixIfNeeded(ins, code, size);
 
             // In case of AVX instructions that take 3 operands, encode reg1 as first source.
             // Note that reg1 is both a source and a destination.
@@ -16810,8 +17122,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
 
         case INS_cvttsd2si:
         case INS_cvtsd2si:
-        case INS_cvtsi2sd:
-        case INS_cvtsi2ss:
+        case INS_cvtsi2sd32:
+        case INS_cvtsi2ss32:
+        case INS_cvtsi2sd64:
+        case INS_cvtsi2ss64:
             result.insThroughput = PERFSCORE_THROUGHPUT_1C;
             result.insLatency += PERFSCORE_LATENCY_7C;
             break;
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 644fed77da8241..e2910b3b34192f 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -39,7 +39,7 @@ struct CnsVal
 };
 
 UNATIVE_OFFSET emitInsSize(code_t code, bool includeRexPrefixSize);
-UNATIVE_OFFSET emitInsSizeSV(code_t code, int var, int dsp);
+UNATIVE_OFFSET emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp);
 UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp);
 UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val);
 UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code);
@@ -430,6 +430,14 @@ bool HasKMaskRegisterDest(instruction ins) const
         case INS_cmpss:
         case INS_cmppd:
         case INS_cmpsd:
+        case INS_vpgatherdd:
+        case INS_vpgatherqd:
+        case INS_vpgatherdq:
+        case INS_vpgatherqq:
+        case INS_vgatherdps:
+        case INS_vgatherqps:
+        case INS_vgatherdpd:
+        case INS_vgatherqpd:
         {
             return true;
         }
@@ -461,6 +469,17 @@ void SetUseEvexEncoding(bool value)
     useEvexEncodings = value;
 }
 
+//------------------------------------------------------------------------
+// UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported
+// contains Evex prefix.
+//
+// Returns:
+//    TRUE if target supports either.
+bool UseSimdEncoding() const
+{
+    return UseVEXEncoding() || UseEvexEncoding();
+}
+
 // 4-byte EVEX prefix starts with byte 0x62
 #define EVEX_PREFIX_MASK 0xFF00000000000000ULL
 #define EVEX_PREFIX_CODE 0x6200000000000000ULL
@@ -491,7 +510,7 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr);
 //    size - operand size
 //
 // Returns:
-//    TRUE if code has an Evex prefix.
+//    code with prefix added.
 code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size)
 {
     if (TakesEvexPrefix(ins))
@@ -505,6 +524,32 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size)
     return code;
 }
 
+//------------------------------------------------------------------------
+// AddSimdPrefixIfNeeded: Add the correct SIMD prefix.
+// Check if the prefix already exists befpre adding.
+//
+// Arguments:
+//    ins - the instruction being encoded.
+//    code - opcode + prefixes bits at some stage of encoding.
+//    size - operand size
+//
+// Returns:
+//    TRUE if code has an Evex prefix.
+code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size)
+{
+    if (TakesEvexPrefix(ins))
+    {
+        code = !hasEvexPrefix(code) ? AddEvexPrefix(ins, code, size) : code;
+    }
+    else if (TakesVexPrefix(ins))
+    {
+        code = !hasVexPrefix(code) ? AddVexPrefix(ins, code, size) : code;
+    }
+    return code;
+}
+
+bool TakesSimdPrefix(instruction ins) const;
+
 //------------------------------------------------------------------------
 // hasVexOrEvexPrefix: Returns true if the instruction encoding already
 // contains a Vex or Evex prefix.
@@ -519,6 +564,8 @@ bool hasVexOrEvexPrefix(code_t code)
     return (hasVexPrefix(code) || hasEvexPrefix(code));
 }
 
+ssize_t TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte);
+
 //------------------------------------------------------------------------
 // codeEvexMigrationCheck: Temporary check to use when adding EVEX codepaths
 // TODO-XArch-AVX512: Remove implementation and uses once all Evex paths are
@@ -534,6 +581,8 @@ bool codeEvexMigrationCheck(code_t code)
     return hasEvexPrefix(code);
 }
 
+ssize_t GetInputSizeInBytes(instrDesc* id);
+
 bool containsAVXInstruction = false;
 bool ContainsAVX()
 {
@@ -954,4 +1003,18 @@ inline bool emitIsUncondJump(instrDesc* jmp)
     return (ins == INS_jmp);
 }
 
+//------------------------------------------------------------------------
+// HasEmbeddedBroadcast: Do we consider embedded broadcast while encoding.
+// TODO-XArch-AVX512: Add eventual check on the instrDesc
+//
+// Arguments:
+//    id - Instruction descriptor.
+//
+// Returns:
+//    TRUE if the instruction does embeddded broadcast.
+inline bool HasEmbeddedBroadcast(instrDesc* id)
+{
+    return false;
+}
+
 #endif // TARGET_XARCH
diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h
index a060cc3053d993..37738201789abe 100644
--- a/src/coreclr/jit/hwintrinsiclistxarch.h
+++ b/src/coreclr/jit/hwintrinsiclistxarch.h
@@ -289,7 +289,7 @@ HARDWARE_INTRINSIC(SSE,             CompareScalarOrdered,
 HARDWARE_INTRINSIC(SSE,             CompareUnordered,                           16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cmpps,              INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(SSE,             CompareScalarUnordered,                     16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cmpss,              INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE,             ConvertToInt32,                             16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtss2si,           INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE,             ConvertScalarToVector128Single,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2ss,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE,             ConvertScalarToVector128Single,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2ss32,         INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE,             ConvertToInt32WithTruncation,               16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvttss2si,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE,             Divide,                                     16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_divps,              INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE,             DivideScalar,                               16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_divss,              INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
@@ -340,7 +340,7 @@ HARDWARE_INTRINSIC(SSE,             Xor,
 //  SSE 64-bit-only Intrinsics
 HARDWARE_INTRINSIC(SSE_X64,         ConvertToInt64,                             16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtss2si,           INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(SSE_X64,         ConvertToInt64WithTruncation,               16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvttss2si,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(SSE_X64,         ConvertScalarToVector128Single,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2ss,           INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(SSE_X64,         ConvertScalarToVector128Single,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2ss64,         INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen)
 
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 ISA              Function name                               SIMD size       NumArg                                                                                                         Instructions                                                                                                                             Category                            Flags
@@ -393,7 +393,7 @@ HARDWARE_INTRINSIC(SSE2,            ConvertToInt32,
 HARDWARE_INTRINSIC(SSE2,            ConvertToInt32WithTruncation,               16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvttsd2si},         HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2,            ConvertToUInt32,                            16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2,            ConvertToVector128Double,                   16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtdq2pd,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtps2pd,           INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2,            ConvertScalarToVector128Double,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2sd,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtss2sd,           INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg)
+HARDWARE_INTRINSIC(SSE2,            ConvertScalarToVector128Double,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2sd32,         INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtss2sd,           INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(SSE2,            ConvertToVector128Int32,                    16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtps2dq,           INS_cvtpd2dq},          HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2,            ConvertScalarToVector128Int32,              16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2,            ConvertToVector128Int32WithTruncation,      16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvttps2dq,          INS_cvttpd2dq},         HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
@@ -459,7 +459,7 @@ HARDWARE_INTRINSIC(SSE2,            Xor,
 HARDWARE_INTRINSIC(SSE2_X64,        ConvertToInt64,                             16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid,            INS_cvtsd2si},          HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_X64,        ConvertToInt64WithTruncation,               16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvttsd2si},         HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_X64,        ConvertToUInt64,                            16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_X64,        ConvertScalarToVector128Double,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2sd,           INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
+HARDWARE_INTRINSIC(SSE2_X64,        ConvertScalarToVector128Double,             16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtsi2sd64,         INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(SSE2_X64,        ConvertScalarToVector128Int64,              16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(SSE2_X64,        ConvertScalarToVector128UInt64,             16,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(SSE2_X64,        StoreNonTemporal,                           16,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movnti,             INS_movnti,             INS_invalid,            INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index d8bb92d89fdf7c..cc552a964ed21c 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -36,12 +36,12 @@ const char* CodeGen::genInsName(instruction ins)
     const char * const insNames[] =
     {
 #if defined(TARGET_XARCH)
-        #define INST0(id, nm, um, mr,                 flags) nm,
-        #define INST1(id, nm, um, mr,                 flags) nm,
-        #define INST2(id, nm, um, mr, mi,             flags) nm,
-        #define INST3(id, nm, um, mr, mi, rm,         flags) nm,
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) nm,
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) nm,
+        #define INST0(id, nm, um, mr,                 tt, flags) nm,
+        #define INST1(id, nm, um, mr,                 tt, flags) nm,
+        #define INST2(id, nm, um, mr, mi,             tt, flags) nm,
+        #define INST3(id, nm, um, mr, mi, rm,         tt, flags) nm,
+        #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) nm,
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) nm,
         #include "instrs.h"
 
 #elif defined(TARGET_ARM)
@@ -1789,7 +1789,7 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
 }
 
 // Conversions to or from floating point values
-instruction CodeGen::ins_FloatConv(var_types to, var_types from)
+instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
 {
     // AVX: For now we support only conversion from Int/Long -> float
 
@@ -1801,9 +1801,29 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from)
             switch (to)
             {
                 case TYP_FLOAT:
-                    return INS_cvtsi2ss;
+                {
+                    if (EA_SIZE(attr) == EA_4BYTE)
+                    {
+                        return INS_cvtsi2ss32;
+                    }
+                    else if (EA_SIZE(attr) == EA_8BYTE)
+                    {
+                        return INS_cvtsi2ss64;
+                    }
+                    unreached();
+                }
                 case TYP_DOUBLE:
-                    return INS_cvtsi2sd;
+                {
+                    if (EA_SIZE(attr) == EA_4BYTE)
+                    {
+                        return INS_cvtsi2sd32;
+                    }
+                    else if (EA_SIZE(attr) == EA_8BYTE)
+                    {
+                        return INS_cvtsi2sd64;
+                    }
+                    unreached();
+                }
                 default:
                     unreached();
             }
@@ -1867,7 +1887,7 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
     }
 }
 
-instruction CodeGen::ins_FloatConv(var_types to, var_types from)
+instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
 {
     switch (from)
     {
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index 27152cac97b1e5..103ec6140c2d53 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -18,12 +18,12 @@
 enum instruction : unsigned
 {
 #if defined(TARGET_XARCH)
-    #define INST0(id, nm, um, mr,                 flags) INS_##id,
-    #define INST1(id, nm, um, mr,                 flags) INS_##id,
-    #define INST2(id, nm, um, mr, mi,             flags) INS_##id,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) INS_##id,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) INS_##id,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) INS_##id,
+    #define INST0(id, nm, um, mr,                 tt, flags) INS_##id,
+    #define INST1(id, nm, um, mr,                 tt, flags) INS_##id,
+    #define INST2(id, nm, um, mr, mi,             tt, flags) INS_##id,
+    #define INST3(id, nm, um, mr, mi, rm,         tt, flags) INS_##id,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags) INS_##id,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags) INS_##id,
     #include "instrs.h"
 
 #elif defined(TARGET_ARM)
@@ -96,57 +96,66 @@ enum GCtype : unsigned
 
 #if defined(TARGET_XARCH)
 
-enum insFlags : uint32_t
+enum insFlags : uint64_t
 {
-    INS_FLAGS_None = 0,
+    INS_FLAGS_None = 0ULL,
 
     // Reads
-    Reads_OF = 1 << 0,
-    Reads_SF = 1 << 1,
-    Reads_ZF = 1 << 2,
-    Reads_PF = 1 << 3,
-    Reads_CF = 1 << 4,
-    Reads_DF = 1 << 5,
+    Reads_OF = 1ULL << 0,
+    Reads_SF = 1ULL << 1,
+    Reads_ZF = 1ULL << 2,
+    Reads_PF = 1ULL << 3,
+    Reads_CF = 1ULL << 4,
+    Reads_DF = 1ULL << 5,
 
     // Writes
-    Writes_OF = 1 << 6,
-    Writes_SF = 1 << 7,
-    Writes_ZF = 1 << 8,
-    Writes_AF = 1 << 9,
-    Writes_PF = 1 << 10,
-    Writes_CF = 1 << 11,
+    Writes_OF = 1ULL << 6,
+    Writes_SF = 1ULL << 7,
+    Writes_ZF = 1ULL << 8,
+    Writes_AF = 1ULL << 9,
+    Writes_PF = 1ULL << 10,
+    Writes_CF = 1ULL << 11,
 
     // Resets
-    Resets_OF = 1 << 12,
-    Resets_SF = 1 << 13,
-    Resets_AF = 1 << 14,
-    Resets_PF = 1 << 15,
-    Resets_CF = 1 << 16,
+    Resets_OF = 1ULL << 12,
+    Resets_SF = 1ULL << 13,
+    Resets_AF = 1ULL << 14,
+    Resets_PF = 1ULL << 15,
+    Resets_CF = 1ULL << 16,
 
     // Undefined
-    Undefined_OF = 1 << 17,
-    Undefined_SF = 1 << 18,
-    Undefined_ZF = 1 << 19,
-    Undefined_AF = 1 << 20,
-    Undefined_PF = 1 << 21,
-    Undefined_CF = 1 << 22,
+    Undefined_OF = 1ULL << 17,
+    Undefined_SF = 1ULL << 18,
+    Undefined_ZF = 1ULL << 19,
+    Undefined_AF = 1ULL << 20,
+    Undefined_PF = 1ULL << 21,
+    Undefined_CF = 1ULL << 22,
 
     // Restore
-    Restore_SF_ZF_AF_PF_CF = 1 << 23,
+    Restore_SF_ZF_AF_PF_CF = 1ULL << 23,
 
     // x87 instruction
-    INS_FLAGS_x87Instr = 1 << 24,
+    INS_FLAGS_x87Instr = 1ULL << 24,
 
     // Avx
-    INS_Flags_IsDstDstSrcAVXInstruction = 1 << 25,
-    INS_Flags_IsDstSrcSrcAVXInstruction = 1 << 26,
+    INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 25,
+    INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 26,
 
     // w and s bits
-    INS_FLAGS_Has_Wbit = 1 << 27,
-    INS_FLAGS_Has_Sbit = 1 << 28,
+    INS_FLAGS_Has_Wbit = 1ULL << 27,
+    INS_FLAGS_Has_Sbit = 1ULL << 28,
+
+    // instruction input size
+    // if not input size is set, instruction defaults to using
+    // the emitAttr for size
+    Input_8Bit  = 1ULL << 29,
+    Input_16Bit = 1ULL << 30,
+    Input_32Bit = 1ULL << 31,
+    Input_64Bit = 1ULL << 32,
+    Input_Mask = (0xFULL) << 29,
 
     //  TODO-Cleanup:  Remove this flag and its usage from TARGET_XARCH
-    INS_FLAGS_DONT_CARE = 0x00,
+    INS_FLAGS_DONT_CARE = 0x00ULL,
 };
 
 #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
@@ -330,6 +339,31 @@ enum insBarrier : unsigned
 };
 #endif
 
+#if defined(TARGET_XARCH)
+// Represents tupletype attribute of instruction.
+// This is used in determining factor N while calculated compressed displacement in EVEX encoding
+// Reference: Section 2.6.5 in Intel 64 and ia-32 architectures software developer's manual volume 2.
+enum insTupleType : uint32_t
+{
+    INS_TT_NONE             = 0x00000,
+    INS_TT_FULL             = 0x00001,
+    INS_TT_HALF             = 0x00002,
+    INS_TT_IS_BROADCAST     = 0x00003,
+    INS_TT_FULL_MEM         = 0x00010,
+    INS_TT_TUPLE1_SCALAR    = 0x00020,
+    INS_TT_TUPLE1_FIXED     = 0x00040,
+    INS_TT_TUPLE2           = 0x00080,
+    INS_TT_TUPLE4           = 0x01000,
+    INS_TT_TUPLE8           = 0x02000,
+    INS_TT_HALF_MEM         = 0x04000,
+    INS_TT_QUARTER_MEM      = 0x08000,
+    INS_TT_EIGHTH_MEM       = 0x10000,
+    INS_TT_MEM128           = 0x20000,
+    INS_TT_MOVDDUP          = 0x40000,
+    INS_TT_IS_NON_BROADCAST = 0x7FFFC
+};
+#endif
+
 #undef EA_UNKNOWN
 enum emitAttr : unsigned
 {
diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
index 21b6278db0a556..55a39b52bffbe8 100644
--- a/src/coreclr/jit/instrsxarch.h
+++ b/src/coreclr/jit/instrsxarch.h
@@ -29,120 +29,120 @@
 #endif
 /*****************************************************************************/
 #ifndef INST0
-#define INST0(id, nm, um, mr,                 flags)
+#define INST0(id, nm, um, mr,                 tt, flags)
 #endif
 #ifndef INST2
-#define INST2(id, nm, um, mr, mi,             flags)
+#define INST2(id, nm, um, mr, mi,             tt, flags)
 #endif
 #ifndef INST3
-#define INST3(id, nm, um, mr, mi, rm,         flags)
+#define INST3(id, nm, um, mr, mi, rm,         tt, flags)
 #endif
 #ifndef INST4
-#define INST4(id, nm, um, mr, mi, rm, a4,     flags)
+#define INST4(id, nm, um, mr, mi, rm, a4,     tt, flags)
 #endif
 #ifndef INST5
-#define INST5(id, nm, um, mr, mi, rm, a4, rr, flags)
+#define INST5(id, nm, um, mr, mi, rm, a4, rr, tt, flags)
 #endif
 
 /*****************************************************************************/
 /*               The following is x86-specific                               */
 /*****************************************************************************/
 
-//    id                nm                  um      mr            mi            rm            a4            rr           flags
-INST5(invalid,          "INVALID",          IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,    INS_FLAGS_None)
+//    id                nm                  um      mr            mi            rm            a4            rr           tt              flags
+INST5(invalid,          "INVALID",          IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,    INS_TT_NONE,    INS_FLAGS_None)
 
-INST5(push,             "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_FLAGS_None )
-INST5(pop,              "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_FLAGS_None )
+INST5(push,             "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_TT_NONE,    INS_FLAGS_None )
+INST5(pop,              "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_TT_NONE,    INS_FLAGS_None )
 // Does not affect the stack tracking in the emitter
-INST5(push_hide,        "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_FLAGS_None )
-INST5(pop_hide,         "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_FLAGS_None )
+INST5(push_hide,        "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_TT_NONE,    INS_FLAGS_None )
+INST5(pop_hide,         "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_TT_NONE,    INS_FLAGS_None )
 
-INST5(inc,              "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000040,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit )
-INST5(inc_l,            "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C0FE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF )
-INST5(dec,              "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000048,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit )
-INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C8FE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF )
+INST5(inc,              "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000040,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit )
+INST5(inc_l,            "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C0FE,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF )
+INST5(dec,              "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000048,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF | INS_FLAGS_Has_Wbit )
+INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C8FE,    INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF )
 
 // Multi-byte opcodes without modrm are represented in mixed endian fashion.
 // See comment around quarter way through this file for more information.
 INST5(bswap,            "bswap",            IUM_RW, 0x0F00C8,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C80F,    INS_FLAGS_None )
 
-//    id                nm                  um      mr            mi            rm            a4                         flags
-INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(or,               "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(adc,              "adc",              IUM_RW, 0x000010,     0x001080,     0x000012,     0x000014,                  Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | Reads_CF   | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(sbb,              "sbb",              IUM_RW, 0x000018,     0x001880,     0x00001A,     0x00001C,                  Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | Reads_CF   | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(and,              "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(sub,              "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+//    id                nm                  um      mr            mi            rm            a4                         tt              flags
+INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(or,               "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(adc,              "adc",              IUM_RW, 0x000010,     0x001080,     0x000012,     0x000014,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | Reads_CF   | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(sbb,              "sbb",              IUM_RW, 0x000018,     0x001880,     0x00001A,     0x00001C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | Reads_CF   | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(and,              "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(sub,              "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
 // Does not affect the stack tracking in the emitter
-INST4(sub_hide,         "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(sub_hide,         "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
 
-INST4(xor,              "xor",              IUM_RW, 0x000030,     0x003080,     0x000032,     0x000034,                  Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(cmp,              "cmp",              IUM_RD, 0x000038,     0x003880,     0x00003A,     0x00003C,                  Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
-INST4(test,             "test",             IUM_RD, 0x000084,     0x0000F6,     0x000084,     0x0000A8,                  Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                                       | INS_FLAGS_Has_Wbit )
-INST4(mov,              "mov",              IUM_WR, 0x000088,     0x0000C6,     0x00008A,     0x0000B0,                  INS_FLAGS_Has_Wbit )
+INST4(xor,              "xor",              IUM_RW, 0x000030,     0x003080,     0x000032,     0x000034,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(cmp,              "cmp",              IUM_RD, 0x000038,     0x003880,     0x00003A,     0x00003C,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
+INST4(test,             "test",             IUM_RD, 0x000084,     0x0000F6,     0x000084,     0x0000A8,                  INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Resets_CF                                       | INS_FLAGS_Has_Wbit )
+INST4(mov,              "mov",              IUM_WR, 0x000088,     0x0000C6,     0x00008A,     0x0000B0,                  INS_TT_NONE,    INS_FLAGS_Has_Wbit )
 
-INST4(lea,              "lea",              IUM_WR, BAD_CODE,     BAD_CODE,     0x00008D,     BAD_CODE,                  INS_FLAGS_None )
+INST4(lea,              "lea",              IUM_WR, BAD_CODE,     BAD_CODE,     0x00008D,     BAD_CODE,                  INS_TT_NONE,    INS_FLAGS_None )
 
-//    id                nm                  um      mr            mi            rm                                       flags
+//    id                nm                  um      mr            mi            rm                                       tt              flags
 
 // Note that emitter has only partial support for BT. It can only emit the reg,reg form
 // and the registers need to be reversed to get the correct encoding.
-INST3(bt,               "bt",               IUM_RD, 0x0F00A3,     BAD_CODE,     0x0F00A3,                                Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     )
+INST3(bt,               "bt",               IUM_RD, 0x0F00A3,     BAD_CODE,     0x0F00A3,                                INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     )
 
-INST3(bsf,              "bsf",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BC,                                Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Undefined_CF  )
-INST3(bsr,              "bsr",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BD,                                Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Undefined_CF  )
+INST3(bsf,              "bsf",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BC,                                INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Undefined_CF  )
+INST3(bsr,              "bsr",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BD,                                INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Undefined_CF  )
 
-INST3(movsx,            "movsx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BE,                                INS_FLAGS_Has_Wbit )
+INST3(movsx,            "movsx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BE,                                INS_TT_NONE,    INS_FLAGS_Has_Wbit )
 #ifdef TARGET_AMD64
-INST3(movsxd,           "movsxd",           IUM_WR, BAD_CODE,     BAD_CODE,     0x4800000063,                            INS_FLAGS_Has_Wbit )
+INST3(movsxd,           "movsxd",           IUM_WR, BAD_CODE,     BAD_CODE,     0x4800000063,                            INS_TT_NONE,    INS_FLAGS_Has_Wbit )
 #endif
-INST3(movzx,            "movzx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00B6,                                INS_FLAGS_Has_Wbit )
-
-INST3(cmovo,            "cmovo",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0040,                                Reads_OF )
-INST3(cmovno,           "cmovno",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0041,                                Reads_OF )
-INST3(cmovb,            "cmovb",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0042,                                Reads_CF )
-INST3(cmovae,           "cmovae",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0043,                                Reads_CF )
-INST3(cmove,            "cmove",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0044,                                Reads_ZF )
-INST3(cmovne,           "cmovne",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0045,                                Reads_ZF )
-INST3(cmovbe,           "cmovbe",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0046,                                Reads_ZF | Reads_CF )
-INST3(cmova,            "cmova",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0047,                                Reads_ZF | Reads_CF )
-INST3(cmovs,            "cmovs",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0048,                                Reads_SF )
-INST3(cmovns,           "cmovns",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0049,                                Reads_SF )
-INST3(cmovp,            "cmovp",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004A,                                Reads_PF )
-INST3(cmovnp,           "cmovnp",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004B,                                Reads_PF )
-INST3(cmovl,            "cmovl",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004C,                                Reads_OF       | Reads_SF                                                                      )
-INST3(cmovge,           "cmovge",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004D,                                Reads_OF       | Reads_SF                                                                      )
-INST3(cmovle,           "cmovle",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004E,                                Reads_OF       | Reads_SF      | Reads_ZF                                                      )
-INST3(cmovg,            "cmovg",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004F,                                Reads_OF       | Reads_SF      | Reads_ZF                                                      )
-
-INST3(xchg,             "xchg",             IUM_RW, 0x000086,     BAD_CODE,     0x000086,                                INS_FLAGS_Has_Wbit )
-INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,     0x0F00AF,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-
-//    id                nm                  um      mr            mi            rm                                       flags
+INST3(movzx,            "movzx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00B6,                                INS_TT_NONE,    INS_FLAGS_Has_Wbit )
+
+INST3(cmovo,            "cmovo",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0040,                                INS_TT_NONE,    Reads_OF )
+INST3(cmovno,           "cmovno",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0041,                                INS_TT_NONE,    Reads_OF )
+INST3(cmovb,            "cmovb",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0042,                                INS_TT_NONE,    Reads_CF )
+INST3(cmovae,           "cmovae",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0043,                                INS_TT_NONE,    Reads_CF )
+INST3(cmove,            "cmove",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0044,                                INS_TT_NONE,    Reads_ZF )
+INST3(cmovne,           "cmovne",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0045,                                INS_TT_NONE,    Reads_ZF )
+INST3(cmovbe,           "cmovbe",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0046,                                INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST3(cmova,            "cmova",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0047,                                INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST3(cmovs,            "cmovs",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0048,                                INS_TT_NONE,    Reads_SF )
+INST3(cmovns,           "cmovns",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0049,                                INS_TT_NONE,    Reads_SF )
+INST3(cmovp,            "cmovp",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004A,                                INS_TT_NONE,    Reads_PF )
+INST3(cmovnp,           "cmovnp",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004B,                                INS_TT_NONE,    Reads_PF )
+INST3(cmovl,            "cmovl",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004C,                                INS_TT_NONE,    Reads_OF       | Reads_SF                                                                      )
+INST3(cmovge,           "cmovge",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004D,                                INS_TT_NONE,    Reads_OF       | Reads_SF                                                                      )
+INST3(cmovle,           "cmovle",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004E,                                INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF                                                      )
+INST3(cmovg,            "cmovg",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004F,                                INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF                                                      )
+
+INST3(xchg,             "xchg",             IUM_RW, 0x000086,     BAD_CODE,     0x000086,                                INS_TT_NONE,    INS_FLAGS_Has_Wbit )
+INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,     0x0F00AF,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+
+//    id                nm                  um      mr            mi            rm                                       tt              flags
 
 // Instead of encoding these as 3-operand instructions, we encode them
 // as 2-operand instructions with the target register being implicit
 // implicit_reg = op1*op2_icon
 #define INSTMUL INST3
-INSTMUL(imul_AX,        "imul",             IUM_RD, BAD_CODE,     0x000068,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_CX,        "imul",             IUM_RD, BAD_CODE,     0x000868,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_DX,        "imul",             IUM_RD, BAD_CODE,     0x001068,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_BX,        "imul",             IUM_RD, BAD_CODE,     0x001868,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_SP,        "imul",             IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_BP,        "imul",             IUM_RD, BAD_CODE,     0x002868,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_SI,        "imul",             IUM_RD, BAD_CODE,     0x003068,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_DI,        "imul",             IUM_RD, BAD_CODE,     0x003868,     BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_AX,        "imul",             IUM_RD, BAD_CODE,     0x000068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_CX,        "imul",             IUM_RD, BAD_CODE,     0x000868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_DX,        "imul",             IUM_RD, BAD_CODE,     0x001068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_BX,        "imul",             IUM_RD, BAD_CODE,     0x001868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_SP,        "imul",             IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_BP,        "imul",             IUM_RD, BAD_CODE,     0x002868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_SI,        "imul",             IUM_RD, BAD_CODE,     0x003068,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_DI,        "imul",             IUM_RD, BAD_CODE,     0x003868,     BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
 
 #ifdef TARGET_AMD64
 
-INSTMUL(imul_08,        "imul",             IUM_RD, BAD_CODE,     0x4400000068, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_09,        "imul",             IUM_RD, BAD_CODE,     0x4400000868, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_10,        "imul",             IUM_RD, BAD_CODE,     0x4400001068, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_11,        "imul",             IUM_RD, BAD_CODE,     0x4400001868, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_12,        "imul",             IUM_RD, BAD_CODE,     0x4400002068, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_13,        "imul",             IUM_RD, BAD_CODE,     0x4400002868, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_14,        "imul",             IUM_RD, BAD_CODE,     0x4400003068, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
-INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868, BAD_CODE,                                Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_08,        "imul",             IUM_RD, BAD_CODE,     0x4400000068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_09,        "imul",             IUM_RD, BAD_CODE,     0x4400000868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_10,        "imul",             IUM_RD, BAD_CODE,     0x4400001068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_11,        "imul",             IUM_RD, BAD_CODE,     0x4400001868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_12,        "imul",             IUM_RD, BAD_CODE,     0x4400002068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_13,        "imul",             IUM_RD, BAD_CODE,     0x4400002868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_14,        "imul",             IUM_RD, BAD_CODE,     0x4400003068, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
+INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868, BAD_CODE,                                INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Sbit )
 
 #endif // TARGET_AMD64
 
@@ -183,603 +183,615 @@ INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868,
 #define VEX3INT(c1,c2)   PACK4(c1, 0xc5, 0x02, c2)
 #define VEX3FLT(c1,c2)   PACK4(c1, 0xc5, 0x02, c2)
 
-INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 // These are the SSE instructions used on x86
-INST3(pmovmskb,         "pmovmskb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD7),                            INS_FLAGS_None)    // Move the MSB bits of all bytes in a xmm reg to an int reg
-INST3(movmskpd,         "movmskpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x50),                            INS_FLAGS_None)    // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros.
-INST3(movd,             "movd",             IUM_WR, PCKDBL(0x7E), BAD_CODE,     PCKDBL(0x6E),                            INS_FLAGS_None)    // Move Double/Quadword between mm regs <-> memory/r32/r64 regs, cleanup https://github.com/dotnet/runtime/issues/47943
-INST3(movq,             "movq",             IUM_WR, PCKDBL(0xD6), BAD_CODE,     SSEFLT(0x7E),                            INS_FLAGS_None)    // Move Quadword between memory/mm <-> regs, cleanup https://github.com/dotnet/runtime/issues/47943
-INST3(movsdsse2,        "movsd",            IUM_WR, SSEDBL(0x11), BAD_CODE,     SSEDBL(0x10),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-
-INST3(punpckldq,        "punpckldq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x62),                            INS_Flags_IsDstDstSrcAVXInstruction)
-
-INST3(xorps,            "xorps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x57),                            INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed singles
-
-INST3(cvttsd2si,        "cvttsd2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2C),                            INS_FLAGS_None)    // cvt with trunc scalar double to signed DWORDs
-
-INST3(movntdq,          "movntdq",          IUM_WR, PCKDBL(0xE7), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movnti,           "movnti",           IUM_WR, PCKFLT(0xC3), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movntpd,          "movntpd",          IUM_WR, PCKDBL(0x2B), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movntps,          "movntps",          IUM_WR, PCKFLT(0x2B), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movdqu,           "movdqu",           IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_FLAGS_None)
-INST3(movdqa,           "movdqa",           IUM_WR, PCKDBL(0x7F), BAD_CODE,     PCKDBL(0x6F),                            INS_FLAGS_None)
-INST3(movlpd,           "movlpd",           IUM_WR, PCKDBL(0x13), BAD_CODE,     PCKDBL(0x12),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movlps,           "movlps",           IUM_WR, PCKFLT(0x13), BAD_CODE,     PCKFLT(0x12),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movhpd,           "movhpd",           IUM_WR, PCKDBL(0x17), BAD_CODE,     PCKDBL(0x16),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movhps,           "movhps",           IUM_WR, PCKFLT(0x17), BAD_CODE,     PCKFLT(0x16),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movss,            "movss",            IUM_WR, SSEFLT(0x11), BAD_CODE,     SSEFLT(0x10),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movapd,           "movapd",           IUM_WR, PCKDBL(0x29), BAD_CODE,     PCKDBL(0x28),                            INS_FLAGS_None)
-INST3(movaps,           "movaps",           IUM_WR, PCKFLT(0x29), BAD_CODE,     PCKFLT(0x28),                            INS_FLAGS_None)
-INST3(movupd,           "movupd",           IUM_WR, PCKDBL(0x11), BAD_CODE,     PCKDBL(0x10),                            INS_FLAGS_None)
-INST3(movups,           "movups",           IUM_WR, PCKFLT(0x11), BAD_CODE,     PCKFLT(0x10),                            INS_FLAGS_None)
-INST3(movhlps,          "movhlps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x12),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(movlhps,          "movlhps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x16),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(movmskps,         "movmskps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x50),                            INS_FLAGS_None)
-INST3(unpckhps,         "unpckhps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x15),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(unpcklps,         "unpcklps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x14),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(maskmovdqu,       "maskmovdqu",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF7),                            INS_FLAGS_None)
-
-INST3(shufps,           "shufps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC6),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(shufpd,           "shufpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC6),                            INS_Flags_IsDstDstSrcAVXInstruction)
-
-INST3(punpckhdq,        "punpckhdq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6A),                            INS_Flags_IsDstDstSrcAVXInstruction)
-
-INST3(lfence,           "lfence",           IUM_RD, 0x000FE8AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(mfence,           "mfence",           IUM_RD, 0x000FF0AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetchnta,      "prefetchnta",      IUM_RD, 0x000F0018,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetcht0,       "prefetcht0",       IUM_RD, 0x000F0818,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetcht1,       "prefetcht1",       IUM_RD, 0x000F1018,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetcht2,       "prefetcht2",       IUM_RD, 0x000F1818,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(sfence,           "sfence",           IUM_RD, 0x000FF8AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(pmovmskb,         "pmovmskb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD7),                            INS_TT_NONE,    INS_FLAGS_None)    // Move the MSB bits of all bytes in a xmm reg to an int reg
+INST3(movmskpd,         "movmskpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x50),                            INS_TT_NONE,    INS_FLAGS_None)    // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros.
+INST3(movd,             "movd",             IUM_WR, PCKDBL(0x7E), BAD_CODE,     PCKDBL(0x6E),                            INS_TT_TUPLE1_SCALAR,    INS_FLAGS_None)    // Move Double/Quadword between mm regs <-> memory/r32/r64 regs, cleanup https://github.com/dotnet/runtime/issues/47943
+INST3(movq,             "movq",             IUM_WR, PCKDBL(0xD6), BAD_CODE,     SSEFLT(0x7E),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_FLAGS_None)    // Move Quadword between memory/mm <-> regs, cleanup https://github.com/dotnet/runtime/issues/47943
+INST3(movsdsse2,        "movsd",            IUM_WR, SSEDBL(0x11), BAD_CODE,     SSEDBL(0x10),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction)
+
+INST3(punpckldq,        "punpckldq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x62),                            INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+
+INST3(xorps,            "xorps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x57),                            INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed singles
+
+INST3(cvttsd2si,        "cvttsd2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2C),                            INS_TT_TUPLE1_FIXED,    Input_64Bit)    // cvt with trunc scalar double to signed DWORDs
+
+INST3(movntdq,          "movntdq",          IUM_WR, PCKDBL(0xE7), BAD_CODE,     BAD_CODE,                                INS_TT_FULL_MEM,    Input_32Bit)
+INST3(movnti,           "movnti",           IUM_WR, PCKFLT(0xC3), BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(movntpd,          "movntpd",          IUM_WR, PCKDBL(0x2B), BAD_CODE,     BAD_CODE,                                INS_TT_FULL_MEM,    Input_64Bit)
+INST3(movntps,          "movntps",          IUM_WR, PCKFLT(0x2B), BAD_CODE,     BAD_CODE,                                INS_TT_FULL_MEM,    Input_32Bit)
+INST3(movdqu,           "movdqu",           IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_TT_NONE,    INS_FLAGS_None)
+INST3(movdqa,           "movdqa",           IUM_WR, PCKDBL(0x7F), BAD_CODE,     PCKDBL(0x6F),                            INS_TT_NONE,    INS_FLAGS_None)
+INST3(movlpd,           "movlpd",           IUM_WR, PCKDBL(0x13), BAD_CODE,     PCKDBL(0x12),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movlps,           "movlps",           IUM_WR, PCKFLT(0x13), BAD_CODE,     PCKFLT(0x12),                            INS_TT_TUPLE1_FIXED,     Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movhpd,           "movhpd",           IUM_WR, PCKDBL(0x17), BAD_CODE,     PCKDBL(0x16),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movhps,           "movhps",           IUM_WR, PCKFLT(0x17), BAD_CODE,     PCKFLT(0x16),                            INS_TT_TUPLE2,           Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movss,            "movss",            IUM_WR, SSEFLT(0x11), BAD_CODE,     SSEFLT(0x10),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movapd,           "movapd",           IUM_WR, PCKDBL(0x29), BAD_CODE,     PCKDBL(0x28),                            INS_TT_FULL_MEM,         Input_64Bit)
+INST3(movaps,           "movaps",           IUM_WR, PCKFLT(0x29), BAD_CODE,     PCKFLT(0x28),                            INS_TT_FULL_MEM,         Input_32Bit)
+INST3(movupd,           "movupd",           IUM_WR, PCKDBL(0x11), BAD_CODE,     PCKDBL(0x10),                            INS_TT_FULL_MEM,         Input_64Bit)
+INST3(movups,           "movups",           IUM_WR, PCKFLT(0x11), BAD_CODE,     PCKFLT(0x10),                            INS_TT_FULL_MEM,         Input_32Bit)
+INST3(movhlps,          "movhlps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x12),                            INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(movlhps,          "movlhps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x16),                            INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(movmskps,         "movmskps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x50),                            INS_TT_NONE,    INS_FLAGS_None)
+INST3(unpckhps,         "unpckhps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x15),                            INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(unpcklps,         "unpcklps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x14),                            INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(maskmovdqu,       "maskmovdqu",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF7),                            INS_TT_NONE,    INS_FLAGS_None)
+
+INST3(shufps,           "shufps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC6),                            INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(shufpd,           "shufpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC6),                            INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+
+INST3(punpckhdq,        "punpckhdq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6A),                            INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
+
+INST3(lfence,           "lfence",           IUM_RD, 0x000FE8AE,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(mfence,           "mfence",           IUM_RD, 0x000FF0AE,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(prefetchnta,      "prefetchnta",      IUM_RD, 0x000F0018,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(prefetcht0,       "prefetcht0",       IUM_RD, 0x000F0818,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(prefetcht1,       "prefetcht1",       IUM_RD, 0x000F1018,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(prefetcht2,       "prefetcht2",       IUM_RD, 0x000F1818,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
+INST3(sfence,           "sfence",           IUM_RD, 0x000FF8AE,   BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)
 
 // SSE 2 arith
-INST3(addps,            "addps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed singles
-INST3(addss,            "addss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar singles
-INST3(addpd,            "addpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed doubles
-INST3(addsd,            "addsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar doubles
-INST3(mulps,            "mulps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed singles
-INST3(mulss,            "mulss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar single
-INST3(mulpd,            "mulpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed doubles
-INST3(mulsd,            "mulsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar doubles
-INST3(subps,            "subps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed singles
-INST3(subss,            "subss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar singles
-INST3(subpd,            "subpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed doubles
-INST3(subsd,            "subsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar doubles
-INST3(minps,            "minps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed singles
-INST3(minss,            "minss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar single
-INST3(minpd,            "minpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed doubles
-INST3(minsd,            "minsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar double
-INST3(divps,            "divps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed singles
-INST3(divss,            "divss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar singles
-INST3(divpd,            "divpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed doubles
-INST3(divsd,            "divsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar doubles
-INST3(maxps,            "maxps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed singles
-INST3(maxss,            "maxss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar single
-INST3(maxpd,            "maxpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed doubles
-INST3(maxsd,            "maxsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar double
-INST3(xorpd,            "xorpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x57),                            INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed doubles
-INST3(andps,            "andps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x54),                            INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed singles
-INST3(andpd,            "andpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x54),                            INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed doubles
-INST3(sqrtps,           "sqrtps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x51),                            INS_FLAGS_None)    // Sqrt of packed singles
-INST3(sqrtss,           "sqrtss",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x51),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar single
-INST3(sqrtpd,           "sqrtpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x51),                            INS_FLAGS_None)    // Sqrt of packed doubles
-INST3(sqrtsd,           "sqrtsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x51),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar double
-INST3(andnps,           "andnps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x55),                            INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed singles
-INST3(andnpd,           "andnpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x55),                            INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed doubles
-INST3(orps,             "orps",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x56),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed singles
-INST3(orpd,             "orpd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x56),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed doubles
-INST3(haddpd,           "haddpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed doubles
-INST3(haddps,           "haddps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed floats
-INST3(hsubpd,           "hsubpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed doubles
-INST3(hsubps,           "hsubps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed floats
-INST3(addsubps,         "addsubps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xD0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed singles
-INST3(addsubpd,         "addsubpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed doubles
+INST3(addps,            "addps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x58),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed singles
+INST3(addss,            "addss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x58),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar singles
+INST3(addpd,            "addpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x58),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed doubles
+INST3(addsd,            "addsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x58),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar doubles
+INST3(mulps,            "mulps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x59),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed singles
+INST3(mulss,            "mulss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x59),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar single
+INST3(mulpd,            "mulpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x59),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed doubles
+INST3(mulsd,            "mulsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x59),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar doubles
+INST3(subps,            "subps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5C),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed singles
+INST3(subss,            "subss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5C),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar singles
+INST3(subpd,            "subpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5C),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed doubles
+INST3(subsd,            "subsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5C),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar doubles
+INST3(minps,            "minps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5D),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed singles
+INST3(minss,            "minss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5D),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar single
+INST3(minpd,            "minpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5D),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed doubles
+INST3(minsd,            "minsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5D),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar double
+INST3(divps,            "divps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5E),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed singles
+INST3(divss,            "divss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5E),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar singles
+INST3(divpd,            "divpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5E),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed doubles
+INST3(divsd,            "divsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5E),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar doubles
+INST3(maxps,            "maxps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5F),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed singles
+INST3(maxss,            "maxss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5F),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar single
+INST3(maxpd,            "maxpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5F),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed doubles
+INST3(maxsd,            "maxsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5F),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar double
+INST3(xorpd,            "xorpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x57),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed doubles
+INST3(andps,            "andps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x54),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed singles
+INST3(andpd,            "andpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x54),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed doubles
+INST3(sqrtps,           "sqrtps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x51),                            INS_TT_FULL,             Input_32Bit | INS_FLAGS_None)    // Sqrt of packed singles
+INST3(sqrtss,           "sqrtss",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x51),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar single
+INST3(sqrtpd,           "sqrtpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x51),                            INS_TT_FULL,             Input_64Bit | INS_FLAGS_None)    // Sqrt of packed doubles
+INST3(sqrtsd,           "sqrtsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x51),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar double
+INST3(andnps,           "andnps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x55),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed singles
+INST3(andnpd,           "andnpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x55),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed doubles
+INST3(orps,             "orps",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x56),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed singles
+INST3(orpd,             "orpd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x56),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed doubles
+INST3(haddpd,           "haddpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7C),                            INS_TT_NONE,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed doubles
+INST3(haddps,           "haddps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7C),                            INS_TT_NONE,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed floats
+INST3(hsubpd,           "hsubpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7D),                            INS_TT_NONE,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed doubles
+INST3(hsubps,           "hsubps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7D),                            INS_TT_NONE,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed floats
+INST3(addsubps,         "addsubps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xD0),                            INS_TT_NONE,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed singles
+INST3(addsubpd,         "addsubpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD0),                            INS_TT_NONE,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed doubles
 
 // SSE 2 approx arith
-INST3(rcpps,            "rcpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x53),                            INS_FLAGS_None)    // Reciprocal of packed singles
-INST3(rcpss,            "rcpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x53),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal of scalar single
-INST3(rsqrtps,          "rsqrtps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x52),                            INS_FLAGS_None)    // Reciprocal Sqrt of packed singles
-INST3(rsqrtss,          "rsqrtss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x52),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal Sqrt of scalar single
+INST3(rcpps,            "rcpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x53),                            INS_TT_NONE,    Input_32Bit | INS_FLAGS_None)    // Reciprocal of packed singles
+INST3(rcpss,            "rcpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x53),                            INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal of scalar single
+INST3(rsqrtps,          "rsqrtps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x52),                            INS_TT_NONE,    Input_32Bit | INS_FLAGS_None)    // Reciprocal Sqrt of packed singles
+INST3(rsqrtss,          "rsqrtss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x52),                            INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal Sqrt of scalar single
 
 // SSE2 conversions
-INST3(cvtpi2ps,         "cvtpi2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2A),                            INS_FLAGS_None)    // cvt packed DWORDs to singles
-INST3(cvtsi2ss,         "cvtsi2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar single
-INST3(cvtpi2pd,         "cvtpi2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2A),                            INS_FLAGS_None)    // cvt packed DWORDs to doubles
-INST3(cvtsi2sd,         "cvtsi2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar double
-INST3(cvttps2pi,        "cvttps2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2C),                            INS_FLAGS_None)    // cvt with trunc packed singles to DWORDs
-INST3(cvttss2si,        "cvttss2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2C),                            INS_FLAGS_None)    // cvt with trunc scalar single to DWORD
-INST3(cvttpd2pi,        "cvttpd2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2C),                            INS_FLAGS_None)    // cvt with trunc packed doubles to DWORDs
-INST3(cvtps2pi,         "cvtps2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2D),                            INS_FLAGS_None)    // cvt packed singles to DWORDs
-INST3(cvtss2si,         "cvtss2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2D),                            INS_FLAGS_None)    // cvt scalar single to DWORD
-INST3(cvtpd2pi,         "cvtpd2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2D),                            INS_FLAGS_None)    // cvt packed doubles to DWORDs
-INST3(cvtsd2si,         "cvtsd2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2D),                            INS_FLAGS_None)    // cvt scalar double to DWORD
-INST3(cvtps2pd,         "cvtps2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5A),                            INS_FLAGS_None)    // cvt packed singles to doubles
-INST3(cvtpd2ps,         "cvtpd2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5A),                            INS_FLAGS_None)    // cvt packed doubles to singles
-INST3(cvtss2sd,         "cvtss2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar single to scalar doubles
-INST3(cvtsd2ss,         "cvtsd2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar double to scalar singles
-INST3(cvtdq2ps,         "cvtdq2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5B),                            INS_FLAGS_None)    // cvt packed DWORDs to singles
-INST3(cvtps2dq,         "cvtps2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5B),                            INS_FLAGS_None)    // cvt packed singles to DWORDs
-INST3(cvttps2dq,        "cvttps2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5B),                            INS_FLAGS_None)    // cvt with trunc packed singles to DWORDs
-INST3(cvtpd2dq,         "cvtpd2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xE6),                            INS_FLAGS_None)    // cvt packed doubles to DWORDs
-INST3(cvttpd2dq,        "cvttpd2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE6),                            INS_FLAGS_None)    // cvt with trunc packed doubles to DWORDs
-INST3(cvtdq2pd,         "cvtdq2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xE6),                            INS_FLAGS_None)    // cvt packed DWORDs to doubles
+INST3(cvtpi2ps,         "cvtpi2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2A),                            INS_TT_NONE,             Input_32Bit)    // cvt packed DWORDs to singles
+INST3(cvtsi2ss32,       "cvtsi2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2A),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar single
+INST3(cvtsi2ss64,       "cvtsi2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2A),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar single
+INST3(cvtpi2pd,         "cvtpi2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2A),                            INS_TT_NONE,             Input_32Bit)    // cvt packed DWORDs to doubles
+INST3(cvtsi2sd32,       "cvtsi2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2A),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar double
+INST3(cvtsi2sd64,       "cvtsi2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2A),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar double
+INST3(cvttps2pi,        "cvttps2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2C),                            INS_TT_NONE,             Input_32Bit)    // cvt with trunc packed singles to DWORDs
+INST3(cvttss2si,        "cvttss2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2C),                            INS_TT_TUPLE1_FIXED,     Input_32Bit)    // cvt with trunc scalar single to DWORD
+INST3(cvttpd2pi,        "cvttpd2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2C),                            INS_TT_NONE,             Input_64Bit)    // cvt with trunc packed doubles to DWORDs
+INST3(cvtps2pi,         "cvtps2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2D),                            INS_TT_NONE,             Input_32Bit)    // cvt packed singles to DWORDs
+INST3(cvtss2si,         "cvtss2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2D),                            INS_TT_TUPLE1_FIXED,     Input_32Bit)    // cvt scalar single to DWORD
+INST3(cvtpd2pi,         "cvtpd2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2D),                            INS_TT_NONE,             Input_64Bit)    // cvt packed doubles to DWORDs
+INST3(cvtsd2si,         "cvtsd2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2D),                            INS_TT_TUPLE1_FIXED,     Input_64Bit)    // cvt scalar double to DWORD
+INST3(cvtps2pd,         "cvtps2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5A),                            INS_TT_HALF,             Input_32Bit)    // cvt packed singles to doubles
+INST3(cvtpd2ps,         "cvtpd2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5A),                            INS_TT_FULL,             Input_64Bit)    // cvt packed doubles to singles
+INST3(cvtss2sd,         "cvtss2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5A),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar single to scalar doubles
+INST3(cvtsd2ss,         "cvtsd2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5A),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar double to scalar singles
+INST3(cvtdq2ps,         "cvtdq2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5B),                            INS_TT_FULL,             Input_32Bit)    // cvt packed DWORDs to singles
+INST3(cvtps2dq,         "cvtps2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5B),                            INS_TT_FULL,             Input_32Bit)    // cvt packed singles to DWORDs
+INST3(cvttps2dq,        "cvttps2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5B),                            INS_TT_FULL,             Input_32Bit)    // cvt with trunc packed singles to DWORDs
+INST3(cvtpd2dq,         "cvtpd2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xE6),                            INS_TT_FULL,             Input_64Bit)    // cvt packed doubles to DWORDs
+INST3(cvttpd2dq,        "cvttpd2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE6),                            INS_TT_FULL,             Input_64Bit)    // cvt with trunc packed doubles to DWORDs
+INST3(cvtdq2pd,         "cvtdq2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xE6),                            INS_TT_HALF,             Input_32Bit)    // cvt packed DWORDs to doubles
 
 // SSE2 comparison instructions
-INST3(comiss,           "comiss",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2F),                            Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // ordered compare singles
-INST3(comisd,           "comisd",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2F),                            Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // ordered compare doubles
-INST3(ucomiss,          "ucomiss",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2E),                            Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // unordered compare singles
-INST3(ucomisd,          "ucomisd",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2E),                            Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // unordered compare doubles
+INST3(comiss,           "comiss",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2F),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // ordered compare singles
+INST3(comisd,           "comisd",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2F),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // ordered compare doubles
+INST3(ucomiss,          "ucomiss",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2E),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // unordered compare singles
+INST3(ucomisd,          "ucomisd",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2E),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | Resets_OF  | Resets_SF     | Writes_ZF     | Resets_AF     | Writes_PF     | Writes_CF                     )  // unordered compare doubles
 
 // SSE2 packed single/double comparison operations.
 // Note that these instructions not only compare but also overwrite the first source.
-INST3(cmpps,            "cmpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed singles
-INST3(cmppd,            "cmppd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed doubles
-INST3(cmpss,            "cmpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar singles
-INST3(cmpsd,            "cmpsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar doubles
+INST3(cmpps,            "cmpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC2),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed singles
+INST3(cmppd,            "cmppd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC2),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed doubles
+INST3(cmpss,            "cmpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xC2),                            INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar singles
+INST3(cmpsd,            "cmpsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xC2),                            INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar doubles
 
 //SSE2 packed integer operations
-INST3(paddb,            "paddb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed byte integers
-INST3(paddw,            "paddw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFD),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed word (16-bit) integers
-INST3(paddd,            "paddd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed double-word (32-bit) integers
-INST3(paddq,            "paddq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed quad-word (64-bit) integers
-INST3(paddsb,           "paddsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed byte integers and saturate the results
-INST3(paddsw,           "paddsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xED),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed word integers and saturate the results
-INST3(paddusb,          "paddusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned byte integers and saturate the results
-INST3(paddusw,          "paddusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDD),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned word integers and saturate the results
-INST3(pavgb,            "pavgb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed byte integers
-INST3(pavgw,            "pavgw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed word integers
-INST3(psubb,            "psubb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
-INST3(psubw,            "psubw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
-INST3(psubd,            "psubd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed double-word (32-bit) integers
-INST3(psubq,            "psubq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // subtract packed quad-word (64-bit) integers
-INST3(pmaddwd,          "pmaddwd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst
-INST3(pmulhw,           "pmulhw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit signed integers
-INST3(pmulhuw,          "pmulhuw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit unsigned integers
-INST3(pmuludq,          "pmuludq",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit unsigned integers and store 64-bit result
-INST3(pmullw,           "pmullw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result
-INST3(pand,             "pand",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND of two xmm regs
-INST3(pandn,            "pandn",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDF),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND NOT of two xmm regs
-INST3(por,              "por",              IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise OR of two xmm regs
-INST3(pxor,             "pxor",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEF),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise XOR of two xmm regs
-INST3(psadbw,           "psadbw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF6),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Compute the sum of absolute differences of packed unsigned 8-bit integers
-INST3(psubsb,           "psubsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation
-INST3(psubusb,          "psubusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation
-INST3(psubsw,           "psubsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation
-INST3(psubusw,          "psubusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation
+INST3(paddb,            "paddb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFC),                            INS_TT_FULL_MEM,         Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed byte integers
+INST3(paddw,            "paddw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFD),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed word (16-bit) integers
+INST3(paddd,            "paddd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFE),                            INS_TT_FULL,             Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed double-word (32-bit) integers
+INST3(paddq,            "paddq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD4),                            INS_TT_FULL,             Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed quad-word (64-bit) integers
+INST3(paddsb,           "paddsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEC),                            INS_TT_FULL_MEM,         Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed byte integers and saturate the results
+INST3(paddsw,           "paddsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xED),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed word integers and saturate the results
+INST3(paddusb,          "paddusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDC),                            INS_TT_FULL_MEM,         Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned byte integers and saturate the results
+INST3(paddusw,          "paddusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDD),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned word integers and saturate the results
+INST3(pavgb,            "pavgb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE0),                            INS_TT_FULL_MEM,         Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed byte integers
+INST3(pavgw,            "pavgw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE3),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed word integers
+INST3(psubb,            "psubb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF8),                            INS_TT_FULL_MEM,         Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
+INST3(psubw,            "psubw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF9),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
+INST3(psubd,            "psubd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFA),                            INS_TT_FULL_MEM,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed double-word (32-bit) integers
+INST3(psubq,            "psubq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFB),                            INS_TT_FULL_MEM,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // subtract packed quad-word (64-bit) integers
+INST3(pmaddwd,          "pmaddwd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF5),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst
+INST3(pmulhw,           "pmulhw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE5),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit signed integers
+INST3(pmulhuw,          "pmulhuw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE4),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit unsigned integers
+INST3(pmuludq,          "pmuludq",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF4),                            INS_TT_FULL_MEM,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit unsigned integers and store 64-bit result
+INST3(pmullw,           "pmullw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD5),                            INS_TT_FULL_MEM,         Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result
+// TODO-XArch-AVX512: pand, pandn, por, and pxor have AVX512 instructions under different names, pandd, pandq etc
+INST3(pand,             "pand",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDB),                            INS_TT_NONE,         INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND of two xmm regs
+INST3(pandn,            "pandn",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDF),                            INS_TT_NONE,         INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND NOT of two xmm regs
+INST3(por,              "por",              IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEB),                            INS_TT_NONE,         INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise OR of two xmm regs
+INST3(pxor,             "pxor",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEF),                            INS_TT_NONE,         INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise XOR of two xmm regs
+INST3(psadbw,           "psadbw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF6),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Compute the sum of absolute differences of packed unsigned 8-bit integers
+INST3(psubsb,           "psubsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE8),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation
+INST3(psubusb,          "psubusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD8),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation
+INST3(psubsw,           "psubsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE9),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation
+INST3(psubusw,          "psubusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD9),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation
 
 // Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode,
 // which is handled in emitxarch.cpp.
-INST3(psrldq,           "psrldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_Flags_IsDstDstSrcAVXInstruction)    // Shift right logical of xmm reg by given number of bytes
-INST3(pslldq,           "pslldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_Flags_IsDstDstSrcAVXInstruction)    // Shift left logical of xmm reg by given number of bytes
-INST3(psllw,            "psllw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xF1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 16-bit integers
-INST3(pslld,            "pslld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xF2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 32-bit integers
-INST3(psllq,            "psllq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xF3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 64-bit integers
-INST3(psrlw,            "psrlw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xD1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 16-bit integers
-INST3(psrld,            "psrld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xD2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 32-bit integers
-INST3(psrlq,            "psrlq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xD3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 64-bit integers
-INST3(psraw,            "psraw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xE1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 16-bit integers
-INST3(psrad,            "psrad",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xE2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 32-bit integers
-
-INST3(pmaxub,           "pmaxub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum unsigned bytes
-INST3(pminub,           "pminub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum unsigned bytes
-INST3(pmaxsw,           "pmaxsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed words
-INST3(pminsw,           "pminsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed words
-INST3(pcmpeqd,          "pcmpeqd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x76),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit integers for equality
-INST3(pcmpgtd,          "pcmpgtd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x66),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit signed integers for greater than
-INST3(pcmpeqw,          "pcmpeqw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x75),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit integers for equality
-INST3(pcmpgtw,          "pcmpgtw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x65),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit signed integers for greater than
-INST3(pcmpeqb,          "pcmpeqb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x74),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit integers for equality
-INST3(pcmpgtb,          "pcmpgtb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x64),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit signed integers for greater than
-
-INST3(pshufd,           "pshufd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x70),                            INS_FLAGS_None)    // Packed shuffle of 32-bit integers
-INST3(pshufhw,          "pshufhw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x70),                            INS_FLAGS_None)    // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
-INST3(pshuflw,          "pshuflw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x70),                            INS_FLAGS_None)    // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
-INST3(pextrw,           "pextrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC5),                            INS_FLAGS_None)    // Extract 16-bit value into a r32 with zero extended to 32-bits
-INST3(pinsrw,           "pinsrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Insert word at index
-
-INST3(punpckhbw,        "punpckhbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x68),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
-INST3(punpcklbw,        "punpcklbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x60),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (lo)
-INST3(punpckhqdq,       "punpckhqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (hi)
-INST3(punpcklqdq,       "punpcklqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (lo)
-INST3(punpckhwd,        "punpckhwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x69),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (hi)
-INST3(punpcklwd,        "punpcklwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x61),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (lo)
-INST3(unpckhpd,         "unpckhpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x15),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
-INST3(unpcklpd,         "unpcklpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x14),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
-
-INST3(packssdw,         "packssdw",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6B),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to short with saturation
-INST3(packsswb,         "packsswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x63),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to byte with saturation
-INST3(packuswb,         "packuswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x67),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to unsigned byte with saturation
+// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value.
+// This hopefully means we do not need psrldq128/psrldq256/psrldq512 etc
+INST3(psrldq,           "psrldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Shift right logical of xmm reg by given number of bytes
+INST3(pslldq,           "pslldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Shift left logical of xmm reg by given number of bytes
+INST3(psllw,            "psllw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xF1),                            INS_TT_FULL_MEM | INS_TT_MEM128,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 16-bit integers
+INST3(pslld,            "pslld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xF2),                            INS_TT_FULL | INS_TT_MEM128,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 32-bit integers
+INST3(psllq,            "psllq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xF3),                            INS_TT_FULL | INS_TT_MEM128,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 64-bit integers
+INST3(psrlw,            "psrlw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xD1),                            INS_TT_FULL_MEM | INS_TT_MEM128,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 16-bit integers
+INST3(psrld,            "psrld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xD2),                            INS_TT_FULL | INS_TT_MEM128,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 32-bit integers
+INST3(psrlq,            "psrlq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xD3),                            INS_TT_FULL | INS_TT_MEM128,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 64-bit integers
+INST3(psraw,            "psraw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xE1),                            INS_TT_FULL_MEM | INS_TT_MEM128,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 16-bit integers
+INST3(psrad,            "psrad",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xE2),                            INS_TT_FULL | INS_TT_MEM128,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 32-bit integers
+INST3(pmaxub,           "pmaxub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDE),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum unsigned bytes
+INST3(pminub,           "pminub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDA),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum unsigned bytes
+INST3(pmaxsw,           "pmaxsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEE),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed words
+INST3(pminsw,           "pminsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEA),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed words
+INST3(pcmpeqd,          "pcmpeqd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x76),                            INS_TT_FULL,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit integers for equality
+INST3(pcmpgtd,          "pcmpgtd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x66),                            INS_TT_FULL,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit signed integers for greater than
+INST3(pcmpeqw,          "pcmpeqw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x75),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit integers for equality
+INST3(pcmpgtw,          "pcmpgtw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x65),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit signed integers for greater than
+INST3(pcmpeqb,          "pcmpeqb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x74),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit integers for equality
+INST3(pcmpgtb,          "pcmpgtb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x64),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit signed integers for greater than
+
+INST3(pshufd,           "pshufd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x70),                            INS_TT_FULL,             Input_32Bit)    // Packed shuffle of 32-bit integers
+INST3(pshufhw,          "pshufhw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x70),                            INS_TT_FULL_MEM,         Input_16Bit)    // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
+INST3(pshuflw,          "pshuflw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x70),                            INS_TT_FULL_MEM,         Input_16Bit)    // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
+INST3(pextrw,           "pextrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC5),                            INS_TT_TUPLE1_SCALAR,    Input_16Bit)    // Extract 16-bit value into a r32 with zero extended to 32-bits
+INST3(pinsrw,           "pinsrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC4),                            INS_TT_TUPLE1_SCALAR,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Insert word at index
+
+INST3(punpckhbw,        "punpckhbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x68),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3(punpcklbw,        "punpcklbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x60),                            INS_TT_FULL_MEM,     Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (lo)
+INST3(punpckhqdq,       "punpckhqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6D),                            INS_TT_FULL,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (hi)
+INST3(punpcklqdq,       "punpcklqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6C),                            INS_TT_FULL,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (lo)
+INST3(punpckhwd,        "punpckhwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x69),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (hi)
+INST3(punpcklwd,        "punpcklwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x61),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (lo)
+INST3(unpckhpd,         "unpckhpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x15),                            INS_TT_FULL,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3(unpcklpd,         "unpcklpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x14),                            INS_TT_FULL,         Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
+
+INST3(packssdw,         "packssdw",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6B),                            INS_TT_FULL,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to short with saturation
+INST3(packsswb,         "packsswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x63),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to byte with saturation
+INST3(packuswb,         "packuswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x67),                            INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to unsigned byte with saturation
 
 //    id                nm                  um      mr            mi            rm                                       flags
-INST3(dpps,             "dpps",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x40),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two float vector regs
-INST3(dppd,             "dppd",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x41),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two double vector regs
-INST3(insertps,         "insertps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x21),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert packed single precision float value
-INST3(pcmpeqq,          "pcmpeqq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x29),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
-INST3(pcmpgtq,          "pcmpgtq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x37),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
-INST3(pmulld,           "pmulld",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x40),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
-INST3(ptest,            "ptest",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x17),                             INS_FLAGS_None)    // Packed logical compare
-INST3(phaddd,           "phaddd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x02),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add
-INST3(pabsb,            "pabsb",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1C),                             INS_FLAGS_None)    // Packed absolute value of bytes
-INST3(pabsw,            "pabsw",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1D),                             INS_FLAGS_None)    // Packed absolute value of 16-bit integers
-INST3(pabsd,            "pabsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1E),                             INS_FLAGS_None)    // Packed absolute value of 32-bit integers
-INST3(palignr,          "palignr",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Align Right
-INST3(pmaddubsw,        "pmaddubsw",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x04),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Packed Signed and Unsigned Bytes
-INST3(pmulhrsw,         "pmulhrsw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Multiply High with Round and Scale
-INST3(pshufb,           "pshufb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x00),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Shuffle Bytes
-INST3(psignb,           "psignb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x08),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
-INST3(psignw,           "psignw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x09),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
-INST3(psignd,           "psignd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
-INST3(pminsb,           "pminsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x38),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed bytes
-INST3(pminsd,           "pminsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x39),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit signed integers
-INST3(pminuw,           "pminuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 16-bit unsigned integers
-INST3(pminud,           "pminud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit unsigned integers
-INST3(pmaxsb,           "pmaxsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed bytes
-INST3(pmaxsd,           "pmaxsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit signed integers
-INST3(pmaxuw,           "pmaxuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 16-bit unsigned integers
-INST3(pmaxud,           "pmaxud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit unsigned integers
-INST3(pmovsxbw,         "pmovsxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x20),                             INS_FLAGS_None)    // Packed sign extend byte to short
-INST3(pmovsxbd,         "pmovsxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x21),                             INS_FLAGS_None)    // Packed sign extend byte to int
-INST3(pmovsxbq,         "pmovsxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x22),                             INS_FLAGS_None)    // Packed sign extend byte to long
-INST3(pmovsxwd,         "pmovsxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x23),                             INS_FLAGS_None)    // Packed sign extend short to int
-INST3(pmovsxwq,         "pmovsxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x24),                             INS_FLAGS_None)    // Packed sign extend short to long
-INST3(pmovsxdq,         "pmovsxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x25),                             INS_FLAGS_None)    // Packed sign extend int to long
-INST3(pmovzxbw,         "pmovzxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x30),                             INS_FLAGS_None)    // Packed zero extend byte to short
-INST3(pmovzxbd,         "pmovzxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x31),                             INS_FLAGS_None)    // Packed zero extend byte to intg
-INST3(pmovzxbq,         "pmovzxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x32),                             INS_FLAGS_None)    // Packed zero extend byte to lon
-INST3(pmovzxwd,         "pmovzxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x33),                             INS_FLAGS_None)    // Packed zero extend short to int
-INST3(pmovzxwq,         "pmovzxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x34),                             INS_FLAGS_None)    // Packed zero extend short to long
-INST3(pmovzxdq,         "pmovzxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x35),                             INS_FLAGS_None)    // Packed zero extend int to long
-INST3(packusdw,         "packusdw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to unsigned short with saturation
-INST3(roundps,          "roundps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x08),                             INS_FLAGS_None)    // Round packed single precision floating-point values
-INST3(roundss,          "roundss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0A),                             INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar single precision floating-point values
-INST3(roundpd,          "roundpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x09),                             INS_FLAGS_None)    // Round packed double precision floating-point values
-INST3(roundsd,          "roundsd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0B),                             INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar double precision floating-point values
-INST3(pmuldq,           "pmuldq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x28),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit signed integers and store 64-bit result
-INST3(blendps,          "blendps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Single Precision Floating-Point Values
-INST3(blendvps,         "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x14),                             INS_FLAGS_None)    // Variable Blend Packed Singles
-INST3(blendpd,          "blendpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Double Precision Floating-Point Values
-INST3(blendvpd,         "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x15),                             INS_FLAGS_None)    // Variable Blend Packed Doubles
-INST3(pblendw,          "pblendw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Words
-INST3(pblendvb,         "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x10),                             INS_FLAGS_None)    // Variable Blend Packed Bytes
-INST3(phaddw,           "phaddw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x01),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers
-INST3(phsubw,           "phsubw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x05),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers
-INST3(phsubd,           "phsubd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x06),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 32-bit integers
-INST3(phaddsw,          "phaddsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x03),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers with saturation
-INST3(phsubsw,          "phsubsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x07),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers with saturation
-INST3(lddqu,            "lddqu",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xF0),                            INS_FLAGS_None)    // Load Unaligned integer
-INST3(movntdqa,         "movntdqa",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2A),                             INS_FLAGS_None)    // Load Double Quadword Non-Temporal Aligned Hint
-INST3(movddup,          "movddup",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x12),                            INS_FLAGS_None)    // Replicate Double FP Values
-INST3(movsldup,         "movsldup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x12),                            INS_FLAGS_None)    // Replicate even-indexed Single FP Values
-INST3(movshdup,         "movshdup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x16),                            INS_FLAGS_None)    // Replicate odd-indexed Single FP Values
-INST3(phminposuw,       "phminposuw",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x41),                             INS_FLAGS_None)    // Packed Horizontal Word Minimum
-INST3(mpsadbw,          "mpsadbw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x42),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Compute Multiple Packed Sums of Absolute Difference
-INST3(pinsrb,           "pinsrb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x20),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Byte
-INST3(pinsrd,           "pinsrd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Dword
-INST3(pinsrq,           "pinsrq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Qword
-INST3(pextrb,           "pextrb",           IUM_WR, SSE3A(0x14),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Byte
-INST3(pextrd,           "pextrd",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Dword
-INST3(pextrq,           "pextrq",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Qword
-INST3(pextrw_sse41,     "pextrw",           IUM_WR, SSE3A(0x15),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Word
-INST3(extractps,        "extractps",        IUM_WR, SSE3A(0x17),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Packed Floating-Point Values
+INST3(dpps,             "dpps",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x40),                             INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two float vector regs
+INST3(dppd,             "dppd",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x41),                             INS_TT_NONE,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two double vector regs
+INST3(insertps,         "insertps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x21),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Insert packed single precision float value
+INST3(pcmpeqq,          "pcmpeqq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x29),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
+INST3(pcmpgtq,          "pcmpgtq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x37),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
+INST3(pmulld,           "pmulld",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x40),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
+INST3(ptest,            "ptest",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x17),                             INS_TT_NONE,    INS_FLAGS_None)    // Packed logical compare
+INST3(phaddd,           "phaddd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x02),                             INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add
+INST3(pabsb,            "pabsb",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1C),                             INS_TT_FULL_MEM,    Input_8Bit | INS_FLAGS_None)    // Packed absolute value of bytes
+INST3(pabsw,            "pabsw",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1D),                             INS_TT_FULL_MEM,    Input_16Bit | INS_FLAGS_None)    // Packed absolute value of 16-bit integers
+INST3(pabsd,            "pabsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1E),                             INS_TT_FULL,    Input_32Bit | INS_FLAGS_None)    // Packed absolute value of 32-bit integers
+// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value.
+// This hopefully means we do not need palignr128/palignr256/palignr512 etc.
+INST3(palignr,          "palignr",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0F),                             INS_TT_FULL_MEM,    INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Align Right
+INST3(pmaddubsw,        "pmaddubsw",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x04),                             INS_TT_FULL_MEM,    INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Packed Signed and Unsigned Bytes
+INST3(pmulhrsw,         "pmulhrsw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0B),                             INS_TT_FULL_MEM,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Multiply High with Round and Scale
+INST3(pshufb,           "pshufb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x00),                             INS_TT_FULL_MEM,    Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Shuffle Bytes
+INST3(psignb,           "psignb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x08),                             INS_TT_NONE,    Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
+INST3(psignw,           "psignw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x09),                             INS_TT_NONE,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
+INST3(psignd,           "psignd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0A),                             INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
+INST3(pminsb,           "pminsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x38),                             INS_TT_FULL_MEM,    Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed bytes
+INST3(pminsd,           "pminsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x39),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit signed integers
+INST3(pminuw,           "pminuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3A),                             INS_TT_FULL_MEM,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 16-bit unsigned integers
+INST3(pminud,           "pminud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3B),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit unsigned integers
+INST3(pmaxsb,           "pmaxsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3C),                             INS_TT_FULL_MEM,    Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed bytes
+INST3(pmaxsd,           "pmaxsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3D),                             INS_TT_FULL,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit signed integers
+INST3(pmaxuw,           "pmaxuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3E),                             INS_TT_FULL_MEM,     Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 16-bit unsigned integers
+INST3(pmaxud,           "pmaxud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3F),                             INS_TT_FULL,         Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit unsigned integers
+INST3(pmovsxbw,         "pmovsxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x20),                             INS_TT_HALF_MEM,     Input_8Bit)    // Packed sign extend byte to short
+INST3(pmovsxbd,         "pmovsxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x21),                             INS_TT_QUARTER_MEM,  Input_8Bit)    // Packed sign extend byte to int
+INST3(pmovsxbq,         "pmovsxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x22),                             INS_TT_EIGHTH_MEM,   Input_8Bit)    // Packed sign extend byte to long
+INST3(pmovsxwd,         "pmovsxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x23),                             INS_TT_HALF_MEM,     Input_16Bit)    // Packed sign extend short to int
+INST3(pmovsxwq,         "pmovsxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x24),                             INS_TT_QUARTER_MEM,  Input_16Bit)    // Packed sign extend short to long
+INST3(pmovsxdq,         "pmovsxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x25),                             INS_TT_HALF_MEM,     Input_32Bit)    // Packed sign extend int to long
+INST3(pmovzxbw,         "pmovzxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x30),                             INS_TT_HALF_MEM,     Input_8Bit)    // Packed zero extend byte to short
+INST3(pmovzxbd,         "pmovzxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x31),                             INS_TT_QUARTER_MEM,  Input_8Bit)    // Packed zero extend byte to intg
+INST3(pmovzxbq,         "pmovzxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x32),                             INS_TT_EIGHTH_MEM,   Input_8Bit)    // Packed zero extend byte to lon
+INST3(pmovzxwd,         "pmovzxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x33),                             INS_TT_HALF_MEM,     Input_16Bit)    // Packed zero extend short to int
+INST3(pmovzxwq,         "pmovzxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x34),                             INS_TT_QUARTER_MEM,  Input_16Bit)    // Packed zero extend short to long
+INST3(pmovzxdq,         "pmovzxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x35),                             INS_TT_HALF_MEM,     Input_32Bit)    // Packed zero extend int to long
+INST3(packusdw,         "packusdw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2B),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to unsigned short with saturation
+INST3(roundps,          "roundps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x08),                             INS_TT_NONE,    Input_32Bit | INS_FLAGS_None)    // Round packed single precision floating-point values
+INST3(roundss,          "roundss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0A),                             INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar single precision floating-point values
+INST3(roundpd,          "roundpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x09),                             INS_TT_NONE,    Input_64Bit | INS_FLAGS_None)    // Round packed double precision floating-point values
+INST3(roundsd,          "roundsd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0B),                             INS_TT_NONE,    Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar double precision floating-point values
+INST3(pmuldq,           "pmuldq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x28),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit signed integers and store 64-bit result
+INST3(blendps,          "blendps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0C),                             INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Single Precision Floating-Point Values
+INST3(blendvps,         "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x14),                             INS_TT_NONE,    Input_32Bit | INS_FLAGS_None)    // Variable Blend Packed Singles
+INST3(blendpd,          "blendpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0D),                             INS_TT_NONE,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Double Precision Floating-Point Values
+INST3(blendvpd,         "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x15),                             INS_TT_NONE,    Input_64Bit | INS_FLAGS_None)    // Variable Blend Packed Doubles
+INST3(pblendw,          "pblendw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0E),                             INS_TT_NONE,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Words
+INST3(pblendvb,         "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x10),                             INS_TT_NONE,    Input_8Bit | INS_FLAGS_None)    // Variable Blend Packed Bytes
+INST3(phaddw,           "phaddw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x01),                             INS_TT_NONE,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers
+INST3(phsubw,           "phsubw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x05),                             INS_TT_NONE,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers
+INST3(phsubd,           "phsubd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x06),                             INS_TT_NONE,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 32-bit integers
+INST3(phaddsw,          "phaddsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x03),                             INS_TT_NONE,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers with saturation
+INST3(phsubsw,          "phsubsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x07),                             INS_TT_NONE,    Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers with saturation
+// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value.
+// This hopefully means we do not need lddqu128/lddqu256/lddqu512 etc.
+INST3(lddqu,            "lddqu",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xF0),                            INS_TT_NONE,    INS_FLAGS_None)    // Load Unaligned integer
+INST3(movntdqa,         "movntdqa",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2A),                             INS_TT_FULL_MEM,    INS_FLAGS_None)    // Load Double Quadword Non-Temporal Aligned Hint
+INST3(movddup,          "movddup",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x12),                            INS_TT_MOVDDUP,    Input_64Bit | INS_FLAGS_None)    // Replicate Double FP Values
+INST3(movsldup,         "movsldup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x12),                            INS_TT_FULL_MEM,    Input_32Bit | INS_FLAGS_None)    // Replicate even-indexed Single FP Values
+INST3(movshdup,         "movshdup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x16),                            INS_TT_FULL_MEM,    Input_32Bit | INS_FLAGS_None)    // Replicate odd-indexed Single FP Values
+INST3(phminposuw,       "phminposuw",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x41),                             INS_TT_NONE,    Input_16Bit | INS_FLAGS_None)    // Packed Horizontal Word Minimum
+INST3(mpsadbw,          "mpsadbw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x42),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Compute Multiple Packed Sums of Absolute Difference
+INST3(pinsrb,           "pinsrb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x20),                             INS_TT_TUPLE1_SCALAR,    Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Byte
+INST3(pinsrd,           "pinsrd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Dword
+INST3(pinsrq,           "pinsrq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Qword
+INST3(pextrb,           "pextrb",           IUM_WR, SSE3A(0x14),  BAD_CODE,     BAD_CODE,                                INS_TT_TUPLE1_SCALAR,    Input_8Bit | INS_FLAGS_None)    // Extract Byte
+INST3(pextrd,           "pextrd",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_FLAGS_None)    // Extract Dword
+INST3(pextrq,           "pextrq",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_FLAGS_None)    // Extract Qword
+INST3(pextrw_sse41,     "pextrw",           IUM_WR, SSE3A(0x15),  BAD_CODE,     BAD_CODE,                                INS_TT_TUPLE1_SCALAR,    Input_16Bit | INS_FLAGS_None)    // Extract Word
+INST3(extractps,        "extractps",        IUM_WR, SSE3A(0x17),  BAD_CODE,     BAD_CODE,                                INS_TT_TUPLE1_SCALAR,    Input_16Bit | INS_FLAGS_None)    // Extract Packed Floating-Point Values
 
 //PCLMULQDQ instructions
-INST3(pclmulqdq,        "pclmulqdq" ,       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x44),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform a carry-less multiplication of two quadwords
+INST3(pclmulqdq,        "pclmulqdq" ,       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x44),                             INS_TT_FULL_MEM,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)   // Perform a carry-less multiplication of two quadwords
 
 //AES instructions
-INST3(aesdec,           "aesdec",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDE),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES decryption flow
-INST3(aesdeclast,       "aesdeclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDF),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES decryption flow
-INST3(aesenc,           "aesenc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDC),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES encryption flow
-INST3(aesenclast,       "aesenclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDD),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES encryption flow
-INST3(aesimc,           "aesimc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDB),                             INS_FLAGS_None)   // Perform the AES InvMixColumn Transformation
-INST3(aeskeygenassist,  "aeskeygenassist",  IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xDF),                             INS_FLAGS_None)   // AES Round Key Generation Assist
-INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-
-INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value.
+// This hopefully means we do not need aesdec128/aesdec256/aesdec512 etc.
+INST3(aesdec,           "aesdec",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDE),                             INS_TT_FULL_MEM,    INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES decryption flow
+INST3(aesdeclast,       "aesdeclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDF),                             INS_TT_FULL_MEM,    INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES decryption flow
+INST3(aesenc,           "aesenc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDC),                             INS_TT_FULL_MEM,    INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES encryption flow
+INST3(aesenclast,       "aesenclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDD),                             INS_TT_FULL_MEM,    INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES encryption flow
+INST3(aesimc,           "aesimc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDB),                             INS_TT_NONE,    INS_FLAGS_None)   // Perform the AES InvMixColumn Transformation
+INST3(aeskeygenassist,  "aeskeygenassist",  IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xDF),                             INS_TT_NONE,    INS_FLAGS_None)   // AES Round Key Generation Assist
+INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
+
+INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 // AVX only instructions
-INST3(vbroadcastss,     "broadcastss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x18),                             INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
-INST3(vbroadcastsd,     "broadcastsd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x19),                             INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
-INST3(vpbroadcastb,     "pbroadcastb",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x78),                             INS_FLAGS_None)    // Broadcast int8 value from reg/memory to entire ymm register
-INST3(vpbroadcastw,     "pbroadcastw",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x79),                             INS_FLAGS_None)    // Broadcast int16 value from reg/memory to entire ymm register
-INST3(vpbroadcastd,     "pbroadcastd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x58),                             INS_FLAGS_None)    // Broadcast int32 value from reg/memory to entire ymm register
-INST3(vpbroadcastq,     "pbroadcastq",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x59),                             INS_FLAGS_None)    // Broadcast int64 value from reg/memory to entire ymm register
-INST3(vextractf128,     "extractf128",      IUM_WR, SSE3A(0x19),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract 128-bit packed floating point values
-INST3(vextracti128,     "extracti128",      IUM_WR, SSE3A(0x39),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract 128-bit packed integer values
-INST3(vinsertf128,      "insertf128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x18),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed floating point values
-INST3(vinserti128,      "inserti128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x38),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed integer values
-INST3(vzeroupper,       "zeroupper",        IUM_WR, 0xC577F8,     BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
-INST3(vperm2i128,       "perm2i128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x46),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute 128-bit halves of input register
-INST3(vpermq,           "permq",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x00),                             INS_FLAGS_None)    // Permute 64-bit of input register
-INST3(vpblendd,         "pblendd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x02),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed DWORDs
-INST3(vblendvps,        "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Singles
-INST3(vblendvpd,        "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Doubles
-INST3(vpblendvb,        "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Bytes
-INST3(vtestps,          "testps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0E),                             INS_FLAGS_None)    // Packed Bit Test
-INST3(vtestpd,          "testpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0F),                             INS_FLAGS_None)    // Packed Bit Test
-INST3(vpsrlvd,          "psrlvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
-INST3(vpsrlvq,          "psrlvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
-INST3(vpsravd,          "psravd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x46),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Arithmetic
-INST3(vpsllvd,          "psllvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
-INST3(vpsllvq,          "psllvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
-INST3(vpermilps,        "permilps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x04),                             INS_FLAGS_None)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
-INST3(vpermilpd,        "permilpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x05),                             INS_FLAGS_None)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
-INST3(vpermilpsvar,     "permilpsvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
-INST3(vpermilpdvar,     "permilpdvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
-INST3(vperm2f128,       "perm2f128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x06),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Floating-Point Values
-INST3(vpermpd,          "permpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x01),                             INS_FLAGS_None)    // Permute Double-Precision Floating-Point Values
-INST3(vpermd,           "permd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x36),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Packed Doublewords Elements
-INST3(vpermps,          "permps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x16),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Single-Precision Floating-Point Elements
-INST3(vbroadcastf128,   "broadcastf128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1A),                             INS_FLAGS_None)    // Broadcast packed float values read from memory to entire ymm register
-INST3(vbroadcasti128,   "broadcasti128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x5A),                             INS_FLAGS_None)    // Broadcast packed integer values read from memory to entire ymm register
-INST3(vmaskmovps,       "maskmovps",        IUM_WR, SSE38(0x2E),  BAD_CODE,     SSE38(0x2C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores
-INST3(vmaskmovpd,       "maskmovpd",        IUM_WR, SSE38(0x2F),  BAD_CODE,     SSE38(0x2D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores
-INST3(vpmaskmovd,       "pmaskmovd",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Dword Loads and Stores
-INST3(vpmaskmovq,       "pmaskmovq",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Qword Loads and Stores
-INST3(vpgatherdd,       "pgatherdd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Dword
-INST3(vpgatherqd,       "pgatherqd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Qword
-INST3(vpgatherdq,       "pgatherdq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword with Signed Dword Indices
-INST3(vpgatherqq,       "pgatherqq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Qword with Signed Dword Indices
-INST3(vgatherdps,       "gatherdps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Dword Indices
-INST3(vgatherqps,       "gatherqps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Qword Indices
-INST3(vgatherdpd,       "gatherdpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Dword Indices
-INST3(vgatherqpd,       "gatherqpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Qword Indices
-
-INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(vbroadcastss,     "broadcastss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x18),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
+INST3(vbroadcastsd,     "broadcastsd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x19),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
+INST3(vpbroadcastb,     "pbroadcastb",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x78),                             INS_TT_TUPLE1_SCALAR,    Input_8Bit | INS_FLAGS_None)    // Broadcast int8 value from reg/memory to entire ymm register
+INST3(vpbroadcastw,     "pbroadcastw",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x79),                             INS_TT_TUPLE1_SCALAR,    Input_16Bit | INS_FLAGS_None)    // Broadcast int16 value from reg/memory to entire ymm register
+INST3(vpbroadcastd,     "pbroadcastd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x58),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_FLAGS_None)    // Broadcast int32 value from reg/memory to entire ymm register
+INST3(vpbroadcastq,     "pbroadcastq",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x59),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_FLAGS_None)    // Broadcast int64 value from reg/memory to entire ymm register
+INST3(vextractf128,     "extractf128",      IUM_WR, SSE3A(0x19),  BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)    // Extract 128-bit packed floating point values
+INST3(vextracti128,     "extracti128",      IUM_WR, SSE3A(0x39),  BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)    // Extract 128-bit packed integer values
+INST3(vinsertf128,      "insertf128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x18),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed floating point values
+INST3(vinserti128,      "inserti128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x38),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed integer values
+INST3(vzeroupper,       "zeroupper",        IUM_WR, 0xC577F8,     BAD_CODE,     BAD_CODE,                                INS_TT_NONE,    INS_FLAGS_None)    // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
+INST3(vperm2i128,       "perm2i128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x46),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Permute 128-bit halves of input register
+INST3(vpermq,           "permq",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x00),                             INS_TT_FULL,    Input_64Bit | INS_FLAGS_None)    // Permute 64-bit of input register
+INST3(vpblendd,         "pblendd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x02),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed DWORDs
+INST3(vblendvps,        "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4A),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Singles
+INST3(vblendvpd,        "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4B),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Doubles
+INST3(vpblendvb,        "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4C),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Bytes
+INST3(vtestps,          "testps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0E),                             INS_TT_NONE,    INS_FLAGS_None)    // Packed Bit Test
+INST3(vtestpd,          "testpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0F),                             INS_TT_NONE,    INS_FLAGS_None)    // Packed Bit Test
+INST3(vpsrlvd,          "psrlvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
+INST3(vpsrlvq,          "psrlvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
+INST3(vpsravd,          "psravd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x46),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Arithmetic
+INST3(vpsllvd,          "psllvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
+INST3(vpsllvq,          "psllvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
+INST3(vpermilps,        "permilps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x04),                             INS_TT_FULL,    Input_32Bit | INS_FLAGS_None)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
+INST3(vpermilpd,        "permilpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x05),                             INS_TT_FULL,    Input_64Bit | INS_FLAGS_None)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
+INST3(vpermilpsvar,     "permilpsvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0C),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
+INST3(vpermilpdvar,     "permilpdvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0D),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
+INST3(vperm2f128,       "perm2f128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x06),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Floating-Point Values
+INST3(vpermpd,          "permpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x01),                             INS_TT_FULL,    Input_64Bit | INS_FLAGS_None)    // Permute Double-Precision Floating-Point Values
+INST3(vpermd,           "permd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x36),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Packed Doublewords Elements
+INST3(vpermps,          "permps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x16),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Single-Precision Floating-Point Elements
+INST3(vbroadcastf128,   "broadcastf128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1A),                             INS_TT_NONE,    INS_FLAGS_None)    // Broadcast packed float values read from memory to entire ymm register
+INST3(vbroadcasti128,   "broadcasti128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x5A),                             INS_TT_NONE,    INS_FLAGS_None)    // Broadcast packed integer values read from memory to entire ymm register
+INST3(vmaskmovps,       "maskmovps",        IUM_WR, SSE38(0x2E),  BAD_CODE,     SSE38(0x2C),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores
+INST3(vmaskmovpd,       "maskmovpd",        IUM_WR, SSE38(0x2F),  BAD_CODE,     SSE38(0x2D),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores
+INST3(vpmaskmovd,       "pmaskmovd",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Dword Loads and Stores
+INST3(vpmaskmovq,       "pmaskmovq",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Qword Loads and Stores
+INST3(vpgatherdd,       "pgatherdd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Dword
+INST3(vpgatherqd,       "pgatherqd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Qword
+INST3(vpgatherdq,       "pgatherdq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword with Signed Dword Indices
+INST3(vpgatherqq,       "pgatherqq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Qword with Signed Dword Indices
+INST3(vgatherdps,       "gatherdps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Dword Indices
+INST3(vgatherqps,       "gatherqps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Qword Indices
+INST3(vgatherdpd,       "gatherdpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Dword Indices
+INST3(vgatherqpd,       "gatherqpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Qword Indices
+
+INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 //    id                nm                  um      mr            mi            rm                                       flags
-INST3(vfmadd132pd,      "fmadd132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Double-Precision Floating-Point Values
-INST3(vfmadd213pd,      "fmadd213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231pd,      "fmadd231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd132ps,      "fmadd132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Single-Precision Floating-Point Values
-INST3(vfmadd213ps,      "fmadd213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231ps,      "fmadd231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd132sd,      "fmadd132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values
-INST3(vfmadd213sd,      "fmadd213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231sd,      "fmadd231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd132ss,      "fmadd132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values
-INST3(vfmadd213ss,      "fmadd213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231ss,      "fmadd231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub132pd,   "fmaddsub132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values
-INST3(vfmaddsub213pd,   "fmaddsub213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub231pd,   "fmaddsub231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub132ps,   "fmaddsub132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values
-INST3(vfmaddsub213ps,   "fmaddsub213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub231ps,   "fmaddsub231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd132pd,   "fmsubadd132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values
-INST3(vfmsubadd213pd,   "fmsubadd213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd231pd,   "fmsubadd231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd132ps,   "fmsubadd132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values
-INST3(vfmsubadd213ps,   "fmsubadd213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd231ps,   "fmsubadd231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132pd,      "fmsub132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values
-INST3(vfmsub213pd,      "fmsub213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231pd,      "fmsub231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132ps,      "fmsub132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values
-INST3(vfmsub213ps,      "fmsub213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231ps,      "fmsub231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132sd,      "fmsub132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values
-INST3(vfmsub213sd,      "fmsub213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231sd,      "fmsub231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132ss,      "fmsub132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values
-INST3(vfmsub213ss,      "fmsub213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231ss,      "fmsub231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132pd,     "fnmadd132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
-INST3(vfnmadd213pd,     "fnmadd213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231pd,     "fnmadd231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132ps,     "fnmadd132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
-INST3(vfnmadd213ps,     "fnmadd213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231ps,     "fnmadd231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132sd,     "fnmadd132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
-INST3(vfnmadd213sd,     "fnmadd213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231sd,     "fnmadd231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132ss,     "fnmadd132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
-INST3(vfnmadd213ss,     "fnmadd213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231ss,     "fnmadd231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132pd,     "fnmsub132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
-INST3(vfnmsub213pd,     "fnmsub213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231pd,     "fnmsub231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132ps,     "fnmsub132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
-INST3(vfnmsub213ps,     "fnmsub213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231ps,     "fnmsub231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132sd,     "fnmsub132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
-INST3(vfnmsub213sd,     "fnmsub213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231sd,     "fnmsub231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132ss,     "fnmsub132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
-INST3(vfnmsub213ss,     "fnmsub213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231ss,     "fnmsub231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-
-INST3(FIRST_AVXVNNI_INSTRUCTION, "FIRST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-INST3(vpdpbusd,          "pdpbusd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x50),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Unsigned and Signed Bytes
-INST3(vpdpwssd,          "pdpwssd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x52),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Signed Word Integers
-INST3(vpdpbusds,         "pdpbusds",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x51),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Unsigned and Signed Bytes with Saturation
-INST3(vpdpwssds,         "pdpwssds",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x53),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Signed Word Integers with Saturation
-INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(vfmadd132pd,      "fmadd132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Double-Precision Floating-Point Values
+INST3(vfmadd213pd,      "fmadd213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231pd,      "fmadd231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd132ps,      "fmadd132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Single-Precision Floating-Point Values
+INST3(vfmadd213ps,      "fmadd213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231ps,      "fmadd231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd132sd,      "fmadd132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values
+INST3(vfmadd213sd,      "fmadd213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231sd,      "fmadd231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd132ss,      "fmadd132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values
+INST3(vfmadd213ss,      "fmadd213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231ss,      "fmadd231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub132pd,   "fmaddsub132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfmaddsub213pd,   "fmaddsub213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub231pd,   "fmaddsub231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub132ps,   "fmaddsub132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfmaddsub213ps,   "fmaddsub213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub231ps,   "fmaddsub231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd132pd,   "fmsubadd132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values
+INST3(vfmsubadd213pd,   "fmsubadd213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd231pd,   "fmsubadd231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd132ps,   "fmsubadd132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values
+INST3(vfmsubadd213ps,   "fmsubadd213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd231ps,   "fmsubadd231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132pd,      "fmsub132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfmsub213pd,      "fmsub213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231pd,      "fmsub231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132ps,      "fmsub132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfmsub213ps,      "fmsub213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231ps,      "fmsub231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132sd,      "fmsub132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values
+INST3(vfmsub213sd,      "fmsub213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231sd,      "fmsub231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132ss,      "fmsub132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values
+INST3(vfmsub213ss,      "fmsub213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231ss,      "fmsub231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132pd,     "fnmadd132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
+INST3(vfnmadd213pd,     "fnmadd213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231pd,     "fnmadd231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132ps,     "fnmadd132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
+INST3(vfnmadd213ps,     "fnmadd213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231ps,     "fnmadd231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132sd,     "fnmadd132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
+INST3(vfnmadd213sd,     "fnmadd213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231sd,     "fnmadd231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132ss,     "fnmadd132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
+INST3(vfnmadd213ss,     "fnmadd213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231ss,     "fnmadd231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132pd,     "fnmsub132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfnmsub213pd,     "fnmsub213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231pd,     "fnmsub231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_TT_FULL,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132ps,     "fnmsub132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfnmsub213ps,     "fnmsub213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231ps,     "fnmsub231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132sd,     "fnmsub132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
+INST3(vfnmsub213sd,     "fnmsub213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231sd,     "fnmsub231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_TT_TUPLE1_SCALAR,    Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132ss,     "fnmsub132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
+INST3(vfnmsub213ss,     "fnmsub213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231ss,     "fnmsub231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_TT_TUPLE1_SCALAR,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
+
+INST3(FIRST_AVXVNNI_INSTRUCTION, "FIRST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
+INST3(vpdpbusd,          "pdpbusd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x50),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Unsigned and Signed Bytes
+INST3(vpdpwssd,          "pdpwssd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x52),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Signed Word Integers
+INST3(vpdpbusds,         "pdpbusds",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x51),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Unsigned and Signed Bytes with Saturation
+INST3(vpdpwssds,         "pdpwssds",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x53),                             INS_TT_FULL,    Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Signed Word Integers with Saturation
+INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
 // BMI1
-INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Resets_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Logical AND NOT
-INST3(blsi,				"blsi",				IUM_WR, BAD_CODE,	  BAD_CODE,		SSE38(0xF3),							 Resets_OF		| Writes_SF		| Writes_ZF		| Undefined_AF	| Undefined_PF	| Writes_CF		| INS_Flags_IsDstDstSrcAVXInstruction)    // Extract Lowest Set Isolated Bit
-INST3(blsmsk,			"blsmsk",			IUM_WR, BAD_CODE,	  BAD_CODE,		SSE38(0xF3),							 Resets_OF		| Writes_SF		| Writes_ZF		| Undefined_AF	| Undefined_PF	| Resets_CF		| INS_Flags_IsDstDstSrcAVXInstruction)    // Get Mask Up to Lowest Set Bit
-INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Reset Lowest Set Bit
-INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Bit Field Extract
+INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
+INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Resets_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Logical AND NOT
+INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Extract Lowest Set Isolated Bit
+INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Resets_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Get Mask Up to Lowest Set Bit
+INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Reset Lowest Set Bit
+INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)    // Bit Field Extract
 
 // BMI2
-INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             INS_FLAGS_None)
-INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)                               // Parallel Bits Deposit
-INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)                               // Parallel Bits Extract
-INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Zero High Bits Starting with Specified Bit Position
-INST3(mulx,             "mulx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF6),                             INS_Flags_IsDstDstSrcAVXInstruction)                               // Unsigned Multiply Without Affecting Flags
+INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             INS_TT_NONE,    INS_FLAGS_None)
+INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)                               // Parallel Bits Deposit
+INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)                               // Parallel Bits Extract
+INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_TT_NONE,    Resets_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_Flags_IsDstDstSrcAVXInstruction)    // Zero High Bits Starting with Specified Bit Position
+INST3(mulx,             "mulx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF6),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)                               // Unsigned Multiply Without Affecting Flags
 #ifdef TARGET_AMD64
-INST3(shlx,             "shlx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_Flags_IsDstDstSrcAVXInstruction)   //  Shift Logical Left Without Affecting Flags
-INST3(sarx,             "sarx",             IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0xF7),           INS_Flags_IsDstDstSrcAVXInstruction)   //  Shift Arithmetic Right Without Affecting Flags
-INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF7),           INS_Flags_IsDstDstSrcAVXInstruction)   //  Shift Logical Right Without Affecting Flags
+INST3(shlx,             "shlx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)   //  Shift Logical Left Without Affecting Flags
+INST3(sarx,             "sarx",             IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0xF7),           INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)   //  Shift Arithmetic Right Without Affecting Flags
+INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF7),           INS_TT_NONE,    INS_Flags_IsDstDstSrcAVXInstruction)   //  Shift Logical Right Without Affecting Flags
 #endif
 
-INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
+
+INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
+
+INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
-INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
 // Scalar instructions in SSE4.2
-INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF0),           INS_FLAGS_None)
+INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF0),           INS_TT_NONE,    INS_FLAGS_None)
 
 // BMI1
-INST3(tzcnt,            "tzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBC),                            Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF )    // Count the Number of Trailing Zero Bits
+INST3(tzcnt,            "tzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBC),                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF )    // Count the Number of Trailing Zero Bits
 
 // LZCNT
-INST3(lzcnt,            "lzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBD),                            Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF )
+INST3(lzcnt,            "lzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBD),                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF )
 
 // MOVBE
-INST3(movbe,            "movbe",            IUM_WR, PCKMVB(0xF1), BAD_CODE,     PCKMVB(0xF0),                            INS_FLAGS_None )
+INST3(movbe,            "movbe",            IUM_WR, PCKMVB(0xF1), BAD_CODE,     PCKMVB(0xF0),                            INS_TT_NONE,    INS_FLAGS_None )
 
 // POPCNT
-INST3(popcnt,           "popcnt",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xB8),                            Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Resets_CF )
+INST3(popcnt,           "popcnt",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xB8),                            INS_TT_NONE,    Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Resets_CF )
 
 //    id                nm                  um      mr            mi                                                     flags
-INST2(ret,              "ret",              IUM_RD, 0x0000C3,     0x0000C2,                                              INS_FLAGS_None )
-INST2(loop,             "loop",             IUM_RD, BAD_CODE,     0x0000E2,                                              INS_FLAGS_None )
-INST2(call,             "call",             IUM_RD, 0x0010FF,     0x0000E8,                                              INS_FLAGS_None )
-
-INST2(rol,              "rol",              IUM_RW, 0x0000D2,     BAD_CODE,                                              Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(rol_1,            "rol",              IUM_RW, 0x0000D0,     0x0000D0,                                              Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(rol_N,            "rol",              IUM_RW, 0x0000C0,     0x0000C0,                                              Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(ror,              "ror",              IUM_RW, 0x0008D2,     BAD_CODE,                                              Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(ror_1,            "ror",              IUM_RW, 0x0008D0,     0x0008D0,                                              Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(ror_N,            "ror",              IUM_RW, 0x0008C0,     0x0008C0,                                              Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
-
-INST2(rcl,              "rcl",              IUM_RW, 0x0010D2,     BAD_CODE,                                              Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
-INST2(rcl_1,            "rcl",              IUM_RW, 0x0010D0,     0x0010D0,                                              Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
-INST2(rcl_N,            "rcl",              IUM_RW, 0x0010C0,     0x0010C0,                                              Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit ) 
-INST2(rcr,              "rcr",              IUM_RW, 0x0018D2,     BAD_CODE,                                              Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
-INST2(rcr_1,            "rcr",              IUM_RW, 0x0018D0,     0x0018D0,                                              Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )   
-INST2(rcr_N,            "rcr",              IUM_RW, 0x0018C0,     0x0018C0,                                              Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
-INST2(shl,              "shl",              IUM_RW, 0x0020D2,     BAD_CODE,                                              Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(shl_1,            "shl",              IUM_RW, 0x0020D0,     0x0020D0,                                              Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(shl_N,            "shl",              IUM_RW, 0x0020C0,     0x0020C0,                                              Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(shr,              "shr",              IUM_RW, 0x0028D2,     BAD_CODE,                                              Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(shr_1,            "shr",              IUM_RW, 0x0028D0,     0x0028D0,                                              Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(shr_N,            "shr",              IUM_RW, 0x0028C0,     0x0028C0,                                              Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(sar,              "sar",              IUM_RW, 0x0038D2,     BAD_CODE,                                              Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(sar_1,            "sar",              IUM_RW, 0x0038D0,     0x0038D0,                                              Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
-INST2(sar_N,            "sar",              IUM_RW, 0x0038C0,     0x0038C0,                                              Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(ret,              "ret",              IUM_RD, 0x0000C3,     0x0000C2,                                              INS_TT_NONE,    INS_FLAGS_None )
+INST2(loop,             "loop",             IUM_RD, BAD_CODE,     0x0000E2,                                              INS_TT_NONE,    INS_FLAGS_None )
+INST2(call,             "call",             IUM_RD, 0x0010FF,     0x0000E8,                                              INS_TT_NONE,    INS_FLAGS_None )
+
+INST2(rol,              "rol",              IUM_RW, 0x0000D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(rol_1,            "rol",              IUM_RW, 0x0000D0,     0x0000D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(rol_N,            "rol",              IUM_RW, 0x0000C0,     0x0000C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(ror,              "ror",              IUM_RW, 0x0008D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(ror_1,            "ror",              IUM_RW, 0x0008D0,     0x0008D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(ror_N,            "ror",              IUM_RW, 0x0008C0,     0x0008C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF             | INS_FLAGS_Has_Wbit )
+
+INST2(rcl,              "rcl",              IUM_RW, 0x0010D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
+INST2(rcl_1,            "rcl",              IUM_RW, 0x0010D0,     0x0010D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
+INST2(rcl_N,            "rcl",              IUM_RW, 0x0010C0,     0x0010C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit ) 
+INST2(rcr,              "rcr",              IUM_RW, 0x0018D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
+INST2(rcr_1,            "rcr",              IUM_RW, 0x0018D0,     0x0018D0,                                              INS_TT_NONE,    Writes_OF                                                                      | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )   
+INST2(rcr_N,            "rcr",              IUM_RW, 0x0018C0,     0x0018C0,                                              INS_TT_NONE,    Undefined_OF                                                                   | Writes_CF | Reads_CF  | INS_FLAGS_Has_Wbit )
+INST2(shl,              "shl",              IUM_RW, 0x0020D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(shl_1,            "shl",              IUM_RW, 0x0020D0,     0x0020D0,                                              INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(shl_N,            "shl",              IUM_RW, 0x0020C0,     0x0020C0,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(shr,              "shr",              IUM_RW, 0x0028D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(shr_1,            "shr",              IUM_RW, 0x0028D0,     0x0028D0,                                              INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(shr_N,            "shr",              IUM_RW, 0x0028C0,     0x0028C0,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(sar,              "sar",              IUM_RW, 0x0038D2,     BAD_CODE,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(sar_1,            "sar",              IUM_RW, 0x0038D0,     0x0038D0,                                              INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
+INST2(sar_N,            "sar",              IUM_RW, 0x0038C0,     0x0038C0,                                              INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF             | INS_FLAGS_Has_Wbit )
 
 
 //    id                nm                  um      mr                                                                   flags
-INST1(r_movsb,          "rep movsb",        IUM_RD, 0x00A4F3,                                                            Reads_DF | INS_FLAGS_Has_Wbit )
-INST1(r_movsd,          "rep movsd",        IUM_RD, 0x00A5F3,                                                            Reads_DF | INS_FLAGS_Has_Wbit )
+INST1(r_movsb,          "rep movsb",        IUM_RD, 0x00A4F3,                                                            INS_TT_NONE,    Reads_DF | INS_FLAGS_Has_Wbit )
+INST1(r_movsd,          "rep movsd",        IUM_RD, 0x00A5F3,                                                            INS_TT_NONE,    Reads_DF | INS_FLAGS_Has_Wbit )
 #if defined(TARGET_AMD64)
-INST1(r_movsq,          "rep movsq",        IUM_RD, 0xF3A548,                                                            Reads_DF )
+INST1(r_movsq,          "rep movsq",        IUM_RD, 0xF3A548,                                                            INS_TT_NONE,    Reads_DF )
 #endif // defined(TARGET_AMD64)
-INST1(movsb,            "movsb",            IUM_RD, 0x0000A4,                                                            Reads_DF   | INS_FLAGS_Has_Wbit )
-INST1(movsd,            "movsd",            IUM_RD, 0x0000A5,                                                            Reads_DF   | INS_FLAGS_Has_Wbit )
+INST1(movsb,            "movsb",            IUM_RD, 0x0000A4,                                                            INS_TT_NONE,    Reads_DF   | INS_FLAGS_Has_Wbit )
+INST1(movsd,            "movsd",            IUM_RD, 0x0000A5,                                                            INS_TT_NONE,    Reads_DF   | INS_FLAGS_Has_Wbit )
 #if defined(TARGET_AMD64)
-INST1(movsq,            "movsq",            IUM_RD, 0x00A548,                                                            Reads_DF   )
+INST1(movsq,            "movsq",            IUM_RD, 0x00A548,                                                            INS_TT_NONE,    Reads_DF   )
 #endif // defined(TARGET_AMD64)
 
-INST1(r_stosb,          "rep stosb",        IUM_RD, 0x00AAF3,                                                            Reads_DF   | INS_FLAGS_Has_Wbit )
-INST1(r_stosd,          "rep stosd",        IUM_RD, 0x00ABF3,                                                            Reads_DF   | INS_FLAGS_Has_Wbit )
+INST1(r_stosb,          "rep stosb",        IUM_RD, 0x00AAF3,                                                            INS_TT_NONE,    Reads_DF   | INS_FLAGS_Has_Wbit )
+INST1(r_stosd,          "rep stosd",        IUM_RD, 0x00ABF3,                                                            INS_TT_NONE,    Reads_DF   | INS_FLAGS_Has_Wbit )
 #if defined(TARGET_AMD64)
-INST1(r_stosq,          "rep stosq",        IUM_RD, 0xF3AB48,                                                            Reads_DF   )
+INST1(r_stosq,          "rep stosq",        IUM_RD, 0xF3AB48,                                                            INS_TT_NONE,    Reads_DF   )
 #endif // defined(TARGET_AMD64)
-INST1(stosb,            "stosb",            IUM_RD, 0x0000AA,                                                            Reads_DF   | INS_FLAGS_Has_Wbit )
-INST1(stosd,            "stosd",            IUM_RD, 0x0000AB,                                                            Reads_DF   | INS_FLAGS_Has_Wbit )
+INST1(stosb,            "stosb",            IUM_RD, 0x0000AA,                                                            INS_TT_NONE,    Reads_DF   | INS_FLAGS_Has_Wbit )
+INST1(stosd,            "stosd",            IUM_RD, 0x0000AB,                                                            INS_TT_NONE,    Reads_DF   | INS_FLAGS_Has_Wbit )
 #if defined(TARGET_AMD64)
-INST1(stosq,            "stosq",            IUM_RD, 0x00AB48,                                                            Reads_DF   )
+INST1(stosq,            "stosq",            IUM_RD, 0x00AB48,                                                            INS_TT_NONE,    Reads_DF   )
 #endif // defined(TARGET_AMD64)
 
-INST1(int3,             "int3",             IUM_RD, 0x0000CC,                                                            INS_FLAGS_None )
-INST1(nop,              "nop",              IUM_RD, 0x000090,                                                            INS_FLAGS_None )
-INST1(pause,            "pause",            IUM_RD, 0x0090F3,                                                            INS_FLAGS_None )
-INST1(lock,             "lock",             IUM_RD, 0x0000F0,                                                            INS_FLAGS_None )
-INST1(leave,            "leave",            IUM_RD, 0x0000C9,                                                            INS_FLAGS_None )
+INST1(int3,             "int3",             IUM_RD, 0x0000CC,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(nop,              "nop",              IUM_RD, 0x000090,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(pause,            "pause",            IUM_RD, 0x0090F3,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(lock,             "lock",             IUM_RD, 0x0000F0,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(leave,            "leave",            IUM_RD, 0x0000C9,                                                            INS_TT_NONE,    INS_FLAGS_None )
 
-INST1(serialize,        "serialize",        IUM_RD, 0x0fe801,                                                            INS_FLAGS_None )
+INST1(serialize,        "serialize",        IUM_RD, 0x0fe801,                                                            INS_TT_NONE,    INS_FLAGS_None )
 
-INST1(neg,              "neg",              IUM_RW, 0x0018F6,                                                            Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit )
-INST1(not,              "not",              IUM_RW, 0x0010F6,                                                            INS_FLAGS_None | INS_FLAGS_Has_Wbit )
+INST1(neg,              "neg",              IUM_RW, 0x0018F6,                                                            INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit )
+INST1(not,              "not",              IUM_RW, 0x0010F6,                                                            INS_TT_NONE,    INS_FLAGS_None | INS_FLAGS_Has_Wbit )
 
-INST1(cwde,             "cwde",             IUM_RD, 0x000098,                                                            INS_FLAGS_None )
-INST1(cdq,              "cdq",              IUM_RD, 0x000099,                                                            INS_FLAGS_None )
-INST1(idiv,             "idiv",             IUM_RD, 0x0038F6,                                                            Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit )
-INST1(imulEAX,          "imul",             IUM_RD, 0x0028F6,                                                            Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit )
-INST1(div,              "div",              IUM_RD, 0x0030F6,                                                            Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit )
-INST1(mulEAX,           "mul",              IUM_RD, 0x0020F6,                                                            Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit )
+INST1(cwde,             "cwde",             IUM_RD, 0x000098,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(cdq,              "cdq",              IUM_RD, 0x000099,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(idiv,             "idiv",             IUM_RD, 0x0038F6,                                                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit )
+INST1(imulEAX,          "imul",             IUM_RD, 0x0028F6,                                                            INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit )
+INST1(div,              "div",              IUM_RD, 0x0030F6,                                                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Undefined_CF  | INS_FLAGS_Has_Wbit )
+INST1(mulEAX,           "mul",              IUM_RD, 0x0020F6,                                                            INS_TT_NONE,    Writes_OF      | Undefined_SF  | Undefined_ZF  | Undefined_AF  | Undefined_PF  | Writes_CF     | INS_FLAGS_Has_Wbit )
 
-INST1(sahf,             "sahf",             IUM_RD, 0x00009E,                                                            Restore_SF_ZF_AF_PF_CF )
+INST1(sahf,             "sahf",             IUM_RD, 0x00009E,                                                            INS_TT_NONE,    Restore_SF_ZF_AF_PF_CF )
 
-INST1(xadd,             "xadd",             IUM_RW, 0x0F00C0,                                                            Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit )
-INST1(cmpxchg,          "cmpxchg",          IUM_RW, 0x0F00B0,                                                            Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit )
+INST1(xadd,             "xadd",             IUM_RW, 0x0F00C0,                                                            INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit )
+INST1(cmpxchg,          "cmpxchg",          IUM_RW, 0x0F00B0,                                                            INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF     | INS_FLAGS_Has_Wbit )
 
-INST1(shld,             "shld",             IUM_RW, 0x0F00A4,                                                            Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF     )
-INST1(shrd,             "shrd",             IUM_RW, 0x0F00AC,                                                            Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF     )
+INST1(shld,             "shld",             IUM_RW, 0x0F00A4,                                                            INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF     )
+INST1(shrd,             "shrd",             IUM_RW, 0x0F00AC,                                                            INS_TT_NONE,    Undefined_OF   | Writes_SF     | Writes_ZF     | Undefined_AF  | Writes_PF     | Writes_CF     )
 
 // For RyuJIT/x86, we follow the x86 calling convention that requires
 // us to return floating point value on the x87 FP stack, so we need
 // these instructions regardless of whether we're using full stack fp.
 #ifdef TARGET_X86
-INST1(fld,              "fld",              IUM_WR, 0x0000D9,                                                            INS_FLAGS_x87Instr)
-INST1(fstp,             "fstp",             IUM_WR, 0x0018D9,                                                            INS_FLAGS_x87Instr)
+INST1(fld,              "fld",              IUM_WR, 0x0000D9,                                                            INS_TT_NONE,    INS_FLAGS_x87Instr)
+INST1(fstp,             "fstp",             IUM_WR, 0x0018D9,                                                            INS_TT_NONE,    INS_FLAGS_x87Instr)
 #endif // TARGET_X86
 
-INST1(seto,             "seto",             IUM_WR, 0x0F0090,                                                            Reads_OF )
-INST1(setno,            "setno",            IUM_WR, 0x0F0091,                                                            Reads_OF )
-INST1(setb,             "setb",             IUM_WR, 0x0F0092,                                                            Reads_CF )
-INST1(setae,            "setae",            IUM_WR, 0x0F0093,                                                            Reads_CF )
-INST1(sete,             "sete",             IUM_WR, 0x0F0094,                                                            Reads_ZF )
-INST1(setne,            "setne",            IUM_WR, 0x0F0095,                                                            Reads_ZF )
-INST1(setbe,            "setbe",            IUM_WR, 0x0F0096,                                                            Reads_ZF | Reads_CF )
-INST1(seta,             "seta",             IUM_WR, 0x0F0097,                                                            Reads_ZF | Reads_CF )
-INST1(sets,             "sets",             IUM_WR, 0x0F0098,                                                            Reads_SF )
-INST1(setns,            "setns",            IUM_WR, 0x0F0099,                                                            Reads_SF )
-INST1(setp,             "setp",             IUM_WR, 0x0F009A,                                                            Reads_PF )
-INST1(setnp,            "setnp",            IUM_WR, 0x0F009B,                                                            Reads_PF )
-INST1(setl,             "setl",             IUM_WR, 0x0F009C,                                                            Reads_OF       | Reads_SF  )
-INST1(setge,            "setge",            IUM_WR, 0x0F009D,                                                            Reads_OF       | Reads_SF  )
-INST1(setle,            "setle",            IUM_WR, 0x0F009E,                                                            Reads_OF       | Reads_SF      | Reads_ZF  )
-INST1(setg,             "setg",             IUM_WR, 0x0F009F,                                                            Reads_OF       | Reads_SF      | Reads_ZF  )
+INST1(seto,             "seto",             IUM_WR, 0x0F0090,                                                            INS_TT_NONE,    Reads_OF )
+INST1(setno,            "setno",            IUM_WR, 0x0F0091,                                                            INS_TT_NONE,    Reads_OF )
+INST1(setb,             "setb",             IUM_WR, 0x0F0092,                                                            INS_TT_NONE,    Reads_CF )
+INST1(setae,            "setae",            IUM_WR, 0x0F0093,                                                            INS_TT_NONE,    Reads_CF )
+INST1(sete,             "sete",             IUM_WR, 0x0F0094,                                                            INS_TT_NONE,    Reads_ZF )
+INST1(setne,            "setne",            IUM_WR, 0x0F0095,                                                            INS_TT_NONE,    Reads_ZF )
+INST1(setbe,            "setbe",            IUM_WR, 0x0F0096,                                                            INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST1(seta,             "seta",             IUM_WR, 0x0F0097,                                                            INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST1(sets,             "sets",             IUM_WR, 0x0F0098,                                                            INS_TT_NONE,    Reads_SF )
+INST1(setns,            "setns",            IUM_WR, 0x0F0099,                                                            INS_TT_NONE,    Reads_SF )
+INST1(setp,             "setp",             IUM_WR, 0x0F009A,                                                            INS_TT_NONE,    Reads_PF )
+INST1(setnp,            "setnp",            IUM_WR, 0x0F009B,                                                            INS_TT_NONE,    Reads_PF )
+INST1(setl,             "setl",             IUM_WR, 0x0F009C,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF  )
+INST1(setge,            "setge",            IUM_WR, 0x0F009D,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF  )
+INST1(setle,            "setle",            IUM_WR, 0x0F009E,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF  )
+INST1(setg,             "setg",             IUM_WR, 0x0F009F,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF  )
 
 // Indirect jump used for tailcalls. We differentiate between func-internal
 // indirect jump (e.g. used for switch) and tailcall indirect jumps because the
 // x64 unwinder might require the latter to be rex.w prefixed.
-INST1(tail_i_jmp,       "tail.jmp",         IUM_RD, 0x0020FF,                                                            INS_FLAGS_None )
-INST1(i_jmp,            "jmp",              IUM_RD, 0x0020FF,                                                            INS_FLAGS_None )
-INST0(jmp,              "jmp",              IUM_RD, 0x0000EB,                                                            INS_FLAGS_None )
-INST0(jo,               "jo",               IUM_RD, 0x000070,                                                            Reads_OF )
-INST0(jno,              "jno",              IUM_RD, 0x000071,                                                            Reads_OF )
-INST0(jb,               "jb",               IUM_RD, 0x000072,                                                            Reads_CF )
-INST0(jae,              "jae",              IUM_RD, 0x000073,                                                            Reads_CF )
-INST0(je,               "je",               IUM_RD, 0x000074,                                                            Reads_ZF )
-INST0(jne,              "jne",              IUM_RD, 0x000075,                                                            Reads_ZF )
-INST0(jbe,              "jbe",              IUM_RD, 0x000076,                                                            Reads_ZF | Reads_CF )
-INST0(ja,               "ja",               IUM_RD, 0x000077,                                                            Reads_ZF | Reads_CF )
-INST0(js,               "js",               IUM_RD, 0x000078,                                                            Reads_SF )
-INST0(jns,              "jns",              IUM_RD, 0x000079,                                                            Reads_SF )
-INST0(jp,               "jp",               IUM_RD, 0x00007A,                                                            Reads_PF )
-INST0(jnp,              "jnp",              IUM_RD, 0x00007B,                                                            Reads_PF )
-INST0(jl,               "jl",               IUM_RD, 0x00007C,                                                            Reads_OF       | Reads_SF  )
-INST0(jge,              "jge",              IUM_RD, 0x00007D,                                                            Reads_OF       | Reads_SF  )
-INST0(jle,              "jle",              IUM_RD, 0x00007E,                                                            Reads_OF       | Reads_SF      | Reads_ZF  )
-INST0(jg,               "jg",               IUM_RD, 0x00007F,                                                            Reads_OF       | Reads_SF      | Reads_ZF  )
-
-INST0(l_jmp,            "jmp",              IUM_RD, 0x0000E9,                                                            INS_FLAGS_None )
-INST0(l_jo,             "jo",               IUM_RD, 0x00800F,                                                            Reads_OF )
-INST0(l_jno,            "jno",              IUM_RD, 0x00810F,                                                            Reads_OF )
-INST0(l_jb,             "jb",               IUM_RD, 0x00820F,                                                            Reads_CF )
-INST0(l_jae,            "jae",              IUM_RD, 0x00830F,                                                            Reads_CF )
-INST0(l_je,             "je",               IUM_RD, 0x00840F,                                                            Reads_ZF )
-INST0(l_jne,            "jne",              IUM_RD, 0x00850F,                                                            Reads_ZF )
-INST0(l_jbe,            "jbe",              IUM_RD, 0x00860F,                                                            Reads_ZF | Reads_CF )
-INST0(l_ja,             "ja",               IUM_RD, 0x00870F,                                                            Reads_ZF | Reads_CF )
-INST0(l_js,             "js",               IUM_RD, 0x00880F,                                                            Reads_SF )
-INST0(l_jns,            "jns",              IUM_RD, 0x00890F,                                                            Reads_SF )
-INST0(l_jp,             "jp",               IUM_RD, 0x008A0F,                                                            Reads_PF )
-INST0(l_jnp,            "jnp",              IUM_RD, 0x008B0F,                                                            Reads_PF )
-INST0(l_jl,             "jl",               IUM_RD, 0x008C0F,                                                            Reads_OF       | Reads_SF  )
-INST0(l_jge,            "jge",              IUM_RD, 0x008D0F,                                                            Reads_OF       | Reads_SF  )
-INST0(l_jle,            "jle",              IUM_RD, 0x008E0F,                                                            Reads_OF       | Reads_SF      | Reads_ZF  )
-INST0(l_jg,             "jg",               IUM_RD, 0x008F0F,                                                            Reads_OF       | Reads_SF      | Reads_ZF  )
-
-INST0(align,            "align",            IUM_RD, BAD_CODE,                                                            INS_FLAGS_None)
+INST1(tail_i_jmp,       "tail.jmp",         IUM_RD, 0x0020FF,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST1(i_jmp,            "jmp",              IUM_RD, 0x0020FF,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST0(jmp,              "jmp",              IUM_RD, 0x0000EB,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST0(jo,               "jo",               IUM_RD, 0x000070,                                                            INS_TT_NONE,    Reads_OF )
+INST0(jno,              "jno",              IUM_RD, 0x000071,                                                            INS_TT_NONE,    Reads_OF )
+INST0(jb,               "jb",               IUM_RD, 0x000072,                                                            INS_TT_NONE,    Reads_CF )
+INST0(jae,              "jae",              IUM_RD, 0x000073,                                                            INS_TT_NONE,    Reads_CF )
+INST0(je,               "je",               IUM_RD, 0x000074,                                                            INS_TT_NONE,    Reads_ZF )
+INST0(jne,              "jne",              IUM_RD, 0x000075,                                                            INS_TT_NONE,    Reads_ZF )
+INST0(jbe,              "jbe",              IUM_RD, 0x000076,                                                            INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST0(ja,               "ja",               IUM_RD, 0x000077,                                                            INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST0(js,               "js",               IUM_RD, 0x000078,                                                            INS_TT_NONE,    Reads_SF )
+INST0(jns,              "jns",              IUM_RD, 0x000079,                                                            INS_TT_NONE,    Reads_SF )
+INST0(jp,               "jp",               IUM_RD, 0x00007A,                                                            INS_TT_NONE,    Reads_PF )
+INST0(jnp,              "jnp",              IUM_RD, 0x00007B,                                                            INS_TT_NONE,    Reads_PF )
+INST0(jl,               "jl",               IUM_RD, 0x00007C,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF  )
+INST0(jge,              "jge",              IUM_RD, 0x00007D,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF  )
+INST0(jle,              "jle",              IUM_RD, 0x00007E,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF  )
+INST0(jg,               "jg",               IUM_RD, 0x00007F,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF  )
+
+INST0(l_jmp,            "jmp",              IUM_RD, 0x0000E9,                                                            INS_TT_NONE,    INS_FLAGS_None )
+INST0(l_jo,             "jo",               IUM_RD, 0x00800F,                                                            INS_TT_NONE,    Reads_OF )
+INST0(l_jno,            "jno",              IUM_RD, 0x00810F,                                                            INS_TT_NONE,    Reads_OF )
+INST0(l_jb,             "jb",               IUM_RD, 0x00820F,                                                            INS_TT_NONE,    Reads_CF )
+INST0(l_jae,            "jae",              IUM_RD, 0x00830F,                                                            INS_TT_NONE,    Reads_CF )
+INST0(l_je,             "je",               IUM_RD, 0x00840F,                                                            INS_TT_NONE,    Reads_ZF )
+INST0(l_jne,            "jne",              IUM_RD, 0x00850F,                                                            INS_TT_NONE,    Reads_ZF )
+INST0(l_jbe,            "jbe",              IUM_RD, 0x00860F,                                                            INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST0(l_ja,             "ja",               IUM_RD, 0x00870F,                                                            INS_TT_NONE,    Reads_ZF | Reads_CF )
+INST0(l_js,             "js",               IUM_RD, 0x00880F,                                                            INS_TT_NONE,    Reads_SF )
+INST0(l_jns,            "jns",              IUM_RD, 0x00890F,                                                            INS_TT_NONE,    Reads_SF )
+INST0(l_jp,             "jp",               IUM_RD, 0x008A0F,                                                            INS_TT_NONE,    Reads_PF )
+INST0(l_jnp,            "jnp",              IUM_RD, 0x008B0F,                                                            INS_TT_NONE,    Reads_PF )
+INST0(l_jl,             "jl",               IUM_RD, 0x008C0F,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF  )
+INST0(l_jge,            "jge",              IUM_RD, 0x008D0F,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF  )
+INST0(l_jle,            "jle",              IUM_RD, 0x008E0F,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF  )
+INST0(l_jg,             "jg",               IUM_RD, 0x008F0F,                                                            INS_TT_NONE,    Reads_OF       | Reads_SF      | Reads_ZF  )
+
+INST0(align,            "align",            IUM_RD, BAD_CODE,                                                            INS_TT_NONE,    INS_FLAGS_None)
 
 /*****************************************************************************/
 #undef  INST0
diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp
index d91f7104d1871f..469abe498f42a5 100644
--- a/src/coreclr/jit/simdcodegenxarch.cpp
+++ b/src/coreclr/jit/simdcodegenxarch.cpp
@@ -709,7 +709,8 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
     // Currently AVX doesn't support integer.
     // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
     if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
-        !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && GetEmitter()->IsThreeOperandAVXInstruction(ins))
+        !(ins == INS_cvtsi2ss32 || ins == INS_cvtsi2sd32 || ins == INS_cvtsi2ss64 || ins == INS_cvtsi2sd64) &&
+        GetEmitter()->IsThreeOperandAVXInstruction(ins))
     {
         inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
     }

From 0a1d2b9d26aea0c49a68a92e40706018ebe54189 Mon Sep 17 00:00:00 2001
From: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
Date: Thu, 27 Oct 2022 16:03:34 -0700
Subject: [PATCH 4/4] Moving 'JitStressEvexEncoding' under Debug flag and other
 review cleanup.

---
 src/coreclr/jit/codegen.h         |   4 +
 src/coreclr/jit/codegenxarch.cpp  |   2 +-
 src/coreclr/jit/compiler.h        |  18 ++--
 src/coreclr/jit/emit.cpp          |   1 -
 src/coreclr/jit/emitxarch.cpp     | 165 ++++++++++++++++--------------
 src/coreclr/jit/emitxarch.h       |  28 +++--
 src/coreclr/jit/instr.cpp         |  16 ++-
 src/coreclr/jit/instr.h           |  16 +--
 src/coreclr/jit/instrsxarch.h     |   3 +-
 src/coreclr/jit/jitconfigvalues.h |   8 +-
 10 files changed, 148 insertions(+), 113 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index cc544e5bb7e532..2a27feeaf81f3c 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -1700,7 +1700,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
     instruction ins_Copy(var_types dstType);
     instruction ins_Copy(regNumber srcReg, var_types dstType);
+#if defined(TARGET_XARCH)
     instruction ins_FloatConv(var_types to, var_types from, emitAttr attr);
+#elif defined(TARGET_ARM)
+    instruction ins_FloatConv(var_types to, var_types from);
+#endif
     instruction ins_MathOp(genTreeOps oper, var_types type);
 
     void instGen_Return(unsigned stkArgSize);
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 3f33ee2ac044ca..8f76d0a9c40ff6 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -7110,7 +7110,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
     // Note that we need to specify dstType here so that it will determine
     // the size of destination integer register and also the rex.w prefix.
     genConsumeOperands(treeNode->AsOp());
-    instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(dstType));
+    instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(srcType));
     GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
     genProduceReg(treeNode);
 }
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 8461c90aeacb3f..3b6cdf4b9cbf19 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -8993,7 +8993,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // canUseEvexEncoding - Answer the question: Is Evex encoding supported on this target.
     //
     // Returns:
-    //    TRUE if Evex encoding is supported, FALSE if not.
+    //    `true` if Evex encoding is supported, `false` if not.
+    //
     bool canUseEvexEncoding() const
     {
 #ifdef TARGET_XARCH
@@ -9007,18 +9008,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding.
     //
     // Returns:
-    //    TRUE if user requests EVEX encoding and it's safe, FALSE if not.
+    //    `true` if user requests EVEX encoding and it's safe, `false` if not.
+    //
     bool DoJitStressEvexEncoding() const
     {
 #ifdef TARGET_XARCH
-        // Using JitStressEVEXEncoding flag will force instructions which would
-        // otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding
-        // This requires AVX512VL support.
-        if (JitConfig.JitStressEVEXEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
+// Using JitStressEvexEncoding flag will force instructions which would
+// otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding
+// This requires AVX512VL support.
+#ifdef DEBUG
+        if (JitConfig.JitStressEvexEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
         {
             return true;
         }
-#endif
+#endif // DEBUG
+#endif // TARGET_XARCH
         return false;
     }
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index a54bacb0558ef6..a7644c3bcbd0f1 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3999,7 +3999,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
     UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
     unsigned estimatedSize = id->idCodeSize();
-
     if (actualSize != estimatedSize)
     {
         // It is fatal to under-estimate the instruction size, except for alignment instructions
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index f73cb54d0361ce..db6294890e7e3f 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -41,7 +41,8 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins)
 //    ins - The instruction to check.
 //
 // Returns:
-//    TRUE if it is a sse or avx or avx512 instruction..
+//    `true` if it is a sse or avx or avx512 instruction.
+//
 bool emitter::IsAvx512OrPriorInstruction(instruction ins)
 {
     // TODO-XArch-AVX512: Fix check once AVX512 instructions are added.
@@ -60,7 +61,8 @@ bool emitter::IsAVXOnlyInstruction(instruction ins)
 //    ins - The instruction to check.
 //
 // Returns:
-//    TRUE if it is a avx512f+ instruction.
+//    `true` if it is a avx512f+ instruction.
+//
 bool emitter::IsAvx512OnlyInstruction(instruction ins)
 {
     return (ins >= INS_FIRST_AVX512_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION);
@@ -150,20 +152,21 @@ regNumber emitter::getSseShiftRegNumber(instruction ins)
     }
 }
 
-bool emitter::IsAVXInstruction(instruction ins) const
+bool emitter::IsVexEncodedInstruction(instruction ins) const
 {
     return UseVEXEncoding() && IsSSEOrAVXInstruction(ins);
 }
 
 //------------------------------------------------------------------------
-// IsAvx512Instruction: Answer the question- Can this instruction be Evex encoded.
+// IsEvexEncodedInstruction: Answer the question- Can this instruction be Evex encoded.
 //
 // Arguments:
 //    ins - The instruction to check.
 //
 // Returns:
-//    TRUE if ins can be Evex encoded.
-bool emitter::IsAvx512Instruction(instruction ins) const
+//    `true` if ins can be Evex encoded.
+//
+bool emitter::IsEvexEncodedInstruction(instruction ins) const
 {
     if (!UseEvexEncoding())
     {
@@ -288,11 +291,11 @@ bool emitter::IsAvx512Instruction(instruction ins) const
 //    ins - The instruction to check.
 //
 // Returns:
-//    TRUE if ins is a SIMD instruction.
+//    `true` if ins is a SIMD instruction.
 //
-bool emitter::IsSimdInstruction(instruction ins) const
+bool emitter::IsVexOrEvexEncodedInstruction(instruction ins) const
 {
-    return IsAvx512Instruction(ins) || IsAVXInstruction(ins);
+    return IsEvexEncodedInstruction(ins) || IsVexEncodedInstruction(ins);
 }
 
 // Returns true if the AVX instruction is a binary operator that requires 3 operands.
@@ -303,7 +306,8 @@ bool emitter::IsSimdInstruction(instruction ins) const
 // to indicate whether a 3-operand instruction.
 bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
 {
-    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsSimdInstruction(ins);
+    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) &&
+           IsVexOrEvexEncodedInstruction(ins);
 }
 
 // Returns true if the AVX instruction requires 3 operands that duplicate the source
@@ -313,7 +317,8 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
 // to indicate whether a 3-operand instruction.
 bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
 {
-    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsSimdInstruction(ins);
+    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) &&
+           IsVexOrEvexEncodedInstruction(ins);
 }
 
 //------------------------------------------------------------------------
@@ -763,8 +768,8 @@ bool emitter::TakesSimdPrefix(instruction ins) const
 //------------------------------------------------------------------------
 // TakesEvexPrefix: Checks if the instruction should be EVEX encoded.
 // TODO-XArch-AVX512: This check needs to be updated once AVX512 instructions are added.
-// Eventually, this should evolve to return 'TRUE' for the following cases:
-// - JitConfig.JitStressEVEXEncoding flag is set.
+// Eventually, this should evolve to return `true` for the following cases:
+// - JitConfig.JitStressEvexEncoding flag is set.
 // - Is an new AVX512 instruction.
 // - Uses ZMM vector registers.
 // - Uses upper 128-bit or 256-bit registers for an AVX512VL ins.
@@ -787,9 +792,11 @@ bool emitter::TakesEvexPrefix(instruction ins) const
     }
 
     // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added.
-    return IsAvx512Instruction(ins) && !HasKMaskRegisterDest(ins);
+    return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins);
 }
 
+// Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section
+// 2.6.
 // Add base EVEX prefix without setting W, R, X, or B bits
 // L'L bits will be set based on emitter attr.
 //
@@ -813,6 +820,7 @@ bool emitter::TakesEvexPrefix(instruction ins) const
 // - V'- bit to extend vvvv
 // - aaa - specifies mask register
 //    Rest    - reserved for future use and usage of them will uresult in Undefined instruction exception.
+//
 #define DEFAULT_BYTE_EVEX_PREFIX 0x62F07C0800000000ULL
 
 #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL
@@ -833,7 +841,7 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at
 {
 
     // Only AVX512 instructions require EVEX prefix
-    assert(IsAvx512Instruction(ins));
+    assert(IsEvexEncodedInstruction(ins));
 
     // Shouldn't have already added EVEX prefix
     assert(!hasEvexPrefix(code));
@@ -873,7 +881,7 @@ bool emitter::TakesVexPrefix(instruction ins) const
             break;
     }
 
-    return IsAVXInstruction(ins);
+    return IsVexEncodedInstruction(ins);
 }
 
 // Add base VEX prefix without setting W, R, X, or B bits
@@ -910,7 +918,7 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att
     // emitted, by simply checking that all the requirements were met.
 
     // Only AVX instructions require VEX prefix
-    assert(IsAVXInstruction(ins));
+    assert(IsVexEncodedInstruction(ins));
 
     // Shouldn't have already added VEX prefix
     assert(!hasVexPrefix(code));
@@ -1128,7 +1136,7 @@ unsigned RegEncoding(regNumber reg)
 // AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
 emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 {
-    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    if (UseEvexEncoding() && IsEvexEncodedInstruction(ins))
     {
         if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck().
         {
@@ -1139,7 +1147,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
             return emitter::code_t(code | 0x0000800000000000ULL);
         }
     }
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsVexEncodedInstruction(ins))
     {
         if (TakesVexPrefix(ins))
         {
@@ -1162,7 +1170,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
 {
-    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    if (UseEvexEncoding() && IsEvexEncodedInstruction(ins))
     {
         if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck().
         {
@@ -1173,7 +1181,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
             return code & 0xFF7FFFFFFFFFFFFFULL;
         }
     }
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsVexEncodedInstruction(ins))
     {
         if (TakesVexPrefix(ins))
         {
@@ -1190,7 +1198,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
 {
-    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    if (UseEvexEncoding() && IsEvexEncodedInstruction(ins))
     {
         if (TakesEvexPrefix(ins))
         {
@@ -1200,7 +1208,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
             return code & 0xFFBFFFFFFFFFFFFFULL;
         }
     }
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsVexEncodedInstruction(ins))
     {
         if (TakesVexPrefix(ins))
         {
@@ -1217,7 +1225,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
 {
-    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    if (UseEvexEncoding() && IsEvexEncodedInstruction(ins))
     {
         if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck().
         {
@@ -1228,7 +1236,7 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
             return code & 0xFFDFFFFFFFFFFFFFULL;
         }
     }
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsVexEncodedInstruction(ins))
     {
         if (TakesVexPrefix(ins))
         {
@@ -1246,8 +1254,8 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
 // Adds REX prefix (0x40) without W, R, X or B bits set
 emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
 {
-    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
-    assert(!UseEvexEncoding() || !IsAvx512Instruction(ins));
+    assert(!UseVEXEncoding() || !IsVexEncodedInstruction(ins));
+    assert(!UseEvexEncoding() || !IsEvexEncodedInstruction(ins));
     return code | 0x4000000000ULL;
 }
 
@@ -1289,7 +1297,7 @@ unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_
     if (hasEvexPrefix(code))
     {
         // Only AVX512 instructions should have an EVEX prefix
-        assert(IsAvx512Instruction(ins));
+        assert(IsEvexEncodedInstruction(ins));
 
         code_t evexPrefix = (code >> 32) & 0xFFFFFFFF;
         code &= 0x00000000FFFFFFFFLL;
@@ -1391,7 +1399,7 @@ unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_
     else if (hasVexPrefix(code))
     {
         // Only AVX instructions should have a VEX prefix
-        assert(UseVEXEncoding() && IsAVXInstruction(ins));
+        assert(UseVEXEncoding() && IsVexEncodedInstruction(ins));
         code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
         code &= 0x00000000FFFFFFFFLL;
 
@@ -1631,7 +1639,7 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c
     if (hasVexPrefix(code))
     {
         // Only AVX instructions should have a VEX prefix
-        assert(UseVEXEncoding() && IsAVXInstruction(ins));
+        assert(UseVEXEncoding() && IsVexEncodedInstruction(ins));
         code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
         code &= 0x00000000FFFFFFFFLL;
 
@@ -1898,7 +1906,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 {
     // In case of AVX instructions, REX prefixes are part of VEX prefix.
     // And hence requires no additional byte to encode REX prefixes.
-    if (IsAVXInstruction(ins))
+    if (IsVexOrEvexEncodedInstruction(ins))
     {
         return 0;
     }
@@ -1919,7 +1927,7 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
 //
 unsigned emitter::emitGetEvexPrefixSize(instruction ins)
 {
-    if (IsAvx512Instruction(ins))
+    if (IsEvexEncodedInstruction(ins))
     {
         return 4;
     }
@@ -1931,7 +1939,7 @@ unsigned emitter::emitGetEvexPrefixSize(instruction ins)
 // Size of vex prefix in bytes
 unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
 {
-    if (IsAVXInstruction(ins))
+    if (IsVexEncodedInstruction(ins))
     {
         return 3;
     }
@@ -1958,9 +1966,10 @@ unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, c
     unsigned adjustedSize = 0;
 
     // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway
-    // IsAvx512Instruction(ins) is TRUE for AVX/SSE instructions also which needs to be VEX encoded unless explicitly
+    // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless
+    // explicitly
     // asked for EVEX.
-    if (IsAvx512Instruction(ins) && TakesEvexPrefix(ins))
+    if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(ins))
     {
         // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
         // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always
@@ -2007,7 +2016,7 @@ unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, c
 
         adjustedSize = evexPrefixAdjustedSize;
     }
-    else if (IsAVXInstruction(ins))
+    else if (IsVexEncodedInstruction(ins))
     {
         // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
         // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always
@@ -2089,7 +2098,7 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod
 {
     unsigned adjustedSize = 0;
 
-    if (IsAVXInstruction(ins))
+    if (IsVexEncodedInstruction(ins))
     {
         // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
         // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always
@@ -2603,7 +2612,7 @@ insTupleType insTupleTypeInfos[] =
 inline bool hasTupleTypeInfo(instruction ins)
 {
     assert((unsigned)ins < ArrLen(insTupleTypeInfos));
-    return ((insTupleTypeInfos[ins] != INS_TT_NONE));
+    return (insTupleTypeInfos[ins] != INS_TT_NONE);
 }
 
 //------------------------------------------------------------------------
@@ -2618,7 +2627,7 @@ inline bool hasTupleTypeInfo(instruction ins)
 inline insTupleType insTupleTypeInfo(instruction ins)
 {
     assert((unsigned)ins < ArrLen(insTupleTypeInfos));
-    assert((insTupleTypeInfos[ins] != INS_TT_NONE));
+    assert(insTupleTypeInfos[ins] != INS_TT_NONE);
     return insTupleTypeInfos[ins];
 }
 
@@ -2729,7 +2738,7 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt
 inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code)
 {
     assert(reg < REG_STK);
-    assert(IsSimdInstruction(ins));
+    assert(IsVexOrEvexEncodedInstruction(ins));
     assert(hasVexOrEvexPrefix(code));
 
     // Get 4-bit register encoding
@@ -2743,7 +2752,7 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg,
 
     // Both prefix encodes register operand in 1's complement form
     assert(regBits <= 0xF);
-    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    if (UseEvexEncoding() && IsEvexEncodedInstruction(ins))
     {
         if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code))
         {
@@ -2754,7 +2763,7 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg,
             return code ^ regBits;
         }
     }
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsVexEncodedInstruction(ins))
     {
         if (TakesVexPrefix(ins))
         {
@@ -3080,7 +3089,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
         (!id->idIsSmallDsc() && (IsExtendedReg(id->idReg3(), attr) || IsExtendedReg(id->idReg4(), attr))))
     {
         sz += emitGetRexPrefixSize(ins);
-        includeRexPrefixSize = !IsAVXInstruction(ins);
+        includeRexPrefixSize = !IsVexEncodedInstruction(ins);
     }
 
     sz += emitInsSize(code, includeRexPrefixSize);
@@ -3733,7 +3742,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code, int val
     }
     else
     {
-        assert(!IsSSEOrAVXInstruction(ins));
+        assert(!IsAvx512OrPriorInstruction(ins));
     }
 
     return valSize + emitInsSizeAM(id, code);
@@ -6006,7 +6015,7 @@ void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int off
 void emitter::emitIns_AR_R_R(
     instruction ins, emitAttr attr, regNumber op2Reg, regNumber op3Reg, regNumber base, int offs)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins));
 
     instrDesc* id = emitNewInstrAmd(attr, offs);
@@ -6046,7 +6055,7 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre
 void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival)
 {
     noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
 
     ssize_t    offs = indir->Offset();
     instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
@@ -6066,7 +6075,7 @@ void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenT
 void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival)
 {
     noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
 
     instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
 
@@ -6113,7 +6122,7 @@ void emitter::emitIns_R_C_I(
 void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival)
 {
     noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
 
     instrDesc* id = emitNewInstrCns(attr, ival);
 
@@ -6135,7 +6144,7 @@ void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int
 
 void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins));
 
     ssize_t    offs = indir->Offset();
@@ -6156,7 +6165,7 @@ void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regN
 
 void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins));
 
     instrDesc* id = emitNewInstrAmd(attr, offs);
@@ -6322,7 +6331,7 @@ void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regN
 void emitter::emitIns_R_R_A_I(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins));
 
     ssize_t    offs = indir->Offset();
@@ -6344,7 +6353,7 @@ void emitter::emitIns_R_R_A_I(
 void emitter::emitIns_R_R_AR_I(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins));
 
     instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
@@ -6456,7 +6465,7 @@ void emitter::emitIns_R_R_R_I(
 void emitter::emitIns_R_R_S_I(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins));
 
     instrDesc* id = emitNewInstrCns(attr, ival);
@@ -6516,7 +6525,7 @@ void emitter::emitIns_R_R_A_R(
     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir)
 {
     assert(isAvxBlendv(ins));
-    assert(UseVEXEncoding());
+    assert(UseSimdEncoding());
 
     int        ival = encodeXmmRegAsIval(op3Reg);
     ssize_t    offs = indir->Offset();
@@ -6555,7 +6564,7 @@ void emitter::emitIns_R_R_AR_R(
     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base, int offs)
 {
     assert(isAvxBlendv(ins));
-    assert(UseVEXEncoding());
+    assert(UseSimdEncoding());
 
     int        ival = encodeXmmRegAsIval(op3Reg);
     instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
@@ -7229,7 +7238,7 @@ void emitter::emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs
 //
 void emitter::emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, regNumber reg, int imm)
 {
-    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsAvx512OrPriorInstruction(ins));
     assert(reg != REG_NA);
 
     instrDesc* id = emitNewInstrAmdCns(attr, indir->Offset(), imm);
@@ -10641,7 +10650,7 @@ void emitter::emitDispIns(
 
         case IF_RWR_RRD_RRD:
         {
-            assert(IsAVXInstruction(ins));
+            assert(IsVexEncodedInstruction(ins));
             assert(IsThreeOperandAVXInstruction(ins));
             regNumber reg2 = id->idReg2();
             regNumber reg3 = id->idReg3();
@@ -10664,7 +10673,7 @@ void emitter::emitDispIns(
         }
 
         case IF_RWR_RRD_RRD_CNS:
-            assert(IsAVXInstruction(ins));
+            assert(IsVexEncodedInstruction(ins));
             assert(IsThreeOperandAVXInstruction(ins));
             printf("%s, ", emitRegName(id->idReg1(), attr));
             printf("%s, ", emitRegName(id->idReg2(), attr));
@@ -11474,8 +11483,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
 
     // Emit the REX prefix if required
     // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support.
-    // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently
-    // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support.
+    // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently
+    // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support.
     if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))
     {
         code = AddRexWPrefix(ins, code);
@@ -11601,7 +11610,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             code += 4;
         }
     }
-    else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins))
+    else if (!IsSSEInstruction(ins) && !IsVexEncodedInstruction(ins))
     {
         /* Is the operand size larger than a byte? */
 
@@ -12408,7 +12417,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             code += 4;
         }
     }
-    else if (!IsSSEInstruction(ins) && !IsAVXInstruction(ins) && !IsAvx512Instruction(ins))
+    else if (!IsSSEInstruction(ins) && !IsVexEncodedInstruction(ins) && !IsEvexEncodedInstruction(ins))
     {
         // Is the operand size larger than a byte?
         switch (size)
@@ -13186,7 +13195,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
 
     // We would to update GC info correctly
     assert(!IsSSEInstruction(ins));
-    assert(!IsAVXInstruction(ins));
+    assert(!IsVexEncodedInstruction(ins));
 
     // Get the 'base' opcode
     switch (ins)
@@ -13618,7 +13627,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
     else if ((code & 0xFF) == 0x00)
     {
         // This case happens for some SSE/AVX instructions only
-        assert(IsAVXInstruction(ins) || Is4ByteSSEInstruction(ins));
+        assert(IsVexEncodedInstruction(ins) || Is4ByteSSEInstruction(ins));
 
         dst += emitOutputByte(dst, (code >> 8) & 0xFF);
         dst += emitOutputByte(dst, (0xC0 | regCode));
@@ -13818,7 +13827,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     code_t code;
 
     instruction ins = id->idIns();
-    assert(IsSimdInstruction(ins));
+    assert(IsVexOrEvexEncodedInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins));
     regNumber targetReg = id->idReg1();
     regNumber src1      = id->idReg2();
@@ -13866,7 +13875,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     else if ((code & 0xFF) == 0x00)
     {
         // This case happens for AVX instructions only
-        assert(IsSimdInstruction(ins));
+        assert(IsVexOrEvexEncodedInstruction(ins));
 
         dst += emitOutputByte(dst, (code >> 8) & 0xFF);
         dst += emitOutputByte(dst, (0xC0 | regCode));
@@ -14264,7 +14273,7 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id)
 
     // We would to update GC info correctly
     assert(!IsSSEInstruction(ins));
-    assert(!IsAVXInstruction(ins));
+    assert(!IsVexEncodedInstruction(ins));
 
 #ifdef TARGET_AMD64
     // all these opcodes take a sign-extended 4-byte immediate, max
@@ -14358,7 +14367,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
 
     // SSE/AVX doesnt make any sense here
     assert(!IsSSEInstruction(ins));
-    assert(!IsAVXInstruction(ins));
+    assert(!IsVexEncodedInstruction(ins));
 
     size_t ssz;
     size_t lsz;
@@ -14714,7 +14723,7 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id)
 // Arguments:
 //    id -- Instruction descriptor.
 //    dsp -- Displacemnt.
-//    dspInByte[out] - TRUE if compressed displacement
+//    dspInByte[out] - `true` if compressed displacement
 //
 // Return Value:
 //    compressed displacement value if dspInByte ===  TRUE.
@@ -15367,7 +15376,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             else if ((code & 0xFF) == 0x00)
             {
                 // This case happens for some SSE/AVX instructions only
-                assert(IsAVXInstruction(ins) || Is4ByteSSEInstruction(ins));
+                assert(IsVexEncodedInstruction(ins) || Is4ByteSSEInstruction(ins));
 
                 dst += emitOutputByte(dst, (code >> 8) & 0xFF);
                 dst += emitOutputByte(dst, (0xC0 | regcode));
@@ -15439,7 +15448,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
         case IF_RRW_ARD_CNS:
         case IF_RWR_ARD_CNS:
-            assert(IsSSEOrAVXInstruction(ins));
+            assert(IsAvx512OrPriorInstruction(ins));
             emitGetInsAmdCns(id, &cnsVal);
             code = insCodeRM(ins);
 
@@ -15459,7 +15468,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             break;
 
         case IF_AWR_RRD_CNS:
-            assert(IsSSEOrAVXInstruction(ins));
+            assert(IsAvx512OrPriorInstruction(ins));
             emitGetInsAmdCns(id, &cnsVal);
             code = insCodeMR(ins);
             dst  = emitOutputAM(dst, id, code, &cnsVal);
@@ -15498,7 +15507,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RWR_RRD_ARD_CNS:
         case IF_RWR_RRD_ARD_RRD:
         {
-            assert(IsSSEOrAVXInstruction(ins));
+            assert(IsAvx512OrPriorInstruction(ins));
             emitGetInsAmdCns(id, &cnsVal);
             code = insCodeRM(ins);
             if (EncodedBySSE38orSSE3A(ins))
@@ -15607,7 +15616,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
         case IF_RRW_SRD_CNS:
         case IF_RWR_SRD_CNS:
-            assert(IsSSEOrAVXInstruction(ins));
+            assert(IsAvx512OrPriorInstruction(ins));
             emitGetInsCns(id, &cnsVal);
             code = insCodeRM(ins);
 
@@ -15671,7 +15680,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
         case IF_RWR_RRD_SRD:
         {
-            assert(IsSimdInstruction(ins));
+            assert(IsVexOrEvexEncodedInstruction(ins));
 
             code = insCodeRM(ins);
             code = AddSimdPrefixIfNeeded(ins, code, size);
@@ -15696,7 +15705,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RWR_RRD_SRD_RRD:
         {
             // This should only be called on AVX instructions
-            assert(IsAVXInstruction(ins));
+            assert(IsVexOrEvexEncodedInstruction(ins));
             emitGetInsCns(id, &cnsVal);
 
             code = insCodeRM(ins);
@@ -15836,7 +15845,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RWR_RRD_MRD:
         {
             // This should only be called on AVX instructions
-            assert(IsAVXInstruction(ins));
+            assert(IsVexEncodedInstruction(ins));
 
             code = insCodeRM(ins);
             code = AddVexPrefixIfNeeded(ins, code, size);
@@ -15861,7 +15870,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RWR_RRD_MRD_RRD:
         {
             // This should only be called on AVX instructions
-            assert(IsAVXInstruction(ins));
+            assert(IsVexEncodedInstruction(ins));
             emitGetInsCns(id, &cnsVal);
 
             code = insCodeRM(ins);
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index e2910b3b34192f..13f79bc3833cca 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -98,9 +98,9 @@ static bool IsBMIInstruction(instruction ins);
 
 static regNumber getBmiRegNumber(instruction ins);
 static regNumber getSseShiftRegNumber(instruction ins);
-bool IsAVXInstruction(instruction ins) const;
-bool IsAvx512Instruction(instruction ins) const;
-bool IsSimdInstruction(instruction ins) const;
+bool IsVexEncodedInstruction(instruction ins) const;
+bool IsEvexEncodedInstruction(instruction ins) const;
+bool IsVexOrEvexEncodedInstruction(instruction ins) const;
 
 code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code);
 
@@ -178,7 +178,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
 //    ins - The instruction to check.
 //
 // Returns:
-//    TRUE if W bit needs to be set to 1.
+//    `true` if W bit needs to be set to 1.
 //
 bool IsWEvexOpcodeExtension(instruction ins)
 {
@@ -410,7 +410,7 @@ bool IsWEvexOpcodeExtension(instruction ins)
 //    ins - The instruction to check.
 //
 // Returns:
-//    TRUE if Evex encoding requires KMAsk support.
+//    `true` if Evex encoding requires KMAsk support.
 //
 bool HasKMaskRegisterDest(instruction ins) const
 {
@@ -474,7 +474,8 @@ void SetUseEvexEncoding(bool value)
 // contains Evex prefix.
 //
 // Returns:
-//    TRUE if target supports either.
+//    `true` if target supports either.
+//
 bool UseSimdEncoding() const
 {
     return UseVEXEncoding() || UseEvexEncoding();
@@ -494,7 +495,8 @@ bool TakesEvexPrefix(instruction ins) const;
 //    code - opcode + prefixes bits at some stage of encoding.
 //
 // Returns:
-//    TRUE if code has an Evex prefix.
+//    `true` if code has an Evex prefix.
+//
 bool hasEvexPrefix(code_t code)
 {
     return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE;
@@ -534,7 +536,8 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size)
 //    size - operand size
 //
 // Returns:
-//    TRUE if code has an Evex prefix.
+//    `true` if code has an Evex prefix.
+//
 code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size)
 {
     if (TakesEvexPrefix(ins))
@@ -558,7 +561,8 @@ bool TakesSimdPrefix(instruction ins) const;
 //    code - opcode + prefixes bits at some stage of encoding.
 //
 // Returns:
-//    TRUE if code has a SIMD prefix.
+//    `true` if code has a SIMD prefix.
+//
 bool hasVexOrEvexPrefix(code_t code)
 {
     return (hasVexPrefix(code) || hasEvexPrefix(code));
@@ -575,7 +579,8 @@ ssize_t TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte);
 //    code - opcode + prefixes bits at some stage of encoding.
 //
 // Returns:
-//    TRUE if code has an Evex prefix.
+//    `true` if code has an Evex prefix.
+//
 bool codeEvexMigrationCheck(code_t code)
 {
     return hasEvexPrefix(code);
@@ -1011,7 +1016,8 @@ inline bool emitIsUncondJump(instrDesc* jmp)
 //    id - Instruction descriptor.
 //
 // Returns:
-//    TRUE if the instruction does embeddded broadcast.
+//    `true` if the instruction does embedded broadcast.
+//
 inline bool HasEmbeddedBroadcast(instrDesc* id)
 {
     return false;
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index cc552a964ed21c..406becd21fb853 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -101,7 +101,7 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id)
     static char     buf[4][TEMP_BUFFER_LEN];
     const char*     retbuf;
 
-    if (GetEmitter()->IsAVXInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins))
+    if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins))
     {
         sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName);
         retbuf = buf[curBuf];
@@ -1788,7 +1788,17 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
     }
 }
 
-// Conversions to or from floating point values
+//------------------------------------------------------------------------
+// ins_FloatConv: Conversions to or from floating point values.
+//
+// Arguments:
+//    to - Destination type.
+//    from - Source type.
+//    attr - Input size.
+//
+// Returns:
+//    The correct conversion instruction to use based on src and dst types.
+//
 instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
 {
     // AVX: For now we support only conversion from Int/Long -> float
@@ -1887,7 +1897,7 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
     }
 }
 
-instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
+instruction CodeGen::ins_FloatConv(var_types to, var_types from)
 {
     switch (from)
     {
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index 103ec6140c2d53..180ad19ad3a96e 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -341,7 +341,7 @@ enum insBarrier : unsigned
 
 #if defined(TARGET_XARCH)
 // Represents tupletype attribute of instruction.
-// This is used in determining factor N while calculated compressed displacement in EVEX encoding
+// This is used in determining factor N while calculating compressed displacement in EVEX encoding
 // Reference: Section 2.6.5 in Intel 64 and ia-32 architectures software developer's manual volume 2.
 enum insTupleType : uint32_t
 {
@@ -353,13 +353,13 @@ enum insTupleType : uint32_t
     INS_TT_TUPLE1_SCALAR    = 0x00020,
     INS_TT_TUPLE1_FIXED     = 0x00040,
     INS_TT_TUPLE2           = 0x00080,
-    INS_TT_TUPLE4           = 0x01000,
-    INS_TT_TUPLE8           = 0x02000,
-    INS_TT_HALF_MEM         = 0x04000,
-    INS_TT_QUARTER_MEM      = 0x08000,
-    INS_TT_EIGHTH_MEM       = 0x10000,
-    INS_TT_MEM128           = 0x20000,
-    INS_TT_MOVDDUP          = 0x40000,
+    INS_TT_TUPLE4           = 0x00100,
+    INS_TT_TUPLE8           = 0x00200,
+    INS_TT_HALF_MEM         = 0x00400,
+    INS_TT_QUARTER_MEM      = 0x00800,
+    INS_TT_EIGHTH_MEM       = 0x01000,
+    INS_TT_MEM128           = 0x02000,
+    INS_TT_MOVDDUP          = 0x04000,
     INS_TT_IS_NON_BROADCAST = 0x7FFFC
 };
 #endif
diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
index 55a39b52bffbe8..59c6dc1221f4d0 100644
--- a/src/coreclr/jit/instrsxarch.h
+++ b/src/coreclr/jit/instrsxarch.h
@@ -15,6 +15,7 @@
  *          rm      -- base encoding for reg,R/M  addressing mode
  *          a4      -- base encoding for eax,i32  addressing mode
  *          rr      -- base encoding for register addressing mode
+ *          tt      -- the tupletype for the instruction
  *          flags   -- flags, see INS_FLAGS_* enum
  *
 ******************************************************************************/
@@ -64,7 +65,7 @@ INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,
 
 // Multi-byte opcodes without modrm are represented in mixed endian fashion.
 // See comment around quarter way through this file for more information.
-INST5(bswap,            "bswap",            IUM_RW, 0x0F00C8,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C80F,    INS_FLAGS_None )
+INST5(bswap,            "bswap",            IUM_RW, 0x0F00C8,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C80F,    INS_TT_NONE,    INS_FLAGS_None )
 
 //    id                nm                  um      mr            mi            rm            a4                         tt              flags
 INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_TT_NONE,    Writes_OF      | Writes_SF     | Writes_ZF     | Writes_AF     | Writes_PF     | Writes_CF                  | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit )
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index eea4ae64e9b64c..2f337078a71de7 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -297,9 +297,11 @@ CONFIG_INTEGER(AltJitAssertOnNYI, W("AltJitAssertOnNYI"), 1) // Controls the Alt
 
 CONFIG_INTEGER(EnableEHWriteThru, W("EnableEHWriteThru"), 1) // Enable the register allocator to support EH-write thru:
                                                              // partial enregistration of vars exposed on EH boundaries
-CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1)   // Enable the enregistration of locals that are
-                                                                     // defined or used in a multireg context.
-CONFIG_INTEGER(JitStressEVEXEncoding, W("JitStressEVEXEncoding"), 0) // Enable EVEX encoding for SIMD instructions.
+CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are
+                                                                   // defined or used in a multireg context.
+#if defined(DEBUG)
+CONFIG_INTEGER(JitStressEvexEncoding, W("JitStressEvexEncoding"), 0) // Enable EVEX encoding for SIMD instructions.
+#endif // DEBUG                                                               // DEBUG
 
 // clang-format off