diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index ebccbe7a9a91ed..e3a03d12550e52 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2581,12 +2581,8 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) regNumber src = genConsumeReg(srcIndir->Addr()); unsigned size = tree->Size(); - // TODO-XARCH-AVX512: Consider enabling it here - unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) - ? YMM_REGSIZE_BYTES - : XMM_REGSIZE_BYTES; - - if (size >= simdSize) + const unsigned simdSize = compiler->roundDownSIMDSize(size); + if ((size >= simdSize) && (simdSize > 0)) { // Number of SIMD regs needed to save the whole src to regs. unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT); @@ -2603,33 +2599,37 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) } auto emitSimdLoadStore = [&](bool load) { - unsigned offset = 0; - int regIndex = 0; - instruction simdMov = simdUnalignedMovIns(); + unsigned offset = 0; + int regIndex = 0; + instruction simdMov = simdUnalignedMovIns(); + unsigned curSimdSize = simdSize; do { + assert(curSimdSize >= XMM_REGSIZE_BYTES); if (load) { // vmovdqu ymm, ymmword ptr[src + offset] - GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], src, offset); + GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(curSimdSize), tempRegs[regIndex++], src, offset); } else { // vmovdqu ymmword ptr[dst + offset], ymm - GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], dst, offset); + GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(curSimdSize), tempRegs[regIndex++], dst, offset); } - offset += simdSize; + offset += curSimdSize; if (size == offset) { break; } + // Overlap with the previously processed data. We'll always use SIMD for simplicity assert(size > offset); - if ((size - offset) < simdSize) + unsigned remainder = size - offset; + if (remainder < curSimdSize) { - // Overlap with the previously processed data. We'll always use SIMD for simplicity - // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. - offset = size - simdSize; + // Switch to smaller SIMD size if necessary + curSimdSize = compiler->roundUpSIMDSize(remainder); + offset = size - curSimdSize; } } while (true); }; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e21307de79e919..e7306c5f0b93ce 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8908,6 +8908,82 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif } + //------------------------------------------------------------------------ + // roundUpSIMDSize: rounds the given size up to the nearest SIMD size + // available on the target. Examples on XARCH: + // + // size: 7 -> XMM + // size: 30 -> YMM (or XMM if target doesn't support AVX) + // size: 70 -> ZMM (or YMM or XMM depending on target) + // + // Arguments: + // size - size of the data to process with SIMD + // + // Notes: + // It's only supposed to be used for scenarios where we can + // perform an overlapped load/store. + // + unsigned int roundUpSIMDSize(unsigned size) + { +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) + unsigned maxSimdSize = maxSIMDStructBytes(); + assert(maxSimdSize <= ZMM_REGSIZE_BYTES); + if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES) + { + return XMM_REGSIZE_BYTES; + } + if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES) + { + return YMM_REGSIZE_BYTES; + } + return maxSimdSize; +#elif defined(TARGET_ARM64) + assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES); + return FP_REGSIZE_BYTES; +#else + assert(!"roundUpSIMDSize() unimplemented on target arch"); + unreached(); +#endif + } + + //------------------------------------------------------------------------ + // roundDownSIMDSize: rounds the given size down to the nearest SIMD size + // available on the target. Examples on XARCH: + // + // size: 7 -> 0 + // size: 30 -> XMM (not enough for AVX) + // size: 60 -> YMM (or XMM if target doesn't support AVX) + // size: 70 -> ZMM/YMM/XMM whatever the current system can offer + // + // Arguments: + // size - size of the data to process with SIMD + // + unsigned int roundDownSIMDSize(unsigned size) + { +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) + unsigned maxSimdSize = maxSIMDStructBytes(); + assert(maxSimdSize <= ZMM_REGSIZE_BYTES); + if (size >= maxSimdSize) + { + // Size is bigger than max SIMD size the current target supports + return maxSimdSize; + } + if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES) + { + // Size is >= YMM but not enough for ZMM -> YMM + return YMM_REGSIZE_BYTES; + } + // Return 0 if size is even less than XMM, otherwise - XMM + return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0; +#elif defined(TARGET_ARM64) + assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES); + return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0; +#else + assert(!"roundDownSIMDSize() unimplemented on target arch"); + unreached(); +#endif + } + unsigned int minSIMDStructBytes() { return emitTypeSize(TYP_SIMD8); @@ -8955,6 +9031,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX { return false; } + unsigned int roundUpSIMDSize(unsigned size) + { + return 0; + } + unsigned int roundDownSIMDSize(unsigned size) + { + return 0; + } #endif // FEATURE_SIMD public: @@ -8985,9 +9069,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX { maxRegSize = maxSIMDStructBytes(); #if defined(TARGET_XARCH) - // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial - maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES); - threshold = maxRegSize; + if (type != UnrollKind::Memmove) + { + // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial. + // Enabled for Memmove only for now. + maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES); + } + threshold = maxRegSize; #elif defined(TARGET_ARM64) // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: // diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index c1772f1cdeb80f..2369c53e75e85d 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -7814,7 +7814,7 @@ void emitter::emitIns_I_ARX( void emitter::emitIns_R_ARX( instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp) { - assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE) && (reg != REG_NA)); + assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_64BYTE) && (reg != REG_NA)); noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg)); if ((ins == INS_lea) && (reg == base) && (index == REG_NA) && (disp == 0)) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 29c3bcb6aff60c..1e71060a025375 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1536,13 +1536,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) // Lowering was expected to get rid of memmove in case of zero assert(size > 0); - // TODO-XARCH-AVX512: Consider enabling it here - unsigned simdSize = - (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) - ? YMM_REGSIZE_BYTES - : XMM_REGSIZE_BYTES; - - if (size >= simdSize) + const unsigned simdSize = compiler->roundDownSIMDSize(size); + if ((size >= simdSize) && (simdSize > 0)) { unsigned simdRegs = size / simdSize; if ((size % simdSize) != 0)