Skip to content

Commit 3bd006f

Browse files
authored
JIT: Accelerate uint->floating casts on pre-AVX-512 x86 (#124114)
This is another one peeled from #116805 (only one more to go after this 😄) It takes care of uint->floating casts without a helper call on pre-AVX-512 x86. Since baseline x86 is the only platform without native instructions for all 32-bit conversions, adding this allows some more simplification in `fgMorphExpandCast` Typical diff: ```diff - push eax - vzeroupper - push 0 - push ecx - call [CORINFO_HELP_LNG2FLT] - fstp dword ptr [esp] - vmovss xmm0, dword ptr [esp] + vxorps xmm0, xmm0, xmm0 + vcvtsi2sd xmm0, xmm0, ecx + vaddsd xmm1, xmm0, qword ptr [reloc @rwd00] + vblendvpd xmm0, xmm0, xmm1, xmm0 + vcvtpd2ps xmm0, xmm0 vmovd eax, xmm0 - pop ecx ret +RWD00 dq 41F0000000000000h -; Total bytes of code 27 +; Total bytes of code 31 ```
1 parent 6bd4752 commit 3bd006f

File tree

6 files changed

+197
-142
lines changed

6 files changed

+197
-142
lines changed

src/coreclr/jit/compiler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6229,6 +6229,7 @@ class Compiler
62296229
void fgConvertBBToThrowBB(BasicBlock* block);
62306230

62316231
bool fgCastNeeded(GenTree* tree, var_types toType);
6232+
bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false);
62326233

62336234
void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB);
62346235
void fgLoopCallMark();

src/coreclr/jit/decomposelongs.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
138138
}
139139

140140
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
141+
// On x86, long->floating casts are implemented in DecomposeCast.
142+
// Those nodes, plus any nodes that produce a long, will be examined.
141143
if (!tree->TypeIs(TYP_LONG) &&
142144
!(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)))
143145
#else
@@ -159,6 +161,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
159161
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
160162
// Here we do a conservative check for specific cases where it is certain the load/store
161163
// can be contained. In those cases, we can skip decomposition.
164+
//
165+
// We also look for longs consumed directly by a long->floating cast. These can skip
166+
// decomposition because the cast is implemented using HWIntrinsics.
162167

163168
GenTree* user = use.User();
164169

@@ -589,21 +594,17 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
589594
// The sequence this creates is simply:
590595
// AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
591596

592-
NamedIntrinsic intrinsicId = NI_Illegal;
593-
GenTree* srcOp = cast->CastOp();
594-
var_types dstType = cast->CastToType();
595-
var_types baseFloatingType = (dstType == TYP_FLOAT) ? TYP_FLOAT : TYP_DOUBLE;
596-
var_types baseIntegralType = cast->IsUnsigned() ? TYP_ULONG : TYP_LONG;
597+
NamedIntrinsic intrinsicId = NI_Illegal;
598+
GenTree* srcOp = cast->CastOp();
597599

598600
assert(!cast->gtOverflow());
599601
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
600602

601603
intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
602604

603-
GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16);
604-
GenTree* convert =
605-
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16);
606-
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16);
605+
GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
606+
GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
607+
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);
607608

608609
Range().InsertAfter(cast, createScalar, convert, toScalar);
609610
Range().Remove(cast);

src/coreclr/jit/flowgraph.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,6 +1269,44 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType)
12691269
return true;
12701270
}
12711271

1272+
//-------------------------------------------------------------------------------------
1273+
// fgCastRequiresHelper: Check whether a given cast must be converted to a helper call.
1274+
//
1275+
// Arguments:
1276+
// fromType - The source type of the cast.
1277+
// toType - The target type of the cast.
1278+
// overflow - True if the cast has the GTF_OVERFLOW flag set.
1279+
//
1280+
// Return Value:
1281+
// True if the cast requires a helper call, otherwise false.
1282+
//
1283+
bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */)
1284+
{
1285+
if (overflow && varTypeIsFloating(fromType))
1286+
{
1287+
assert(varTypeIsIntegral(toType));
1288+
return true;
1289+
}
1290+
1291+
#if defined(TARGET_X86) || defined(TARGET_ARM)
1292+
if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
1293+
{
1294+
return true;
1295+
}
1296+
1297+
if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
1298+
{
1299+
#if defined(TARGET_X86)
1300+
return !compOpportunisticallyDependsOn(InstructionSet_AVX512);
1301+
#else
1302+
return true;
1303+
#endif // TARGET_X86
1304+
}
1305+
#endif // TARGET_X86 || TARGET_ARM
1306+
1307+
return false;
1308+
}
1309+
12721310
GenTree* Compiler::fgGetCritSectOfStaticMethod()
12731311
{
12741312
noway_assert(!compIsForInlining());

src/coreclr/jit/importer.cpp

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8127,28 +8127,8 @@ void Compiler::impImportBlockCode(BasicBlock* block)
81278127
goto _CONV;
81288128

81298129
_CONV:
8130-
// only converts from FLOAT or DOUBLE to an integer type
8131-
// and converts from ULONG (or LONG on ARM) to DOUBLE are morphed to calls
8132-
8133-
if (varTypeIsFloating(lclTyp))
8134-
{
8135-
callNode = varTypeIsLong(impStackTop().val) ||
8136-
uns // uint->dbl gets turned into uint->long->dbl
8137-
#ifdef TARGET_64BIT
8138-
// TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK?
8139-
// TYP_BYREF could be used as TYP_I_IMPL which is long.
8140-
// TODO-CQ: remove this when we lower casts long/ulong --> float/double
8141-
// and generate SSE2 code instead of going through helper calls.
8142-
|| impStackTop().val->TypeIs(TYP_BYREF)
8143-
#endif
8144-
;
8145-
}
8146-
else
8147-
{
8148-
callNode = varTypeIsFloating(impStackTop().val->TypeGet());
8149-
}
8150-
8151-
op1 = impPopStack().val;
8130+
op1 = impPopStack().val;
8131+
callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl);
81528132

81538133
impBashVarAddrsToI(op1);
81548134

src/coreclr/jit/lowerxarch.cpp

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -827,7 +827,7 @@ void Lowering::LowerCast(GenTree* tree)
827827

828828
GenTree* castOp = tree->AsCast()->CastOp();
829829
var_types dstType = tree->CastToType();
830-
var_types srcType = castOp->TypeGet();
830+
var_types srcType = genActualType(castOp);
831831

832832
// force the srcType to unsigned if GT_UNSIGNED flag is set
833833
if (tree->IsUnsigned())
@@ -844,12 +844,96 @@ void Lowering::LowerCast(GenTree* tree)
844844
// Long types should have been handled by helper call or in DecomposeLongs on x86.
845845
assert(!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit);
846846
}
847-
else if (srcType == TYP_UINT)
847+
848+
#ifdef TARGET_X86
849+
if ((srcType == TYP_UINT) && varTypeIsFloating(dstType) &&
850+
!m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
848851
{
849-
// uint->float casts should have an intermediate cast to long unless
850-
// we have the EVEX unsigned conversion instructions available.
851-
assert(dstType != TYP_FLOAT || m_compiler->canUseEvexEncodingDebugOnly());
852+
// Pre-AVX-512, there was no conversion instruction for uint->floating, so we emulate it
853+
// using signed int conversion. This is necessary only on 32-bit, because x64 simply casts
854+
// the uint up to a signed long before conversion.
855+
//
856+
// This logic depends on the fact that conversion from int to double is lossless. When
857+
// converting to float, we use a double intermediate, and convert to float only after the
858+
// double result is fixed up. This ensures the floating result is rounded correctly.
859+
860+
LABELEDDISPTREERANGE("LowerCast before", BlockRange(), tree);
861+
862+
LIR::Range castRange = LIR::EmptyRange();
863+
864+
// This creates the equivalent of the following C# code:
865+
// var castResult = Sse2.ConvertScalarToVector128Double(Vector128<double>.Zero, (int)castOp);
866+
867+
GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
868+
GenTree* castResult =
869+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, castOp, NI_X86Base_ConvertScalarToVector128Double,
870+
TYP_INT, 16);
871+
872+
castRange.InsertAtEnd(zero);
873+
castRange.InsertAtEnd(castResult);
874+
875+
// We will use the conversion result multiple times, so replace it with a lclVar.
876+
LIR::Use resUse;
877+
LIR::Use::MakeDummyUse(castRange, castResult, &resUse);
878+
resUse.ReplaceWithLclVar(m_compiler);
879+
castResult = resUse.Def();
880+
881+
// If the input had the MSB set, it will have converted as a negative, so we must wrap the
882+
// result back around to positive by adding 2^32. `blendvpd` uses only the MSB of the mask
883+
// element.
884+
//
885+
// This creates the equivalent of the following C# code:
886+
// var addRes = Sse2.AddScalar(castResult, Vector128.CreateScalar(4294967296.0));
887+
// castResult = Sse41.BlendVariable(castResult, addRes, castResult);
888+
889+
GenTreeVecCon* addCns = m_compiler->gtNewVconNode(TYP_SIMD16);
890+
addCns->gtSimdVal.f64[0] = 4294967296.0;
891+
892+
GenTree* addRes =
893+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult, addCns, NI_X86Base_AddScalar, TYP_DOUBLE, 16);
894+
895+
castRange.InsertAtEnd(addCns);
896+
castRange.InsertAtEnd(addRes);
897+
898+
GenTree* resClone1 = m_compiler->gtClone(castResult);
899+
GenTree* resClone2 = m_compiler->gtClone(castResult);
900+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, resClone1, addRes, resClone2,
901+
NI_X86Base_BlendVariable, TYP_DOUBLE, 16);
902+
castRange.InsertAtEnd(resClone1);
903+
castRange.InsertAtEnd(resClone2);
904+
castRange.InsertAtEnd(castResult);
905+
906+
// Convert to float if necessary, then ToScalar() the result out.
907+
if (dstType == TYP_FLOAT)
908+
{
909+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult,
910+
NI_X86Base_ConvertToVector128Single, TYP_DOUBLE, 16);
911+
castRange.InsertAtEnd(castResult);
912+
}
913+
914+
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, castResult, dstType, 16);
915+
castRange.InsertAtEnd(toScalar);
916+
917+
LIR::ReadOnlyRange lowerRange(castRange.FirstNode(), castRange.LastNode());
918+
BlockRange().InsertBefore(tree, std::move(castRange));
919+
920+
LABELEDDISPTREERANGE("LowerCast after", BlockRange(), toScalar);
921+
922+
LIR::Use castUse;
923+
if (BlockRange().TryGetUse(tree, &castUse))
924+
{
925+
castUse.ReplaceWith(toScalar);
926+
}
927+
else
928+
{
929+
toScalar->SetUnusedValue();
930+
}
931+
932+
BlockRange().Remove(tree);
933+
LowerRange(lowerRange);
934+
return;
852935
}
936+
#endif // TARGET_X86
853937

854938
if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType))
855939
{

0 commit comments

Comments
 (0)