JIT: Accelerate uint->floating casts on pre-AVX-512 x86 (#124114)

saucecontrol · web-flow · commit 3bd006fc465c · 2026-03-03T14:27:17.000-08:00
This is another one peeled from #116805 (only one more to go after this 😄) It takes care of uint->floating casts without a helper call on pre-AVX-512 x86. Since baseline x86 is the only platform without native instructions for all 32-bit conversions, adding this allows some more simplification in `fgMorphExpandCast` Typical diff: ```diff - push eax - vzeroupper - push 0 - push ecx - call [CORINFO_HELP_LNG2FLT] - fstp dword ptr [esp] - vmovss xmm0, dword ptr [esp] + vxorps xmm0, xmm0, xmm0 + vcvtsi2sd xmm0, xmm0, ecx + vaddsd xmm1, xmm0, qword ptr [reloc @rwd00] + vblendvpd xmm0, xmm0, xmm1, xmm0 + vcvtpd2ps xmm0, xmm0 vmovd eax, xmm0 - pop ecx ret +RWD00 dq 41F0000000000000h -; Total bytes of code 27 +; Total bytes of code 31 ```
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
@@ -6229,6 +6229,7 @@ class Compiler
     void fgConvertBBToThrowBB(BasicBlock* block);
 
     bool fgCastNeeded(GenTree* tree, var_types toType);
+    bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false);
 
     void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB);
     void fgLoopCallMark();
diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp
@@ -138,6 +138,8 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
     }
 
 #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
+    // On x86, long->floating casts are implemented in DecomposeCast.
+    // Those nodes, plus any nodes that produce a long, will be examined.
     if (!tree->TypeIs(TYP_LONG) &&
         !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)))
 #else
@@ -159,6 +161,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
         // HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
         // Here we do a conservative check for specific cases where it is certain the load/store
         // can be contained. In those cases, we can skip decomposition.
+        //
+        // We also look for longs consumed directly by a long->floating cast. These can skip
+        // decomposition because the cast is implemented using HWIntrinsics.
 
         GenTree* user = use.User();
 
@@ -589,21 +594,17 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
         // The sequence this creates is simply:
         //    AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
 
-        NamedIntrinsic intrinsicId      = NI_Illegal;
-        GenTree*       srcOp            = cast->CastOp();
-        var_types      dstType          = cast->CastToType();
-        var_types      baseFloatingType = (dstType == TYP_FLOAT) ? TYP_FLOAT : TYP_DOUBLE;
-        var_types      baseIntegralType = cast->IsUnsigned() ? TYP_ULONG : TYP_LONG;
+        NamedIntrinsic intrinsicId = NI_Illegal;
+        GenTree*       srcOp       = cast->CastOp();
 
         assert(!cast->gtOverflow());
         assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
 
         intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
 
-        GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16);
-        GenTree* convert =
-            m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16);
-        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16);
+        GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
+        GenTree* convert  = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
+        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);
 
         Range().InsertAfter(cast, createScalar, convert, toScalar);
         Range().Remove(cast);
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
@@ -1269,6 +1269,44 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType)
     return true;
 }
 
+//-------------------------------------------------------------------------------------
+// fgCastRequiresHelper: Check whether a given cast must be converted to a helper call.
+//
+// Arguments:
+//    fromType - The source type of the cast.
+//    toType   - The target type of the cast.
+//    overflow - True if the cast has the GTF_OVERFLOW flag set.
+//
+// Return Value:
+//    True if the cast requires a helper call, otherwise false.
+//
+bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */)
+{
+    if (overflow && varTypeIsFloating(fromType))
+    {
+        assert(varTypeIsIntegral(toType));
+        return true;
+    }
+
+#if defined(TARGET_X86) || defined(TARGET_ARM)
+    if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
+    {
+        return true;
+    }
+
+    if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
+    {
+#if defined(TARGET_X86)
+        return !compOpportunisticallyDependsOn(InstructionSet_AVX512);
+#else
+        return true;
+#endif // TARGET_X86
+    }
+#endif // TARGET_X86 || TARGET_ARM
+
+    return false;
+}
+
 GenTree* Compiler::fgGetCritSectOfStaticMethod()
 {
     noway_assert(!compIsForInlining());
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
@@ -8127,28 +8127,8 @@ void Compiler::impImportBlockCode(BasicBlock* block)
                 goto _CONV;
 
             _CONV:
-                // only converts from FLOAT or DOUBLE to an integer type
-                // and converts from  ULONG (or LONG on ARM) to DOUBLE are morphed to calls
-
-                if (varTypeIsFloating(lclTyp))
-                {
-                    callNode = varTypeIsLong(impStackTop().val) ||
-                               uns // uint->dbl gets turned into uint->long->dbl
-#ifdef TARGET_64BIT
-                                   // TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK?
-                                   // TYP_BYREF could be used as TYP_I_IMPL which is long.
-                                   // TODO-CQ: remove this when we lower casts long/ulong --> float/double
-                                   // and generate SSE2 code instead of going through helper calls.
-                               || impStackTop().val->TypeIs(TYP_BYREF)
-#endif
-                        ;
-                }
-                else
-                {
-                    callNode = varTypeIsFloating(impStackTop().val->TypeGet());
-                }
-
-                op1 = impPopStack().val;
+                op1      = impPopStack().val;
+                callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl);
 
                 impBashVarAddrsToI(op1);
 
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -827,7 +827,7 @@ void Lowering::LowerCast(GenTree* tree)
 
     GenTree*  castOp  = tree->AsCast()->CastOp();
     var_types dstType = tree->CastToType();
-    var_types srcType = castOp->TypeGet();
+    var_types srcType = genActualType(castOp);
 
     // force the srcType to unsigned if GT_UNSIGNED flag is set
     if (tree->IsUnsigned())
@@ -844,12 +844,96 @@ void Lowering::LowerCast(GenTree* tree)
         // Long types should have been handled by helper call or in DecomposeLongs on x86.
         assert(!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit);
     }
-    else if (srcType == TYP_UINT)
+
+#ifdef TARGET_X86
+    if ((srcType == TYP_UINT) && varTypeIsFloating(dstType) &&
+        !m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
     {
-        // uint->float casts should have an intermediate cast to long unless
-        // we have the EVEX unsigned conversion instructions available.
-        assert(dstType != TYP_FLOAT || m_compiler->canUseEvexEncodingDebugOnly());
+        // Pre-AVX-512, there was no conversion instruction for uint->floating, so we emulate it
+        // using signed int conversion. This is necessary only on 32-bit, because x64 simply casts
+        // the uint up to a signed long before conversion.
+        //
+        // This logic depends on the fact that conversion from int to double is lossless. When
+        // converting to float, we use a double intermediate, and convert to float only after the
+        // double result is fixed up. This ensures the floating result is rounded correctly.
+
+        LABELEDDISPTREERANGE("LowerCast before", BlockRange(), tree);
+
+        LIR::Range castRange = LIR::EmptyRange();
+
+        // This creates the equivalent of the following C# code:
+        //   var castResult = Sse2.ConvertScalarToVector128Double(Vector128<double>.Zero, (int)castOp);
+
+        GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
+        GenTree* castResult =
+            m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, castOp, NI_X86Base_ConvertScalarToVector128Double,
+                                                 TYP_INT, 16);
+
+        castRange.InsertAtEnd(zero);
+        castRange.InsertAtEnd(castResult);
+
+        // We will use the conversion result multiple times, so replace it with a lclVar.
+        LIR::Use resUse;
+        LIR::Use::MakeDummyUse(castRange, castResult, &resUse);
+        resUse.ReplaceWithLclVar(m_compiler);
+        castResult = resUse.Def();
+
+        // If the input had the MSB set, it will have converted as a negative, so we must wrap the
+        // result back around to positive by adding 2^32. `blendvpd` uses only the MSB of the mask
+        // element.
+        //
+        // This creates the equivalent of the following C# code:
+        //   var addRes = Sse2.AddScalar(castResult, Vector128.CreateScalar(4294967296.0));
+        //   castResult = Sse41.BlendVariable(castResult, addRes, castResult);
+
+        GenTreeVecCon* addCns    = m_compiler->gtNewVconNode(TYP_SIMD16);
+        addCns->gtSimdVal.f64[0] = 4294967296.0;
+
+        GenTree* addRes =
+            m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult, addCns, NI_X86Base_AddScalar, TYP_DOUBLE, 16);
+
+        castRange.InsertAtEnd(addCns);
+        castRange.InsertAtEnd(addRes);
+
+        GenTree* resClone1 = m_compiler->gtClone(castResult);
+        GenTree* resClone2 = m_compiler->gtClone(castResult);
+        castResult         = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, resClone1, addRes, resClone2,
+                                                                  NI_X86Base_BlendVariable, TYP_DOUBLE, 16);
+        castRange.InsertAtEnd(resClone1);
+        castRange.InsertAtEnd(resClone2);
+        castRange.InsertAtEnd(castResult);
+
+        // Convert to float if necessary, then ToScalar() the result out.
+        if (dstType == TYP_FLOAT)
+        {
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult,
+                                                              NI_X86Base_ConvertToVector128Single, TYP_DOUBLE, 16);
+            castRange.InsertAtEnd(castResult);
+        }
+
+        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, castResult, dstType, 16);
+        castRange.InsertAtEnd(toScalar);
+
+        LIR::ReadOnlyRange lowerRange(castRange.FirstNode(), castRange.LastNode());
+        BlockRange().InsertBefore(tree, std::move(castRange));
+
+        LABELEDDISPTREERANGE("LowerCast after", BlockRange(), toScalar);
+
+        LIR::Use castUse;
+        if (BlockRange().TryGetUse(tree, &castUse))
+        {
+            castUse.ReplaceWith(toScalar);
+        }
+        else
+        {
+            toScalar->SetUnusedValue();
+        }
+
+        BlockRange().Remove(tree);
+        LowerRange(lowerRange);
+        return;
     }
+#endif // TARGET_X86
 
     if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType))
     {
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp