From 713e0ffca70474390d7247cfaba50edf4b9d41a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Fri, 26 Jul 2024 09:14:11 +0200 Subject: [PATCH 1/2] Revert "Ensure that WaitForPendingFinalizers has seen the expected Full GC count (#105289)" This reverts commit 54a9efd92de0f776dcec4711b29601a1ff159223. --- .../src/System/Runtime/InternalCalls.cs | 2 +- .../src/System/Runtime/__Finalizer.cs | 9 ++-- .../nativeaot/Runtime/FinalizerHelpers.cpp | 47 +++++------------ .../src/System/Runtime/RuntimeImports.cs | 10 ---- src/coreclr/vm/finalizerthread.cpp | 34 ++----------- src/coreclr/vm/finalizerthread.h | 2 +- .../System.Runtime.Tests/System/GCTests.cs | 50 ------------------- 7 files changed, 22 insertions(+), 132 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs index 2237b50350835f..7ea73ba7c2c387 100644 --- a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs +++ b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs @@ -276,7 +276,7 @@ internal static extern unsafe IntPtr RhpCallPropagateExceptionCallback( // Indicate that the current round of finalizations is complete. [DllImport(Redhawk.BaseName)] - internal static extern void RhpSignalFinalizationComplete(uint fCount, int observedFullGcCount); + internal static extern void RhpSignalFinalizationComplete(uint fCount); [DllImport(Redhawk.BaseName)] internal static extern ulong RhpGetTickCount64(); diff --git a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs index 80576c921f8a20..4e695601f19450 100644 --- a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs +++ b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs @@ -29,14 +29,11 @@ public static void ProcessFinalizers() // otherwise memory is low and we should initiate a collection. if (InternalCalls.RhpWaitForFinalizerRequest() != 0) { - int observedFullGcCount = RuntimeImports.RhGetGcCollectionCount(RuntimeImports.RhGetMaxGcGeneration(), false); uint finalizerCount = DrainQueue(); - // Anyone waiting to drain the Q can now wake up. Note that there is a - // race in that another thread starting a drain, as we leave a drain, may - // consider itself satisfied by the drain that just completed. - // Thus we include the Full GC count that we have certaily observed. - InternalCalls.RhpSignalFinalizationComplete(finalizerCount, observedFullGcCount); + // Tell anybody that's interested that the finalization pass is complete (there is a race condition here + // where we might immediately signal a new request as complete, but this is acceptable). + InternalCalls.RhpSignalFinalizationComplete(finalizerCount); } else { diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index b0f9eb0db5aa99..8fa60538189697 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -94,22 +94,6 @@ EXTERN_C void QCALLTYPE RhInitializeFinalizerThread() g_FinalizerEvent.Set(); } -static int32_t g_fullGcCountSeenByFinalization; - -// Indicate that the current round of finalizations is complete. -EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount, int32_t observedFullGcCount) -{ - FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId()); - - g_fullGcCountSeenByFinalization = observedFullGcCount; - g_FinalizerDoneEvent.Set(); - - if (YieldProcessorNormalization::IsMeasurementScheduled()) - { - YieldProcessorNormalization::PerformMeasurement(); - } -} - EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWait) { // This must be called via p/invoke rather than RuntimeImport since it blocks and could starve the GC if @@ -119,14 +103,6 @@ EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWai // Can't call this from the finalizer thread itself. if (ThreadStore::GetCurrentThread() != g_pFinalizerThread) { - // We may see a completion of finalization cycle that might not see objects that became - // F-reachable in recent GCs. In such case we want to wait for a completion of another cycle. - // However, since an object cannot be prevented from promoting, one can only rely on Full GCs - // to collect unreferenced objects deterministically. Thus we only care about Full GCs here. - int desiredFullGcCount = - GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration()); - - tryAgain: // Clear any current indication that a finalization pass is finished and wake the finalizer thread up // (if there's no work to do it'll set the done event immediately). g_FinalizerDoneEvent.Reset(); @@ -134,17 +110,6 @@ EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWai // Wait for the finalizer thread to get back to us. g_FinalizerDoneEvent.Wait(INFINITE, false, allowReentrantWait); - - // we use unsigned math here as the collection counts, which are size_t internally, - // can in theory overflow an int and wrap around. - // unsigned math would have more defined/portable behavior in such case - if ((int)((unsigned int)desiredFullGcCount - (unsigned int)g_fullGcCountSeenByFinalization) > 0) - { - // There were some Full GCs happening before we started waiting and possibly not seen by the - // last finalization cycle. This is rare, but we need to be sure we have seen those, - // so we try one more time. - goto tryAgain; - } } } @@ -211,6 +176,18 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest() } while (true); } +// Indicate that the current round of finalizations is complete. +EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount) +{ + FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId()); + g_FinalizerDoneEvent.Set(); + + if (YieldProcessorNormalization::IsMeasurementScheduled()) + { + YieldProcessorNormalization::PerformMeasurement(); + } +} + // // The following helpers are special in that they interact with internal GC state or directly manipulate // managed references so they're called with a special co-operative p/invoke. diff --git a/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs b/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs index 70b0cda4d2f6ac..4751e40da3b24a 100644 --- a/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs +++ b/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs @@ -104,15 +104,5 @@ internal static IntPtr RhGetModuleSection(TypeManagerHandle module, ReadyToRunSe [MethodImplAttribute(MethodImplOptions.InternalCall)] [RuntimeImport(RuntimeLibrary, "RhBulkMoveWithWriteBarrier")] internal static extern unsafe void RhBulkMoveWithWriteBarrier(ref byte dmem, ref byte smem, nuint size); - - // Get maximum GC generation number. - [MethodImplAttribute(MethodImplOptions.InternalCall)] - [RuntimeImport(RuntimeLibrary, "RhGetMaxGcGeneration")] - internal static extern int RhGetMaxGcGeneration(); - - // Get count of collections so far. - [MethodImplAttribute(MethodImplOptions.InternalCall)] - [RuntimeImport(RuntimeLibrary, "RhGetGcCollectionCount")] - internal static extern int RhGetGcCollectionCount(int generation, bool getSpecialGCCount); } } diff --git a/src/coreclr/vm/finalizerthread.cpp b/src/coreclr/vm/finalizerthread.cpp index 97ace9a32353b8..e543a3c60c3462 100644 --- a/src/coreclr/vm/finalizerthread.cpp +++ b/src/coreclr/vm/finalizerthread.cpp @@ -404,15 +404,13 @@ VOID FinalizerThread::FinalizerThreadWorker(void *args) } LOG((LF_GC, LL_INFO100, "***** Calling Finalizers\n")); - int observedFullGcCount = - GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration()); FinalizeAllObjects(); // Anyone waiting to drain the Q can now wake up. Note that there is a // race in that another thread starting a drain, as we leave a drain, may - // consider itself satisfied by the drain that just completed. - // Thus we include the Full GC count that we have certaily observed. - SignalFinalizationDone(observedFullGcCount); + // consider itself satisfied by the drain that just completed. This is + // acceptable. + SignalFinalizationDone(); } if (s_InitializedFinalizerThreadForPlatform) @@ -540,13 +538,10 @@ void FinalizerThread::FinalizerThreadCreate() } } -static int g_fullGcCountSeenByFinalization; - -void FinalizerThread::SignalFinalizationDone(int observedFullGcCount) +void FinalizerThread::SignalFinalizationDone() { WRAPPER_NO_CONTRACT; - g_fullGcCountSeenByFinalization = observedFullGcCount; hEventFinalizerDone->Set(); } @@ -560,13 +555,6 @@ void FinalizerThread::FinalizerThreadWait() // Can't call this from within a finalized method. if (!IsCurrentThreadFinalizer()) { - // We may see a completion of finalization cycle that might not see objects that became - // F-reachable in recent GCs. In such case we want to wait for a completion of another cycle. - // However, since an object cannot be prevented from promoting, one can only rely on Full GCs - // to collect unreferenced objects deterministically. Thus we only care about Full GCs here. - int desiredFullGcCount = - GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration()); - GCX_PREEMP(); #ifdef FEATURE_COMINTEROP @@ -577,8 +565,8 @@ void FinalizerThread::FinalizerThreadWait() g_pRCWCleanupList->CleanupWrappersInCurrentCtxThread(); #endif // FEATURE_COMINTEROP - tryAgain: hEventFinalizerDone->Reset(); + EnableFinalization(); // Under GC stress the finalizer queue may never go empty as frequent @@ -592,18 +580,6 @@ void FinalizerThread::FinalizerThreadWait() DWORD status; status = hEventFinalizerDone->Wait(INFINITE,TRUE); - - // we use unsigned math here as the collection counts, which are size_t internally, - // can in theory overflow an int and wrap around. - // unsigned math would have more defined/portable behavior in such case - if ((int)((unsigned int)desiredFullGcCount - (unsigned int)g_fullGcCountSeenByFinalization) > 0) - { - // There were some Full GCs happening before we started waiting and possibly not seen by the - // last finalization cycle. This is rare, but we need to be sure we have seen those, - // so we try one more time. - goto tryAgain; - } - _ASSERTE(status == WAIT_OBJECT_0); } } diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h index 03aae7b4e9cf6d..b254773883ab8c 100644 --- a/src/coreclr/vm/finalizerthread.h +++ b/src/coreclr/vm/finalizerthread.h @@ -67,7 +67,7 @@ class FinalizerThread static void FinalizerThreadWait(); - static void SignalFinalizationDone(int observedFullGcCount); + static void SignalFinalizationDone(); static VOID FinalizerThreadWorker(void *args); static DWORD WINAPI FinalizerThreadStart(void *args); diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs index ce029fc637284e..137c1dc8246a10 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs @@ -7,7 +7,6 @@ using System.Runtime.InteropServices; using System.Diagnostics; using System.Threading; -using System.Threading.Tasks; using System.Runtime; using Microsoft.DotNet.RemoteExecutor; using Xunit; @@ -293,55 +292,6 @@ private class TestObject } } - [OuterLoop] - [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsPreciseGcSupported))] - public static void WaitForPendingFinalizersRaces() - { - Task.Run(Test); - Task.Run(Test); - Task.Run(Test); - Task.Run(Test); - Task.Run(Test); - Task.Run(Test); - Test(); - - static void Test() - { - for (int i = 0; i < 20000; i++) - { - BoxedFinalized flag = new BoxedFinalized(); - MakeAndNull(flag); - GC.Collect(); - GC.WaitForPendingFinalizers(); - Assert.True(flag.finalized); - } - } - - [MethodImpl(MethodImplOptions.NoInlining)] - static void MakeAndNull(BoxedFinalized flag) - { - var deadObj = new TestObjectWithFinalizer(flag); - // it's dead here - }; - } - - class BoxedFinalized - { - public bool finalized; - } - - class TestObjectWithFinalizer - { - BoxedFinalized _flag; - - public TestObjectWithFinalizer(BoxedFinalized flag) - { - _flag = flag; - } - - ~TestObjectWithFinalizer() => _flag.finalized = true; - } - [Fact] public static void SuppressFinalizer_NullObject_ThrowsArgumentNullException() { From 5208698d019c6f24a144c0261b02ae36dbab44dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Fri, 26 Jul 2024 09:14:20 +0200 Subject: [PATCH 2/2] Revert "Port yield normalization from CoreCLR to Native AOT (#103675)" This reverts commit d35f3021b91d67eeac232a0370c6efb6c256f060. --- src/coreclr/gc/env/gcenv.os.h | 6 + src/coreclr/inc/yieldprocessornormalized.h | 39 +- src/coreclr/nativeaot/Runtime/Crst.h | 1 + .../nativeaot/Runtime/FinalizerHelpers.cpp | 8 +- .../eventpipe/gen-eventing-event-inc.lst | 1 - src/coreclr/nativeaot/Runtime/startup.cpp | 2 + .../Runtime/windows/PalRedhawkInline.h | 20 - .../Runtime/yieldprocessornormalized.cpp | 102 +++++- .../Runtime/yieldprocessornormalized.h | 228 +++++++++++- .../utilcode/yieldprocessornormalized.cpp | 1 + src/coreclr/vm/yieldprocessornormalized.cpp | 294 ++++++++++++++- .../vm/yieldprocessornormalizedshared.cpp | 341 ------------------ 12 files changed, 653 insertions(+), 390 deletions(-) delete mode 100644 src/coreclr/vm/yieldprocessornormalizedshared.cpp diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h index aa7223850eaa9b..01ed27dac3e59b 100644 --- a/src/coreclr/gc/env/gcenv.os.h +++ b/src/coreclr/gc/env/gcenv.os.h @@ -6,6 +6,12 @@ #ifndef __GCENV_OS_H__ #define __GCENV_OS_H__ +#ifdef HAS_SYSTEM_YIELDPROCESSOR +// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC. +#undef YieldProcessor +#define YieldProcessor System_YieldProcessor +#endif + #define NUMA_NODE_UNDEFINED UINT16_MAX bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index); diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index e37bf79f0c5089..121e60b033356d 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -3,11 +3,14 @@ #pragma once -#ifdef FEATURE_NATIVEAOT -FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } -#else +// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where +// the intention is to use the system-default implementation of YieldProcessor(). +#define HAS_SYSTEM_YIELDPROCESSOR FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } +#ifdef YieldProcessor +#undef YieldProcessor #endif +#define YieldProcessor Dont_Use_YieldProcessor #define DISABLE_COPY(T) \ T(const T &) = delete; \ @@ -141,9 +144,9 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo { _ASSERTE(count != 0); - if (sizeof(size_t) <= sizeof(unsigned int)) + if (sizeof(SIZE_T) <= sizeof(unsigned int)) { - // On platforms with a small size_t, prevent overflow on the multiply below + // On platforms with a small SIZE_T, prevent overflow on the multiply below const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (count > MaxCount) { @@ -151,7 +154,7 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo } } - size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield; + SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield; _ASSERTE(n != 0); do { @@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( { _ASSERTE(preSkylakeCount != 0); - if (sizeof(size_t) <= sizeof(unsigned int)) + if (sizeof(SIZE_T) <= sizeof(unsigned int)) { - // On platforms with a small size_t, prevent overflow on the multiply below + // On platforms with a small SIZE_T, prevent overflow on the multiply below const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (preSkylakeCount > MaxCount) { @@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( } const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; - size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; + SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; if (n == 0) { n = 1; @@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl _ASSERTE(preSkylakeCount != 0); - if (sizeof(size_t) <= sizeof(unsigned int)) + if (sizeof(SIZE_T) <= sizeof(unsigned int)) { - // On platforms with a small size_t, prevent overflow on the multiply below + // On platforms with a small SIZE_T, prevent overflow on the multiply below const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (preSkylakeCount > MaxCount) { @@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl } const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; - size_t n = - (size_t)preSkylakeCount * + SIZE_T n = + (SIZE_T)preSkylakeCount * YieldProcessorNormalization::s_yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; if (n == 0) @@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized( unsigned int spinIteration) { // This shift value should be adjusted based on the asserted conditions below - const uint8_t MaxShift = 3; - static_assert( - ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); - static_assert( - ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); + const UINT8 MaxShift = 3; + static_assert_no_msg( + ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration); + static_assert_no_msg( + ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration); unsigned int n; if (spinIteration <= MaxShift && diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h index 4ab9db08e0f5e3..31bf8fde9eec8a 100644 --- a/src/coreclr/nativeaot/Runtime/Crst.h +++ b/src/coreclr/nativeaot/Runtime/Crst.h @@ -20,6 +20,7 @@ enum CrstType CrstRestrictedCallouts, CrstGcStressControl, CrstThreadStore, + CrstYieldProcessorNormalized, CrstEventPipe, CrstEventPipeConfig, CrstGcEvent, diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index 8fa60538189697..dd9f1e096842fb 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext) g_pFinalizerThread = PTR_Thread(pThread); + // We have some time until the first finalization request - use the time to calibrate normalized waits. + EnsureYieldProcessorNormalizedInitialized(); + // Wait for a finalization request. uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE); ASSERT(uResult == WAIT_OBJECT_0); @@ -181,11 +184,6 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount) { FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId()); g_FinalizerDoneEvent.Set(); - - if (YieldProcessorNormalization::IsMeasurementScheduled()) - { - YieldProcessorNormalization::PerformMeasurement(); - } } // diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst index 0f4c932719a399..901af659ff84b6 100644 --- a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst +++ b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst @@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount ThreadRunning WaitHandleWaitStart WaitHandleWaitStop -YieldProcessorMeasurement diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp index af835018e1823a..db2802dcb115ef 100644 --- a/src/coreclr/nativeaot/Runtime/startup.cpp +++ b/src/coreclr/nativeaot/Runtime/startup.cpp @@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance) #endif #endif // !USE_PORTABLE_HELPERS + InitializeYieldProcessorNormalizedCrst(); + #ifdef STRESS_LOG uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize(); uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel(); diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h index 1f2a74dcd15100..187ad26fb8bf11 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h @@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD return _InterlockedCompareExchange64(pDst, iValue, iComparand); } -#ifdef HOST_X86 -FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) -{ - int64_t iOldValue; - do { - iOldValue = *pDst; - } while (PalInterlockedCompareExchange64(pDst, - iValue, - iOldValue) != iOldValue); - return iOldValue; -} -#else // HOST_X86 -EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t); -#pragma intrinsic(_InterlockedExchange64) -FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) -{ - return _InterlockedExchange64(pDst, iValue); -} -#endif // HOST_X86 - #if defined(HOST_AMD64) || defined(HOST_ARM64) EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *); #pragma intrinsic(_InterlockedCompareExchange128) diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp index efaf4e8bb20704..444d52b0114c03 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp @@ -15,6 +15,104 @@ #include "volatile.h" #include "yieldprocessornormalized.h" -#include "../../utilcode/yieldprocessornormalized.cpp" +#define ULONGLONG int64_t -#include "../../vm/yieldprocessornormalizedshared.cpp" +static Volatile s_isYieldProcessorNormalizedInitialized = false; +static CrstStatic s_initializeYieldProcessorNormalizedCrst; + +// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are +// tuned for Skylake processors +unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake +unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7; + +void InitializeYieldProcessorNormalizedCrst() +{ + WRAPPER_NO_CONTRACT; + s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized); +} + +static void InitializeYieldProcessorNormalized() +{ + WRAPPER_NO_CONTRACT; + + CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); + + if (s_isYieldProcessorNormalizedInitialized) + { + return; + } + + // Intel pre-Skylake processor: measured typically 14-17 cycles per yield + // Intel post-Skylake processor: measured typically 125-150 cycles per yield + const int MeasureDurationMs = 10; + const int NsPerSecond = 1000 * 1000 * 1000; + + ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency(); + + if (ticksPerSecond < 1000 / MeasureDurationMs) + { + // High precision clock not available or clock resolution is too low, resort to defaults + s_isYieldProcessorNormalizedInitialized = true; + return; + } + + // Measure the nanosecond delay per yield + ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs); + unsigned int yieldCount = 0; + ULONGLONG startTicks = PalQueryPerformanceCounter(); + ULONGLONG elapsedTicks; + do + { + // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask + // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the + // low microsecond range. + for (int i = 0; i < 1000; ++i) + { + System_YieldProcessor(); + } + yieldCount += 1000; + + ULONGLONG nowTicks = PalQueryPerformanceCounter(); + elapsedTicks = nowTicks - startTicks; + } while (elapsedTicks < measureDurationTicks); + double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond); + if (nsPerYield < 1) + { + nsPerYield = 1; + } + + // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this + // value is naturally limited to MinNsPerNormalizedYield. + int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5); + if (yieldsPerNormalizedYield < 1) + { + yieldsPerNormalizedYield = 1; + } + _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield); + + // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to + // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a + // better job of allowing other work to run. + int optimalMaxNormalizedYieldsPerSpinIteration = + (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5); + if (optimalMaxNormalizedYieldsPerSpinIteration < 1) + { + optimalMaxNormalizedYieldsPerSpinIteration = 1; + } + + g_yieldsPerNormalizedYield = yieldsPerNormalizedYield; + g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; + s_isYieldProcessorNormalizedInitialized = true; + + GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); +} + +void EnsureYieldProcessorNormalizedInitialized() +{ + WRAPPER_NO_CONTRACT; + + if (!s_isYieldProcessorNormalizedInitialized) + { + InitializeYieldProcessorNormalized(); + } +} diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h index 5539ebf90561bc..8c74bf3cfe3002 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h @@ -1,5 +1,229 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -#include "PalRedhawk.h" -#include "../../inc/yieldprocessornormalized.h" +#pragma once + +#include + +// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where +// the intention is to use the system-default implementation of YieldProcessor(). +#define HAS_SYSTEM_YIELDPROCESSOR +FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } +#ifdef YieldProcessor +#undef YieldProcessor +#endif +#define YieldProcessor Dont_Use_YieldProcessor +#ifdef PalYieldProcessor +#undef PalYieldProcessor +#endif +#define PalYieldProcessor Dont_Use_PalYieldProcessor + +#define SIZE_T uintptr_t + +const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake +const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake + +extern unsigned int g_yieldsPerNormalizedYield; +extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration; + +void InitializeYieldProcessorNormalizedCrst(); +void EnsureYieldProcessorNormalizedInitialized(); + +class YieldProcessorNormalizationInfo +{ +private: + unsigned int yieldsPerNormalizedYield; + unsigned int optimalMaxNormalizedYieldsPerSpinIteration; + unsigned int optimalMaxYieldsPerSpinIteration; + +public: + YieldProcessorNormalizationInfo() + : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield), + optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration), + optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration) + { + } + + friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &); + friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int); + friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int); + friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int); +}; + +// See YieldProcessorNormalized() for preliminary info. Typical usage: +// if (!condition) +// { +// YieldProcessorNormalizationInfo normalizationInfo; +// do +// { +// YieldProcessorNormalized(normalizationInfo); +// } while (!condition); +// } +FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo) +{ + unsigned int n = normalizationInfo.yieldsPerNormalizedYield; + _ASSERTE(n != 0); + do + { + System_YieldProcessor(); + } while (--n != 0); +} + +// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the +// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following: +// - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value +// for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage +// and decrease scalability of the operation. +// while(!condition) +// { +// YieldProcessorNormalized(); +// } +// - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the +// condition, otherwise it may unnecessarily increase latency of the operation +// - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in +// yield count per iteration for each failed check of the condition, the progression can significantly magnify the second +// issue above on later iterations. +// - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each +// issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using +// System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method. +FORCEINLINE void YieldProcessorNormalized() +{ + YieldProcessorNormalized(YieldProcessorNormalizationInfo()); +} + +// See YieldProcessorNormalized(count) for preliminary info. Typical usage: +// if (!moreExpensiveCondition) +// { +// YieldProcessorNormalizationInfo normalizationInfo; +// do +// { +// YieldProcessorNormalized(normalizationInfo, 2); +// } while (!moreExpensiveCondition); +// } +FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count) +{ + _ASSERTE(count != 0); + + if (sizeof(SIZE_T) <= sizeof(unsigned int)) + { + // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield + // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). + const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield; + if (count > MaxCount) + { + count = MaxCount; + } + } + + SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield; + _ASSERTE(n != 0); + do + { + System_YieldProcessor(); + } while (--n != 0); +} + +// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is +// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage: +// while(!moreExpensiveCondition) +// { +// YieldProcessorNormalized(2); +// } +FORCEINLINE void YieldProcessorNormalized(unsigned int count) +{ + YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count); +} + +// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary +// info. Typical usage: +// if (!condition) +// { +// YieldProcessorNormalizationInfo normalizationInfo; +// do +// { +// YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100); +// } while (!condition); +// } +FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( + const YieldProcessorNormalizationInfo &normalizationInfo, + unsigned int preSkylakeCount) +{ + _ASSERTE(preSkylakeCount != 0); + + if (sizeof(SIZE_T) <= sizeof(unsigned int)) + { + // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield + // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). + const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield; + if (preSkylakeCount > MaxCount) + { + preSkylakeCount = MaxCount; + } + } + + const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; + SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; + if (n == 0) + { + n = 1; + } + do + { + System_YieldProcessor(); + } while (--n != 0); +} + +// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned +// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in +// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a +// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage: +// while(!condition) +// { +// YieldProcessorNormalizedForPreSkylakeCount(100); +// } +FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount) +{ + YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount); +} + +// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the +// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait +// iteration exponentially up to a limit. Typical usage: +// if (!conditionThatMayNotBeSatisfiedSoon) +// { +// YieldProcessorNormalizationInfo normalizationInfo; +// do +// { +// YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally +// } while (!conditionThatMayNotBeSatisfiedSoon); +// } +FORCEINLINE void YieldProcessorWithBackOffNormalized( + const YieldProcessorNormalizationInfo &normalizationInfo, + unsigned int spinIteration) +{ + // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in + // InitializeYieldProcessorNormalized() + const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration = + NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1; + _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + + // This shift value should be adjusted based on the asserted condition below + const uint8_t MaxShift = 3; + static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); + + unsigned int n; + if (spinIteration <= MaxShift && + ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration) + { + n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield; + } + else + { + n = normalizationInfo.optimalMaxYieldsPerSpinIteration; + } + _ASSERTE(n != 0); + do + { + System_YieldProcessor(); + } while (--n != 0); +} diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp index c6aaaa19557fa7..020d8d7cc79e4e 100644 --- a/src/coreclr/utilcode/yieldprocessornormalized.cpp +++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +#include "stdafx.h" #include "yieldprocessornormalized.h" bool YieldProcessorNormalization::s_isMeasurementScheduled; diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 258e30d634c7ce..14166de34dd641 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -7,4 +7,296 @@ #include "finalizerthread.h" -#include "yieldprocessornormalizedshared.cpp" +enum class NormalizationState : UINT8 +{ + Uninitialized, + Initialized, + Failed +}; + +static const int NsPerYieldMeasurementCount = 8; +static const unsigned int MeasurementPeriodMs = 4000; + +static const unsigned int NsPerS = 1000 * 1000 * 1000; + +static NormalizationState s_normalizationState = NormalizationState::Uninitialized; +static unsigned int s_previousNormalizationTimeMs; + +static UINT64 s_performanceCounterTicksPerS; +static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; +static int s_nextMeasurementIndex; +static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; + +static unsigned int DetermineMeasureDurationUs() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_PREEMPTIVE; + } + CONTRACTL_END; + + _ASSERTE(s_normalizationState != NormalizationState::Failed); + + // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration + // if the overhead seems high relative to the measure duration. + unsigned int measureDurationUs = 1; + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + UINT64 startTicks = li.QuadPart; + QueryPerformanceCounter(&li); + UINT64 elapsedTicks = li.QuadPart - startTicks; + if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration + { + measureDurationUs *= 4; + } + return measureDurationUs; +} + +static double MeasureNsPerYield(unsigned int measureDurationUs) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_PREEMPTIVE; + } + CONTRACTL_END; + + _ASSERTE(s_normalizationState != NormalizationState::Failed); + + int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1; + UINT64 ticksPerS = s_performanceCounterTicksPerS; + UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); + + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + UINT64 startTicks = li.QuadPart; + + for (int i = 0; i < yieldCount; ++i) + { + System_YieldProcessor(); + } + + QueryPerformanceCounter(&li); + UINT64 elapsedTicks = li.QuadPart - startTicks; + while (elapsedTicks < measureDurationTicks) + { + int nextYieldCount = + Max(4, + elapsedTicks == 0 + ? yieldCount / 4 + : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1); + for (int i = 0; i < nextYieldCount; ++i) + { + System_YieldProcessor(); + } + + QueryPerformanceCounter(&li); + elapsedTicks = li.QuadPart - startTicks; + yieldCount += nextYieldCount; + } + + // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op + const double MinNsPerYield = 0.1; + + // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to + // really take this long. Limit the maximum to keep the recorded values reasonable. + const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1; + + return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); +} + +void YieldProcessorNormalization::PerformMeasurement() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_PREEMPTIVE; + } + CONTRACTL_END; + + _ASSERTE(s_isMeasurementScheduled); + + double latestNsPerYield; + if (s_normalizationState == NormalizationState::Initialized) + { + if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + { + return; + } + + int nextMeasurementIndex = s_nextMeasurementIndex; + latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); + AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield); + if (++nextMeasurementIndex >= NsPerYieldMeasurementCount) + { + nextMeasurementIndex = 0; + } + s_nextMeasurementIndex = nextMeasurementIndex; + } + else if (s_normalizationState == NormalizationState::Uninitialized) + { + LARGE_INTEGER li; + if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000) + { + // High precision clock not available or clock resolution is too low, resort to defaults + s_normalizationState = NormalizationState::Failed; + return; + } + s_performanceCounterTicksPerS = li.QuadPart; + + unsigned int measureDurationUs = DetermineMeasureDurationUs(); + for (int i = 0; i < NsPerYieldMeasurementCount; ++i) + { + latestNsPerYield = MeasureNsPerYield(measureDurationUs); + AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield); + if (i == 0 || latestNsPerYield < s_establishedNsPerYield) + { + AtomicStore(&s_establishedNsPerYield, latestNsPerYield); + } + + if (i < NsPerYieldMeasurementCount - 1) + { + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); + } + } + } + else + { + _ASSERTE(s_normalizationState == NormalizationState::Failed); + return; + } + + double establishedNsPerYield = s_nsPerYieldMeasurements[0]; + for (int i = 1; i < NsPerYieldMeasurementCount; ++i) + { + double nsPerYield = s_nsPerYieldMeasurements[i]; + if (nsPerYield < establishedNsPerYield) + { + establishedNsPerYield = nsPerYield; + } + } + if (establishedNsPerYield != s_establishedNsPerYield) + { + AtomicStore(&s_establishedNsPerYield, establishedNsPerYield); + } + + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); + + // Calculate the number of yields required to span the duration of a normalized yield + unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); + _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); + s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; + + // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to + // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a + // better job of allowing other work to run. + s_optimalMaxNormalizedYieldsPerSpinIteration = + Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); + _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + + GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); + + s_previousNormalizationTimeMs = GetTickCount(); + s_normalizationState = NormalizationState::Initialized; + s_isMeasurementScheduled = false; +} + + +void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_ANY; + } + CONTRACTL_END; + + NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState); + if (normalizationState == NormalizationState::Initialized) + { + if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + { + return; + } + } + else if (normalizationState == NormalizationState::Uninitialized) + { + } + else + { + _ASSERTE(normalizationState == NormalizationState::Failed); + return; + } + + // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below + if (s_isMeasurementScheduled || !g_fEEStarted) + { + return; + } + + s_isMeasurementScheduled = true; + FinalizerThread::EnableFinalization(); +} + + +void YieldProcessorNormalization::FireMeasurementEvents() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_ANY; + } + CONTRACTL_END; + + if (!EventEnabledYieldProcessorMeasurement()) + { + return; + } + + // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the + // recorded information, so try to enumerate the array with some care. + double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield); + int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex); + for (int i = 0; i < NsPerYieldMeasurementCount; ++i) + { + double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]); + if (nsPerYield != 0) // the array may not be fully initialized yet + { + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield); + } + + if (++nextIndex >= NsPerYieldMeasurementCount) + { + nextIndex = 0; + } + } +} + +double YieldProcessorNormalization::AtomicLoad(double *valueRef) +{ + WRAPPER_NO_CONTRACT; + +#ifdef TARGET_64BIT + return VolatileLoadWithoutBarrier(valueRef); +#else + return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); +#endif +} + +void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) +{ + WRAPPER_NO_CONTRACT; + +#ifdef TARGET_64BIT + *valueRef = value; +#else + InterlockedExchangeT(valueRef, value); +#endif +} + diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp deleted file mode 100644 index 05daee21947376..00000000000000 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ /dev/null @@ -1,341 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -enum class NormalizationState : uint8_t -{ - Uninitialized, - Initialized, - Failed -}; - -static const int NsPerYieldMeasurementCount = 8; -static const unsigned int MeasurementPeriodMs = 4000; - -static const unsigned int NsPerS = 1000 * 1000 * 1000; - -static NormalizationState s_normalizationState = NormalizationState::Uninitialized; -static unsigned int s_previousNormalizationTimeMs; - -static uint64_t s_performanceCounterTicksPerS; -static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; -static int s_nextMeasurementIndex; -static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; - -void RhEnableFinalization(); - -inline unsigned int GetTickCountPortable() -{ -#ifdef FEATURE_NATIVEAOT - return (unsigned int)PalGetTickCount64(); -#else - return GetTickCount(); -#endif -} - -static uint64_t GetPerformanceCounter() -{ -#ifdef FEATURE_NATIVEAOT - return PalQueryPerformanceCounter(); -#else - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - return li.QuadPart; -#endif -} - -static unsigned int DetermineMeasureDurationUs() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; -#ifndef FEATURE_NATIVEAOT - MODE_PREEMPTIVE; -#endif - } - CONTRACTL_END; - - _ASSERTE(s_normalizationState != NormalizationState::Failed); - - // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration - // if the overhead seems high relative to the measure duration. - unsigned int measureDurationUs = 1; - uint64_t startTicks = GetPerformanceCounter(); - uint64_t elapsedTicks = GetPerformanceCounter() - startTicks; - if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration - { - measureDurationUs *= 4; - } - return measureDurationUs; -} - -static double MeasureNsPerYield(unsigned int measureDurationUs) -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; -#ifndef FEATURE_NATIVEAOT - MODE_PREEMPTIVE; -#endif - } - CONTRACTL_END; - - _ASSERTE(s_normalizationState != NormalizationState::Failed); - - int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1; - uint64_t ticksPerS = s_performanceCounterTicksPerS; - uint64_t measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); - - uint64_t startTicks = GetPerformanceCounter(); - - for (int i = 0; i < yieldCount; ++i) - { - System_YieldProcessor(); - } - - uint64_t elapsedTicks = GetPerformanceCounter() - startTicks; - while (elapsedTicks < measureDurationTicks) - { - int nextYieldCount = - max(4, - elapsedTicks == 0 - ? yieldCount / 4 - : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1); - for (int i = 0; i < nextYieldCount; ++i) - { - System_YieldProcessor(); - } - - elapsedTicks = GetPerformanceCounter() - startTicks; - yieldCount += nextYieldCount; - } - - // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op - const double MinNsPerYield = 0.1; - - // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to - // really take this long. Limit the maximum to keep the recorded values reasonable. - const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1; - - return max(MinNsPerYield, min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); -} - -void YieldProcessorNormalization::PerformMeasurement() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; -#ifndef FEATURE_NATIVEAOT - MODE_PREEMPTIVE; -#endif - } - CONTRACTL_END; - - _ASSERTE(s_isMeasurementScheduled); - - double latestNsPerYield; - if (s_normalizationState == NormalizationState::Initialized) - { - if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs) - { - return; - } - - int nextMeasurementIndex = s_nextMeasurementIndex; - latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); - AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield); - if (++nextMeasurementIndex >= NsPerYieldMeasurementCount) - { - nextMeasurementIndex = 0; - } - s_nextMeasurementIndex = nextMeasurementIndex; - } - else if (s_normalizationState == NormalizationState::Uninitialized) - { -#ifdef FEATURE_NATIVEAOT - if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000) -#else - LARGE_INTEGER li; - if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000) -#endif - { - // High precision clock not available or clock resolution is too low, resort to defaults - s_normalizationState = NormalizationState::Failed; - return; - } - -#ifndef FEATURE_NATIVEAOT - s_performanceCounterTicksPerS = li.QuadPart; -#endif - - unsigned int measureDurationUs = DetermineMeasureDurationUs(); - for (int i = 0; i < NsPerYieldMeasurementCount; ++i) - { - latestNsPerYield = MeasureNsPerYield(measureDurationUs); - AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield); - if (i == 0 || latestNsPerYield < s_establishedNsPerYield) - { - AtomicStore(&s_establishedNsPerYield, latestNsPerYield); - } - if (i < NsPerYieldMeasurementCount - 1) - { - FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); - } - } - } - else - { - _ASSERTE(s_normalizationState == NormalizationState::Failed); - return; - } - - double establishedNsPerYield = s_nsPerYieldMeasurements[0]; - for (int i = 1; i < NsPerYieldMeasurementCount; ++i) - { - double nsPerYield = s_nsPerYieldMeasurements[i]; - if (nsPerYield < establishedNsPerYield) - { - establishedNsPerYield = nsPerYield; - } - } - if (establishedNsPerYield != s_establishedNsPerYield) - { - AtomicStore(&s_establishedNsPerYield, establishedNsPerYield); - } - - FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); - - // Calculate the number of yields required to span the duration of a normalized yield - unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); - _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); - s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; - - // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to - // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a - // better job of allowing other work to run. - s_optimalMaxNormalizedYieldsPerSpinIteration = - max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); - _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); - - GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); - - s_previousNormalizationTimeMs = GetTickCountPortable(); - s_normalizationState = NormalizationState::Initialized; - s_isMeasurementScheduled = false; -} - - -void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_ANY; - } - CONTRACTL_END; - - NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState); - if (normalizationState == NormalizationState::Initialized) - { - if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs) - { - return; - } - } - else if (normalizationState == NormalizationState::Uninitialized) - { - } - else - { - _ASSERTE(normalizationState == NormalizationState::Failed); - return; - } - -#ifdef FEATURE_NATIVEAOT - if (s_isMeasurementScheduled) -#else - // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below - if (s_isMeasurementScheduled || !g_fEEStarted) -#endif - { - return; - } - - s_isMeasurementScheduled = true; -#ifdef FEATURE_NATIVEAOT - RhEnableFinalization(); -#else - FinalizerThread::EnableFinalization(); -#endif -} - -void YieldProcessorNormalization::FireMeasurementEvents() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_ANY; - } - CONTRACTL_END; - - if (!EventEnabledYieldProcessorMeasurement()) - { - return; - } - - // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the - // recorded information, so try to enumerate the array with some care. - double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield); - int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex); - for (int i = 0; i < NsPerYieldMeasurementCount; ++i) - { - double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]); - if (nsPerYield != 0) // the array may not be fully initialized yet - { - FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield); - } - - if (++nextIndex >= NsPerYieldMeasurementCount) - { - nextIndex = 0; - } - } -} - -double YieldProcessorNormalization::AtomicLoad(double *valueRef) -{ - WRAPPER_NO_CONTRACT; - -#ifdef TARGET_64BIT - return VolatileLoadWithoutBarrier(valueRef); -#else -#ifdef FEATURE_NATIVEAOT - static_assert(sizeof(int64_t) == sizeof(double), ""); - int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0); - return *(double*)(int64_t*)(&intRes); -#else - return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); -#endif -#endif -} - -void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) -{ - WRAPPER_NO_CONTRACT; - -#ifdef TARGET_64BIT - *valueRef = value; -#else -#ifdef FEATURE_NATIVEAOT - static_assert(sizeof(int64_t) == sizeof(double), ""); - PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value); -#else - InterlockedExchangeT(valueRef, value); -#endif -#endif -} -