From 713e0ffca70474390d7247cfaba50edf4b9d41a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?=
 <MichalStrehovsky@users.noreply.github.com>
Date: Fri, 26 Jul 2024 09:14:11 +0200
Subject: [PATCH 1/2] Revert "Ensure that WaitForPendingFinalizers has seen the
 expected Full GC count (#105289)"

This reverts commit 54a9efd92de0f776dcec4711b29601a1ff159223.
---
 .../src/System/Runtime/InternalCalls.cs       |  2 +-
 .../src/System/Runtime/__Finalizer.cs         |  9 ++--
 .../nativeaot/Runtime/FinalizerHelpers.cpp    | 47 +++++------------
 .../src/System/Runtime/RuntimeImports.cs      | 10 ----
 src/coreclr/vm/finalizerthread.cpp            | 34 ++-----------
 src/coreclr/vm/finalizerthread.h              |  2 +-
 .../System.Runtime.Tests/System/GCTests.cs    | 50 -------------------
 7 files changed, 22 insertions(+), 132 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs
index 2237b50350835f..7ea73ba7c2c387 100644
--- a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs
+++ b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs
@@ -276,7 +276,7 @@ internal static extern unsafe IntPtr RhpCallPropagateExceptionCallback(
 
         // Indicate that the current round of finalizations is complete.
         [DllImport(Redhawk.BaseName)]
-        internal static extern void RhpSignalFinalizationComplete(uint fCount, int observedFullGcCount);
+        internal static extern void RhpSignalFinalizationComplete(uint fCount);
 
         [DllImport(Redhawk.BaseName)]
         internal static extern ulong RhpGetTickCount64();
diff --git a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs
index 80576c921f8a20..4e695601f19450 100644
--- a/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs
+++ b/src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/__Finalizer.cs
@@ -29,14 +29,11 @@ public static void ProcessFinalizers()
                 // otherwise memory is low and we should initiate a collection.
                 if (InternalCalls.RhpWaitForFinalizerRequest() != 0)
                 {
-                    int observedFullGcCount = RuntimeImports.RhGetGcCollectionCount(RuntimeImports.RhGetMaxGcGeneration(), false);
                     uint finalizerCount = DrainQueue();
 
-                    // Anyone waiting to drain the Q can now wake up.  Note that there is a
-                    // race in that another thread starting a drain, as we leave a drain, may
-                    // consider itself satisfied by the drain that just completed.
-                    // Thus we include the Full GC count that we have certaily observed.
-                    InternalCalls.RhpSignalFinalizationComplete(finalizerCount, observedFullGcCount);
+                    // Tell anybody that's interested that the finalization pass is complete (there is a race condition here
+                    // where we might immediately signal a new request as complete, but this is acceptable).
+                    InternalCalls.RhpSignalFinalizationComplete(finalizerCount);
                 }
                 else
                 {
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index b0f9eb0db5aa99..8fa60538189697 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -94,22 +94,6 @@ EXTERN_C void QCALLTYPE RhInitializeFinalizerThread()
     g_FinalizerEvent.Set();
 }
 
-static int32_t g_fullGcCountSeenByFinalization;
-
-// Indicate that the current round of finalizations is complete.
-EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount, int32_t observedFullGcCount)
-{
-    FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
-
-    g_fullGcCountSeenByFinalization = observedFullGcCount;
-    g_FinalizerDoneEvent.Set();
-
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
-}
-
 EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWait)
 {
     // This must be called via p/invoke rather than RuntimeImport since it blocks and could starve the GC if
@@ -119,14 +103,6 @@ EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWai
     // Can't call this from the finalizer thread itself.
     if (ThreadStore::GetCurrentThread() != g_pFinalizerThread)
     {
-        // We may see a completion of finalization cycle that might not see objects that became
-        // F-reachable in recent GCs. In such case we want to wait for a completion of another cycle.
-        // However, since an object cannot be prevented from promoting, one can only rely on Full GCs
-        // to collect unreferenced objects deterministically. Thus we only care about Full GCs here.
-        int desiredFullGcCount =
-            GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration());
-
-    tryAgain:
         // Clear any current indication that a finalization pass is finished and wake the finalizer thread up
         // (if there's no work to do it'll set the done event immediately).
         g_FinalizerDoneEvent.Reset();
@@ -134,17 +110,6 @@ EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWai
 
         // Wait for the finalizer thread to get back to us.
         g_FinalizerDoneEvent.Wait(INFINITE, false, allowReentrantWait);
-
-        // we use unsigned math here as the collection counts, which are size_t internally,
-        // can in theory overflow an int and wrap around.
-        // unsigned math would have more defined/portable behavior in such case
-        if ((int)((unsigned int)desiredFullGcCount - (unsigned int)g_fullGcCountSeenByFinalization) > 0)
-        {
-            // There were some Full GCs happening before we started waiting and possibly not seen by the
-            // last finalization cycle. This is rare, but we need to be sure we have seen those,
-            // so we try one more time.
-            goto tryAgain;
-        }
     }
 }
 
@@ -211,6 +176,18 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest()
     } while (true);
 }
 
+// Indicate that the current round of finalizations is complete.
+EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
+{
+    FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
+    g_FinalizerDoneEvent.Set();
+
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        YieldProcessorNormalization::PerformMeasurement();
+    }
+}
+
 //
 // The following helpers are special in that they interact with internal GC state or directly manipulate
 // managed references so they're called with a special co-operative p/invoke.
diff --git a/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs b/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs
index 70b0cda4d2f6ac..4751e40da3b24a 100644
--- a/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs
+++ b/src/coreclr/nativeaot/Test.CoreLib/src/System/Runtime/RuntimeImports.cs
@@ -104,15 +104,5 @@ internal static IntPtr RhGetModuleSection(TypeManagerHandle module, ReadyToRunSe
         [MethodImplAttribute(MethodImplOptions.InternalCall)]
         [RuntimeImport(RuntimeLibrary, "RhBulkMoveWithWriteBarrier")]
         internal static extern unsafe void RhBulkMoveWithWriteBarrier(ref byte dmem, ref byte smem, nuint size);
-
-        // Get maximum GC generation number.
-        [MethodImplAttribute(MethodImplOptions.InternalCall)]
-        [RuntimeImport(RuntimeLibrary, "RhGetMaxGcGeneration")]
-        internal static extern int RhGetMaxGcGeneration();
-
-        // Get count of collections so far.
-        [MethodImplAttribute(MethodImplOptions.InternalCall)]
-        [RuntimeImport(RuntimeLibrary, "RhGetGcCollectionCount")]
-        internal static extern int RhGetGcCollectionCount(int generation, bool getSpecialGCCount);
     }
 }
diff --git a/src/coreclr/vm/finalizerthread.cpp b/src/coreclr/vm/finalizerthread.cpp
index 97ace9a32353b8..e543a3c60c3462 100644
--- a/src/coreclr/vm/finalizerthread.cpp
+++ b/src/coreclr/vm/finalizerthread.cpp
@@ -404,15 +404,13 @@ VOID FinalizerThread::FinalizerThreadWorker(void *args)
         }
         LOG((LF_GC, LL_INFO100, "***** Calling Finalizers\n"));
 
-        int observedFullGcCount =
-            GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration());
         FinalizeAllObjects();
 
         // Anyone waiting to drain the Q can now wake up.  Note that there is a
         // race in that another thread starting a drain, as we leave a drain, may
-        // consider itself satisfied by the drain that just completed.
-        // Thus we include the Full GC count that we have certaily observed.
-        SignalFinalizationDone(observedFullGcCount);
+        // consider itself satisfied by the drain that just completed.  This is
+        // acceptable.
+        SignalFinalizationDone();
     }
 
     if (s_InitializedFinalizerThreadForPlatform)
@@ -540,13 +538,10 @@ void FinalizerThread::FinalizerThreadCreate()
     }
 }
 
-static int g_fullGcCountSeenByFinalization;
-
-void FinalizerThread::SignalFinalizationDone(int observedFullGcCount)
+void FinalizerThread::SignalFinalizationDone()
 {
     WRAPPER_NO_CONTRACT;
 
-    g_fullGcCountSeenByFinalization = observedFullGcCount;
     hEventFinalizerDone->Set();
 }
 
@@ -560,13 +555,6 @@ void FinalizerThread::FinalizerThreadWait()
     // Can't call this from within a finalized method.
     if (!IsCurrentThreadFinalizer())
     {
-        // We may see a completion of finalization cycle that might not see objects that became
-        // F-reachable in recent GCs. In such case we want to wait for a completion of another cycle.
-        // However, since an object cannot be prevented from promoting, one can only rely on Full GCs
-        // to collect unreferenced objects deterministically. Thus we only care about Full GCs here.
-        int desiredFullGcCount =
-            GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration());
-
         GCX_PREEMP();
 
 #ifdef FEATURE_COMINTEROP
@@ -577,8 +565,8 @@ void FinalizerThread::FinalizerThreadWait()
             g_pRCWCleanupList->CleanupWrappersInCurrentCtxThread();
 #endif // FEATURE_COMINTEROP
 
-    tryAgain:
         hEventFinalizerDone->Reset();
+
         EnableFinalization();
 
         // Under GC stress the finalizer queue may never go empty as frequent
@@ -592,18 +580,6 @@ void FinalizerThread::FinalizerThreadWait()
 
         DWORD status;
         status = hEventFinalizerDone->Wait(INFINITE,TRUE);
-
-        // we use unsigned math here as the collection counts, which are size_t internally,
-        // can in theory overflow an int and wrap around.
-        // unsigned math would have more defined/portable behavior in such case
-        if ((int)((unsigned int)desiredFullGcCount - (unsigned int)g_fullGcCountSeenByFinalization) > 0)
-        {
-            // There were some Full GCs happening before we started waiting and possibly not seen by the
-            // last finalization cycle. This is rare, but we need to be sure we have seen those,
-            // so we try one more time.
-            goto tryAgain;
-        }
-
         _ASSERTE(status == WAIT_OBJECT_0);
     }
 }
diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h
index 03aae7b4e9cf6d..b254773883ab8c 100644
--- a/src/coreclr/vm/finalizerthread.h
+++ b/src/coreclr/vm/finalizerthread.h
@@ -67,7 +67,7 @@ class FinalizerThread
 
     static void FinalizerThreadWait();
 
-    static void SignalFinalizationDone(int observedFullGcCount);
+    static void SignalFinalizationDone();
 
     static VOID FinalizerThreadWorker(void *args);
     static DWORD WINAPI FinalizerThreadStart(void *args);
diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs
index ce029fc637284e..137c1dc8246a10 100644
--- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs
+++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/GCTests.cs
@@ -7,7 +7,6 @@
 using System.Runtime.InteropServices;
 using System.Diagnostics;
 using System.Threading;
-using System.Threading.Tasks;
 using System.Runtime;
 using Microsoft.DotNet.RemoteExecutor;
 using Xunit;
@@ -293,55 +292,6 @@ private class TestObject
             }
         }
 
-        [OuterLoop]
-        [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsPreciseGcSupported))]
-        public static void WaitForPendingFinalizersRaces()
-        {
-            Task.Run(Test);
-            Task.Run(Test);
-            Task.Run(Test);
-            Task.Run(Test);
-            Task.Run(Test);
-            Task.Run(Test);
-            Test();
-
-            static void Test()
-            {
-                for (int i = 0; i < 20000; i++)
-                {
-                    BoxedFinalized flag = new BoxedFinalized();
-                    MakeAndNull(flag);
-                    GC.Collect();
-                    GC.WaitForPendingFinalizers();
-                    Assert.True(flag.finalized);
-                }
-            }
-
-            [MethodImpl(MethodImplOptions.NoInlining)]
-            static void MakeAndNull(BoxedFinalized flag)
-            {
-                var deadObj = new TestObjectWithFinalizer(flag);
-                // it's dead here
-            };
-        }
-
-        class BoxedFinalized
-        {
-            public bool finalized;
-        }
-
-        class TestObjectWithFinalizer
-        {
-            BoxedFinalized _flag;
-
-            public TestObjectWithFinalizer(BoxedFinalized flag)
-            {
-                _flag = flag;
-            }
-
-            ~TestObjectWithFinalizer() => _flag.finalized = true;
-        }
-
         [Fact]
         public static void SuppressFinalizer_NullObject_ThrowsArgumentNullException()
         {

From 5208698d019c6f24a144c0261b02ae36dbab44dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?=
 <MichalStrehovsky@users.noreply.github.com>
Date: Fri, 26 Jul 2024 09:14:20 +0200
Subject: [PATCH 2/2] Revert "Port yield normalization from CoreCLR to Native
 AOT (#103675)"

This reverts commit d35f3021b91d67eeac232a0370c6efb6c256f060.
---
 src/coreclr/gc/env/gcenv.os.h                 |   6 +
 src/coreclr/inc/yieldprocessornormalized.h    |  39 +-
 src/coreclr/nativeaot/Runtime/Crst.h          |   1 +
 .../nativeaot/Runtime/FinalizerHelpers.cpp    |   8 +-
 .../eventpipe/gen-eventing-event-inc.lst      |   1 -
 src/coreclr/nativeaot/Runtime/startup.cpp     |   2 +
 .../Runtime/windows/PalRedhawkInline.h        |  20 -
 .../Runtime/yieldprocessornormalized.cpp      | 102 +++++-
 .../Runtime/yieldprocessornormalized.h        | 228 +++++++++++-
 .../utilcode/yieldprocessornormalized.cpp     |   1 +
 src/coreclr/vm/yieldprocessornormalized.cpp   | 294 ++++++++++++++-
 .../vm/yieldprocessornormalizedshared.cpp     | 341 ------------------
 12 files changed, 653 insertions(+), 390 deletions(-)
 delete mode 100644 src/coreclr/vm/yieldprocessornormalizedshared.cpp

diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
index aa7223850eaa9b..01ed27dac3e59b 100644
--- a/src/coreclr/gc/env/gcenv.os.h
+++ b/src/coreclr/gc/env/gcenv.os.h
@@ -6,6 +6,12 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index e37bf79f0c5089..121e60b033356d 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,11 +3,14 @@
 
 #pragma once
 
-#ifdef FEATURE_NATIVEAOT
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#else
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
 #endif
+#define YieldProcessor Dont_Use_YieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -141,9 +144,9 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
@@ -151,7 +154,7 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
         }
     }
 
-    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n =
-        (size_t)preSkylakeCount *
+    SIZE_T n =
+        (SIZE_T)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const uint8_t MaxShift = 3;
-    static_assert(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-    static_assert(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert_no_msg(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     unsigned int n;
     if (spinIteration <= MaxShift &&
diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h
index 4ab9db08e0f5e3..31bf8fde9eec8a 100644
--- a/src/coreclr/nativeaot/Runtime/Crst.h
+++ b/src/coreclr/nativeaot/Runtime/Crst.h
@@ -20,6 +20,7 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
+    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index 8fa60538189697..dd9f1e096842fb 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
+    // We have some time until the first finalization request - use the time to calibrate normalized waits.
+    EnsureYieldProcessorNormalizedInitialized();
+
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -181,11 +184,6 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
     FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
     g_FinalizerDoneEvent.Set();
-
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
 }
 
 //
diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
index 0f4c932719a399..901af659ff84b6 100644
--- a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
+++ b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
@@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
-YieldProcessorMeasurement
diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp
index af835018e1823a..db2802dcb115ef 100644
--- a/src/coreclr/nativeaot/Runtime/startup.cpp
+++ b/src/coreclr/nativeaot/Runtime/startup.cpp
@@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
+    InitializeYieldProcessorNormalizedCrst();
+
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 1f2a74dcd15100..187ad26fb8bf11 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
-#ifdef HOST_X86
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    int64_t iOldValue;
-    do {
-        iOldValue = *pDst;
-    } while (PalInterlockedCompareExchange64(pDst,
-                                          iValue,
-                                          iOldValue) != iOldValue);
-    return iOldValue;
-}
-#else // HOST_X86
-EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
-#pragma intrinsic(_InterlockedExchange64)
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    return _InterlockedExchange64(pDst, iValue);
-}
-#endif // HOST_X86
-
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
index efaf4e8bb20704..444d52b0114c03 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -15,6 +15,104 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
-#include "../../utilcode/yieldprocessornormalized.cpp"
+#define ULONGLONG int64_t
 
-#include "../../vm/yieldprocessornormalizedshared.cpp"
+static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
+static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+
+void InitializeYieldProcessorNormalizedCrst()
+{
+    WRAPPER_NO_CONTRACT;
+    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
+}
+
+static void InitializeYieldProcessorNormalized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+    if (s_isYieldProcessorNormalizedInitialized)
+    {
+        return;
+    }
+
+    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+    const int MeasureDurationMs = 10;
+    const int NsPerSecond = 1000 * 1000 * 1000;
+
+    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
+
+    if (ticksPerSecond < 1000 / MeasureDurationMs)
+    {
+        // High precision clock not available or clock resolution is too low, resort to defaults
+        s_isYieldProcessorNormalizedInitialized = true;
+        return;
+    }
+
+    // Measure the nanosecond delay per yield
+    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+    unsigned int yieldCount = 0;
+      ULONGLONG startTicks = PalQueryPerformanceCounter();
+    ULONGLONG elapsedTicks;
+    do
+    {
+        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
+        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
+        // low microsecond range.
+        for (int i = 0; i < 1000; ++i)
+        {
+            System_YieldProcessor();
+        }
+        yieldCount += 1000;
+
+        ULONGLONG nowTicks = PalQueryPerformanceCounter();
+        elapsedTicks = nowTicks - startTicks;
+    } while (elapsedTicks < measureDurationTicks);
+    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+    if (nsPerYield < 1)
+    {
+        nsPerYield = 1;
+    }
+
+    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
+    // value is naturally limited to MinNsPerNormalizedYield.
+    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+    if (yieldsPerNormalizedYield < 1)
+    {
+        yieldsPerNormalizedYield = 1;
+    }
+    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    int optimalMaxNormalizedYieldsPerSpinIteration =
+        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    {
+        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+    }
+
+    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    s_isYieldProcessorNormalizedInitialized = true;
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+}
+
+void EnsureYieldProcessorNormalizedInitialized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!s_isYieldProcessorNormalizedInitialized)
+    {
+        InitializeYieldProcessorNormalized();
+    }
+}
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
index 5539ebf90561bc..8c74bf3cfe3002 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
@@ -1,5 +1,229 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#include "PalRedhawk.h"
-#include "../../inc/yieldprocessornormalized.h"
+#pragma once
+
+#include <limits.h>
+
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
+FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
+#endif
+#define YieldProcessor Dont_Use_YieldProcessor
+#ifdef PalYieldProcessor
+#undef PalYieldProcessor
+#endif
+#define PalYieldProcessor Dont_Use_PalYieldProcessor
+
+#define SIZE_T uintptr_t
+
+const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+
+extern unsigned int g_yieldsPerNormalizedYield;
+extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+
+class YieldProcessorNormalizationInfo
+{
+private:
+    unsigned int yieldsPerNormalizedYield;
+    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
+    unsigned int optimalMaxYieldsPerSpinIteration;
+
+public:
+    YieldProcessorNormalizationInfo()
+        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+    }
+
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+};
+
+// See YieldProcessorNormalized() for preliminary info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
+{
+    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
+// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
+//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
+//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
+//     and decrease scalability of the operation.
+//         while(!condition)
+//         {
+//             YieldProcessorNormalized();
+//         }
+//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
+//     condition, otherwise it may unnecessarily increase latency of the operation
+//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
+//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
+//     issue above on later iterations.
+//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
+//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
+//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
+FORCEINLINE void YieldProcessorNormalized()
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
+}
+
+// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
+//     if (!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo, 2);
+//         } while (!moreExpensiveCondition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
+{
+    _ASSERTE(count != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+        if (count > MaxCount)
+        {
+            count = MaxCount;
+        }
+    }
+
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
+// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
+//     while(!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalized(2);
+//     }
+FORCEINLINE void YieldProcessorNormalized(unsigned int count)
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
+}
+
+// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
+// info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int preSkylakeCount)
+{
+    _ASSERTE(preSkylakeCount != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+        if (preSkylakeCount > MaxCount)
+        {
+            preSkylakeCount = MaxCount;
+        }
+    }
+
+    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    if (n == 0)
+    {
+        n = 1;
+    }
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
+// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
+// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
+// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
+//     while(!condition)
+//     {
+//         YieldProcessorNormalizedForPreSkylakeCount(100);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
+{
+    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
+// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
+// iteration exponentially up to a limit. Typical usage:
+//     if (!conditionThatMayNotBeSatisfiedSoon)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
+//         } while (!conditionThatMayNotBeSatisfiedSoon);
+//     }
+FORCEINLINE void YieldProcessorWithBackOffNormalized(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int spinIteration)
+{
+    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
+    // InitializeYieldProcessorNormalized()
+    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
+    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    // This shift value should be adjusted based on the asserted condition below
+    const uint8_t MaxShift = 3;
+    static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+
+    unsigned int n;
+    if (spinIteration <= MaxShift &&
+        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
+    }
+    else
+    {
+        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
+    }
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index c6aaaa19557fa7..020d8d7cc79e4e 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#include "stdafx.h"
 #include "yieldprocessornormalized.h"
 
 bool YieldProcessorNormalization::s_isMeasurementScheduled;
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 258e30d634c7ce..14166de34dd641 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -7,4 +7,296 @@
 
 #include "finalizerthread.h"
 
-#include "yieldprocessornormalizedshared.cpp"
+enum class NormalizationState : UINT8
+{
+    Uninitialized,
+    Initialized,
+    Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static UINT64 s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
+
+static unsigned int DetermineMeasureDurationUs()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+    // if the overhead seems high relative to the measure duration.
+    unsigned int measureDurationUs = 1;
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+    QueryPerformanceCounter(&li);
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
+    {
+        measureDurationUs *= 4;
+    }
+    return measureDurationUs;
+}
+
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+    UINT64 ticksPerS = s_performanceCounterTicksPerS;
+    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
+
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+
+    for (int i = 0; i < yieldCount; ++i)
+    {
+        System_YieldProcessor();
+    }
+
+    QueryPerformanceCounter(&li);
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    while (elapsedTicks < measureDurationTicks)
+    {
+        int nextYieldCount =
+            Max(4,
+                elapsedTicks == 0
+                    ? yieldCount / 4
+                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+        for (int i = 0; i < nextYieldCount; ++i)
+        {
+            System_YieldProcessor();
+        }
+
+        QueryPerformanceCounter(&li);
+        elapsedTicks = li.QuadPart - startTicks;
+        yieldCount += nextYieldCount;
+    }
+
+    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+    const double MinNsPerYield = 0.1;
+
+    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+    // really take this long. Limit the maximum to keep the recorded values reasonable.
+    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_isMeasurementScheduled);
+
+    double latestNsPerYield;
+    if (s_normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+
+        int nextMeasurementIndex = s_nextMeasurementIndex;
+        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
+        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+        {
+            nextMeasurementIndex = 0;
+        }
+        s_nextMeasurementIndex = nextMeasurementIndex;
+    }
+    else if (s_normalizationState == NormalizationState::Uninitialized)
+    {
+        LARGE_INTEGER li;
+        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+        {
+            // High precision clock not available or clock resolution is too low, resort to defaults
+            s_normalizationState = NormalizationState::Failed;
+            return;
+        }
+        s_performanceCounterTicksPerS = li.QuadPart;
+
+        unsigned int measureDurationUs = DetermineMeasureDurationUs();
+        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+        {
+            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
+            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+            {
+                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
+            }
+
+            if (i < NsPerYieldMeasurementCount - 1)
+            {
+                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+            }
+        }
+    }
+    else
+    {
+        _ASSERTE(s_normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = s_nsPerYieldMeasurements[i];
+        if (nsPerYield < establishedNsPerYield)
+        {
+            establishedNsPerYield = nsPerYield;
+        }
+    }
+    if (establishedNsPerYield != s_establishedNsPerYield)
+    {
+        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+    }
+
+    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    s_optimalMaxNormalizedYieldsPerSpinIteration =
+        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+    s_previousNormalizationTimeMs = GetTickCount();
+    s_normalizationState = NormalizationState::Initialized;
+    s_isMeasurementScheduled = false;
+}
+
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+    if (normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+    }
+    else if (normalizationState == NormalizationState::Uninitialized)
+    {
+    }
+    else
+    {
+        _ASSERTE(normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+    if (s_isMeasurementScheduled || !g_fEEStarted)
+    {
+        return;
+    }
+
+    s_isMeasurementScheduled = true;
+    FinalizerThread::EnableFinalization();
+}
+
+
+void YieldProcessorNormalization::FireMeasurementEvents()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    if (!EventEnabledYieldProcessorMeasurement())
+    {
+        return;
+    }
+
+    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+    // recorded information, so try to enumerate the array with some care.
+    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
+    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        if (nsPerYield != 0) // the array may not be fully initialized yet
+        {
+            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+        }
+
+        if (++nextIndex >= NsPerYieldMeasurementCount)
+        {
+            nextIndex = 0;
+        }
+    }
+}
+
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    return VolatileLoadWithoutBarrier(valueRef);
+#else
+    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
+}
+
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    *valueRef = value;
+#else
+    InterlockedExchangeT(valueRef, value);
+#endif
+}
+
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
deleted file mode 100644
index 05daee21947376..00000000000000
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-enum class NormalizationState : uint8_t
-{
-    Uninitialized,
-    Initialized,
-    Failed
-};
-
-static const int NsPerYieldMeasurementCount = 8;
-static const unsigned int MeasurementPeriodMs = 4000;
-
-static const unsigned int NsPerS = 1000 * 1000 * 1000;
-
-static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
-static unsigned int s_previousNormalizationTimeMs;
-
-static uint64_t s_performanceCounterTicksPerS;
-static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
-static int s_nextMeasurementIndex;
-static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
-
-void RhEnableFinalization();
-
-inline unsigned int GetTickCountPortable()
-{
-#ifdef FEATURE_NATIVEAOT
-    return (unsigned int)PalGetTickCount64();
-#else
-    return GetTickCount();
-#endif
-}
-
-static uint64_t GetPerformanceCounter()
-{
-#ifdef FEATURE_NATIVEAOT
-    return PalQueryPerformanceCounter();
-#else
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    return li.QuadPart;
-#endif
-}
-
-static unsigned int DetermineMeasureDurationUs()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
-    // if the overhead seems high relative to the measure duration.
-    unsigned int measureDurationUs = 1;
-    uint64_t startTicks = GetPerformanceCounter();
-    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
-    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
-    {
-        measureDurationUs *= 4;
-    }
-    return measureDurationUs;
-}
-
-static double MeasureNsPerYield(unsigned int measureDurationUs)
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
-    uint64_t ticksPerS = s_performanceCounterTicksPerS;
-    uint64_t measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
-
-    uint64_t startTicks = GetPerformanceCounter();
-
-    for (int i = 0; i < yieldCount; ++i)
-    {
-        System_YieldProcessor();
-    }
-
-    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
-    while (elapsedTicks < measureDurationTicks)
-    {
-        int nextYieldCount =
-            max(4,
-                elapsedTicks == 0
-                    ? yieldCount / 4
-                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
-        for (int i = 0; i < nextYieldCount; ++i)
-        {
-            System_YieldProcessor();
-        }
-
-        elapsedTicks = GetPerformanceCounter() - startTicks;
-        yieldCount += nextYieldCount;
-    }
-
-    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
-    const double MinNsPerYield = 0.1;
-
-    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
-    // really take this long. Limit the maximum to keep the recorded values reasonable.
-    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
-
-    return max(MinNsPerYield, min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
-}
-
-void YieldProcessorNormalization::PerformMeasurement()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_isMeasurementScheduled);
-
-    double latestNsPerYield;
-    if (s_normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-
-        int nextMeasurementIndex = s_nextMeasurementIndex;
-        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
-        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
-        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
-        {
-            nextMeasurementIndex = 0;
-        }
-        s_nextMeasurementIndex = nextMeasurementIndex;
-    }
-    else if (s_normalizationState == NormalizationState::Uninitialized)
-    {
-#ifdef FEATURE_NATIVEAOT
-        if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000)
-#else
-        LARGE_INTEGER li;
-        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
-#endif
-        {
-            // High precision clock not available or clock resolution is too low, resort to defaults
-            s_normalizationState = NormalizationState::Failed;
-            return;
-        }
-
-#ifndef FEATURE_NATIVEAOT
-        s_performanceCounterTicksPerS = li.QuadPart;
-#endif
-
-        unsigned int measureDurationUs = DetermineMeasureDurationUs();
-        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-        {
-            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
-            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
-            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
-            {
-                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
-            }
-            if (i < NsPerYieldMeasurementCount - 1)
-            {
-                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-            }
-        }
-    }
-    else
-    {
-        _ASSERTE(s_normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
-    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = s_nsPerYieldMeasurements[i];
-        if (nsPerYield < establishedNsPerYield)
-        {
-            establishedNsPerYield = nsPerYield;
-        }
-    }
-    if (establishedNsPerYield != s_establishedNsPerYield)
-    {
-        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
-    }
-
-    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-
-    // Calculate the number of yields required to span the duration of a normalized yield
-    unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
-    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
-    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    s_optimalMaxNormalizedYieldsPerSpinIteration =
-        max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
-    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-
-    s_previousNormalizationTimeMs = GetTickCountPortable();
-    s_normalizationState = NormalizationState::Initialized;
-    s_isMeasurementScheduled = false;
-}
-
-
-void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
-    if (normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-    }
-    else if (normalizationState == NormalizationState::Uninitialized)
-    {
-    }
-    else
-    {
-        _ASSERTE(normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-#ifdef FEATURE_NATIVEAOT
-    if (s_isMeasurementScheduled)
-#else
-    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
-    if (s_isMeasurementScheduled || !g_fEEStarted)
-#endif
-    {
-        return;
-    }
-
-    s_isMeasurementScheduled = true;
-#ifdef FEATURE_NATIVEAOT
-    RhEnableFinalization();
-#else
-    FinalizerThread::EnableFinalization();
-#endif
-}
-
-void YieldProcessorNormalization::FireMeasurementEvents()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    if (!EventEnabledYieldProcessorMeasurement())
-    {
-        return;
-    }
-
-    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
-    // recorded information, so try to enumerate the array with some care.
-    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
-    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
-    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
-        if (nsPerYield != 0) // the array may not be fully initialized yet
-        {
-            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
-        }
-
-        if (++nextIndex >= NsPerYieldMeasurementCount)
-        {
-            nextIndex = 0;
-        }
-    }
-}
-
-double YieldProcessorNormalization::AtomicLoad(double *valueRef)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    return VolatileLoadWithoutBarrier(valueRef);
-#else
-#ifdef FEATURE_NATIVEAOT
-    static_assert(sizeof(int64_t) == sizeof(double), "");
-    int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0);
-    return *(double*)(int64_t*)(&intRes);
-#else
-    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
-#endif
-#endif
-}
-
-void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    *valueRef = value;
-#else
-#ifdef FEATURE_NATIVEAOT
-    static_assert(sizeof(int64_t) == sizeof(double), "");
-    PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value);
-#else
-    InterlockedExchangeT(valueRef, value);
-#endif
-#endif
-}
-