diff --git a/src/coreclr/debug/daccess/dacdbiimpl.cpp b/src/coreclr/debug/daccess/dacdbiimpl.cpp
index 715c87dedafef2..8b38fdd7a82c89 100644
--- a/src/coreclr/debug/daccess/dacdbiimpl.cpp
+++ b/src/coreclr/debug/daccess/dacdbiimpl.cpp
@@ -4713,8 +4713,16 @@ void DacDbiInterfaceImpl::GetThreadAllocInfo(VMPTR_Thread        vmThread,
 
     Thread * pThread = vmThread.GetDacPtr();
     gc_alloc_context* allocContext = pThread->GetAllocContext();
-    threadAllocInfo->m_allocBytesSOH = allocContext->alloc_bytes - (allocContext->alloc_limit - allocContext->alloc_ptr);
-    threadAllocInfo->m_allocBytesUOH = allocContext->alloc_bytes_uoh;
+    if (allocContext != nullptr)
+    {
+        threadAllocInfo->m_allocBytesSOH = allocContext->alloc_bytes - (allocContext->alloc_limit - allocContext->alloc_ptr);
+        threadAllocInfo->m_allocBytesUOH = allocContext->alloc_bytes_uoh;
+    }
+    else
+    {
+            threadAllocInfo->m_allocBytesSOH = 0;
+            threadAllocInfo->m_allocBytesUOH = 0;
+    }
 }
 
 // Set and reset the TSNC_DebuggerUserSuspend bit on the state of the specified thread
diff --git a/src/coreclr/debug/daccess/request.cpp b/src/coreclr/debug/daccess/request.cpp
index 5353c93a892289..bd62b8d39b7df1 100644
--- a/src/coreclr/debug/daccess/request.cpp
+++ b/src/coreclr/debug/daccess/request.cpp
@@ -720,8 +720,18 @@ ClrDataAccess::GetThreadAllocData(CLRDATA_ADDRESS addr, struct DacpAllocData *da
 
     Thread* thread = PTR_Thread(TO_TADDR(addr));
 
-    data->allocBytes = TO_CDADDR(thread->m_alloc_context.alloc_bytes);
-    data->allocBytesLoh = TO_CDADDR(thread->m_alloc_context.alloc_bytes_uoh);
+    gc_alloc_context* pAllocContext = thread->GetAllocContext();
+
+    if (pAllocContext != NULL)
+    {
+        data->allocBytes = TO_CDADDR(pAllocContext->alloc_bytes);
+        data->allocBytesLoh = TO_CDADDR(pAllocContext->alloc_bytes_uoh);
+    }
+    else
+    {
+        data->allocBytes = TO_CDADDR(0);
+        data->allocBytesLoh = TO_CDADDR(0);
+    }
 
     SOSDacLeave();
     return hr;
@@ -816,8 +826,18 @@ HRESULT ClrDataAccess::GetThreadDataImpl(CLRDATA_ADDRESS threadAddr, struct Dacp
     threadData->osThreadId = (DWORD)thread->m_OSThreadId;
     threadData->state = thread->m_State;
     threadData->preemptiveGCDisabled = thread->m_fPreemptiveGCDisabled;
-    threadData->allocContextPtr = TO_CDADDR(thread->m_alloc_context.alloc_ptr);
-    threadData->allocContextLimit = TO_CDADDR(thread->m_alloc_context.alloc_limit);
+
+    gc_alloc_context* allocContext = thread->GetAllocContext();
+    if (allocContext)
+    {
+        threadData->allocContextPtr = TO_CDADDR(allocContext->alloc_ptr);
+        threadData->allocContextLimit = TO_CDADDR(allocContext->alloc_limit);
+    }
+    else
+    {
+        threadData->allocContextPtr = TO_CDADDR(0);
+        threadData->allocContextLimit = TO_CDADDR(0);
+    }
 
     threadData->fiberData = (CLRDATA_ADDRESS)NULL;
 
diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt
index f60713ef587e1c..255378fe1d19f0 100644
--- a/src/coreclr/vm/CMakeLists.txt
+++ b/src/coreclr/vm/CMakeLists.txt
@@ -621,8 +621,8 @@ if(CLR_CMAKE_TARGET_ARCH_AMD64)
         ${ARCH_SOURCES_DIR}/GenericComPlusCallStubs.asm
         ${ARCH_SOURCES_DIR}/getstate.asm
         ${ARCH_SOURCES_DIR}/JitHelpers_Fast.asm
+        ${ARCH_SOURCES_DIR}/JitHelpers_FastMP.asm
         ${ARCH_SOURCES_DIR}/JitHelpers_FastWriteBarriers.asm
-        ${ARCH_SOURCES_DIR}/JitHelpers_InlineGetThread.asm
         ${ARCH_SOURCES_DIR}/JitHelpers_SingleAppDomain.asm
         ${ARCH_SOURCES_DIR}/JitHelpers_Slow.asm
         ${ARCH_SOURCES_DIR}/patchedcode.asm
diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc
index 2d14b9c31e8fca..8e67aedd9c6dc0 100644
--- a/src/coreclr/vm/amd64/AsmMacros.inc
+++ b/src/coreclr/vm/amd64/AsmMacros.inc
@@ -206,6 +206,26 @@ INLINE_GETTHREAD macro Reg
 
         endm
 
+;
+; Inlined macro to get the current thread's allocation context
+; Trashes rax and r11
+;
+
+INLINE_GET_ALLOC_CONTEXT macro Reg
+
+        EXTERN _tls_index: DWORD
+        EXTERN t_thread_alloc_context: DWORD
+
+        mov     r11d, [_tls_index]
+        mov     rax, gs:[OFFSET__TEB__ThreadLocalStoragePointer]
+        mov     rax, [rax + r11 * 8]
+        mov     r11d, SECTIONREL t_thread_alloc_context
+        add     rax, r11
+        mov     Reg, rax
+
+        endm
+
+
 ; if you change this code there will be corresponding code in JITInterfaceGen.cpp which will need to be changed
 ;
 
diff --git a/src/coreclr/vm/amd64/JitHelpers_FastMP.asm b/src/coreclr/vm/amd64/JitHelpers_FastMP.asm
new file mode 100644
index 00000000000000..9849b8d8016d70
--- /dev/null
+++ b/src/coreclr/vm/amd64/JitHelpers_FastMP.asm
@@ -0,0 +1,75 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+
+; ***********************************************************************
+; File: JitHelpers_InlineGetThread.asm, see history in jithelp.asm
+;
+; ***********************************************************************
+
+include AsmMacros.inc
+include asmconstants.inc
+
+CopyValueClassUnchecked equ     ?CopyValueClassUnchecked@@YAXPEAX0PEAVMethodTable@@@Z
+JIT_Box                 equ     ?JIT_Box@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@PEAX@Z
+
+extern CopyValueClassUnchecked:proc
+extern JIT_Box:proc
+
+; HCIMPL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* unboxedData)
+NESTED_ENTRY JIT_BoxFastMP, _TEXT
+
+        ; m_BaseSize is guaranteed to be a multiple of 8.
+        mov     r8d, [rcx + OFFSET__MethodTable__m_BaseSize]
+
+        INLINE_GET_ALLOC_CONTEXT r11
+        mov     r10, [r11 + OFFSETOF__gc_alloc_context__alloc_limit]
+        mov     rax, [r11 + OFFSETOF__gc_alloc_context__alloc_ptr]
+
+        add     r8, rax
+
+        cmp     r8, r10
+        ja      AllocFailed
+
+        test    rdx, rdx
+        je      NullRef
+
+        mov     [r11 + OFFSETOF__gc_alloc_context__alloc_ptr], r8
+        mov     [rax], rcx
+
+        ; Check whether the object contains pointers
+        test    dword ptr [rcx + OFFSETOF__MethodTable__m_dwFlags], MethodTable__enum_flag_ContainsPointers
+        jnz     ContainsPointers
+
+        ; We have no pointers - emit a simple inline copy loop
+        ; Copy the contents from the end
+        mov     ecx, [rcx + OFFSET__MethodTable__m_BaseSize]
+        sub     ecx, 18h  ; sizeof(ObjHeader) + sizeof(Object) + last slot
+
+align 16
+    CopyLoop:
+        mov     r8, [rdx+rcx]
+        mov     [rax+rcx+8], r8
+        sub     ecx, 8
+        jge     CopyLoop
+        REPRET
+
+    ContainsPointers:
+        ; Do call to CopyValueClassUnchecked(object, data, pMT)
+        push_vol_reg rax
+        alloc_stack 20h
+        END_PROLOGUE
+
+        mov     r8, rcx
+        lea     rcx, [rax + 8]
+        call    CopyValueClassUnchecked
+
+        add     rsp, 20h
+        pop     rax
+        ret
+
+    AllocFailed:
+    NullRef:
+        jmp     JIT_Box
+NESTED_END JIT_BoxFastMP, _TEXT
+
+        end
diff --git a/src/coreclr/vm/amd64/JitHelpers_InlineGetThread.asm b/src/coreclr/vm/amd64/JitHelpers_InlineGetThread.asm
deleted file mode 100644
index bf79668e567e29..00000000000000
--- a/src/coreclr/vm/amd64/JitHelpers_InlineGetThread.asm
+++ /dev/null
@@ -1,263 +0,0 @@
-; Licensed to the .NET Foundation under one or more agreements.
-; The .NET Foundation licenses this file to you under the MIT license.
-
-; ***********************************************************************
-; File: JitHelpers_InlineGetThread.asm, see history in jithelp.asm
-;
-; Notes: These routinues will be patched at runtime with the location in
-;        the TLS to find the Thread* and are the fastest implementation
-;        of their specific functionality.
-; ***********************************************************************
-
-include AsmMacros.inc
-include asmconstants.inc
-
-; Min amount of stack space that a nested function should allocate.
-MIN_SIZE equ 28h
-
-JIT_NEW                 equ     ?JIT_New@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@@Z
-CopyValueClassUnchecked equ     ?CopyValueClassUnchecked@@YAXPEAX0PEAVMethodTable@@@Z
-JIT_Box                 equ     ?JIT_Box@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@PEAX@Z
-g_pStringClass          equ     ?g_pStringClass@@3PEAVMethodTable@@EA
-FramedAllocateString    equ     ?FramedAllocateString@@YAPEAVStringObject@@K@Z
-JIT_NewArr1             equ     ?JIT_NewArr1@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@_J@Z
-
-INVALIDGCVALUE          equ     0CCCCCCCDh
-
-extern JIT_NEW:proc
-extern CopyValueClassUnchecked:proc
-extern JIT_Box:proc
-extern g_pStringClass:QWORD
-extern FramedAllocateString:proc
-extern JIT_NewArr1:proc
-
-extern JIT_InternalThrow:proc
-
-; IN: rcx: MethodTable*
-; OUT: rax: new object
-LEAF_ENTRY JIT_TrialAllocSFastMP_InlineGetThread, _TEXT
-        mov     edx, [rcx + OFFSET__MethodTable__m_BaseSize]
-
-        ; m_BaseSize is guaranteed to be a multiple of 8.
-
-        INLINE_GETTHREAD r11
-        mov     r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit]
-        mov     rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr]
-
-        add     rdx, rax
-
-        cmp     rdx, r10
-        ja      AllocFailed
-
-        mov     [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], rdx
-        mov     [rax], rcx
-
-        ret
-
-    AllocFailed:
-        jmp     JIT_NEW
-LEAF_END JIT_TrialAllocSFastMP_InlineGetThread, _TEXT
-
-; HCIMPL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* unboxedData)
-NESTED_ENTRY JIT_BoxFastMP_InlineGetThread, _TEXT
-
-        ; m_BaseSize is guaranteed to be a multiple of 8.
-        mov     r8d, [rcx + OFFSET__MethodTable__m_BaseSize]
-
-        INLINE_GETTHREAD r11
-        mov     r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit]
-        mov     rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr]
-
-        add     r8, rax
-
-        cmp     r8, r10
-        ja      AllocFailed
-
-        test    rdx, rdx
-        je      NullRef
-
-        mov     [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8
-        mov     [rax], rcx
-
-        ; Check whether the object contains pointers
-        test    dword ptr [rcx + OFFSETOF__MethodTable__m_dwFlags], MethodTable__enum_flag_ContainsPointers
-        jnz     ContainsPointers
-
-        ; We have no pointers - emit a simple inline copy loop
-        ; Copy the contents from the end
-        mov     ecx, [rcx + OFFSET__MethodTable__m_BaseSize]
-        sub     ecx, 18h  ; sizeof(ObjHeader) + sizeof(Object) + last slot
-
-align 16
-    CopyLoop:
-        mov     r8, [rdx+rcx]
-        mov     [rax+rcx+8], r8
-        sub     ecx, 8
-        jge     CopyLoop
-        REPRET
-
-    ContainsPointers:
-        ; Do call to CopyValueClassUnchecked(object, data, pMT)
-        push_vol_reg rax
-        alloc_stack 20h
-        END_PROLOGUE
-
-        mov     r8, rcx
-        lea     rcx, [rax + 8]
-        call    CopyValueClassUnchecked
-
-        add     rsp, 20h
-        pop     rax
-        ret
-
-    AllocFailed:
-    NullRef:
-        jmp     JIT_Box
-NESTED_END JIT_BoxFastMP_InlineGetThread, _TEXT
-
-LEAF_ENTRY AllocateStringFastMP_InlineGetThread, _TEXT
-        ; We were passed the number of characters in ECX
-
-        ; we need to load the method table for string from the global
-        mov     r9, [g_pStringClass]
-
-        ; Instead of doing elaborate overflow checks, we just limit the number of elements
-        ; to (LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR) or less.
-        ; This will avoid all overflow problems, as well as making sure
-        ; big string objects are correctly allocated in the big object heap.
-
-        cmp     ecx, (ASM_LARGE_OBJECT_SIZE - 256)/2
-        jae     OversizedString
-
-        ; Calculate the final size to allocate.
-        ; We need to calculate baseSize + cnt*2, then round that up by adding 7 and anding ~7.
-
-        lea     edx, [STRING_BASE_SIZE + ecx*2 + 7]
-        and     edx, -8
-
-        INLINE_GETTHREAD r11
-        mov     r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit]
-        mov     rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr]
-
-        add     rdx, rax
-
-        cmp     rdx, r10
-        ja      AllocFailed
-
-        mov     [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], rdx
-        mov     [rax], r9
-
-        mov     [rax + OFFSETOF__StringObject__m_StringLength], ecx
-
-        ret
-
-    OversizedString:
-    AllocFailed:
-        jmp     FramedAllocateString
-LEAF_END AllocateStringFastMP_InlineGetThread, _TEXT
-
-; HCIMPL2(Object*, JIT_NewArr1VC_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size)
-LEAF_ENTRY JIT_NewArr1VC_MP_InlineGetThread, _TEXT
-        ; We were passed a (shared) method table in RCX, which contains the element type.
-
-        ; The element count is in RDX
-
-        ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need
-        ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray.
-
-        ; Do a conservative check here.  This is to avoid overflow while doing the calculations.  We don't
-        ; have to worry about "large" objects, since the allocation quantum is never big enough for
-        ; LARGE_OBJECT_SIZE.
-
-        ; For Value Classes, this needs to be 2^16 - slack (2^32 / max component size),
-        ; The slack includes the size for the array header and round-up ; for alignment.  Use 256 for the
-        ; slack value out of laziness.
-
-        ; In both cases we do a final overflow check after adding to the alloc_ptr.
-
-        cmp     rdx, (65535 - 256)
-        jae     OversizedArray
-
-        movzx   r8d, word ptr [rcx + OFFSETOF__MethodTable__m_dwFlags]  ; component size is low 16 bits
-        imul    r8d, edx
-        add     r8d, dword ptr [rcx + OFFSET__MethodTable__m_BaseSize]
-
-        ; round the size to a multiple of 8
-
-        add     r8d, 7
-        and     r8d, -8
-
-
-        INLINE_GETTHREAD r11
-        mov     r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit]
-        mov     rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr]
-
-        add     r8, rax
-        jc      AllocFailed
-
-        cmp     r8, r10
-        ja      AllocFailed
-
-        mov     [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8
-        mov     [rax], rcx
-
-        mov     dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx
-
-        ret
-
-    OversizedArray:
-    AllocFailed:
-        jmp     JIT_NewArr1
-LEAF_END JIT_NewArr1VC_MP_InlineGetThread, _TEXT
-
-
-; HCIMPL2(Object*, JIT_NewArr1OBJ_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size)
-LEAF_ENTRY JIT_NewArr1OBJ_MP_InlineGetThread, _TEXT
-        ; We were passed a (shared) method table in RCX, which contains the element type.
-
-        ; The element count is in RDX
-
-        ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need
-        ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray.
-
-        ; Verifies that LARGE_OBJECT_SIZE fits in 32-bit.  This allows us to do array size
-        ; arithmetic using 32-bit registers.
-        .erre ASM_LARGE_OBJECT_SIZE lt 100000000h
-
-        cmp     rdx, (ASM_LARGE_OBJECT_SIZE - 256)/8 ; sizeof(void*)
-        jae     OversizedArray
-
-        ; In this case we know the element size is sizeof(void *), or 8 for x64
-        ; This helps us in two ways - we can shift instead of multiplying, and
-        ; there's no need to align the size either
-
-        mov     r8d, dword ptr [rcx + OFFSET__MethodTable__m_BaseSize]
-        lea     r8d, [r8d + edx * 8]
-
-        ; No need for rounding in this case - element size is 8, and m_BaseSize is guaranteed
-        ; to be a multiple of 8.
-
-        INLINE_GETTHREAD r11
-        mov     r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit]
-        mov     rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr]
-
-        add     r8, rax
-
-        cmp     r8, r10
-        ja      AllocFailed
-
-        mov     [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8
-        mov     [rax], rcx
-
-        mov     dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx
-
-        ret
-
-    OversizedArray:
-    AllocFailed:
-        jmp     JIT_NewArr1
-LEAF_END JIT_NewArr1OBJ_MP_InlineGetThread, _TEXT
-
-
-        end
-
diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h
index b51088a6b47930..60892996320047 100644
--- a/src/coreclr/vm/amd64/asmconstants.h
+++ b/src/coreclr/vm/amd64/asmconstants.h
@@ -111,12 +111,6 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__Thread__m_pFrame
 #define Thread_m_pFrame OFFSETOF__Thread__m_pFrame
 
 
-#define               OFFSET__Thread__m_alloc_context__alloc_ptr 0x48
-ASMCONSTANTS_C_ASSERT(OFFSET__Thread__m_alloc_context__alloc_ptr == offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr));
-
-#define               OFFSET__Thread__m_alloc_context__alloc_limit 0x50
-ASMCONSTANTS_C_ASSERT(OFFSET__Thread__m_alloc_context__alloc_limit == offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_limit));
-
 #define               OFFSETOF__gc_alloc_context__alloc_ptr 0x0
 ASMCONSTANT_OFFSETOF_ASSERT(gc_alloc_context, alloc_ptr);
 
diff --git a/src/coreclr/vm/comutilnative.cpp b/src/coreclr/vm/comutilnative.cpp
index a3c9d0a848cdff..0756c41e410ccd 100644
--- a/src/coreclr/vm/comutilnative.cpp
+++ b/src/coreclr/vm/comutilnative.cpp
@@ -900,7 +900,7 @@ FCIMPL0(INT64, GCInterface::GetAllocatedBytesForCurrentThread)
 
     INT64 currentAllocated = 0;
     Thread *pThread = GetThread();
-    gc_alloc_context* ac = pThread->GetAllocContext();
+    gc_alloc_context* ac = &t_thread_alloc_context;
     currentAllocated = ac->alloc_bytes + ac->alloc_bytes_uoh - (ac->alloc_limit - ac->alloc_ptr);
 
     return currentAllocated;
@@ -987,7 +987,10 @@ extern "C" INT64 QCALLTYPE GCInterface_GetTotalAllocatedBytesPrecise()
     for (Thread *pThread = ThreadStore::GetThreadList(NULL); pThread; pThread = ThreadStore::GetThreadList(pThread))
     {
         gc_alloc_context* ac = pThread->GetAllocContext();
-        allocated -= ac->alloc_limit - ac->alloc_ptr;
+        if (ac != nullptr)
+        {
+            allocated -= ac->alloc_limit - ac->alloc_ptr;
+        }
     }
 
     ThreadSuspend::RestartEE(FALSE, TRUE);
diff --git a/src/coreclr/vm/gccover.cpp b/src/coreclr/vm/gccover.cpp
index 70ab39ea681b02..67d4bdf4e25521 100644
--- a/src/coreclr/vm/gccover.cpp
+++ b/src/coreclr/vm/gccover.cpp
@@ -1859,7 +1859,7 @@ void DoGcStress (PCONTEXT regs, NativeCodeVersion nativeCodeVersion)
     // BUG(github #10318) - when not using allocation contexts, the alloc lock
     // must be acquired here. Until fixed, this assert prevents random heap corruption.
     assert(GCHeapUtilities::UseThreadAllocationContexts());
-    GCHeapUtilities::GetGCHeap()->StressHeap(GetThread()->GetAllocContext());
+    GCHeapUtilities::GetGCHeap()->StressHeap(&t_thread_alloc_context);
 
     // StressHeap can exit early w/o forcing a SuspendEE to trigger the instruction update
     // We can not rely on the return code to determine if the instruction update happened
diff --git a/src/coreclr/vm/gcenv.ee.cpp b/src/coreclr/vm/gcenv.ee.cpp
index a4a538780aa4ad..62708eb53f080e 100644
--- a/src/coreclr/vm/gcenv.ee.cpp
+++ b/src/coreclr/vm/gcenv.ee.cpp
@@ -291,8 +291,10 @@ void GCToEEInterface::GcScanRoots(promote_func* fn, int condemned, int max_gen,
     Thread* pThread = NULL;
     while ((pThread = ThreadStore::GetThreadList(pThread)) != NULL)
     {
-        if (GCHeapUtilities::GetGCHeap()->IsThreadUsingAllocationContextHeap(
-            pThread->GetAllocContext(), sc->thread_number))
+        gc_alloc_context* palloc_context = pThread->GetAllocContext();
+        if (palloc_context != nullptr
+            && GCHeapUtilities::GetGCHeap()->IsThreadUsingAllocationContextHeap(
+                palloc_context, sc->thread_number))
         {
             STRESS_LOG2(LF_GC | LF_GCROOTS, LL_INFO100, "{ Starting scan of Thread %p ID = %x\n", pThread, pThread->GetThreadId());
 
@@ -435,13 +437,12 @@ gc_alloc_context * GCToEEInterface::GetAllocContext()
 {
     WRAPPER_NO_CONTRACT;
 
-    Thread* pThread = ::GetThreadNULLOk();
-    if (!pThread)
+    if (!::GetThreadNULLOk())
     {
         return nullptr;
     }
 
-    return pThread->GetAllocContext();
+    return &t_thread_alloc_context;
 }
 
 void GCToEEInterface::GcEnumAllocContexts(enum_alloc_context_func* fn, void* param)
@@ -458,7 +459,11 @@ void GCToEEInterface::GcEnumAllocContexts(enum_alloc_context_func* fn, void* par
         Thread * pThread = NULL;
         while ((pThread = ThreadStore::GetThreadList(pThread)) != NULL)
         {
-            fn(pThread->GetAllocContext(), param);
+            gc_alloc_context* palloc_context = pThread->GetAllocContext();
+            if (palloc_context != nullptr)
+            {
+                fn(palloc_context, param);
+            }
         }
     }
     else
diff --git a/src/coreclr/vm/gcheaputilities.cpp b/src/coreclr/vm/gcheaputilities.cpp
index 2f588ae6bdaec1..a365300be4f61a 100644
--- a/src/coreclr/vm/gcheaputilities.cpp
+++ b/src/coreclr/vm/gcheaputilities.cpp
@@ -43,6 +43,12 @@ bool g_sw_ww_enabled_for_gc_heap = false;
 
 GVAL_IMPL_INIT(gc_alloc_context, g_global_alloc_context, {});
 
+// on MP systems, each thread has its own allocation chunk so we can avoid
+// lock prefixes and expensive MP cache snooping stuff
+#ifndef _MSC_VER
+__thread gc_alloc_context t_thread_alloc_context;
+#endif
+
 enum GC_LOAD_STATUS {
     GC_LOAD_STATUS_BEFORE_START,
     GC_LOAD_STATUS_START,
diff --git a/src/coreclr/vm/gcheaputilities.h b/src/coreclr/vm/gcheaputilities.h
index c652cc52bf417c..c20c574d470619 100644
--- a/src/coreclr/vm/gcheaputilities.h
+++ b/src/coreclr/vm/gcheaputilities.h
@@ -26,6 +26,14 @@ GVAL_DECL(gc_alloc_context, g_global_alloc_context);
 }
 #endif // !DACCESS_COMPILE
 
+// on MP systems, each thread has its own allocation chunk so we can avoid
+// lock prefixes and expensive MP cache snooping stuff
+#ifdef _MSC_VER
+EXTERN_C __declspec(selectany) __declspec(thread) gc_alloc_context t_thread_alloc_context;
+#else
+EXTERN_C __thread gc_alloc_context t_thread_alloc_context;
+#endif
+
 extern "C" uint32_t* g_card_bundle_table;
 extern "C" uint8_t* g_ephemeral_low;
 extern "C" uint8_t* g_ephemeral_high;
diff --git a/src/coreclr/vm/gchelpers.cpp b/src/coreclr/vm/gchelpers.cpp
index 63754563b496b4..4835b6b320aa87 100644
--- a/src/coreclr/vm/gchelpers.cpp
+++ b/src/coreclr/vm/gchelpers.cpp
@@ -46,7 +46,7 @@ inline gc_alloc_context* GetThreadAllocContext()
 
     assert(GCHeapUtilities::UseThreadAllocationContexts());
 
-    return & GetThread()->m_alloc_context;
+    return &t_thread_alloc_context;
 }
 
 // When not using per-thread allocation contexts, we (the EE) need to take care that
diff --git a/src/coreclr/vm/gcstress.h b/src/coreclr/vm/gcstress.h
index d46ef841f76718..3cd7894d9b3efb 100644
--- a/src/coreclr/vm/gcstress.h
+++ b/src/coreclr/vm/gcstress.h
@@ -289,7 +289,7 @@ namespace _GCStress
             // BUG(github #10318) - when not using allocation contexts, the alloc lock
             // must be acquired here. Until fixed, this assert prevents random heap corruption.
             _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
-            GCHeapUtilities::GetGCHeap()->StressHeap(GetThread()->GetAllocContext());
+            GCHeapUtilities::GetGCHeap()->StressHeap(&t_thread_alloc_context);
         }
 
         FORCEINLINE
diff --git a/src/coreclr/vm/i386/jitinterfacex86.cpp b/src/coreclr/vm/i386/jitinterfacex86.cpp
index 67decfd147f986..492bc66bd1867a 100644
--- a/src/coreclr/vm/i386/jitinterfacex86.cpp
+++ b/src/coreclr/vm/i386/jitinterfacex86.cpp
@@ -230,15 +230,15 @@ void JIT_TrialAlloc::EmitCore(CPUSTUBLINKER *psl, CodeLabel *noLock, CodeLabel *
                  && "EAX should contain size for allocation and it doesnt!!!");
 
         // Fetch current thread into EDX, preserving EAX and ECX
-        psl->X86EmitCurrentThreadFetch(kEDX, (1 << kEAX) | (1 << kECX));
+        psl->X86EmitCurrentThreadAllocContextFetch(kEDX, (1 << kEAX) | (1 << kECX));
 
         // Try the allocation.
 
 
         if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ))
         {
-            // MOV EBX, [edx]Thread.m_alloc_context.alloc_ptr
-            psl->X86EmitOffsetModRM(0x8B, kEBX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr));
+            // MOV EBX, [edx]gc_alloc_context.alloc_ptr
+            psl->X86EmitOffsetModRM(0x8B, kEBX, kEDX, offsetof(gc_alloc_context, alloc_ptr));
             // add EAX, EBX
             psl->Emit16(0xC303);
             if (flags & ALIGN8)
@@ -246,20 +246,20 @@ void JIT_TrialAlloc::EmitCore(CPUSTUBLINKER *psl, CodeLabel *noLock, CodeLabel *
         }
         else
         {
-            // add             eax, [edx]Thread.m_alloc_context.alloc_ptr
-            psl->X86EmitOffsetModRM(0x03, kEAX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr));
+            // add             eax, [edx]gc_alloc_context.alloc_ptr
+            psl->X86EmitOffsetModRM(0x03, kEAX, kEDX, offsetof(gc_alloc_context, alloc_ptr));
         }
 
-        // cmp             eax, [edx]Thread.m_alloc_context.alloc_limit
-        psl->X86EmitOffsetModRM(0x3b, kEAX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_limit));
+        // cmp             eax, [edx]gc_alloc_context.alloc_limit
+        psl->X86EmitOffsetModRM(0x3b, kEAX, kEDX, offsetof(gc_alloc_context, alloc_limit));
 
         // ja              noAlloc
         psl->X86EmitCondJump(noAlloc, X86CondCode::kJA);
 
         // Fill in the allocation and get out.
 
-        // mov             [edx]Thread.m_alloc_context.alloc_ptr, eax
-        psl->X86EmitIndexRegStore(kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr), kEAX);
+        // mov             [edx]gc_alloc_context.alloc_ptr, eax
+        psl->X86EmitIndexRegStore(kEDX, offsetof(gc_alloc_context, alloc_ptr), kEAX);
 
         if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ))
         {
diff --git a/src/coreclr/vm/i386/stublinkerx86.cpp b/src/coreclr/vm/i386/stublinkerx86.cpp
index 8fb36cc501c3c2..413bbfedb6cc29 100644
--- a/src/coreclr/vm/i386/stublinkerx86.cpp
+++ b/src/coreclr/vm/i386/stublinkerx86.cpp
@@ -2447,6 +2447,73 @@ VOID StubLinkerCPU::X86EmitCurrentThreadFetch(X86Reg dstreg, unsigned preservedR
 #endif // TARGET_UNIX
 }
 
+#ifdef TARGET_UNIX
+namespace
+{
+    gc_alloc_context* STDCALL GetAllocContextHelper()
+    {
+        return &t_thread_alloc_context;
+    }
+}
+#endif
+
+VOID StubLinkerCPU::X86EmitCurrentThreadAllocContextFetch(X86Reg dstreg, unsigned preservedRegSet)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        // It doesn't make sense to have the destination register be preserved
+        PRECONDITION((preservedRegSet & (1 << dstreg)) == 0);
+        AMD64_ONLY(PRECONDITION(dstreg < 8)); // code below doesn't support high registers
+    }
+    CONTRACTL_END;
+
+#ifdef TARGET_UNIX
+
+    X86EmitPushRegs(preservedRegSet & ((1 << kEAX) | (1 << kEDX) | (1 << kECX)));
+
+    // call GetThread
+    X86EmitCall(NewExternalCodeLabel((LPVOID)GetAllocContextHelper), sizeof(void*));
+
+    // mov dstreg, eax
+    X86EmitMovRegReg(dstreg, kEAX);
+
+    X86EmitPopRegs(preservedRegSet & ((1 << kEAX) | (1 << kEDX) | (1 << kECX)));
+
+#ifdef _DEBUG
+    // Trash caller saved regs that we were not told to preserve, and that aren't the dstreg.
+    preservedRegSet |= 1 << dstreg;
+    if (!(preservedRegSet & (1 << kEAX)))
+        X86EmitDebugTrashReg(kEAX);
+    if (!(preservedRegSet & (1 << kEDX)))
+        X86EmitDebugTrashReg(kEDX);
+    if (!(preservedRegSet & (1 << kECX)))
+        X86EmitDebugTrashReg(kECX);
+#endif // _DEBUG
+
+#else // TARGET_UNIX
+
+#ifdef TARGET_AMD64
+    BYTE code[] = { 0x65,0x48,0x8b,0x04,0x25 };    // mov dstreg, qword ptr gs:[IMM32]
+    static const int regByteIndex = 3;
+#elif defined(TARGET_X86)
+    BYTE code[] = { 0x64,0x8b,0x05 };              // mov dstreg, dword ptr fs:[IMM32]
+    static const int regByteIndex = 2;
+#endif
+    code[regByteIndex] |= (dstreg << 3);
+
+    EmitBytes(code, sizeof(code));
+    Emit32(offsetof(TEB, ThreadLocalStoragePointer));
+
+    X86EmitIndexRegLoad(dstreg, dstreg, sizeof(void *) * _tls_index);
+
+    _ASSERTE(Thread::GetOffsetOfThreadStatic(&t_thread_alloc_context) < INT_MAX);
+    X86EmitAddReg(dstreg, (int32_t)Thread::GetOffsetOfThreadStatic(&t_thread_alloc_context));
+
+#endif // TARGET_UNIX
+}
+
 #if defined(FEATURE_COMINTEROP) && defined(TARGET_X86)
 
 #if defined(PROFILING_SUPPORTED)
diff --git a/src/coreclr/vm/i386/stublinkerx86.h b/src/coreclr/vm/i386/stublinkerx86.h
index 35aec1598fd559..3741d87d79995e 100644
--- a/src/coreclr/vm/i386/stublinkerx86.h
+++ b/src/coreclr/vm/i386/stublinkerx86.h
@@ -218,6 +218,8 @@ class StubLinkerCPU : public StubLinker
 
         VOID X86EmitCurrentThreadFetch(X86Reg dstreg, unsigned preservedRegSet);
 
+        VOID X86EmitCurrentThreadAllocContextFetch(X86Reg dstreg, unsigned preservedRegSet);
+
         VOID X86EmitIndexRegLoad(X86Reg dstreg, X86Reg srcreg, int32_t ofs = 0);
         VOID X86EmitIndexRegStore(X86Reg dstreg, int32_t ofs, X86Reg srcreg);
 #if defined(TARGET_AMD64)
diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp
index 13fbeaab2332d5..16771a339cfd8a 100644
--- a/src/coreclr/vm/jithelpers.cpp
+++ b/src/coreclr/vm/jithelpers.cpp
@@ -2105,48 +2105,42 @@ HCIMPLEND
 //*************************************************************
 // Allocation fast path for typical objects
 //
-HCIMPL1(Object*, JIT_NewS_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_)
+HCIMPL1_RAW(Object*, JIT_NewS_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_)
 {
-    FCALL_CONTRACT;
-
-    do
-    {
-        _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
+    CONTRACTL {
+        THROWS;
+        DISABLED(GC_TRIGGERS);
+        MODE_COOPERATIVE;
+    } CONTRACTL_END;
 
-        // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
-        // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
-        // some reshuffling of intermediate values into nonvolatile registers around the call.
-        Thread *thread = GetThread();
+    _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
+    gc_alloc_context *allocContext = &t_thread_alloc_context;
 
-        TypeHandle typeHandle(typeHnd_);
-        _ASSERTE(!typeHandle.IsTypeDesc()); // heap objects must have method tables
-        MethodTable *methodTable = typeHandle.AsMethodTable();
+    TypeHandle typeHandle(typeHnd_);
+    _ASSERTE(!typeHandle.IsTypeDesc()); // heap objects must have method tables
+    MethodTable *methodTable = typeHandle.AsMethodTable();
 
-        SIZE_T size = methodTable->GetBaseSize();
-        _ASSERTE(size % DATA_ALIGNMENT == 0);
+    SIZE_T size = methodTable->GetBaseSize();
+    _ASSERTE(size % DATA_ALIGNMENT == 0);
 
-        gc_alloc_context *allocContext = thread->GetAllocContext();
-        BYTE *allocPtr = allocContext->alloc_ptr;
-        _ASSERTE(allocPtr <= allocContext->alloc_limit);
-        if (size > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
-        {
-            break;
-        }
-        allocContext->alloc_ptr = allocPtr + size;
+    BYTE *allocPtr = allocContext->alloc_ptr;
+    _ASSERTE(allocPtr <= allocContext->alloc_limit);
+    if (size > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+    {
+        // Tail call to the slow helper
+        return HCCALL1(JIT_New, typeHnd_);
+    }
 
-        _ASSERTE(allocPtr != nullptr);
-        Object *object = reinterpret_cast<Object *>(allocPtr);
-        _ASSERTE(object->HasEmptySyncBlockInfo());
-        object->SetMethodTable(methodTable);
+    allocContext->alloc_ptr = allocPtr + size;
 
-        return object;
-    } while (false);
+    _ASSERTE(allocPtr != nullptr);
+    Object *object = reinterpret_cast<Object *>(allocPtr);
+    _ASSERTE(object->HasEmptySyncBlockInfo());
+    object->SetMethodTable(methodTable);
 
-    // Tail call to the slow helper
-    ENDFORBIDGC();
-    return HCCALL1(JIT_New, typeHnd_);
+    return object;
 }
-HCIMPLEND
+HCIMPLEND_RAW
 
 #include <optdefault.h>
 
@@ -2219,84 +2213,57 @@ HCIMPLEND
 //*************************************************************
 // Allocation fast path for typical objects
 //
-HCIMPL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength)
+HCIMPL1_RAW(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength)
 {
-    FCALL_CONTRACT;
-
-    do
-    {
-        _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
+    CONTRACTL {
+        THROWS;
+        DISABLED(GC_TRIGGERS);
+        MODE_COOPERATIVE;
+    } CONTRACTL_END;
 
-        // Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow
-        // problems, as well as making sure big string objects are correctly allocated in the big object heap.
-        if (stringLength >= (LARGE_OBJECT_SIZE - 256) / sizeof(WCHAR))
-        {
-            break;
-        }
+    _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
 
-        // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
-        // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
-        // some reshuffling of intermediate values into nonvolatile registers around the call.
-        Thread *thread = GetThread();
+    // Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow
+    // problems, as well as making sure big string objects are correctly allocated in the big object heap.
+    if (stringLength >= (LARGE_OBJECT_SIZE - 256) / sizeof(WCHAR))
+    {
+        // Tail call to the slow helper
+        return HCCALL1(FramedAllocateString, stringLength);
+    }
 
-        SIZE_T totalSize = StringObject::GetSize(stringLength);
+    gc_alloc_context *allocContext = &t_thread_alloc_context;
 
-        // The method table's base size includes space for a terminating null character
-        _ASSERTE(totalSize >= g_pStringClass->GetBaseSize());
-        _ASSERTE((totalSize - g_pStringClass->GetBaseSize()) / sizeof(WCHAR) == stringLength);
+    SIZE_T totalSize = StringObject::GetSize(stringLength);
 
-        SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
-        _ASSERTE(alignedTotalSize >= totalSize);
-        totalSize = alignedTotalSize;
+    // The method table's base size includes space for a terminating null character
+    _ASSERTE(totalSize >= g_pStringClass->GetBaseSize());
+    _ASSERTE((totalSize - g_pStringClass->GetBaseSize()) / sizeof(WCHAR) == stringLength);
 
-        gc_alloc_context *allocContext = thread->GetAllocContext();
-        BYTE *allocPtr = allocContext->alloc_ptr;
-        _ASSERTE(allocPtr <= allocContext->alloc_limit);
-        if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
-        {
-            break;
-        }
-        allocContext->alloc_ptr = allocPtr + totalSize;
+    SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
+    _ASSERTE(alignedTotalSize >= totalSize);
+    totalSize = alignedTotalSize;
 
-        _ASSERTE(allocPtr != nullptr);
-        StringObject *stringObject = reinterpret_cast<StringObject *>(allocPtr);
-        stringObject->SetMethodTable(g_pStringClass);
-        stringObject->SetStringLength(stringLength);
-        _ASSERTE(stringObject->GetBuffer()[stringLength] == W('\0'));
+    BYTE *allocPtr = allocContext->alloc_ptr;
+    _ASSERTE(allocPtr <= allocContext->alloc_limit);
+    if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+    {
+        // Tail call to the slow helper
+        return HCCALL1(FramedAllocateString, stringLength);
+    }
+    allocContext->alloc_ptr = allocPtr + totalSize;
 
-        return stringObject;
-    } while (false);
+    _ASSERTE(allocPtr != nullptr);
+    StringObject *stringObject = reinterpret_cast<StringObject *>(allocPtr);
+    stringObject->SetMethodTable(g_pStringClass);
+    stringObject->SetStringLength(stringLength);
+    _ASSERTE(stringObject->GetBuffer()[stringLength] == W('\0'));
 
-    // Tail call to the slow helper
-    ENDFORBIDGC();
-    return HCCALL1(FramedAllocateString, stringLength);
+    return stringObject;
 }
-HCIMPLEND
+HCIMPLEND_RAW
 
 #include <optdefault.h>
 
-/*********************************************************************/
-/* We don't use HCIMPL macros because this is not a real helper call */
-/* This function just needs mangled arguments like a helper call     */
-
-HCIMPL1_RAW(StringObject*, UnframedAllocateString, DWORD stringLength)
-{
-    // This isn't _really_ an FCALL and therefore shouldn't have the
-    // SO_TOLERANT part of the FCALL_CONTRACT b/c it is not entered
-    // from managed code.
-    CONTRACTL {
-        THROWS;
-        GC_TRIGGERS;
-        MODE_COOPERATIVE;
-    } CONTRACTL_END;
-
-    STRINGREF result;
-    result = AllocateString(stringLength);
-
-    return((StringObject*) OBJECTREFToObject(result));
-}
-HCIMPLEND_RAW
-
 HCIMPL1(StringObject*, FramedAllocateString, DWORD stringLength)
 {
     FCALL_CONTRACT;
@@ -2356,129 +2323,123 @@ HCIMPLEND
 //*************************************************************
 // Array allocation fast path for arrays of value type elements
 //
-HCIMPL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size)
+HCIMPL2_RAW(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size)
 {
-    FCALL_CONTRACT;
-
-    do
-    {
-        _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
+    CONTRACTL {
+        THROWS;
+        DISABLED(GC_TRIGGERS);
+        MODE_COOPERATIVE;
+    } CONTRACTL_END;
 
-        // Do a conservative check here.  This is to avoid overflow while doing the calculations.  We don't
-        // have to worry about "large" objects, since the allocation quantum is never big enough for
-        // LARGE_OBJECT_SIZE.
-        //
-        // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size),
-        // The slack includes the size for the array header and round-up ; for alignment.  Use 256 for the
-        // slack value out of laziness.
-        SIZE_T componentCount = static_cast<SIZE_T>(size);
-        if (componentCount >= static_cast<SIZE_T>(65535 - 256))
-        {
-            break;
-        }
+    _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
 
-        // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
-        // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
-        // some reshuffling of intermediate values into nonvolatile registers around the call.
-        Thread *thread = GetThread();
+    // Do a conservative check here.  This is to avoid overflow while doing the calculations.  We don't
+    // have to worry about "large" objects, since the allocation quantum is never big enough for
+    // LARGE_OBJECT_SIZE.
+    //
+    // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size),
+    // The slack includes the size for the array header and round-up ; for alignment.  Use 256 for the
+    // slack value out of laziness.
+    SIZE_T componentCount = static_cast<SIZE_T>(size);
+    if (componentCount >= static_cast<SIZE_T>(65535 - 256))
+    {
+        // Tail call to the slow helper
+        return HCCALL2(JIT_NewArr1, arrayMT, size);
+    }
 
-        MethodTable *pArrayMT = (MethodTable *)arrayMT;
+    gc_alloc_context *allocContext = &t_thread_alloc_context;
 
-        _ASSERTE(pArrayMT->HasComponentSize());
-        SIZE_T componentSize = pArrayMT->RawGetComponentSize();
-        SIZE_T totalSize = componentCount * componentSize;
-        _ASSERTE(totalSize / componentSize == componentCount);
+    MethodTable *pArrayMT = (MethodTable *)arrayMT;
 
-        SIZE_T baseSize = pArrayMT->GetBaseSize();
-        totalSize += baseSize;
-        _ASSERTE(totalSize >= baseSize);
+    _ASSERTE(pArrayMT->HasComponentSize());
+    SIZE_T componentSize = pArrayMT->RawGetComponentSize();
+    SIZE_T totalSize = componentCount * componentSize;
+    _ASSERTE(totalSize / componentSize == componentCount);
 
-        SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
-        _ASSERTE(alignedTotalSize >= totalSize);
-        totalSize = alignedTotalSize;
+    SIZE_T baseSize = pArrayMT->GetBaseSize();
+    totalSize += baseSize;
+    _ASSERTE(totalSize >= baseSize);
 
-        gc_alloc_context *allocContext = thread->GetAllocContext();
-        BYTE *allocPtr = allocContext->alloc_ptr;
-        _ASSERTE(allocPtr <= allocContext->alloc_limit);
-        if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
-        {
-            break;
-        }
-        allocContext->alloc_ptr = allocPtr + totalSize;
+    SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
+    _ASSERTE(alignedTotalSize >= totalSize);
+    totalSize = alignedTotalSize;
 
-        _ASSERTE(allocPtr != nullptr);
-        ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
-        array->SetMethodTable(pArrayMT);
-        _ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
-        array->m_NumComponents = static_cast<DWORD>(componentCount);
+    BYTE *allocPtr = allocContext->alloc_ptr;
+    _ASSERTE(allocPtr <= allocContext->alloc_limit);
+    if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+    {
+        // Tail call to the slow helper
+        return HCCALL2(JIT_NewArr1, arrayMT, size);
+    }
+    allocContext->alloc_ptr = allocPtr + totalSize;
 
-        return array;
-    } while (false);
+    _ASSERTE(allocPtr != nullptr);
+    ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
+    array->SetMethodTable(pArrayMT);
+    _ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
+    array->m_NumComponents = static_cast<DWORD>(componentCount);
 
-    // Tail call to the slow helper
-    ENDFORBIDGC();
-    return HCCALL2(JIT_NewArr1, arrayMT, size);
+    return array;
 }
-HCIMPLEND
+HCIMPLEND_RAW
 
 //*************************************************************
 // Array allocation fast path for arrays of object elements
 //
-HCIMPL2(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size)
+HCIMPL2_RAW(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size)
 {
-    FCALL_CONTRACT;
-
-    do
-    {
-        _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
+    CONTRACTL {
+        THROWS;
+        DISABLED(GC_TRIGGERS);
+        MODE_COOPERATIVE;
+    } CONTRACTL_END;
 
-        // Make sure that the total size cannot reach LARGE_OBJECT_SIZE, which also allows us to avoid overflow checks. The
-        // "256" slack is to cover the array header size and round-up, using a constant value here out of laziness.
-        SIZE_T componentCount = static_cast<SIZE_T>(size);
-        if (componentCount >= static_cast<SIZE_T>((LARGE_OBJECT_SIZE - 256) / sizeof(void *)))
-        {
-            break;
-        }
+    _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
 
-        // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
-        // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
-        // some reshuffling of intermediate values into nonvolatile registers around the call.
-        Thread *thread = GetThread();
+    // Make sure that the total size cannot reach LARGE_OBJECT_SIZE, which also allows us to avoid overflow checks. The
+    // "256" slack is to cover the array header size and round-up, using a constant value here out of laziness.
+    SIZE_T componentCount = static_cast<SIZE_T>(size);
+    if (componentCount >= static_cast<SIZE_T>((LARGE_OBJECT_SIZE - 256) / sizeof(void *)))
+    {
+        // Tail call to the slow helper
+        return HCCALL2(JIT_NewArr1, arrayMT, size);
+    }
 
-        SIZE_T totalSize = componentCount * sizeof(void *);
-        _ASSERTE(totalSize / sizeof(void *) == componentCount);
+    // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
+    // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
+    // some reshuffling of intermediate values into nonvolatile registers around the call.
+    Thread *thread = GetThread();
 
-        MethodTable *pArrayMT = (MethodTable *)arrayMT;
+    SIZE_T totalSize = componentCount * sizeof(void *);
+    _ASSERTE(totalSize / sizeof(void *) == componentCount);
 
-        SIZE_T baseSize = pArrayMT->GetBaseSize();
-        totalSize += baseSize;
-        _ASSERTE(totalSize >= baseSize);
+    MethodTable *pArrayMT = (MethodTable *)arrayMT;
 
-        _ASSERTE(ALIGN_UP(totalSize, DATA_ALIGNMENT) == totalSize);
+    SIZE_T baseSize = pArrayMT->GetBaseSize();
+    totalSize += baseSize;
+    _ASSERTE(totalSize >= baseSize);
 
-        gc_alloc_context *allocContext = thread->GetAllocContext();
-        BYTE *allocPtr = allocContext->alloc_ptr;
-        _ASSERTE(allocPtr <= allocContext->alloc_limit);
-        if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
-        {
-            break;
-        }
-        allocContext->alloc_ptr = allocPtr + totalSize;
+    _ASSERTE(ALIGN_UP(totalSize, DATA_ALIGNMENT) == totalSize);
 
-        _ASSERTE(allocPtr != nullptr);
-        ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
-        array->SetMethodTable(pArrayMT);
-        _ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
-        array->m_NumComponents = static_cast<DWORD>(componentCount);
+    gc_alloc_context *allocContext = &t_thread_alloc_context;
+    BYTE *allocPtr = allocContext->alloc_ptr;
+    _ASSERTE(allocPtr <= allocContext->alloc_limit);
+    if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+    {
+        // Tail call to the slow helper
+        return HCCALL2(JIT_NewArr1, arrayMT, size);
+    }
+    allocContext->alloc_ptr = allocPtr + totalSize;
 
-        return array;
-    } while (false);
+    _ASSERTE(allocPtr != nullptr);
+    ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
+    array->SetMethodTable(pArrayMT);
+    _ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
+    array->m_NumComponents = static_cast<DWORD>(componentCount);
 
-    // Tail call to the slow helper
-    ENDFORBIDGC();
-    return HCCALL2(JIT_NewArr1, arrayMT, size);
+    return array;
 }
-HCIMPLEND
+HCIMPLEND_RAW
 
 #include <optdefault.h>
 
diff --git a/src/coreclr/vm/jitinterface.h b/src/coreclr/vm/jitinterface.h
index 51e2b959d6f694..848350ebb47cda 100644
--- a/src/coreclr/vm/jitinterface.h
+++ b/src/coreclr/vm/jitinterface.h
@@ -204,7 +204,6 @@ extern FCDECL1(Object*, JIT_NewS_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_)
 extern FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_);
 
 extern FCDECL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength);
-extern FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength);
 extern FCDECL1(StringObject*, FramedAllocateString, DWORD stringLength);
 
 extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
@@ -315,14 +314,6 @@ class WriteBarrierManager
 
 #endif // TARGET_AMD64
 
-#ifdef HOST_64BIT
-EXTERN_C FCDECL1(Object*, JIT_TrialAllocSFastMP_InlineGetThread, CORINFO_CLASS_HANDLE typeHnd_);
-EXTERN_C FCDECL2(Object*, JIT_BoxFastMP_InlineGetThread, CORINFO_CLASS_HANDLE type, void* data);
-EXTERN_C FCDECL2(Object*, JIT_NewArr1VC_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
-EXTERN_C FCDECL2(Object*, JIT_NewArr1OBJ_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
-
-#endif // HOST_64BIT
-
 EXTERN_C FCDECL2_VV(INT64, JIT_LMul, INT64 val1, INT64 val2);
 
 #ifndef HOST_64BIT
@@ -1073,6 +1064,7 @@ EXTERN_C FCDECL2(LPVOID, ArrayStoreCheck, Object** pElement, PtrArray** pArray);
 // means that the caller does not care whether the string is pinned or not.
 OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok, void** ppPinnedString = nullptr);
 
+FCDECL2(Object*, JIT_Box_MP_FastPortable, CORINFO_CLASS_HANDLE type, void* data);
 FCDECL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* data);
 FCDECL0(VOID, JIT_PollGC);
 
diff --git a/src/coreclr/vm/jitinterfacegen.cpp b/src/coreclr/vm/jitinterfacegen.cpp
index 68ab56aeb96ef3..6cb3e4dead267f 100644
--- a/src/coreclr/vm/jitinterfacegen.cpp
+++ b/src/coreclr/vm/jitinterfacegen.cpp
@@ -20,26 +20,16 @@
 
 #ifdef HOST_64BIT
 
-// These are the fastest(?) versions of JIT helpers as they have the code to GetThread patched into them
-// that does not make a call.
-EXTERN_C Object* JIT_TrialAllocSFastMP_InlineGetThread(CORINFO_CLASS_HANDLE typeHnd_);
-EXTERN_C Object* JIT_BoxFastMP_InlineGetThread (CORINFO_CLASS_HANDLE type, void* unboxedData);
-EXTERN_C Object* AllocateStringFastMP_InlineGetThread (CLR_I4 cch);
-EXTERN_C Object* JIT_NewArr1OBJ_MP_InlineGetThread (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
-EXTERN_C Object* JIT_NewArr1VC_MP_InlineGetThread (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
-
-// This next set is the fast version that invoke GetThread but is still faster than the VM implementation (i.e.
-// the "slow" versions).
-EXTERN_C Object* JIT_TrialAllocSFastMP(CORINFO_CLASS_HANDLE typeHnd_);
-EXTERN_C Object* JIT_TrialAllocSFastSP(CORINFO_CLASS_HANDLE typeHnd_);
+// These are the multi-processor-optimized versions of the allocation helpers
+// that must be written in assembly.
 EXTERN_C Object* JIT_BoxFastMP (CORINFO_CLASS_HANDLE type, void* unboxedData);
+
+// These are the single-processor-optimized versions of the allocation helpers.
+EXTERN_C Object* JIT_TrialAllocSFastSP(CORINFO_CLASS_HANDLE typeHnd_);
 EXTERN_C Object* JIT_BoxFastUP (CORINFO_CLASS_HANDLE type, void* unboxedData);
-EXTERN_C Object* AllocateStringFastMP (CLR_I4 cch);
 EXTERN_C Object* AllocateStringFastUP (CLR_I4 cch);
 
-EXTERN_C Object* JIT_NewArr1OBJ_MP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
 EXTERN_C Object* JIT_NewArr1OBJ_UP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
-EXTERN_C Object* JIT_NewArr1VC_MP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
 EXTERN_C Object* JIT_NewArr1VC_UP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
 
 #ifdef TARGET_AMD64
@@ -83,13 +73,13 @@ void InitJITHelpers1()
         // if (multi-proc || server GC)
         if (GCHeapUtilities::UseThreadAllocationContexts())
         {
-            SetJitHelperFunction(CORINFO_HELP_NEWSFAST, JIT_TrialAllocSFastMP_InlineGetThread);
-            SetJitHelperFunction(CORINFO_HELP_NEWSFAST_ALIGN8, JIT_TrialAllocSFastMP_InlineGetThread);
-            SetJitHelperFunction(CORINFO_HELP_BOX, JIT_BoxFastMP_InlineGetThread);
-            SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_InlineGetThread);
-            SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_InlineGetThread);
+            SetJitHelperFunction(CORINFO_HELP_NEWSFAST, JIT_NewS_MP_FastPortable);
+            SetJitHelperFunction(CORINFO_HELP_NEWSFAST_ALIGN8, JIT_NewS_MP_FastPortable);
+            SetJitHelperFunction(CORINFO_HELP_BOX, JIT_BoxFastMP);
+            SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_FastPortable);
+            SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_FastPortable);
 
-            ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP_InlineGetThread), ECall::FastAllocateString);
+            ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateString_MP_FastPortable), ECall::FastAllocateString);
         }
         else
         {
diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp
index 7411d62a285f89..96e88cde3dffa2 100644
--- a/src/coreclr/vm/threads.cpp
+++ b/src/coreclr/vm/threads.cpp
@@ -371,6 +371,7 @@ void SetThread(Thread* t)
     if (t != NULL)
     {
         EnsureTlsDestructionMonitor();
+        t->InitAllocContext();
     }
 
     // Clear or set the app domain to the one domain based on if the thread is being nulled out or set
@@ -1003,6 +1004,20 @@ HRESULT Thread::DetachThread(BOOL fDLLThreadDetach)
         m_ThreadHandleForClose = hThread;
     }
 
+    if (GCHeapUtilities::IsGCHeapInitialized())
+    {
+        // If the GC heap is initialized, we need to fix the alloc context for this detaching thread.
+        GCX_COOP();
+        // GetTotalAllocatedBytes reads dead_threads_non_alloc_bytes, but will suspend EE, being in COOP mode we cannot race with that
+        // however, there could be other threads terminating and doing the same Add.
+        InterlockedExchangeAdd64((LONG64*)&dead_threads_non_alloc_bytes, t_thread_alloc_context.alloc_limit - t_thread_alloc_context.alloc_ptr);
+        GCHeapUtilities::GetGCHeap()->FixAllocContext(&t_thread_alloc_context, NULL, NULL);
+        t_thread_alloc_context.init(); // re-initialize the context.
+
+        // Clear out the alloc context pointer for this thread. When TLS is gone, this pointer will point into freed memory.
+        m_alloc_context = nullptr;
+    }
+
     // We need to make sure that TLS are touched last here.
     SetThread(NULL);
 
@@ -1411,7 +1426,7 @@ Thread::Thread()
 
     m_pBlockingLock = NULL;
 
-    m_alloc_context.init();
+    m_alloc_context = nullptr;
     m_thAllocContextObj = 0;
 
     m_UserInterrupt = 0;
@@ -2871,14 +2886,14 @@ void Thread::OnThreadTerminate(BOOL holdingLock)
     {
         // Guaranteed to NOT be a shutdown case, because we tear down the heap before
         // we tear down any threads during shutdown.
-        if (ThisThreadID == CurrentThreadID)
+        if (ThisThreadID == CurrentThreadID && GetAllocContext() != nullptr)
         {
             GCX_COOP();
             // GetTotalAllocatedBytes reads dead_threads_non_alloc_bytes, but will suspend EE, being in COOP mode we cannot race with that
             // however, there could be other threads terminating and doing the same Add.
-            InterlockedExchangeAdd64((LONG64*)&dead_threads_non_alloc_bytes, m_alloc_context.alloc_limit - m_alloc_context.alloc_ptr);
-            GCHeapUtilities::GetGCHeap()->FixAllocContext(&m_alloc_context, NULL, NULL);
-            m_alloc_context.init();
+            InterlockedExchangeAdd64((LONG64*)&dead_threads_non_alloc_bytes, GetAllocContext()->alloc_limit - GetAllocContext()->alloc_ptr);
+            GCHeapUtilities::GetGCHeap()->FixAllocContext(GetAllocContext(), NULL, NULL);
+            GetAllocContext()->init(); // re-initialize the context.
         }
     }
 
@@ -2930,15 +2945,6 @@ void Thread::OnThreadTerminate(BOOL holdingLock)
 
         }
 
-        if  (GCHeapUtilities::IsGCHeapInitialized() && ThisThreadID != CurrentThreadID)
-        {
-            // We must be holding the ThreadStore lock in order to clean up alloc context.
-            // We should never call FixAllocContext during GC.
-            dead_threads_non_alloc_bytes += m_alloc_context.alloc_limit - m_alloc_context.alloc_ptr;
-            GCHeapUtilities::GetGCHeap()->FixAllocContext(&m_alloc_context, NULL, NULL);
-            m_alloc_context.init();
-        }
-
         SetThreadState(TS_Dead);
         ThreadStore::s_pThreadStore->m_DeadThreadCount++;
         ThreadStore::s_pThreadStore->IncrementDeadThreadCountForGCTrigger();
diff --git a/src/coreclr/vm/threads.h b/src/coreclr/vm/threads.h
index c6bdda4e012b86..f4554c9018a01e 100644
--- a/src/coreclr/vm/threads.h
+++ b/src/coreclr/vm/threads.h
@@ -1016,13 +1016,14 @@ class Thread
     // Lock thread is trying to acquire
     VolatilePtr<DeadlockAwareLock> m_pBlockingLock;
 
-public:
+    // We store a pointer to this thread's alloc context here for easier introspection
+    // from other threads and diagnostic tools
+    gc_alloc_context*        m_alloc_context;
 
-    // on MP systems, each thread has its own allocation chunk so we can avoid
-    // lock prefixes and expensive MP cache snooping stuff
-    gc_alloc_context        m_alloc_context;
+public:
+    inline void InitAllocContext() { LIMITED_METHOD_CONTRACT; m_alloc_context = &t_thread_alloc_context; }
 
-    inline gc_alloc_context *GetAllocContext() { LIMITED_METHOD_CONTRACT; return &m_alloc_context; }
+    inline gc_alloc_context *GetAllocContext() { LIMITED_METHOD_CONTRACT; return m_alloc_context; }
 
     // This is the type handle of the first object in the alloc context at the time
     // we fire the AllocationTick event. It's only for tooling purpose.
diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp
index 40ae02264804fd..267b0359740ff0 100644
--- a/src/coreclr/vm/threadsuspend.cpp
+++ b/src/coreclr/vm/threadsuspend.cpp
@@ -2360,7 +2360,7 @@ void Thread::PerformPreemptiveGC()
         // BUG(github #10318) - when not using allocation contexts, the alloc lock
         // must be acquired here. Until fixed, this assert prevents random heap corruption.
         _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
-        GCHeapUtilities::GetGCHeap()->StressHeap(GetThread()->GetAllocContext());
+        GCHeapUtilities::GetGCHeap()->StressHeap(&t_thread_alloc_context);
         m_bGCStressing = FALSE;
     }
     m_GCOnTransitionsOK = TRUE;