diff --git a/src/coreclr/debug/daccess/dacdbiimpl.cpp b/src/coreclr/debug/daccess/dacdbiimpl.cpp index 715c87dedafef2..8b38fdd7a82c89 100644 --- a/src/coreclr/debug/daccess/dacdbiimpl.cpp +++ b/src/coreclr/debug/daccess/dacdbiimpl.cpp @@ -4713,8 +4713,16 @@ void DacDbiInterfaceImpl::GetThreadAllocInfo(VMPTR_Thread vmThread, Thread * pThread = vmThread.GetDacPtr(); gc_alloc_context* allocContext = pThread->GetAllocContext(); - threadAllocInfo->m_allocBytesSOH = allocContext->alloc_bytes - (allocContext->alloc_limit - allocContext->alloc_ptr); - threadAllocInfo->m_allocBytesUOH = allocContext->alloc_bytes_uoh; + if (allocContext != nullptr) + { + threadAllocInfo->m_allocBytesSOH = allocContext->alloc_bytes - (allocContext->alloc_limit - allocContext->alloc_ptr); + threadAllocInfo->m_allocBytesUOH = allocContext->alloc_bytes_uoh; + } + else + { + threadAllocInfo->m_allocBytesSOH = 0; + threadAllocInfo->m_allocBytesUOH = 0; + } } // Set and reset the TSNC_DebuggerUserSuspend bit on the state of the specified thread diff --git a/src/coreclr/debug/daccess/request.cpp b/src/coreclr/debug/daccess/request.cpp index 5353c93a892289..bd62b8d39b7df1 100644 --- a/src/coreclr/debug/daccess/request.cpp +++ b/src/coreclr/debug/daccess/request.cpp @@ -720,8 +720,18 @@ ClrDataAccess::GetThreadAllocData(CLRDATA_ADDRESS addr, struct DacpAllocData *da Thread* thread = PTR_Thread(TO_TADDR(addr)); - data->allocBytes = TO_CDADDR(thread->m_alloc_context.alloc_bytes); - data->allocBytesLoh = TO_CDADDR(thread->m_alloc_context.alloc_bytes_uoh); + gc_alloc_context* pAllocContext = thread->GetAllocContext(); + + if (pAllocContext != NULL) + { + data->allocBytes = TO_CDADDR(pAllocContext->alloc_bytes); + data->allocBytesLoh = TO_CDADDR(pAllocContext->alloc_bytes_uoh); + } + else + { + data->allocBytes = TO_CDADDR(0); + data->allocBytesLoh = TO_CDADDR(0); + } SOSDacLeave(); return hr; @@ -816,8 +826,18 @@ HRESULT ClrDataAccess::GetThreadDataImpl(CLRDATA_ADDRESS threadAddr, struct Dacp threadData->osThreadId = (DWORD)thread->m_OSThreadId; threadData->state = thread->m_State; threadData->preemptiveGCDisabled = thread->m_fPreemptiveGCDisabled; - threadData->allocContextPtr = TO_CDADDR(thread->m_alloc_context.alloc_ptr); - threadData->allocContextLimit = TO_CDADDR(thread->m_alloc_context.alloc_limit); + + gc_alloc_context* allocContext = thread->GetAllocContext(); + if (allocContext) + { + threadData->allocContextPtr = TO_CDADDR(allocContext->alloc_ptr); + threadData->allocContextLimit = TO_CDADDR(allocContext->alloc_limit); + } + else + { + threadData->allocContextPtr = TO_CDADDR(0); + threadData->allocContextLimit = TO_CDADDR(0); + } threadData->fiberData = (CLRDATA_ADDRESS)NULL; diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt index f60713ef587e1c..255378fe1d19f0 100644 --- a/src/coreclr/vm/CMakeLists.txt +++ b/src/coreclr/vm/CMakeLists.txt @@ -621,8 +621,8 @@ if(CLR_CMAKE_TARGET_ARCH_AMD64) ${ARCH_SOURCES_DIR}/GenericComPlusCallStubs.asm ${ARCH_SOURCES_DIR}/getstate.asm ${ARCH_SOURCES_DIR}/JitHelpers_Fast.asm + ${ARCH_SOURCES_DIR}/JitHelpers_FastMP.asm ${ARCH_SOURCES_DIR}/JitHelpers_FastWriteBarriers.asm - ${ARCH_SOURCES_DIR}/JitHelpers_InlineGetThread.asm ${ARCH_SOURCES_DIR}/JitHelpers_SingleAppDomain.asm ${ARCH_SOURCES_DIR}/JitHelpers_Slow.asm ${ARCH_SOURCES_DIR}/patchedcode.asm diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 2d14b9c31e8fca..8e67aedd9c6dc0 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -206,6 +206,26 @@ INLINE_GETTHREAD macro Reg endm +; +; Inlined macro to get the current thread's allocation context +; Trashes rax and r11 +; + +INLINE_GET_ALLOC_CONTEXT macro Reg + + EXTERN _tls_index: DWORD + EXTERN t_thread_alloc_context: DWORD + + mov r11d, [_tls_index] + mov rax, gs:[OFFSET__TEB__ThreadLocalStoragePointer] + mov rax, [rax + r11 * 8] + mov r11d, SECTIONREL t_thread_alloc_context + add rax, r11 + mov Reg, rax + + endm + + ; if you change this code there will be corresponding code in JITInterfaceGen.cpp which will need to be changed ; diff --git a/src/coreclr/vm/amd64/JitHelpers_FastMP.asm b/src/coreclr/vm/amd64/JitHelpers_FastMP.asm new file mode 100644 index 00000000000000..9849b8d8016d70 --- /dev/null +++ b/src/coreclr/vm/amd64/JitHelpers_FastMP.asm @@ -0,0 +1,75 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. + +; *********************************************************************** +; File: JitHelpers_InlineGetThread.asm, see history in jithelp.asm +; +; *********************************************************************** + +include AsmMacros.inc +include asmconstants.inc + +CopyValueClassUnchecked equ ?CopyValueClassUnchecked@@YAXPEAX0PEAVMethodTable@@@Z +JIT_Box equ ?JIT_Box@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@PEAX@Z + +extern CopyValueClassUnchecked:proc +extern JIT_Box:proc + +; HCIMPL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* unboxedData) +NESTED_ENTRY JIT_BoxFastMP, _TEXT + + ; m_BaseSize is guaranteed to be a multiple of 8. + mov r8d, [rcx + OFFSET__MethodTable__m_BaseSize] + + INLINE_GET_ALLOC_CONTEXT r11 + mov r10, [r11 + OFFSETOF__gc_alloc_context__alloc_limit] + mov rax, [r11 + OFFSETOF__gc_alloc_context__alloc_ptr] + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + test rdx, rdx + je NullRef + + mov [r11 + OFFSETOF__gc_alloc_context__alloc_ptr], r8 + mov [rax], rcx + + ; Check whether the object contains pointers + test dword ptr [rcx + OFFSETOF__MethodTable__m_dwFlags], MethodTable__enum_flag_ContainsPointers + jnz ContainsPointers + + ; We have no pointers - emit a simple inline copy loop + ; Copy the contents from the end + mov ecx, [rcx + OFFSET__MethodTable__m_BaseSize] + sub ecx, 18h ; sizeof(ObjHeader) + sizeof(Object) + last slot + +align 16 + CopyLoop: + mov r8, [rdx+rcx] + mov [rax+rcx+8], r8 + sub ecx, 8 + jge CopyLoop + REPRET + + ContainsPointers: + ; Do call to CopyValueClassUnchecked(object, data, pMT) + push_vol_reg rax + alloc_stack 20h + END_PROLOGUE + + mov r8, rcx + lea rcx, [rax + 8] + call CopyValueClassUnchecked + + add rsp, 20h + pop rax + ret + + AllocFailed: + NullRef: + jmp JIT_Box +NESTED_END JIT_BoxFastMP, _TEXT + + end diff --git a/src/coreclr/vm/amd64/JitHelpers_InlineGetThread.asm b/src/coreclr/vm/amd64/JitHelpers_InlineGetThread.asm deleted file mode 100644 index bf79668e567e29..00000000000000 --- a/src/coreclr/vm/amd64/JitHelpers_InlineGetThread.asm +++ /dev/null @@ -1,263 +0,0 @@ -; Licensed to the .NET Foundation under one or more agreements. -; The .NET Foundation licenses this file to you under the MIT license. - -; *********************************************************************** -; File: JitHelpers_InlineGetThread.asm, see history in jithelp.asm -; -; Notes: These routinues will be patched at runtime with the location in -; the TLS to find the Thread* and are the fastest implementation -; of their specific functionality. -; *********************************************************************** - -include AsmMacros.inc -include asmconstants.inc - -; Min amount of stack space that a nested function should allocate. -MIN_SIZE equ 28h - -JIT_NEW equ ?JIT_New@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@@Z -CopyValueClassUnchecked equ ?CopyValueClassUnchecked@@YAXPEAX0PEAVMethodTable@@@Z -JIT_Box equ ?JIT_Box@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@PEAX@Z -g_pStringClass equ ?g_pStringClass@@3PEAVMethodTable@@EA -FramedAllocateString equ ?FramedAllocateString@@YAPEAVStringObject@@K@Z -JIT_NewArr1 equ ?JIT_NewArr1@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@_J@Z - -INVALIDGCVALUE equ 0CCCCCCCDh - -extern JIT_NEW:proc -extern CopyValueClassUnchecked:proc -extern JIT_Box:proc -extern g_pStringClass:QWORD -extern FramedAllocateString:proc -extern JIT_NewArr1:proc - -extern JIT_InternalThrow:proc - -; IN: rcx: MethodTable* -; OUT: rax: new object -LEAF_ENTRY JIT_TrialAllocSFastMP_InlineGetThread, _TEXT - mov edx, [rcx + OFFSET__MethodTable__m_BaseSize] - - ; m_BaseSize is guaranteed to be a multiple of 8. - - INLINE_GETTHREAD r11 - mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] - mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] - - add rdx, rax - - cmp rdx, r10 - ja AllocFailed - - mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], rdx - mov [rax], rcx - - ret - - AllocFailed: - jmp JIT_NEW -LEAF_END JIT_TrialAllocSFastMP_InlineGetThread, _TEXT - -; HCIMPL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* unboxedData) -NESTED_ENTRY JIT_BoxFastMP_InlineGetThread, _TEXT - - ; m_BaseSize is guaranteed to be a multiple of 8. - mov r8d, [rcx + OFFSET__MethodTable__m_BaseSize] - - INLINE_GETTHREAD r11 - mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] - mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] - - add r8, rax - - cmp r8, r10 - ja AllocFailed - - test rdx, rdx - je NullRef - - mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 - mov [rax], rcx - - ; Check whether the object contains pointers - test dword ptr [rcx + OFFSETOF__MethodTable__m_dwFlags], MethodTable__enum_flag_ContainsPointers - jnz ContainsPointers - - ; We have no pointers - emit a simple inline copy loop - ; Copy the contents from the end - mov ecx, [rcx + OFFSET__MethodTable__m_BaseSize] - sub ecx, 18h ; sizeof(ObjHeader) + sizeof(Object) + last slot - -align 16 - CopyLoop: - mov r8, [rdx+rcx] - mov [rax+rcx+8], r8 - sub ecx, 8 - jge CopyLoop - REPRET - - ContainsPointers: - ; Do call to CopyValueClassUnchecked(object, data, pMT) - push_vol_reg rax - alloc_stack 20h - END_PROLOGUE - - mov r8, rcx - lea rcx, [rax + 8] - call CopyValueClassUnchecked - - add rsp, 20h - pop rax - ret - - AllocFailed: - NullRef: - jmp JIT_Box -NESTED_END JIT_BoxFastMP_InlineGetThread, _TEXT - -LEAF_ENTRY AllocateStringFastMP_InlineGetThread, _TEXT - ; We were passed the number of characters in ECX - - ; we need to load the method table for string from the global - mov r9, [g_pStringClass] - - ; Instead of doing elaborate overflow checks, we just limit the number of elements - ; to (LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR) or less. - ; This will avoid all overflow problems, as well as making sure - ; big string objects are correctly allocated in the big object heap. - - cmp ecx, (ASM_LARGE_OBJECT_SIZE - 256)/2 - jae OversizedString - - ; Calculate the final size to allocate. - ; We need to calculate baseSize + cnt*2, then round that up by adding 7 and anding ~7. - - lea edx, [STRING_BASE_SIZE + ecx*2 + 7] - and edx, -8 - - INLINE_GETTHREAD r11 - mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] - mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] - - add rdx, rax - - cmp rdx, r10 - ja AllocFailed - - mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], rdx - mov [rax], r9 - - mov [rax + OFFSETOF__StringObject__m_StringLength], ecx - - ret - - OversizedString: - AllocFailed: - jmp FramedAllocateString -LEAF_END AllocateStringFastMP_InlineGetThread, _TEXT - -; HCIMPL2(Object*, JIT_NewArr1VC_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size) -LEAF_ENTRY JIT_NewArr1VC_MP_InlineGetThread, _TEXT - ; We were passed a (shared) method table in RCX, which contains the element type. - - ; The element count is in RDX - - ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need - ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray. - - ; Do a conservative check here. This is to avoid overflow while doing the calculations. We don't - ; have to worry about "large" objects, since the allocation quantum is never big enough for - ; LARGE_OBJECT_SIZE. - - ; For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), - ; The slack includes the size for the array header and round-up ; for alignment. Use 256 for the - ; slack value out of laziness. - - ; In both cases we do a final overflow check after adding to the alloc_ptr. - - cmp rdx, (65535 - 256) - jae OversizedArray - - movzx r8d, word ptr [rcx + OFFSETOF__MethodTable__m_dwFlags] ; component size is low 16 bits - imul r8d, edx - add r8d, dword ptr [rcx + OFFSET__MethodTable__m_BaseSize] - - ; round the size to a multiple of 8 - - add r8d, 7 - and r8d, -8 - - - INLINE_GETTHREAD r11 - mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] - mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] - - add r8, rax - jc AllocFailed - - cmp r8, r10 - ja AllocFailed - - mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 - mov [rax], rcx - - mov dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx - - ret - - OversizedArray: - AllocFailed: - jmp JIT_NewArr1 -LEAF_END JIT_NewArr1VC_MP_InlineGetThread, _TEXT - - -; HCIMPL2(Object*, JIT_NewArr1OBJ_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size) -LEAF_ENTRY JIT_NewArr1OBJ_MP_InlineGetThread, _TEXT - ; We were passed a (shared) method table in RCX, which contains the element type. - - ; The element count is in RDX - - ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need - ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray. - - ; Verifies that LARGE_OBJECT_SIZE fits in 32-bit. This allows us to do array size - ; arithmetic using 32-bit registers. - .erre ASM_LARGE_OBJECT_SIZE lt 100000000h - - cmp rdx, (ASM_LARGE_OBJECT_SIZE - 256)/8 ; sizeof(void*) - jae OversizedArray - - ; In this case we know the element size is sizeof(void *), or 8 for x64 - ; This helps us in two ways - we can shift instead of multiplying, and - ; there's no need to align the size either - - mov r8d, dword ptr [rcx + OFFSET__MethodTable__m_BaseSize] - lea r8d, [r8d + edx * 8] - - ; No need for rounding in this case - element size is 8, and m_BaseSize is guaranteed - ; to be a multiple of 8. - - INLINE_GETTHREAD r11 - mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] - mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] - - add r8, rax - - cmp r8, r10 - ja AllocFailed - - mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 - mov [rax], rcx - - mov dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx - - ret - - OversizedArray: - AllocFailed: - jmp JIT_NewArr1 -LEAF_END JIT_NewArr1OBJ_MP_InlineGetThread, _TEXT - - - end - diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index b51088a6b47930..60892996320047 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -111,12 +111,6 @@ ASMCONSTANTS_C_ASSERT(OFFSETOF__Thread__m_pFrame #define Thread_m_pFrame OFFSETOF__Thread__m_pFrame -#define OFFSET__Thread__m_alloc_context__alloc_ptr 0x48 -ASMCONSTANTS_C_ASSERT(OFFSET__Thread__m_alloc_context__alloc_ptr == offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr)); - -#define OFFSET__Thread__m_alloc_context__alloc_limit 0x50 -ASMCONSTANTS_C_ASSERT(OFFSET__Thread__m_alloc_context__alloc_limit == offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_limit)); - #define OFFSETOF__gc_alloc_context__alloc_ptr 0x0 ASMCONSTANT_OFFSETOF_ASSERT(gc_alloc_context, alloc_ptr); diff --git a/src/coreclr/vm/comutilnative.cpp b/src/coreclr/vm/comutilnative.cpp index a3c9d0a848cdff..0756c41e410ccd 100644 --- a/src/coreclr/vm/comutilnative.cpp +++ b/src/coreclr/vm/comutilnative.cpp @@ -900,7 +900,7 @@ FCIMPL0(INT64, GCInterface::GetAllocatedBytesForCurrentThread) INT64 currentAllocated = 0; Thread *pThread = GetThread(); - gc_alloc_context* ac = pThread->GetAllocContext(); + gc_alloc_context* ac = &t_thread_alloc_context; currentAllocated = ac->alloc_bytes + ac->alloc_bytes_uoh - (ac->alloc_limit - ac->alloc_ptr); return currentAllocated; @@ -987,7 +987,10 @@ extern "C" INT64 QCALLTYPE GCInterface_GetTotalAllocatedBytesPrecise() for (Thread *pThread = ThreadStore::GetThreadList(NULL); pThread; pThread = ThreadStore::GetThreadList(pThread)) { gc_alloc_context* ac = pThread->GetAllocContext(); - allocated -= ac->alloc_limit - ac->alloc_ptr; + if (ac != nullptr) + { + allocated -= ac->alloc_limit - ac->alloc_ptr; + } } ThreadSuspend::RestartEE(FALSE, TRUE); diff --git a/src/coreclr/vm/gccover.cpp b/src/coreclr/vm/gccover.cpp index 70ab39ea681b02..67d4bdf4e25521 100644 --- a/src/coreclr/vm/gccover.cpp +++ b/src/coreclr/vm/gccover.cpp @@ -1859,7 +1859,7 @@ void DoGcStress (PCONTEXT regs, NativeCodeVersion nativeCodeVersion) // BUG(github #10318) - when not using allocation contexts, the alloc lock // must be acquired here. Until fixed, this assert prevents random heap corruption. assert(GCHeapUtilities::UseThreadAllocationContexts()); - GCHeapUtilities::GetGCHeap()->StressHeap(GetThread()->GetAllocContext()); + GCHeapUtilities::GetGCHeap()->StressHeap(&t_thread_alloc_context); // StressHeap can exit early w/o forcing a SuspendEE to trigger the instruction update // We can not rely on the return code to determine if the instruction update happened diff --git a/src/coreclr/vm/gcenv.ee.cpp b/src/coreclr/vm/gcenv.ee.cpp index a4a538780aa4ad..62708eb53f080e 100644 --- a/src/coreclr/vm/gcenv.ee.cpp +++ b/src/coreclr/vm/gcenv.ee.cpp @@ -291,8 +291,10 @@ void GCToEEInterface::GcScanRoots(promote_func* fn, int condemned, int max_gen, Thread* pThread = NULL; while ((pThread = ThreadStore::GetThreadList(pThread)) != NULL) { - if (GCHeapUtilities::GetGCHeap()->IsThreadUsingAllocationContextHeap( - pThread->GetAllocContext(), sc->thread_number)) + gc_alloc_context* palloc_context = pThread->GetAllocContext(); + if (palloc_context != nullptr + && GCHeapUtilities::GetGCHeap()->IsThreadUsingAllocationContextHeap( + palloc_context, sc->thread_number)) { STRESS_LOG2(LF_GC | LF_GCROOTS, LL_INFO100, "{ Starting scan of Thread %p ID = %x\n", pThread, pThread->GetThreadId()); @@ -435,13 +437,12 @@ gc_alloc_context * GCToEEInterface::GetAllocContext() { WRAPPER_NO_CONTRACT; - Thread* pThread = ::GetThreadNULLOk(); - if (!pThread) + if (!::GetThreadNULLOk()) { return nullptr; } - return pThread->GetAllocContext(); + return &t_thread_alloc_context; } void GCToEEInterface::GcEnumAllocContexts(enum_alloc_context_func* fn, void* param) @@ -458,7 +459,11 @@ void GCToEEInterface::GcEnumAllocContexts(enum_alloc_context_func* fn, void* par Thread * pThread = NULL; while ((pThread = ThreadStore::GetThreadList(pThread)) != NULL) { - fn(pThread->GetAllocContext(), param); + gc_alloc_context* palloc_context = pThread->GetAllocContext(); + if (palloc_context != nullptr) + { + fn(palloc_context, param); + } } } else diff --git a/src/coreclr/vm/gcheaputilities.cpp b/src/coreclr/vm/gcheaputilities.cpp index 2f588ae6bdaec1..a365300be4f61a 100644 --- a/src/coreclr/vm/gcheaputilities.cpp +++ b/src/coreclr/vm/gcheaputilities.cpp @@ -43,6 +43,12 @@ bool g_sw_ww_enabled_for_gc_heap = false; GVAL_IMPL_INIT(gc_alloc_context, g_global_alloc_context, {}); +// on MP systems, each thread has its own allocation chunk so we can avoid +// lock prefixes and expensive MP cache snooping stuff +#ifndef _MSC_VER +__thread gc_alloc_context t_thread_alloc_context; +#endif + enum GC_LOAD_STATUS { GC_LOAD_STATUS_BEFORE_START, GC_LOAD_STATUS_START, diff --git a/src/coreclr/vm/gcheaputilities.h b/src/coreclr/vm/gcheaputilities.h index c652cc52bf417c..c20c574d470619 100644 --- a/src/coreclr/vm/gcheaputilities.h +++ b/src/coreclr/vm/gcheaputilities.h @@ -26,6 +26,14 @@ GVAL_DECL(gc_alloc_context, g_global_alloc_context); } #endif // !DACCESS_COMPILE +// on MP systems, each thread has its own allocation chunk so we can avoid +// lock prefixes and expensive MP cache snooping stuff +#ifdef _MSC_VER +EXTERN_C __declspec(selectany) __declspec(thread) gc_alloc_context t_thread_alloc_context; +#else +EXTERN_C __thread gc_alloc_context t_thread_alloc_context; +#endif + extern "C" uint32_t* g_card_bundle_table; extern "C" uint8_t* g_ephemeral_low; extern "C" uint8_t* g_ephemeral_high; diff --git a/src/coreclr/vm/gchelpers.cpp b/src/coreclr/vm/gchelpers.cpp index 63754563b496b4..4835b6b320aa87 100644 --- a/src/coreclr/vm/gchelpers.cpp +++ b/src/coreclr/vm/gchelpers.cpp @@ -46,7 +46,7 @@ inline gc_alloc_context* GetThreadAllocContext() assert(GCHeapUtilities::UseThreadAllocationContexts()); - return & GetThread()->m_alloc_context; + return &t_thread_alloc_context; } // When not using per-thread allocation contexts, we (the EE) need to take care that diff --git a/src/coreclr/vm/gcstress.h b/src/coreclr/vm/gcstress.h index d46ef841f76718..3cd7894d9b3efb 100644 --- a/src/coreclr/vm/gcstress.h +++ b/src/coreclr/vm/gcstress.h @@ -289,7 +289,7 @@ namespace _GCStress // BUG(github #10318) - when not using allocation contexts, the alloc lock // must be acquired here. Until fixed, this assert prevents random heap corruption. _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); - GCHeapUtilities::GetGCHeap()->StressHeap(GetThread()->GetAllocContext()); + GCHeapUtilities::GetGCHeap()->StressHeap(&t_thread_alloc_context); } FORCEINLINE diff --git a/src/coreclr/vm/i386/jitinterfacex86.cpp b/src/coreclr/vm/i386/jitinterfacex86.cpp index 67decfd147f986..492bc66bd1867a 100644 --- a/src/coreclr/vm/i386/jitinterfacex86.cpp +++ b/src/coreclr/vm/i386/jitinterfacex86.cpp @@ -230,15 +230,15 @@ void JIT_TrialAlloc::EmitCore(CPUSTUBLINKER *psl, CodeLabel *noLock, CodeLabel * && "EAX should contain size for allocation and it doesnt!!!"); // Fetch current thread into EDX, preserving EAX and ECX - psl->X86EmitCurrentThreadFetch(kEDX, (1 << kEAX) | (1 << kECX)); + psl->X86EmitCurrentThreadAllocContextFetch(kEDX, (1 << kEAX) | (1 << kECX)); // Try the allocation. if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ)) { - // MOV EBX, [edx]Thread.m_alloc_context.alloc_ptr - psl->X86EmitOffsetModRM(0x8B, kEBX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr)); + // MOV EBX, [edx]gc_alloc_context.alloc_ptr + psl->X86EmitOffsetModRM(0x8B, kEBX, kEDX, offsetof(gc_alloc_context, alloc_ptr)); // add EAX, EBX psl->Emit16(0xC303); if (flags & ALIGN8) @@ -246,20 +246,20 @@ void JIT_TrialAlloc::EmitCore(CPUSTUBLINKER *psl, CodeLabel *noLock, CodeLabel * } else { - // add eax, [edx]Thread.m_alloc_context.alloc_ptr - psl->X86EmitOffsetModRM(0x03, kEAX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr)); + // add eax, [edx]gc_alloc_context.alloc_ptr + psl->X86EmitOffsetModRM(0x03, kEAX, kEDX, offsetof(gc_alloc_context, alloc_ptr)); } - // cmp eax, [edx]Thread.m_alloc_context.alloc_limit - psl->X86EmitOffsetModRM(0x3b, kEAX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_limit)); + // cmp eax, [edx]gc_alloc_context.alloc_limit + psl->X86EmitOffsetModRM(0x3b, kEAX, kEDX, offsetof(gc_alloc_context, alloc_limit)); // ja noAlloc psl->X86EmitCondJump(noAlloc, X86CondCode::kJA); // Fill in the allocation and get out. - // mov [edx]Thread.m_alloc_context.alloc_ptr, eax - psl->X86EmitIndexRegStore(kEDX, offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr), kEAX); + // mov [edx]gc_alloc_context.alloc_ptr, eax + psl->X86EmitIndexRegStore(kEDX, offsetof(gc_alloc_context, alloc_ptr), kEAX); if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ)) { diff --git a/src/coreclr/vm/i386/stublinkerx86.cpp b/src/coreclr/vm/i386/stublinkerx86.cpp index 8fb36cc501c3c2..413bbfedb6cc29 100644 --- a/src/coreclr/vm/i386/stublinkerx86.cpp +++ b/src/coreclr/vm/i386/stublinkerx86.cpp @@ -2447,6 +2447,73 @@ VOID StubLinkerCPU::X86EmitCurrentThreadFetch(X86Reg dstreg, unsigned preservedR #endif // TARGET_UNIX } +#ifdef TARGET_UNIX +namespace +{ + gc_alloc_context* STDCALL GetAllocContextHelper() + { + return &t_thread_alloc_context; + } +} +#endif + +VOID StubLinkerCPU::X86EmitCurrentThreadAllocContextFetch(X86Reg dstreg, unsigned preservedRegSet) +{ + CONTRACTL + { + STANDARD_VM_CHECK; + + // It doesn't make sense to have the destination register be preserved + PRECONDITION((preservedRegSet & (1 << dstreg)) == 0); + AMD64_ONLY(PRECONDITION(dstreg < 8)); // code below doesn't support high registers + } + CONTRACTL_END; + +#ifdef TARGET_UNIX + + X86EmitPushRegs(preservedRegSet & ((1 << kEAX) | (1 << kEDX) | (1 << kECX))); + + // call GetThread + X86EmitCall(NewExternalCodeLabel((LPVOID)GetAllocContextHelper), sizeof(void*)); + + // mov dstreg, eax + X86EmitMovRegReg(dstreg, kEAX); + + X86EmitPopRegs(preservedRegSet & ((1 << kEAX) | (1 << kEDX) | (1 << kECX))); + +#ifdef _DEBUG + // Trash caller saved regs that we were not told to preserve, and that aren't the dstreg. + preservedRegSet |= 1 << dstreg; + if (!(preservedRegSet & (1 << kEAX))) + X86EmitDebugTrashReg(kEAX); + if (!(preservedRegSet & (1 << kEDX))) + X86EmitDebugTrashReg(kEDX); + if (!(preservedRegSet & (1 << kECX))) + X86EmitDebugTrashReg(kECX); +#endif // _DEBUG + +#else // TARGET_UNIX + +#ifdef TARGET_AMD64 + BYTE code[] = { 0x65,0x48,0x8b,0x04,0x25 }; // mov dstreg, qword ptr gs:[IMM32] + static const int regByteIndex = 3; +#elif defined(TARGET_X86) + BYTE code[] = { 0x64,0x8b,0x05 }; // mov dstreg, dword ptr fs:[IMM32] + static const int regByteIndex = 2; +#endif + code[regByteIndex] |= (dstreg << 3); + + EmitBytes(code, sizeof(code)); + Emit32(offsetof(TEB, ThreadLocalStoragePointer)); + + X86EmitIndexRegLoad(dstreg, dstreg, sizeof(void *) * _tls_index); + + _ASSERTE(Thread::GetOffsetOfThreadStatic(&t_thread_alloc_context) < INT_MAX); + X86EmitAddReg(dstreg, (int32_t)Thread::GetOffsetOfThreadStatic(&t_thread_alloc_context)); + +#endif // TARGET_UNIX +} + #if defined(FEATURE_COMINTEROP) && defined(TARGET_X86) #if defined(PROFILING_SUPPORTED) diff --git a/src/coreclr/vm/i386/stublinkerx86.h b/src/coreclr/vm/i386/stublinkerx86.h index 35aec1598fd559..3741d87d79995e 100644 --- a/src/coreclr/vm/i386/stublinkerx86.h +++ b/src/coreclr/vm/i386/stublinkerx86.h @@ -218,6 +218,8 @@ class StubLinkerCPU : public StubLinker VOID X86EmitCurrentThreadFetch(X86Reg dstreg, unsigned preservedRegSet); + VOID X86EmitCurrentThreadAllocContextFetch(X86Reg dstreg, unsigned preservedRegSet); + VOID X86EmitIndexRegLoad(X86Reg dstreg, X86Reg srcreg, int32_t ofs = 0); VOID X86EmitIndexRegStore(X86Reg dstreg, int32_t ofs, X86Reg srcreg); #if defined(TARGET_AMD64) diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index 13fbeaab2332d5..16771a339cfd8a 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -2105,48 +2105,42 @@ HCIMPLEND //************************************************************* // Allocation fast path for typical objects // -HCIMPL1(Object*, JIT_NewS_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_) +HCIMPL1_RAW(Object*, JIT_NewS_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_) { - FCALL_CONTRACT; - - do - { - _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); + CONTRACTL { + THROWS; + DISABLED(GC_TRIGGERS); + MODE_COOPERATIVE; + } CONTRACTL_END; - // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler - // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates - // some reshuffling of intermediate values into nonvolatile registers around the call. - Thread *thread = GetThread(); + _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); + gc_alloc_context *allocContext = &t_thread_alloc_context; - TypeHandle typeHandle(typeHnd_); - _ASSERTE(!typeHandle.IsTypeDesc()); // heap objects must have method tables - MethodTable *methodTable = typeHandle.AsMethodTable(); + TypeHandle typeHandle(typeHnd_); + _ASSERTE(!typeHandle.IsTypeDesc()); // heap objects must have method tables + MethodTable *methodTable = typeHandle.AsMethodTable(); - SIZE_T size = methodTable->GetBaseSize(); - _ASSERTE(size % DATA_ALIGNMENT == 0); + SIZE_T size = methodTable->GetBaseSize(); + _ASSERTE(size % DATA_ALIGNMENT == 0); - gc_alloc_context *allocContext = thread->GetAllocContext(); - BYTE *allocPtr = allocContext->alloc_ptr; - _ASSERTE(allocPtr <= allocContext->alloc_limit); - if (size > static_cast(allocContext->alloc_limit - allocPtr)) - { - break; - } - allocContext->alloc_ptr = allocPtr + size; + BYTE *allocPtr = allocContext->alloc_ptr; + _ASSERTE(allocPtr <= allocContext->alloc_limit); + if (size > static_cast(allocContext->alloc_limit - allocPtr)) + { + // Tail call to the slow helper + return HCCALL1(JIT_New, typeHnd_); + } - _ASSERTE(allocPtr != nullptr); - Object *object = reinterpret_cast(allocPtr); - _ASSERTE(object->HasEmptySyncBlockInfo()); - object->SetMethodTable(methodTable); + allocContext->alloc_ptr = allocPtr + size; - return object; - } while (false); + _ASSERTE(allocPtr != nullptr); + Object *object = reinterpret_cast(allocPtr); + _ASSERTE(object->HasEmptySyncBlockInfo()); + object->SetMethodTable(methodTable); - // Tail call to the slow helper - ENDFORBIDGC(); - return HCCALL1(JIT_New, typeHnd_); + return object; } -HCIMPLEND +HCIMPLEND_RAW #include @@ -2219,84 +2213,57 @@ HCIMPLEND //************************************************************* // Allocation fast path for typical objects // -HCIMPL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength) +HCIMPL1_RAW(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength) { - FCALL_CONTRACT; - - do - { - _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); + CONTRACTL { + THROWS; + DISABLED(GC_TRIGGERS); + MODE_COOPERATIVE; + } CONTRACTL_END; - // Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow - // problems, as well as making sure big string objects are correctly allocated in the big object heap. - if (stringLength >= (LARGE_OBJECT_SIZE - 256) / sizeof(WCHAR)) - { - break; - } + _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); - // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler - // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates - // some reshuffling of intermediate values into nonvolatile registers around the call. - Thread *thread = GetThread(); + // Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow + // problems, as well as making sure big string objects are correctly allocated in the big object heap. + if (stringLength >= (LARGE_OBJECT_SIZE - 256) / sizeof(WCHAR)) + { + // Tail call to the slow helper + return HCCALL1(FramedAllocateString, stringLength); + } - SIZE_T totalSize = StringObject::GetSize(stringLength); + gc_alloc_context *allocContext = &t_thread_alloc_context; - // The method table's base size includes space for a terminating null character - _ASSERTE(totalSize >= g_pStringClass->GetBaseSize()); - _ASSERTE((totalSize - g_pStringClass->GetBaseSize()) / sizeof(WCHAR) == stringLength); + SIZE_T totalSize = StringObject::GetSize(stringLength); - SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT); - _ASSERTE(alignedTotalSize >= totalSize); - totalSize = alignedTotalSize; + // The method table's base size includes space for a terminating null character + _ASSERTE(totalSize >= g_pStringClass->GetBaseSize()); + _ASSERTE((totalSize - g_pStringClass->GetBaseSize()) / sizeof(WCHAR) == stringLength); - gc_alloc_context *allocContext = thread->GetAllocContext(); - BYTE *allocPtr = allocContext->alloc_ptr; - _ASSERTE(allocPtr <= allocContext->alloc_limit); - if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) - { - break; - } - allocContext->alloc_ptr = allocPtr + totalSize; + SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT); + _ASSERTE(alignedTotalSize >= totalSize); + totalSize = alignedTotalSize; - _ASSERTE(allocPtr != nullptr); - StringObject *stringObject = reinterpret_cast(allocPtr); - stringObject->SetMethodTable(g_pStringClass); - stringObject->SetStringLength(stringLength); - _ASSERTE(stringObject->GetBuffer()[stringLength] == W('\0')); + BYTE *allocPtr = allocContext->alloc_ptr; + _ASSERTE(allocPtr <= allocContext->alloc_limit); + if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) + { + // Tail call to the slow helper + return HCCALL1(FramedAllocateString, stringLength); + } + allocContext->alloc_ptr = allocPtr + totalSize; - return stringObject; - } while (false); + _ASSERTE(allocPtr != nullptr); + StringObject *stringObject = reinterpret_cast(allocPtr); + stringObject->SetMethodTable(g_pStringClass); + stringObject->SetStringLength(stringLength); + _ASSERTE(stringObject->GetBuffer()[stringLength] == W('\0')); - // Tail call to the slow helper - ENDFORBIDGC(); - return HCCALL1(FramedAllocateString, stringLength); + return stringObject; } -HCIMPLEND +HCIMPLEND_RAW #include -/*********************************************************************/ -/* We don't use HCIMPL macros because this is not a real helper call */ -/* This function just needs mangled arguments like a helper call */ - -HCIMPL1_RAW(StringObject*, UnframedAllocateString, DWORD stringLength) -{ - // This isn't _really_ an FCALL and therefore shouldn't have the - // SO_TOLERANT part of the FCALL_CONTRACT b/c it is not entered - // from managed code. - CONTRACTL { - THROWS; - GC_TRIGGERS; - MODE_COOPERATIVE; - } CONTRACTL_END; - - STRINGREF result; - result = AllocateString(stringLength); - - return((StringObject*) OBJECTREFToObject(result)); -} -HCIMPLEND_RAW - HCIMPL1(StringObject*, FramedAllocateString, DWORD stringLength) { FCALL_CONTRACT; @@ -2356,129 +2323,123 @@ HCIMPLEND //************************************************************* // Array allocation fast path for arrays of value type elements // -HCIMPL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size) +HCIMPL2_RAW(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size) { - FCALL_CONTRACT; - - do - { - _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); + CONTRACTL { + THROWS; + DISABLED(GC_TRIGGERS); + MODE_COOPERATIVE; + } CONTRACTL_END; - // Do a conservative check here. This is to avoid overflow while doing the calculations. We don't - // have to worry about "large" objects, since the allocation quantum is never big enough for - // LARGE_OBJECT_SIZE. - // - // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), - // The slack includes the size for the array header and round-up ; for alignment. Use 256 for the - // slack value out of laziness. - SIZE_T componentCount = static_cast(size); - if (componentCount >= static_cast(65535 - 256)) - { - break; - } + _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); - // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler - // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates - // some reshuffling of intermediate values into nonvolatile registers around the call. - Thread *thread = GetThread(); + // Do a conservative check here. This is to avoid overflow while doing the calculations. We don't + // have to worry about "large" objects, since the allocation quantum is never big enough for + // LARGE_OBJECT_SIZE. + // + // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), + // The slack includes the size for the array header and round-up ; for alignment. Use 256 for the + // slack value out of laziness. + SIZE_T componentCount = static_cast(size); + if (componentCount >= static_cast(65535 - 256)) + { + // Tail call to the slow helper + return HCCALL2(JIT_NewArr1, arrayMT, size); + } - MethodTable *pArrayMT = (MethodTable *)arrayMT; + gc_alloc_context *allocContext = &t_thread_alloc_context; - _ASSERTE(pArrayMT->HasComponentSize()); - SIZE_T componentSize = pArrayMT->RawGetComponentSize(); - SIZE_T totalSize = componentCount * componentSize; - _ASSERTE(totalSize / componentSize == componentCount); + MethodTable *pArrayMT = (MethodTable *)arrayMT; - SIZE_T baseSize = pArrayMT->GetBaseSize(); - totalSize += baseSize; - _ASSERTE(totalSize >= baseSize); + _ASSERTE(pArrayMT->HasComponentSize()); + SIZE_T componentSize = pArrayMT->RawGetComponentSize(); + SIZE_T totalSize = componentCount * componentSize; + _ASSERTE(totalSize / componentSize == componentCount); - SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT); - _ASSERTE(alignedTotalSize >= totalSize); - totalSize = alignedTotalSize; + SIZE_T baseSize = pArrayMT->GetBaseSize(); + totalSize += baseSize; + _ASSERTE(totalSize >= baseSize); - gc_alloc_context *allocContext = thread->GetAllocContext(); - BYTE *allocPtr = allocContext->alloc_ptr; - _ASSERTE(allocPtr <= allocContext->alloc_limit); - if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) - { - break; - } - allocContext->alloc_ptr = allocPtr + totalSize; + SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT); + _ASSERTE(alignedTotalSize >= totalSize); + totalSize = alignedTotalSize; - _ASSERTE(allocPtr != nullptr); - ArrayBase *array = reinterpret_cast(allocPtr); - array->SetMethodTable(pArrayMT); - _ASSERTE(static_cast(componentCount) == componentCount); - array->m_NumComponents = static_cast(componentCount); + BYTE *allocPtr = allocContext->alloc_ptr; + _ASSERTE(allocPtr <= allocContext->alloc_limit); + if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) + { + // Tail call to the slow helper + return HCCALL2(JIT_NewArr1, arrayMT, size); + } + allocContext->alloc_ptr = allocPtr + totalSize; - return array; - } while (false); + _ASSERTE(allocPtr != nullptr); + ArrayBase *array = reinterpret_cast(allocPtr); + array->SetMethodTable(pArrayMT); + _ASSERTE(static_cast(componentCount) == componentCount); + array->m_NumComponents = static_cast(componentCount); - // Tail call to the slow helper - ENDFORBIDGC(); - return HCCALL2(JIT_NewArr1, arrayMT, size); + return array; } -HCIMPLEND +HCIMPLEND_RAW //************************************************************* // Array allocation fast path for arrays of object elements // -HCIMPL2(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size) +HCIMPL2_RAW(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size) { - FCALL_CONTRACT; - - do - { - _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); + CONTRACTL { + THROWS; + DISABLED(GC_TRIGGERS); + MODE_COOPERATIVE; + } CONTRACTL_END; - // Make sure that the total size cannot reach LARGE_OBJECT_SIZE, which also allows us to avoid overflow checks. The - // "256" slack is to cover the array header size and round-up, using a constant value here out of laziness. - SIZE_T componentCount = static_cast(size); - if (componentCount >= static_cast((LARGE_OBJECT_SIZE - 256) / sizeof(void *))) - { - break; - } + _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); - // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler - // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates - // some reshuffling of intermediate values into nonvolatile registers around the call. - Thread *thread = GetThread(); + // Make sure that the total size cannot reach LARGE_OBJECT_SIZE, which also allows us to avoid overflow checks. The + // "256" slack is to cover the array header size and round-up, using a constant value here out of laziness. + SIZE_T componentCount = static_cast(size); + if (componentCount >= static_cast((LARGE_OBJECT_SIZE - 256) / sizeof(void *))) + { + // Tail call to the slow helper + return HCCALL2(JIT_NewArr1, arrayMT, size); + } - SIZE_T totalSize = componentCount * sizeof(void *); - _ASSERTE(totalSize / sizeof(void *) == componentCount); + // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler + // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates + // some reshuffling of intermediate values into nonvolatile registers around the call. + Thread *thread = GetThread(); - MethodTable *pArrayMT = (MethodTable *)arrayMT; + SIZE_T totalSize = componentCount * sizeof(void *); + _ASSERTE(totalSize / sizeof(void *) == componentCount); - SIZE_T baseSize = pArrayMT->GetBaseSize(); - totalSize += baseSize; - _ASSERTE(totalSize >= baseSize); + MethodTable *pArrayMT = (MethodTable *)arrayMT; - _ASSERTE(ALIGN_UP(totalSize, DATA_ALIGNMENT) == totalSize); + SIZE_T baseSize = pArrayMT->GetBaseSize(); + totalSize += baseSize; + _ASSERTE(totalSize >= baseSize); - gc_alloc_context *allocContext = thread->GetAllocContext(); - BYTE *allocPtr = allocContext->alloc_ptr; - _ASSERTE(allocPtr <= allocContext->alloc_limit); - if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) - { - break; - } - allocContext->alloc_ptr = allocPtr + totalSize; + _ASSERTE(ALIGN_UP(totalSize, DATA_ALIGNMENT) == totalSize); - _ASSERTE(allocPtr != nullptr); - ArrayBase *array = reinterpret_cast(allocPtr); - array->SetMethodTable(pArrayMT); - _ASSERTE(static_cast(componentCount) == componentCount); - array->m_NumComponents = static_cast(componentCount); + gc_alloc_context *allocContext = &t_thread_alloc_context; + BYTE *allocPtr = allocContext->alloc_ptr; + _ASSERTE(allocPtr <= allocContext->alloc_limit); + if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) + { + // Tail call to the slow helper + return HCCALL2(JIT_NewArr1, arrayMT, size); + } + allocContext->alloc_ptr = allocPtr + totalSize; - return array; - } while (false); + _ASSERTE(allocPtr != nullptr); + ArrayBase *array = reinterpret_cast(allocPtr); + array->SetMethodTable(pArrayMT); + _ASSERTE(static_cast(componentCount) == componentCount); + array->m_NumComponents = static_cast(componentCount); - // Tail call to the slow helper - ENDFORBIDGC(); - return HCCALL2(JIT_NewArr1, arrayMT, size); + return array; } -HCIMPLEND +HCIMPLEND_RAW #include diff --git a/src/coreclr/vm/jitinterface.h b/src/coreclr/vm/jitinterface.h index 51e2b959d6f694..848350ebb47cda 100644 --- a/src/coreclr/vm/jitinterface.h +++ b/src/coreclr/vm/jitinterface.h @@ -204,7 +204,6 @@ extern FCDECL1(Object*, JIT_NewS_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_) extern FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_); extern FCDECL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength); -extern FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength); extern FCDECL1(StringObject*, FramedAllocateString, DWORD stringLength); extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); @@ -315,14 +314,6 @@ class WriteBarrierManager #endif // TARGET_AMD64 -#ifdef HOST_64BIT -EXTERN_C FCDECL1(Object*, JIT_TrialAllocSFastMP_InlineGetThread, CORINFO_CLASS_HANDLE typeHnd_); -EXTERN_C FCDECL2(Object*, JIT_BoxFastMP_InlineGetThread, CORINFO_CLASS_HANDLE type, void* data); -EXTERN_C FCDECL2(Object*, JIT_NewArr1VC_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); -EXTERN_C FCDECL2(Object*, JIT_NewArr1OBJ_MP_InlineGetThread, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); - -#endif // HOST_64BIT - EXTERN_C FCDECL2_VV(INT64, JIT_LMul, INT64 val1, INT64 val2); #ifndef HOST_64BIT @@ -1073,6 +1064,7 @@ EXTERN_C FCDECL2(LPVOID, ArrayStoreCheck, Object** pElement, PtrArray** pArray); // means that the caller does not care whether the string is pinned or not. OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok, void** ppPinnedString = nullptr); +FCDECL2(Object*, JIT_Box_MP_FastPortable, CORINFO_CLASS_HANDLE type, void* data); FCDECL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* data); FCDECL0(VOID, JIT_PollGC); diff --git a/src/coreclr/vm/jitinterfacegen.cpp b/src/coreclr/vm/jitinterfacegen.cpp index 68ab56aeb96ef3..6cb3e4dead267f 100644 --- a/src/coreclr/vm/jitinterfacegen.cpp +++ b/src/coreclr/vm/jitinterfacegen.cpp @@ -20,26 +20,16 @@ #ifdef HOST_64BIT -// These are the fastest(?) versions of JIT helpers as they have the code to GetThread patched into them -// that does not make a call. -EXTERN_C Object* JIT_TrialAllocSFastMP_InlineGetThread(CORINFO_CLASS_HANDLE typeHnd_); -EXTERN_C Object* JIT_BoxFastMP_InlineGetThread (CORINFO_CLASS_HANDLE type, void* unboxedData); -EXTERN_C Object* AllocateStringFastMP_InlineGetThread (CLR_I4 cch); -EXTERN_C Object* JIT_NewArr1OBJ_MP_InlineGetThread (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); -EXTERN_C Object* JIT_NewArr1VC_MP_InlineGetThread (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); - -// This next set is the fast version that invoke GetThread but is still faster than the VM implementation (i.e. -// the "slow" versions). -EXTERN_C Object* JIT_TrialAllocSFastMP(CORINFO_CLASS_HANDLE typeHnd_); -EXTERN_C Object* JIT_TrialAllocSFastSP(CORINFO_CLASS_HANDLE typeHnd_); +// These are the multi-processor-optimized versions of the allocation helpers +// that must be written in assembly. EXTERN_C Object* JIT_BoxFastMP (CORINFO_CLASS_HANDLE type, void* unboxedData); + +// These are the single-processor-optimized versions of the allocation helpers. +EXTERN_C Object* JIT_TrialAllocSFastSP(CORINFO_CLASS_HANDLE typeHnd_); EXTERN_C Object* JIT_BoxFastUP (CORINFO_CLASS_HANDLE type, void* unboxedData); -EXTERN_C Object* AllocateStringFastMP (CLR_I4 cch); EXTERN_C Object* AllocateStringFastUP (CLR_I4 cch); -EXTERN_C Object* JIT_NewArr1OBJ_MP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); EXTERN_C Object* JIT_NewArr1OBJ_UP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); -EXTERN_C Object* JIT_NewArr1VC_MP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); EXTERN_C Object* JIT_NewArr1VC_UP (CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); #ifdef TARGET_AMD64 @@ -83,13 +73,13 @@ void InitJITHelpers1() // if (multi-proc || server GC) if (GCHeapUtilities::UseThreadAllocationContexts()) { - SetJitHelperFunction(CORINFO_HELP_NEWSFAST, JIT_TrialAllocSFastMP_InlineGetThread); - SetJitHelperFunction(CORINFO_HELP_NEWSFAST_ALIGN8, JIT_TrialAllocSFastMP_InlineGetThread); - SetJitHelperFunction(CORINFO_HELP_BOX, JIT_BoxFastMP_InlineGetThread); - SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_InlineGetThread); - SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_InlineGetThread); + SetJitHelperFunction(CORINFO_HELP_NEWSFAST, JIT_NewS_MP_FastPortable); + SetJitHelperFunction(CORINFO_HELP_NEWSFAST_ALIGN8, JIT_NewS_MP_FastPortable); + SetJitHelperFunction(CORINFO_HELP_BOX, JIT_BoxFastMP); + SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_FastPortable); + SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_FastPortable); - ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP_InlineGetThread), ECall::FastAllocateString); + ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateString_MP_FastPortable), ECall::FastAllocateString); } else { diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp index 7411d62a285f89..96e88cde3dffa2 100644 --- a/src/coreclr/vm/threads.cpp +++ b/src/coreclr/vm/threads.cpp @@ -371,6 +371,7 @@ void SetThread(Thread* t) if (t != NULL) { EnsureTlsDestructionMonitor(); + t->InitAllocContext(); } // Clear or set the app domain to the one domain based on if the thread is being nulled out or set @@ -1003,6 +1004,20 @@ HRESULT Thread::DetachThread(BOOL fDLLThreadDetach) m_ThreadHandleForClose = hThread; } + if (GCHeapUtilities::IsGCHeapInitialized()) + { + // If the GC heap is initialized, we need to fix the alloc context for this detaching thread. + GCX_COOP(); + // GetTotalAllocatedBytes reads dead_threads_non_alloc_bytes, but will suspend EE, being in COOP mode we cannot race with that + // however, there could be other threads terminating and doing the same Add. + InterlockedExchangeAdd64((LONG64*)&dead_threads_non_alloc_bytes, t_thread_alloc_context.alloc_limit - t_thread_alloc_context.alloc_ptr); + GCHeapUtilities::GetGCHeap()->FixAllocContext(&t_thread_alloc_context, NULL, NULL); + t_thread_alloc_context.init(); // re-initialize the context. + + // Clear out the alloc context pointer for this thread. When TLS is gone, this pointer will point into freed memory. + m_alloc_context = nullptr; + } + // We need to make sure that TLS are touched last here. SetThread(NULL); @@ -1411,7 +1426,7 @@ Thread::Thread() m_pBlockingLock = NULL; - m_alloc_context.init(); + m_alloc_context = nullptr; m_thAllocContextObj = 0; m_UserInterrupt = 0; @@ -2871,14 +2886,14 @@ void Thread::OnThreadTerminate(BOOL holdingLock) { // Guaranteed to NOT be a shutdown case, because we tear down the heap before // we tear down any threads during shutdown. - if (ThisThreadID == CurrentThreadID) + if (ThisThreadID == CurrentThreadID && GetAllocContext() != nullptr) { GCX_COOP(); // GetTotalAllocatedBytes reads dead_threads_non_alloc_bytes, but will suspend EE, being in COOP mode we cannot race with that // however, there could be other threads terminating and doing the same Add. - InterlockedExchangeAdd64((LONG64*)&dead_threads_non_alloc_bytes, m_alloc_context.alloc_limit - m_alloc_context.alloc_ptr); - GCHeapUtilities::GetGCHeap()->FixAllocContext(&m_alloc_context, NULL, NULL); - m_alloc_context.init(); + InterlockedExchangeAdd64((LONG64*)&dead_threads_non_alloc_bytes, GetAllocContext()->alloc_limit - GetAllocContext()->alloc_ptr); + GCHeapUtilities::GetGCHeap()->FixAllocContext(GetAllocContext(), NULL, NULL); + GetAllocContext()->init(); // re-initialize the context. } } @@ -2930,15 +2945,6 @@ void Thread::OnThreadTerminate(BOOL holdingLock) } - if (GCHeapUtilities::IsGCHeapInitialized() && ThisThreadID != CurrentThreadID) - { - // We must be holding the ThreadStore lock in order to clean up alloc context. - // We should never call FixAllocContext during GC. - dead_threads_non_alloc_bytes += m_alloc_context.alloc_limit - m_alloc_context.alloc_ptr; - GCHeapUtilities::GetGCHeap()->FixAllocContext(&m_alloc_context, NULL, NULL); - m_alloc_context.init(); - } - SetThreadState(TS_Dead); ThreadStore::s_pThreadStore->m_DeadThreadCount++; ThreadStore::s_pThreadStore->IncrementDeadThreadCountForGCTrigger(); diff --git a/src/coreclr/vm/threads.h b/src/coreclr/vm/threads.h index c6bdda4e012b86..f4554c9018a01e 100644 --- a/src/coreclr/vm/threads.h +++ b/src/coreclr/vm/threads.h @@ -1016,13 +1016,14 @@ class Thread // Lock thread is trying to acquire VolatilePtr m_pBlockingLock; -public: + // We store a pointer to this thread's alloc context here for easier introspection + // from other threads and diagnostic tools + gc_alloc_context* m_alloc_context; - // on MP systems, each thread has its own allocation chunk so we can avoid - // lock prefixes and expensive MP cache snooping stuff - gc_alloc_context m_alloc_context; +public: + inline void InitAllocContext() { LIMITED_METHOD_CONTRACT; m_alloc_context = &t_thread_alloc_context; } - inline gc_alloc_context *GetAllocContext() { LIMITED_METHOD_CONTRACT; return &m_alloc_context; } + inline gc_alloc_context *GetAllocContext() { LIMITED_METHOD_CONTRACT; return m_alloc_context; } // This is the type handle of the first object in the alloc context at the time // we fire the AllocationTick event. It's only for tooling purpose. diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 40ae02264804fd..267b0359740ff0 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -2360,7 +2360,7 @@ void Thread::PerformPreemptiveGC() // BUG(github #10318) - when not using allocation contexts, the alloc lock // must be acquired here. Until fixed, this assert prevents random heap corruption. _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); - GCHeapUtilities::GetGCHeap()->StressHeap(GetThread()->GetAllocContext()); + GCHeapUtilities::GetGCHeap()->StressHeap(&t_thread_alloc_context); m_bGCStressing = FALSE; } m_GCOnTransitionsOK = TRUE;