diff --git a/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.cpp b/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.cpp index cc409cf2167a16..283926cabdedab 100644 --- a/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.cpp +++ b/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.cpp @@ -164,63 +164,138 @@ static PTR_VOID GetUnwindDataBlob(TADDR moduleBase, PTR_RUNTIME_FUNCTION pRuntim #endif } +// index nodes are searched linearly. +// 16 * sizeof(uint32_t) == 64, which is a typical cache line size +// thus we expect at most one cache miss on every level of the index +#define INDEX_BRANCHING_FACTOR 16 +// T_RUNTIME_FUNCTION is larger than uint32_t, so we have a smaller granularity at last level +#define FUNCTABLE_INDEX_GRANULARITY 8 +#define INDEX_ALIGNMENT 64 CoffNativeCodeManager::CoffNativeCodeManager(TADDR moduleBase, - PTR_VOID pvManagedCodeStartRange, uint32_t cbManagedCodeRange, - PTR_RUNTIME_FUNCTION pRuntimeFunctionTable, uint32_t nRuntimeFunctionTable, - PTR_PTR_VOID pClasslibFunctions, uint32_t nClasslibFunctions) + PTR_VOID pvManagedCodeStartRange, uint32_t cbManagedCodeRange, + PTR_RUNTIME_FUNCTION pRuntimeFunctionTable, uint32_t nRuntimeFunctionTable, + PTR_PTR_VOID pClasslibFunctions, uint32_t nClasslibFunctions) : m_moduleBase(moduleBase), - m_pvManagedCodeStartRange(pvManagedCodeStartRange), m_cbManagedCodeRange(cbManagedCodeRange), - m_pRuntimeFunctionTable(pRuntimeFunctionTable), m_nRuntimeFunctionTable(nRuntimeFunctionTable), - m_pClasslibFunctions(pClasslibFunctions), m_nClasslibFunctions(nClasslibFunctions) + m_pvManagedCodeStartRange(pvManagedCodeStartRange), m_cbManagedCodeRange(cbManagedCodeRange), + m_pRuntimeFunctionTable(pRuntimeFunctionTable), m_nRuntimeFunctionTable(nRuntimeFunctionTable), + m_pClasslibFunctions(pClasslibFunctions), m_nClasslibFunctions(nClasslibFunctions), + m_initializedIndices(0), m_indexCount(0), m_indices{ 0 } { } CoffNativeCodeManager::~CoffNativeCodeManager() { + for (uint32_t i = 0; i < m_indexCount; i++) + { + uint32_t* ptr = m_indices[i]; + if (ptr) + { + _aligned_free(ptr); + m_indices[i] = nullptr; + } + } + + m_indexCount = 0; +} + +bool CoffNativeCodeManager::AllocFuncTableIndex() +{ + uint32_t indexSize = (m_nRuntimeFunctionTable + FUNCTABLE_INDEX_GRANULARITY - 1) / FUNCTABLE_INDEX_GRANULARITY; + uint32_t* index = m_indices[m_indexCount] = (uint32_t*)_aligned_malloc(indexSize * sizeof(uint32_t), INDEX_ALIGNMENT); + if (!index) + return false; + + memset(index, 0, indexSize * sizeof(uint32_t)); + m_indexCount++; + + while (indexSize > INDEX_BRANCHING_FACTOR) + { + uint32_t prevSize = indexSize; + indexSize = (indexSize + INDEX_BRANCHING_FACTOR - 1) / INDEX_BRANCHING_FACTOR; + index = m_indices[m_indexCount] = (uint32_t*)_aligned_malloc(indexSize * sizeof(uint32_t), INDEX_ALIGNMENT); + if (!index) + return false; + + memset(index, 0, indexSize * sizeof(uint32_t)); + m_indexCount++; + } + + return true; } -static int LookupUnwindInfoForMethod(uint32_t relativePc, - PTR_RUNTIME_FUNCTION pRuntimeFunctionTable, - int low, - int high) +NOINLINE +uint32_t** CoffNativeCodeManager::InitFuncTableIndex() { - // Binary search the RUNTIME_FUNCTION table - // Use linear search once we get down to a small number of elements - // to avoid Binary search overhead. - while (high - low > 10) + // max offset is beyond the range of managed methods. + int maxOffset = (int)((TADDR)m_pvManagedCodeStartRange + m_cbManagedCodeRange - m_moduleBase); + + // It is possible to see several threads come here at once. + // We can spin-wait for one thread to do the work or just let all threads do the initialization. + // Either way it will take roughly the same time as for the first thread to complete the work. + // Yet we can make this complete faster if threads help each other by working on different + // parts of the index. + uint32_t perThreadBias = (uint32_t)(((size_t)&perThreadBias * 11400714819323198485ul) >> 32); + + // lets build the index for the runtime table. for every granule that has elements we will have an index entry + uint32_t indexSize = (m_nRuntimeFunctionTable + FUNCTABLE_INDEX_GRANULARITY - 1) / FUNCTABLE_INDEX_GRANULARITY; + uint32_t indexCount = 0; + uint32_t* index = m_indices[indexCount++]; + + // every index N will contain the lowest value from the granule N + 1 + // when we will scan the value N in the indices and see that it is higher than the target, we will know + // that the granule N must be searched for the entry as the next granule will have higher addresses. + uint32_t start = (perThreadBias % indexSize) | 1; + for (uint32_t i = start; i < indexSize; i++) { - int middle = low + (high - low) / 2; - - PTR_RUNTIME_FUNCTION pFunctionEntry = pRuntimeFunctionTable + middle; - if (relativePc < pFunctionEntry->BeginAddress) - { - high = middle - 1; - } - else - { - low = middle; - } + if (index[i - 1] == 0) + { + _ASSERTE(i * FUNCTABLE_INDEX_GRANULARITY < m_nRuntimeFunctionTable); + index[i - 1] = m_pRuntimeFunctionTable[i * FUNCTABLE_INDEX_GRANULARITY].BeginAddress; + } } - for (int i = low; i < high; i++) + for (uint32_t i = 1; i < start; i++) { - PTR_RUNTIME_FUNCTION pNextFunctionEntry = pRuntimeFunctionTable + (i + 1); - if (relativePc < pNextFunctionEntry->BeginAddress) + if (index[i - 1] == 0) { - high = i; - break; + _ASSERTE(i * FUNCTABLE_INDEX_GRANULARITY < m_nRuntimeFunctionTable); + index[i - 1] = m_pRuntimeFunctionTable[i * FUNCTABLE_INDEX_GRANULARITY].BeginAddress; } } - PTR_RUNTIME_FUNCTION pFunctionEntry = pRuntimeFunctionTable + high; - if (relativePc >= pFunctionEntry->BeginAddress) + // we put the maxOffset at the end of the index. + // there is no N + 1 granule to get the value from, so the last slot will contain the sentinel. + index[indexSize - 1] = maxOffset; + + // Now build an N-ary tree of indices. + // Example: at branching factor 16 a program with 32K methods will have 3 sub-index levels. + uint32_t* prevIdx = index; + while (indexSize > INDEX_BRANCHING_FACTOR) { - return high; + uint32_t prevSize = indexSize; + indexSize = (indexSize + INDEX_BRANCHING_FACTOR - 1) / INDEX_BRANCHING_FACTOR; + index = m_indices[indexCount++]; + + start = (perThreadBias % indexSize) | 1; + for (uint32_t i = start; i < indexSize; i++) + { + _ASSERTE(i * INDEX_BRANCHING_FACTOR < prevSize); + index[i - 1] = prevIdx[i * INDEX_BRANCHING_FACTOR]; + } + + for (uint32_t i = 1; i < start; i++) + { + _ASSERTE(i * INDEX_BRANCHING_FACTOR < prevSize); + index[i - 1] = prevIdx[i * INDEX_BRANCHING_FACTOR]; + } + + index[indexSize - 1] = maxOffset; + prevIdx = index; } - ASSERT_UNCONDITIONALLY("Invalid code address"); - return -1; + WriteRelease64((LONG64*)&m_initializedIndices, (LONG64)m_indices); + return m_initializedIndices; } struct CoffNativeMethodInfo @@ -233,6 +308,39 @@ struct CoffNativeMethodInfo // Ensure that CoffNativeMethodInfo fits into the space reserved by MethodInfo static_assert(sizeof(CoffNativeMethodInfo) <= sizeof(MethodInfo), "CoffNativeMethodInfo too big"); +FORCEINLINE +int CoffNativeCodeManager::LookupUnwindInfoIdx(uint32_t relativePc) +{ + uint32_t** indices = m_initializedIndices; + if (!indices) + indices = InitFuncTableIndex(); + + uint32_t idx = 0; + for (int j = m_indexCount - 1; j >= 0; j--) + { + uint32_t* index = indices[j]; + idx *= INDEX_BRANCHING_FACTOR; + + while ((uint32_t)index[idx] < relativePc) + idx++; + } + + for (idx *= FUNCTABLE_INDEX_GRANULARITY; idx < m_nRuntimeFunctionTable; idx++) + { + uint32_t curAddr = m_pRuntimeFunctionTable[idx].BeginAddress; + if (curAddr > relativePc) + return idx - 1; + } + + // we can only get here if we are looking for a location inside the very last managed function. + _ASSERTE(m_pRuntimeFunctionTable[idx - 1].BeginAddress <= relativePc); +#if defined(TARGET_AMD64) + _ASSERTE(m_pRuntimeFunctionTable[idx - 1].EndAddress > relativePc); +#endif + + return idx - 1; +} + bool CoffNativeCodeManager::FindMethodInfo(PTR_VOID ControlPC, MethodInfo * pMethodInfoOut) { @@ -244,16 +352,10 @@ bool CoffNativeCodeManager::FindMethodInfo(PTR_VOID ControlPC, } CoffNativeMethodInfo * pMethodInfo = (CoffNativeMethodInfo *)pMethodInfoOut; - TADDR relativePC = dac_cast(ControlPC) - m_moduleBase; - - int MethodIndex = LookupUnwindInfoForMethod((uint32_t)relativePC, m_pRuntimeFunctionTable, - 0, m_nRuntimeFunctionTable - 1); - if (MethodIndex < 0) - return false; + int MethodIndex = LookupUnwindInfoIdx((uint32_t)relativePC); PTR_RUNTIME_FUNCTION pRuntimeFunction = m_pRuntimeFunctionTable + MethodIndex; - pMethodInfo->runtimeFunction = pRuntimeFunction; // The runtime function could correspond to a funclet. We need to get to the @@ -965,10 +1067,7 @@ PTR_VOID CoffNativeCodeManager::GetAssociatedData(PTR_VOID ControlPC) } TADDR relativePC = dac_cast(ControlPC) - m_moduleBase; - - int MethodIndex = LookupUnwindInfoForMethod((uint32_t)relativePC, m_pRuntimeFunctionTable, 0, m_nRuntimeFunctionTable - 1); - if (MethodIndex < 0) - return NULL; + int MethodIndex = LookupUnwindInfoIdx((uint32_t)relativePC); PTR_RUNTIME_FUNCTION pRuntimeFunction = m_pRuntimeFunctionTable + MethodIndex; @@ -1008,6 +1107,9 @@ bool RhRegisterOSModule(void * pModule, if (pCoffNativeCodeManager == nullptr) return false; + if (!pCoffNativeCodeManager->AllocFuncTableIndex()) + return false; + RegisterCodeManager(pCoffNativeCodeManager, pvManagedCodeStartRange, cbManagedCodeRange); if (!RegisterUnboxingStubs(pvUnboxingStubsStartRange, cbUnboxingStubsRange)) diff --git a/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.h b/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.h index 445a998fdb00bc..b7ddd0fe6741a8 100644 --- a/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.h +++ b/src/coreclr/nativeaot/Runtime/windows/CoffNativeCodeManager.h @@ -44,6 +44,14 @@ class CoffNativeCodeManager : public ICodeManager PTR_PTR_VOID m_pClasslibFunctions; uint32_t m_nClasslibFunctions; + // used to publish a reference to the index once initialized. + // if the reference is not null, the index can be accessed through it. + uint32_t** volatile m_initializedIndices; + uint32_t m_indexCount; + uint32_t* m_indices[8]; + + int LookupUnwindInfoIdx(uint32_t relativePc); + public: CoffNativeCodeManager(TADDR moduleBase, PTR_VOID pvManagedCodeStartRange, uint32_t cbManagedCodeRange, @@ -51,6 +59,9 @@ class CoffNativeCodeManager : public ICodeManager PTR_PTR_VOID pClasslibFunctions, uint32_t nClasslibFunctions); ~CoffNativeCodeManager(); + bool AllocFuncTableIndex(); + uint32_t** InitFuncTableIndex(); + // // Code manager methods //