From 2eac3009222086044ba0a67c37fda1894fec4587 Mon Sep 17 00:00:00 2001 From: ehds Date: Tue, 7 Mar 2023 22:59:34 +0800 Subject: [PATCH 1/4] fix compiler optimize thread local variable access --- CMakeLists.txt | 5 +++++ src/bthread/task_group.cpp | 19 +++++++------------ src/butil/thread_local.h | 26 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35365b270b..c38922fc16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ option(WITH_RDMA "With RDMA" OFF) option(BUILD_UNIT_TESTS "Whether to build unit tests" OFF) option(BUILD_BRPC_TOOLS "Whether to build brpc tools" ON) option(DOWNLOAD_GTEST "Download and build a fresh copy of googletest. Requires Internet access." ON) +option(NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS "Whether to disable compiler optimize thread local variable access." OFF) # Enable MACOSX_RPATH. Run "cmake --help-policy CMP0042" for policy details. if(POLICY CMP0042) @@ -112,6 +113,10 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-deprecated-declarations -Wno-inconsistent-missing-override") endif() +if(NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS) + set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -DNOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS") +endif() + set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DGFLAGS_NS=${GFLAGS_NS}") if(WITH_MESALINK) set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -DUSE_MESALINK") diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 6f5a4abdc7..469e1b0b8d 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -57,7 +57,7 @@ const bool ALLOW_UNUSED dummy_show_per_worker_usage_in_vars = ::GFLAGS_NS::RegisterFlagValidator(&FLAGS_show_per_worker_usage_in_vars, pass_bool); -__thread TaskGroup* tls_task_group = NULL; +BAIDU_VOLATILE_THREAD_LOCAL(TaskGroup*, tls_task_group, NULL); // Sync with TaskMeta::local_storage when a bthread is created or destroyed. // During running, the two fields may be inconsistent, use tls_bls as the // groundtruth. @@ -68,7 +68,7 @@ extern void return_keytable(bthread_keytable_pool_t*, KeyTable*); // [Hacky] This is a special TLS set by bthread-rpc privately... to save // overhead of creation keytable, may be removed later. -BAIDU_THREAD_LOCAL void* tls_unique_user_ptr = NULL; +BAIDU_VOLATILE_THREAD_LOCAL(void*, tls_unique_user_ptr, NULL); const TaskStatistics EMPTY_STAT = { 0, 0 }; @@ -248,9 +248,6 @@ int TaskGroup::init(size_t runqueue_capacity) { return 0; } -#if defined(__linux__) && defined(__aarch64__) && defined(__clang__) - __attribute__((optnone)) -#endif void TaskGroup::task_runner(intptr_t skip_remained) { // NOTE: tls_task_group is volatile since tasks are moved around // different groups. @@ -301,7 +298,7 @@ void TaskGroup::task_runner(intptr_t skip_remained) { } // Group is probably changed - g = tls_task_group; + g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); // TODO: Save thread_return (void)thread_return; @@ -570,9 +567,6 @@ void TaskGroup::sched(TaskGroup** pg) { sched_to(pg, next_tid); } -#if defined(__linux__) && defined(__aarch64__) && defined(__clang__) - __attribute__((optnone)) -#endif void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { TaskGroup* g = *pg; #ifndef NDEBUG @@ -614,7 +608,7 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { if (next_meta->stack != cur_meta->stack) { jump_stack(cur_meta->stack, next_meta->stack); // probably went to another group, need to assign g again. - g = tls_task_group; + g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); } #ifndef NDEBUG else { @@ -633,12 +627,13 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { RemainedFn fn = g->_last_context_remained; g->_last_context_remained = NULL; fn(g->_last_context_remained_arg); - g = tls_task_group; + g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); } // Restore errno errno = saved_errno; - tls_unique_user_ptr = saved_unique_user_ptr; + // tls_unique_user_ptr probably changed. + BAIDU_SET_VOLATILE_THREAD_LOCAL(tls_unique_user_ptr, saved_unique_user_ptr); #ifndef NDEBUG --g->_sched_recursive_guard; diff --git a/src/butil/thread_local.h b/src/butil/thread_local.h index 1c462b0fff..e3d541efbf 100644 --- a/src/butil/thread_local.h +++ b/src/butil/thread_local.h @@ -30,6 +30,32 @@ #define BAIDU_THREAD_LOCAL __thread #endif // _MSC_VER +#define BAIDU_VOLATILE_THREAD_LOCAL(type, var_name, default_value) \ + __thread type var_name = default_value; \ + static __attribute__((noinline, unused)) type get_##var_name(void) { \ + asm volatile(""); \ + return var_name; \ + } \ + static __attribute__((noinline, unused)) type *get_ptr_##var_name(void) { \ + type *ptr = &var_name; \ + asm volatile("" : "+rm"(ptr)); \ + return ptr; \ + } \ + static __attribute__((noinline, unused)) void set_##var_name(type v) { \ + asm volatile(""); \ + var_name = v; \ + } + +#if defined(NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS) +#define BAIDU_GET_VOLATILE_THREAD_LOCAL(var_name) get_##var_name() +#define BAIDU_GET_PTR_VOLATILE_THREAD_LOCAL(var_name) get_ptr_##var_name() +#define BAIDU_SET_VOLATILE_THREAD_LOCAL(var_name, value) set_##var_name(value) +#else +#define BAIDU_GET_VOLATILE_THREAD_LOCAL(var_name) var_name +#define BAIDU_GET_PTR_VOLATILE_THREAD_LOCAL(var_name) &##var_name +#define BAIDU_SET_VOLATILE_THREAD_LOCAL(var_name, value) var_name = value +#endif + namespace butil { // Get a thread-local object typed T. The object will be default-constructed From 20f5566ca7773d7fb454639b406be4a98d2338e1 Mon Sep 17 00:00:00 2001 From: ehds Date: Tue, 7 Mar 2023 23:30:52 +0800 Subject: [PATCH 2/4] change __thread to BAIDU_THREAD_LOCAL --- src/butil/thread_local.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/butil/thread_local.h b/src/butil/thread_local.h index e3d541efbf..e81f631cc1 100644 --- a/src/butil/thread_local.h +++ b/src/butil/thread_local.h @@ -31,7 +31,7 @@ #endif // _MSC_VER #define BAIDU_VOLATILE_THREAD_LOCAL(type, var_name, default_value) \ - __thread type var_name = default_value; \ + BAIDU_THREAD_LOCAL type var_name = default_value; \ static __attribute__((noinline, unused)) type get_##var_name(void) { \ asm volatile(""); \ return var_name; \ From 0964a6c0aac0eb959a55ccf32cbb66b325b36c20 Mon Sep 17 00:00:00 2001 From: ehds Date: Fri, 10 Mar 2023 12:13:13 +0800 Subject: [PATCH 3/4] Add NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS according to compiler and arch --- CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c38922fc16..f8ccacc613 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,6 @@ option(WITH_RDMA "With RDMA" OFF) option(BUILD_UNIT_TESTS "Whether to build unit tests" OFF) option(BUILD_BRPC_TOOLS "Whether to build brpc tools" ON) option(DOWNLOAD_GTEST "Download and build a fresh copy of googletest. Requires Internet access." ON) -option(NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS "Whether to disable compiler optimize thread local variable access." OFF) # Enable MACOSX_RPATH. Run "cmake --help-policy CMP0042" for policy details. if(POLICY CMP0042) @@ -113,10 +112,6 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-deprecated-declarations -Wno-inconsistent-missing-override") endif() -if(NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS) - set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -DNOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS") -endif() - set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DGFLAGS_NS=${GFLAGS_NS}") if(WITH_MESALINK) set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -DUSE_MESALINK") @@ -155,6 +150,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() endif() +if(CMAKE_CXX_COMPILER_ID MATCHES "(AppleClang)|(Clang)" AND CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") + # disable thread local access optimization. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS") +endif() + find_package(Protobuf REQUIRED) find_package(Threads REQUIRED) From b87dfe898ed33097f5da55eeeeba9aaa8347ba90 Mon Sep 17 00:00:00 2001 From: ehds Date: Fri, 10 Mar 2023 13:30:22 +0800 Subject: [PATCH 4/4] move thread local access optimization condition to thread_local.h --- CMakeLists.txt | 5 ----- src/butil/thread_local.h | 5 ++++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8ccacc613..35365b270b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,11 +150,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() endif() -if(CMAKE_CXX_COMPILER_ID MATCHES "(AppleClang)|(Clang)" AND CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") - # disable thread local access optimization. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS") -endif() - find_package(Protobuf REQUIRED) find_package(Threads REQUIRED) diff --git a/src/butil/thread_local.h b/src/butil/thread_local.h index e81f631cc1..4f77e95870 100644 --- a/src/butil/thread_local.h +++ b/src/butil/thread_local.h @@ -46,7 +46,10 @@ var_name = v; \ } -#if defined(NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS) +#if defined(__clang__) && (defined(__aarch64__) || defined(__arm64__)) +// Clang compiler is incorrectly caching the address of thread_local variables +// across a suspend-point. The following macros used to disable the volatile +// thread local access optimization. #define BAIDU_GET_VOLATILE_THREAD_LOCAL(var_name) get_##var_name() #define BAIDU_GET_PTR_VOLATILE_THREAD_LOCAL(var_name) get_ptr_##var_name() #define BAIDU_SET_VOLATILE_THREAD_LOCAL(var_name, value) set_##var_name(value)