From e135b737208be2dda93e500623d4cc99d4c99e4d Mon Sep 17 00:00:00 2001 From: jinhelin Date: Sat, 23 May 2026 21:44:49 +0800 Subject: [PATCH 1/4] feat: exclude rss file from tici memory control --- dbms/src/Common/BackgroundTask.cpp | 1 + dbms/src/Common/MemoryAllocTrace.cpp | 79 ++++++--- dbms/src/Common/MemoryAllocTrace.h | 1 + dbms/src/Common/MemoryTracker.cpp | 46 ++++- dbms/src/Common/MemoryTracker.h | 5 +- dbms/src/Server/Server.cpp | 17 +- dbms/src/Storages/KVStore/ProxyStateMachine.h | 11 ++ ...-20-tici-rss-file-memory-control-design.md | 160 ++++++++++++++++++ 8 files changed, 284 insertions(+), 36 deletions(-) create mode 100644 docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md diff --git a/dbms/src/Common/BackgroundTask.cpp b/dbms/src/Common/BackgroundTask.cpp index a3fc69bbeee..e1f7951863b 100644 --- a/dbms/src/Common/BackgroundTask.cpp +++ b/dbms/src/Common/BackgroundTask.cpp @@ -67,6 +67,7 @@ void CollectProcInfoBackgroundTask::memCheckJob() // Update the memory usage of the current process. Defined in Common/MemoryTracker.cpp auto res = get_process_mem_usage(); real_rss = res.resident_bytes; + rss_file = res.rss_file_bytes; proc_num_threads = res.cur_proc_num_threads; proc_virt_size = res.cur_virt_bytes; baseline_of_query_mem_tracker = root_of_query_mem_trackers->get(); diff --git a/dbms/src/Common/MemoryAllocTrace.cpp b/dbms/src/Common/MemoryAllocTrace.cpp index 1edc0c5c2bf..f10b8f50905 100644 --- a/dbms/src/Common/MemoryAllocTrace.cpp +++ b/dbms/src/Common/MemoryAllocTrace.cpp @@ -16,6 +16,8 @@ #include // Included for `USE_JEMALLOC` #include +#include +#include #if USE_JEMALLOC #include @@ -38,45 +40,76 @@ std::tuple getAllocDeallocPtr() #endif } -bool process_mem_usage(double & resident_set, Int64 & cur_proc_num_threads, UInt64 & cur_virt_size) +static bool parseStatusFieldKb(const std::string & line, std::string_view field_name, UInt64 & out_kb) { - resident_set = 0.0; + if (line.rfind(field_name, 0) != 0) + return false; + + const auto colon_pos = line.find(':'); + if (colon_pos == std::string::npos) + return false; + + std::istringstream iss(line.substr(colon_pos + 1)); + iss >> out_kb; + return !iss.fail(); +} + +static bool parseStatusFieldInt(const std::string & line, std::string_view field_name, Int64 & out_value) +{ + if (line.rfind(field_name, 0) != 0) + return false; - // 'file' stat seems to give the most reliable results - std::ifstream stat_stream("/proc/self/stat", std::ios_base::in); - // if "/proc/self/stat" is not supported - if (!stat_stream.is_open()) + const auto colon_pos = line.find(':'); + if (colon_pos == std::string::npos) return false; - // dummy vars for leading entries in stat that we don't care about - std::string pid, comm, state, ppid, pgrp, session, tty_nr; - std::string tpgid, flags, minflt, cminflt, majflt, cmajflt; - std::string utime, stime, cutime, cstime, priority, nice; - std::string itrealvalue, starttime; + std::istringstream iss(line.substr(colon_pos + 1)); + iss >> out_value; + return !iss.fail(); +} - // the field we want - Int64 rss; +bool process_mem_usage( + UInt64 & resident_bytes, + UInt64 & rss_file_bytes, + Int64 & cur_proc_num_threads, + UInt64 & cur_virt_size) +{ + std::ifstream status_stream("/proc/self/status", std::ios_base::in); + if (!status_stream.is_open()) + return false; - stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr >> tpgid >> flags >> minflt >> cminflt - >> majflt >> cmajflt >> utime >> stime >> cutime >> cstime >> priority >> nice >> cur_proc_num_threads - >> itrealvalue >> starttime >> cur_virt_size >> rss; // don't care about the rest + UInt64 vm_rss_kb = 0; + UInt64 rss_file_kb = 0; + UInt64 vm_size_kb = 0; + Int64 threads = 1; - stat_stream.close(); + std::string line; + while (std::getline(status_stream, line)) + { + if (parseStatusFieldKb(line, "VmRSS", vm_rss_kb)) + resident_bytes = vm_rss_kb * 1024; + if (parseStatusFieldKb(line, "RssFile", rss_file_kb)) + rss_file_bytes = rss_file_kb * 1024; + if (parseStatusFieldKb(line, "VmSize", vm_size_kb)) + cur_virt_size = vm_size_kb * 1024; + if (parseStatusFieldInt(line, "Threads", threads)) + cur_proc_num_threads = threads; + } - Int64 page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages - resident_set = rss * page_size_kb; return true; } ProcessMemoryUsage get_process_mem_usage() { - double resident_set; + UInt64 raw_rss = 0; + UInt64 rss_file = 0; Int64 cur_proc_num_threads = 1; UInt64 cur_virt_size = 0; - process_mem_usage(resident_set, cur_proc_num_threads, cur_virt_size); - resident_set *= 1024; // transfrom from KB to bytes + process_mem_usage(raw_rss, rss_file, cur_proc_num_threads, cur_virt_size); + return ProcessMemoryUsage{ - static_cast(resident_set), + raw_rss, + rss_file, cur_virt_size, cur_proc_num_threads, }; diff --git a/dbms/src/Common/MemoryAllocTrace.h b/dbms/src/Common/MemoryAllocTrace.h index d6af277a952..4570bffaae6 100644 --- a/dbms/src/Common/MemoryAllocTrace.h +++ b/dbms/src/Common/MemoryAllocTrace.h @@ -25,6 +25,7 @@ std::tuple getAllocDeallocPtr(); struct ProcessMemoryUsage { UInt64 resident_bytes; + UInt64 rss_file_bytes; UInt64 cur_virt_bytes; Int64 cur_proc_num_threads; }; diff --git a/dbms/src/Common/MemoryTracker.cpp b/dbms/src/Common/MemoryTracker.cpp index 7e875babc57..c5d10c6f4bb 100644 --- a/dbms/src/Common/MemoryTracker.cpp +++ b/dbms/src/Common/MemoryTracker.cpp @@ -31,8 +31,27 @@ extern const Metric MemoryTrackingSharedColumnData; extern const Metric MemoryTrackingKVStore; } // namespace CurrentMetrics -std::atomic real_rss{0}, proc_num_threads{1}, baseline_of_query_mem_tracker{0}; +std::atomic real_rss{0}, rss_file{0}, proc_num_threads{1}, baseline_of_query_mem_tracker{0}; std::atomic proc_virt_size{0}; +std::atomic_bool exclude_rss_file_from_memory_control{false}; + +void setExcludeRssFileFromMemoryControl(bool value) +{ + exclude_rss_file_from_memory_control.store(value, std::memory_order_relaxed); +} + +bool getExcludeRssFileFromMemoryControl() +{ + return exclude_rss_file_from_memory_control.load(std::memory_order_relaxed); +} + +static Int64 getMemoryControlRss(Int64 current_real_rss, Int64 current_rss_file) +{ + if (!getExcludeRssFileFromMemoryControl()) + return current_real_rss; + return current_real_rss > current_rss_file ? current_real_rss - current_rss_file : 0; +} + MemoryTracker::~MemoryTracker() { // Destruction of global root mem tracker means the process is shutting down, log and metrics models may have been released! @@ -133,9 +152,12 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) { Int64 current_limit = limit.load(std::memory_order_relaxed); Int64 current_accuracy_diff_for_test = accuracy_diff_for_test.load(std::memory_order_relaxed); + const auto current_real_rss = real_rss.load(std::memory_order_relaxed); + const auto current_rss_file = rss_file.load(std::memory_order_relaxed); + const auto current_memory_control_rss = getMemoryControlRss(current_real_rss, current_rss_file); if (unlikely( !next.load(std::memory_order_relaxed) && current_accuracy_diff_for_test && current_limit - && real_rss > current_accuracy_diff_for_test + current_limit)) + && current_memory_control_rss > current_accuracy_diff_for_test + current_limit)) { DB::FmtBuffer fmt_buf; fmt_buf.append("Memory tracker accuracy "); @@ -144,10 +166,13 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) fmt_buf.fmtAppend(" {}", tmp_decr); fmt_buf.fmtAppend( - ": fault injected. real_rss ({}) is much larger than limit ({}). Debug info, threads of process: {}, " + ": fault injected. memory_control_rss ({}) is much larger than limit ({}). Debug info, " + "real_rss: {}, rss_file: {}, threads of process: {}, " "memory usage tracked by ProcessList: peak {}, current {}. Virtual memory size: {}.", - formatReadableSizeWithBinarySuffix(real_rss), + formatReadableSizeWithBinarySuffix(current_memory_control_rss), formatReadableSizeWithBinarySuffix(current_limit), + formatReadableSizeWithBinarySuffix(current_real_rss), + formatReadableSizeWithBinarySuffix(current_rss_file), proc_num_threads.load(), (root_of_query_mem_trackers ? formatReadableSizeWithBinarySuffix(root_of_query_mem_trackers->peak) : "0"), @@ -181,7 +206,7 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) Int64 current_bytes_rss_larger_than_limit = bytes_rss_larger_than_limit.load(std::memory_order_relaxed); bool is_rss_too_large = (!next.load(std::memory_order_relaxed) && current_limit - && real_rss > current_limit + current_bytes_rss_larger_than_limit + && current_memory_control_rss > current_limit + current_bytes_rss_larger_than_limit && will_be > baseline_of_query_mem_tracker); if (is_rss_too_large || unlikely(current_limit && will_be > current_limit)) { @@ -207,11 +232,14 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) else { // RSS too large fmt_buf.fmtAppend( - " exceeded caused by 'RSS(Resident Set Size) much larger than limit' : process memory size would " - "be {} for (attempt to allocate chunk of {} bytes), limit of memory for data computing : {}.", - formatReadableSizeWithBinarySuffix(real_rss), + " exceeded caused by 'memory_control_rss much larger than limit' : memory_control_rss would " + "be {} for (attempt to allocate chunk of {} bytes), limit of memory for data computing : {}. " + "real_rss={}, rss_file={}.", + formatReadableSizeWithBinarySuffix(current_memory_control_rss), size, - formatReadableSizeWithBinarySuffix(current_limit)); + formatReadableSizeWithBinarySuffix(current_limit), + formatReadableSizeWithBinarySuffix(current_real_rss), + formatReadableSizeWithBinarySuffix(current_rss_file)); } fmt_buf.fmtAppend(" Memory Usage of Storage: {}", storageMemoryUsageDetail()); diff --git a/dbms/src/Common/MemoryTracker.h b/dbms/src/Common/MemoryTracker.h index 98775a7a13b..61d52bca79b 100644 --- a/dbms/src/Common/MemoryTracker.h +++ b/dbms/src/Common/MemoryTracker.h @@ -22,8 +22,11 @@ #include #include -extern std::atomic real_rss, proc_num_threads, baseline_of_query_mem_tracker; +extern std::atomic real_rss, rss_file, proc_num_threads, baseline_of_query_mem_tracker; extern std::atomic proc_virt_size; + +void setExcludeRssFileFromMemoryControl(bool value); +bool getExcludeRssFileFromMemoryControl(); namespace CurrentMetrics { extern const Metric MemoryTracking; diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp index 94c3fde88f5..4c469655579 100644 --- a/dbms/src/Server/Server.cpp +++ b/dbms/src/Server/Server.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -1274,6 +1275,18 @@ try GRPCCompletionQueuePool::global_instance = std::make_unique(size); } + const auto tici_reader_addr = config().getString("tici.reader-node.addr", ""); + const auto tici_reader_port = config().getInt("tici.reader-node.port", 0); + const bool tici_reader_enabled = !tici_reader_addr.empty() || tici_reader_port > 0; + const bool exclude_rss_file_from_memory_control + = tici_reader_enabled && config().getBool("tici.exclude-rss-file-from-memory-control", true); + setExcludeRssFileFromMemoryControl(exclude_rss_file_from_memory_control); + LOG_INFO( + log, + "TiCI memory control config: reader_enabled={} exclude_rss_file_from_memory_control={}", + tici_reader_enabled, + exclude_rss_file_from_memory_control); + /// startup flash service for handling coprocessor and MPP requests. FlashGrpcServerHolder flash_grpc_server_holder(this->context(), this->config(), raft_config, log); @@ -1289,9 +1302,7 @@ try proxy_machine.runKVStore(tmt_context); - auto tici_reader_addr = config().getString("tici.reader-node.addr", ""); - auto tici_reader_port = config().getInt("tici.reader-node.port", 0); - if (!tici_reader_addr.empty() || tici_reader_port > 0) + if (tici_reader_enabled) { Stopwatch watch; auto service_addr = config().getString("flash.service_addr"); diff --git a/dbms/src/Storages/KVStore/ProxyStateMachine.h b/dbms/src/Storages/KVStore/ProxyStateMachine.h index d7a21d223f9..2d7f585efba 100644 --- a/dbms/src/Storages/KVStore/ProxyStateMachine.h +++ b/dbms/src/Storages/KVStore/ProxyStateMachine.h @@ -90,6 +90,17 @@ struct TiFlashProxyConfig } }, settings.max_memory_usage_for_all_queries.get()); + + const auto tici_reader_addr = config.getString("tici.reader-node.addr", ""); + const auto tici_reader_port = config.getInt("tici.reader-node.port", 0); + const bool tici_reader_enabled = !tici_reader_addr.empty() || tici_reader_port > 0; + const bool exclude_rss_file_from_memory_control + = tici_reader_enabled && config.getBool("tici.exclude-rss-file-from-memory-control", true); + if (exclude_rss_file_from_memory_control) + { + LOG_INFO(log, "Enable proxy adjusted RSS memory control for TiCI"); + addExtraArgs("exclude-rss-file-from-memory-control", "1"); + } } static TiFlashProxyConfig genForTest() { return TiFlashProxyConfig{}; } diff --git a/docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md b/docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md new file mode 100644 index 00000000000..9be2d61e907 --- /dev/null +++ b/docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md @@ -0,0 +1,160 @@ +# TiCI RssFile 内存流控设计 + +## 背景 + +TiCI reader 启用后会通过 Tantivy 大量使用 mmap 读取索引文件。Linux 会把这些文件映射页计入进程 RSS,并体现在 `/proc/self/status` 的 `RssFile` 中。由于这些页主要是 file-backed page cache,不应和匿名内存一样触发 TiFlash 的内存流控。 + +当前 TiFlash 有两类受影响的流控: + +- TiFlash 计算层的 `MemoryTracker` 会读取进程 RSS,超过配置阈值时拒绝查询请求。 +- `tiflash-proxy` 会通过 `memory-usage-limit`、`memory-usage-high-water` 和 `reject-messages-on-memory-ratio` 控制 Raft append message 和 snapshot 接收。 + +目标是在 TiCI reader 启用时,让这两处流控使用排除 `RssFile` 后的 RSS,同时保留原始 RSS 和 `RssFile` 用于观测与排障。 + +## 配置语义 + +新增 TiCI 相关配置项: + +```toml +[tici] +exclude-rss-file-from-memory-control = true +``` + +有效开关定义为: + +```text +effective_exclude_rss_file = + tici_reader_enabled && tici.exclude-rss-file-from-memory-control +``` + +其中 `tici_reader_enabled` 沿用现有启动条件: + +```text +tici.reader-node.addr 非空 || tici.reader-node.port > 0 +``` + +因此: + +- 未启用 TiCI reader 时,不改变现有内存流控行为。 +- 启用 TiCI reader 时,默认从流控 RSS 中排除 `RssFile`。 +- 如果需要排障或灰度回退,可以显式设置 `exclude-rss-file-from-memory-control = false`。 + +## TiFlash 侧内存采集 + +TiFlash C++ 侧维护 2 个进程内存采样值: + +- `raw_rss`:原始进程 RSS。 +- `rss_file`:`/proc/self/status` 中的 `RssFile`。 + +`MemoryTracker` 在执行内存流控检查时根据配置计算实际用于流控的 RSS: + +```text +if effective_exclude_rss_file: + memory_control_rss = max(raw_rss - rss_file, 0) +else: + memory_control_rss = raw_rss +``` + +实现位置: + +- 复用 `libs/libprocess_metrics` 已有的 `ProcessMetricsInfo.rss` 和 `ProcessMetricsInfo.rss_file`。 +- `get_process_mem_usage()` 只返回原始 RSS 和 `RssFile` 等进程采样值,不接收流控配置。 +- 在 `CollectProcInfoBackgroundTask::memCheckJob()` 中定期更新原始 RSS 和 `RssFile`,更新频率沿用当前 `MemTrackThread` 的 100 ms 周期。 +- `MemoryTracker` 根据 `effective_exclude_rss_file` 选择使用 `raw_rss` 或 `max(raw_rss - rss_file, 0)`。 + +失败策略: + +- 如果读取 `/proc/self/status` 失败,按现有逻辑回退到整体 RSS。 +- 如果 `rss_file > raw_rss`,`memory_control_rss` 取 0,避免无符号下溢。 + +## Proxy 侧内存采集 + +`tiflash-proxy` 不修改 `tikv_util::sys` 的全局 memory usage,避免影响 proxy 内部其他依赖 `GLOBAL_MEMORY_USAGE` 的逻辑。 + +proxy 内部新增独立的 `MemoryControlRssState`,用于缓存 raw RSS、`RssFile`、`memory_control_rss` 和有效性标记: + +```rust +struct MemoryControlInfo +{ + raw_rss: u64, + rss_file: u64, + memory_control_rss: u64, + valid: bool, +} +``` + +实现位置: + +- 在 `contrib/tiflash-proxy/proxy_components/proxy_server/src/run.rs` 中新增状态结构和刷新 helper。 +- 在 `TiKvServer` 中保存 `Arc`。 +- 新增 `TiKvServer::init_memory_control_rss_refresher(effective_exclude_rss_file)`,用 `self.core.background_worker.spawn_interval_task(...)` 启动周期刷新任务。 +- `run_impl` 在 `register_memory_usage_high_water(high_water)` 之后调用该初始化方法。 + +刷新规则: + +- 刷新周期使用 1 s,与 TiKV 现有全局 memory usage 刷新频率一致。 +- 启动周期任务前先同步刷新一次,避免 proxy 刚开始接收消息时缓存为空。 +- 开关关闭时仍可刷新 raw RSS,并令 `memory_control_rss = raw_rss`,这样拒绝路径不需要按开关分支处理。 +- Linux 上从 `/proc/self/status` 读取 `VmRSS` 和 `RssFile`;非 Linux 或读取失败时标记 invalid。 + +`TiFlashGrpcMessageFilter` 构造时接收 `Arc` 和 `high_water`,在拒绝路径中只读取缓存值,不解析 `/proc`。 + +## Proxy 拒绝逻辑 + +修改范围限制在: + +- `contrib/tiflash-proxy/proxy_components/proxy_server/src/run.rs` 中的 `should_reject_raft_message` +- `contrib/tiflash-proxy/proxy_components/proxy_server/src/run.rs` 中的 `should_reject_snapshot` + +逻辑为: + +```text +if reject_messages_on_memory_ratio <= 0: + return false + +if this is raft message and message is not MsgAppend: + return false + +info = memory_control_rss_state.load() +if info.valid: + return info.memory_control_rss >= high_water + +return memory_usage_reaches_high_water(...) +``` + +说明: + +- `should_reject_snapshot` 没有 message type 判断,其余逻辑一致。 +- `MemoryControlRssState` invalid 时,fallback 到原有 `memory_usage_reaches_high_water`,避免采集异常导致流控失效。 +- `tikv_server_memory_usage` 等 proxy 全局指标保持原语义,不改成 adjusted RSS。 + +## 观测与日志 + +需要保留原始 RSS 和 adjusted RSS 的可解释性: + +- TiFlash 侧继续暴露 `tiflash_process_rss_by_type_bytes{type="file"}`。 +- 在启动日志中打印 `effective_exclude_rss_file` 的最终值。 +- 在内存拒绝日志中区分 raw RSS、`RssFile` 和 `memory_control_rss`。 +- proxy 在 high-water 判断相关 debug 日志中包含 `memory_control_rss`、raw RSS、`RssFile` 和 high-water。 + +## 测试与验证 + +本变更不新增单元测试。实现完成后按仓库约束运行: + +```bash +time (make format && make check) +``` + +手工验证场景: + +- TiCI 未启用:`memory_control_rss == raw_rss`,计算层和 proxy 行为保持原样。 +- TiCI 启用且使用默认配置:`memory_control_rss == max(raw_rss - rss_file, 0)`。 +- TiCI 启用但显式关闭开关:`memory_control_rss == raw_rss`。 +- proxy 侧采集返回 invalid:fallback 到原有 `memory_usage_reaches_high_water`。 +- 日志或指标可以解释 raw RSS、`RssFile` 和流控 RSS 的差异。 + +## 非目标 + +- 不改变 `tikv_util::sys::record_global_memory_usage()` 和 `memory_usage_reaches_high_water()` 的全局语义。 +- 不尝试区分 TiCI mmap 文件和其他 file-backed RSS,当前策略是在 TiCI reader 启用时排除全部 `RssFile`。 +- 不为监控或该流控调整新增单元测试。 From 8c0b5342f61a2f3eb84203e2408b679aa5daa574 Mon Sep 17 00:00:00 2001 From: jinhelin Date: Sat, 23 May 2026 23:21:01 +0800 Subject: [PATCH 2/4] ci --- dbms/src/Common/MemoryTracker.cpp | 41 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/dbms/src/Common/MemoryTracker.cpp b/dbms/src/Common/MemoryTracker.cpp index c5d10c6f4bb..e4ebf0840f0 100644 --- a/dbms/src/Common/MemoryTracker.cpp +++ b/dbms/src/Common/MemoryTracker.cpp @@ -33,7 +33,7 @@ extern const Metric MemoryTrackingKVStore; std::atomic real_rss{0}, rss_file{0}, proc_num_threads{1}, baseline_of_query_mem_tracker{0}; std::atomic proc_virt_size{0}; -std::atomic_bool exclude_rss_file_from_memory_control{false}; +std::atomic exclude_rss_file_from_memory_control{false}; void setExcludeRssFileFromMemoryControl(bool value) { @@ -45,11 +45,24 @@ bool getExcludeRssFileFromMemoryControl() return exclude_rss_file_from_memory_control.load(std::memory_order_relaxed); } -static Int64 getMemoryControlRss(Int64 current_real_rss, Int64 current_rss_file) +struct MemoryControlRssInfo { + Int64 real_rss; + Int64 rss_file; + Int64 memory_control_rss; +}; + +static MemoryControlRssInfo getMemoryControlRss() +{ + const Int64 current_real_rss = real_rss.load(std::memory_order_relaxed); + const Int64 current_rss_file = rss_file.load(std::memory_order_relaxed); if (!getExcludeRssFileFromMemoryControl()) - return current_real_rss; - return current_real_rss > current_rss_file ? current_real_rss - current_rss_file : 0; + return {current_real_rss, current_rss_file, current_real_rss}; + return { + current_real_rss, + current_rss_file, + current_real_rss > current_rss_file ? current_real_rss - current_rss_file : 0, + }; } MemoryTracker::~MemoryTracker() @@ -152,12 +165,10 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) { Int64 current_limit = limit.load(std::memory_order_relaxed); Int64 current_accuracy_diff_for_test = accuracy_diff_for_test.load(std::memory_order_relaxed); - const auto current_real_rss = real_rss.load(std::memory_order_relaxed); - const auto current_rss_file = rss_file.load(std::memory_order_relaxed); - const auto current_memory_control_rss = getMemoryControlRss(current_real_rss, current_rss_file); + const auto rss_info = getMemoryControlRss(); if (unlikely( !next.load(std::memory_order_relaxed) && current_accuracy_diff_for_test && current_limit - && current_memory_control_rss > current_accuracy_diff_for_test + current_limit)) + && rss_info.memory_control_rss > current_accuracy_diff_for_test + current_limit)) { DB::FmtBuffer fmt_buf; fmt_buf.append("Memory tracker accuracy "); @@ -169,10 +180,10 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) ": fault injected. memory_control_rss ({}) is much larger than limit ({}). Debug info, " "real_rss: {}, rss_file: {}, threads of process: {}, " "memory usage tracked by ProcessList: peak {}, current {}. Virtual memory size: {}.", - formatReadableSizeWithBinarySuffix(current_memory_control_rss), + formatReadableSizeWithBinarySuffix(rss_info.memory_control_rss), formatReadableSizeWithBinarySuffix(current_limit), - formatReadableSizeWithBinarySuffix(current_real_rss), - formatReadableSizeWithBinarySuffix(current_rss_file), + formatReadableSizeWithBinarySuffix(rss_info.real_rss), + formatReadableSizeWithBinarySuffix(rss_info.rss_file), proc_num_threads.load(), (root_of_query_mem_trackers ? formatReadableSizeWithBinarySuffix(root_of_query_mem_trackers->peak) : "0"), @@ -206,7 +217,7 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) Int64 current_bytes_rss_larger_than_limit = bytes_rss_larger_than_limit.load(std::memory_order_relaxed); bool is_rss_too_large = (!next.load(std::memory_order_relaxed) && current_limit - && current_memory_control_rss > current_limit + current_bytes_rss_larger_than_limit + && rss_info.memory_control_rss > current_limit + current_bytes_rss_larger_than_limit && will_be > baseline_of_query_mem_tracker); if (is_rss_too_large || unlikely(current_limit && will_be > current_limit)) { @@ -235,11 +246,11 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit) " exceeded caused by 'memory_control_rss much larger than limit' : memory_control_rss would " "be {} for (attempt to allocate chunk of {} bytes), limit of memory for data computing : {}. " "real_rss={}, rss_file={}.", - formatReadableSizeWithBinarySuffix(current_memory_control_rss), + formatReadableSizeWithBinarySuffix(rss_info.memory_control_rss), size, formatReadableSizeWithBinarySuffix(current_limit), - formatReadableSizeWithBinarySuffix(current_real_rss), - formatReadableSizeWithBinarySuffix(current_rss_file)); + formatReadableSizeWithBinarySuffix(rss_info.real_rss), + formatReadableSizeWithBinarySuffix(rss_info.rss_file)); } fmt_buf.fmtAppend(" Memory Usage of Storage: {}", storageMemoryUsageDetail()); From a0ae7605085610904178ef67dcd4c7281046f4e8 Mon Sep 17 00:00:00 2001 From: jinhelin Date: Sat, 23 May 2026 23:25:52 +0800 Subject: [PATCH 3/4] ci --- dbms/src/Server/Server.cpp | 4 + ...-20-tici-rss-file-memory-control-design.md | 160 ------------------ 2 files changed, 4 insertions(+), 160 deletions(-) delete mode 100644 docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp index 4c469655579..cc194faf22b 100644 --- a/dbms/src/Server/Server.cpp +++ b/dbms/src/Server/Server.cpp @@ -1275,6 +1275,10 @@ try GRPCCompletionQueuePool::global_instance = std::make_unique(size); } + // TiCI reader uses mmap heavily; file-backed RSS (RssFile) should not trigger memory control. + // Effective value is true only when TiCI reader is enabled AND + // `tici.exclude-rss-file-from-memory-control` is true (default: true). + // If reader is disabled, this stays false regardless of the config item. const auto tici_reader_addr = config().getString("tici.reader-node.addr", ""); const auto tici_reader_port = config().getInt("tici.reader-node.port", 0); const bool tici_reader_enabled = !tici_reader_addr.empty() || tici_reader_port > 0; diff --git a/docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md b/docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md deleted file mode 100644 index 9be2d61e907..00000000000 --- a/docs/superpowers/specs/2026-05-20-tici-rss-file-memory-control-design.md +++ /dev/null @@ -1,160 +0,0 @@ -# TiCI RssFile 内存流控设计 - -## 背景 - -TiCI reader 启用后会通过 Tantivy 大量使用 mmap 读取索引文件。Linux 会把这些文件映射页计入进程 RSS,并体现在 `/proc/self/status` 的 `RssFile` 中。由于这些页主要是 file-backed page cache,不应和匿名内存一样触发 TiFlash 的内存流控。 - -当前 TiFlash 有两类受影响的流控: - -- TiFlash 计算层的 `MemoryTracker` 会读取进程 RSS,超过配置阈值时拒绝查询请求。 -- `tiflash-proxy` 会通过 `memory-usage-limit`、`memory-usage-high-water` 和 `reject-messages-on-memory-ratio` 控制 Raft append message 和 snapshot 接收。 - -目标是在 TiCI reader 启用时,让这两处流控使用排除 `RssFile` 后的 RSS,同时保留原始 RSS 和 `RssFile` 用于观测与排障。 - -## 配置语义 - -新增 TiCI 相关配置项: - -```toml -[tici] -exclude-rss-file-from-memory-control = true -``` - -有效开关定义为: - -```text -effective_exclude_rss_file = - tici_reader_enabled && tici.exclude-rss-file-from-memory-control -``` - -其中 `tici_reader_enabled` 沿用现有启动条件: - -```text -tici.reader-node.addr 非空 || tici.reader-node.port > 0 -``` - -因此: - -- 未启用 TiCI reader 时,不改变现有内存流控行为。 -- 启用 TiCI reader 时,默认从流控 RSS 中排除 `RssFile`。 -- 如果需要排障或灰度回退,可以显式设置 `exclude-rss-file-from-memory-control = false`。 - -## TiFlash 侧内存采集 - -TiFlash C++ 侧维护 2 个进程内存采样值: - -- `raw_rss`:原始进程 RSS。 -- `rss_file`:`/proc/self/status` 中的 `RssFile`。 - -`MemoryTracker` 在执行内存流控检查时根据配置计算实际用于流控的 RSS: - -```text -if effective_exclude_rss_file: - memory_control_rss = max(raw_rss - rss_file, 0) -else: - memory_control_rss = raw_rss -``` - -实现位置: - -- 复用 `libs/libprocess_metrics` 已有的 `ProcessMetricsInfo.rss` 和 `ProcessMetricsInfo.rss_file`。 -- `get_process_mem_usage()` 只返回原始 RSS 和 `RssFile` 等进程采样值,不接收流控配置。 -- 在 `CollectProcInfoBackgroundTask::memCheckJob()` 中定期更新原始 RSS 和 `RssFile`,更新频率沿用当前 `MemTrackThread` 的 100 ms 周期。 -- `MemoryTracker` 根据 `effective_exclude_rss_file` 选择使用 `raw_rss` 或 `max(raw_rss - rss_file, 0)`。 - -失败策略: - -- 如果读取 `/proc/self/status` 失败,按现有逻辑回退到整体 RSS。 -- 如果 `rss_file > raw_rss`,`memory_control_rss` 取 0,避免无符号下溢。 - -## Proxy 侧内存采集 - -`tiflash-proxy` 不修改 `tikv_util::sys` 的全局 memory usage,避免影响 proxy 内部其他依赖 `GLOBAL_MEMORY_USAGE` 的逻辑。 - -proxy 内部新增独立的 `MemoryControlRssState`,用于缓存 raw RSS、`RssFile`、`memory_control_rss` 和有效性标记: - -```rust -struct MemoryControlInfo -{ - raw_rss: u64, - rss_file: u64, - memory_control_rss: u64, - valid: bool, -} -``` - -实现位置: - -- 在 `contrib/tiflash-proxy/proxy_components/proxy_server/src/run.rs` 中新增状态结构和刷新 helper。 -- 在 `TiKvServer` 中保存 `Arc`。 -- 新增 `TiKvServer::init_memory_control_rss_refresher(effective_exclude_rss_file)`,用 `self.core.background_worker.spawn_interval_task(...)` 启动周期刷新任务。 -- `run_impl` 在 `register_memory_usage_high_water(high_water)` 之后调用该初始化方法。 - -刷新规则: - -- 刷新周期使用 1 s,与 TiKV 现有全局 memory usage 刷新频率一致。 -- 启动周期任务前先同步刷新一次,避免 proxy 刚开始接收消息时缓存为空。 -- 开关关闭时仍可刷新 raw RSS,并令 `memory_control_rss = raw_rss`,这样拒绝路径不需要按开关分支处理。 -- Linux 上从 `/proc/self/status` 读取 `VmRSS` 和 `RssFile`;非 Linux 或读取失败时标记 invalid。 - -`TiFlashGrpcMessageFilter` 构造时接收 `Arc` 和 `high_water`,在拒绝路径中只读取缓存值,不解析 `/proc`。 - -## Proxy 拒绝逻辑 - -修改范围限制在: - -- `contrib/tiflash-proxy/proxy_components/proxy_server/src/run.rs` 中的 `should_reject_raft_message` -- `contrib/tiflash-proxy/proxy_components/proxy_server/src/run.rs` 中的 `should_reject_snapshot` - -逻辑为: - -```text -if reject_messages_on_memory_ratio <= 0: - return false - -if this is raft message and message is not MsgAppend: - return false - -info = memory_control_rss_state.load() -if info.valid: - return info.memory_control_rss >= high_water - -return memory_usage_reaches_high_water(...) -``` - -说明: - -- `should_reject_snapshot` 没有 message type 判断,其余逻辑一致。 -- `MemoryControlRssState` invalid 时,fallback 到原有 `memory_usage_reaches_high_water`,避免采集异常导致流控失效。 -- `tikv_server_memory_usage` 等 proxy 全局指标保持原语义,不改成 adjusted RSS。 - -## 观测与日志 - -需要保留原始 RSS 和 adjusted RSS 的可解释性: - -- TiFlash 侧继续暴露 `tiflash_process_rss_by_type_bytes{type="file"}`。 -- 在启动日志中打印 `effective_exclude_rss_file` 的最终值。 -- 在内存拒绝日志中区分 raw RSS、`RssFile` 和 `memory_control_rss`。 -- proxy 在 high-water 判断相关 debug 日志中包含 `memory_control_rss`、raw RSS、`RssFile` 和 high-water。 - -## 测试与验证 - -本变更不新增单元测试。实现完成后按仓库约束运行: - -```bash -time (make format && make check) -``` - -手工验证场景: - -- TiCI 未启用:`memory_control_rss == raw_rss`,计算层和 proxy 行为保持原样。 -- TiCI 启用且使用默认配置:`memory_control_rss == max(raw_rss - rss_file, 0)`。 -- TiCI 启用但显式关闭开关:`memory_control_rss == raw_rss`。 -- proxy 侧采集返回 invalid:fallback 到原有 `memory_usage_reaches_high_water`。 -- 日志或指标可以解释 raw RSS、`RssFile` 和流控 RSS 的差异。 - -## 非目标 - -- 不改变 `tikv_util::sys::record_global_memory_usage()` 和 `memory_usage_reaches_high_water()` 的全局语义。 -- 不尝试区分 TiCI mmap 文件和其他 file-backed RSS,当前策略是在 TiCI reader 启用时排除全部 `RssFile`。 -- 不为监控或该流控调整新增单元测试。 From fe21da50e96d61e39d969c963e6c674633c7fe6a Mon Sep 17 00:00:00 2001 From: jinhelin Date: Sat, 23 May 2026 23:31:19 +0800 Subject: [PATCH 4/4] ci --- dbms/src/Storages/KVStore/ProxyStateMachine.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/dbms/src/Storages/KVStore/ProxyStateMachine.h b/dbms/src/Storages/KVStore/ProxyStateMachine.h index 2d7f585efba..d7a21d223f9 100644 --- a/dbms/src/Storages/KVStore/ProxyStateMachine.h +++ b/dbms/src/Storages/KVStore/ProxyStateMachine.h @@ -90,17 +90,6 @@ struct TiFlashProxyConfig } }, settings.max_memory_usage_for_all_queries.get()); - - const auto tici_reader_addr = config.getString("tici.reader-node.addr", ""); - const auto tici_reader_port = config.getInt("tici.reader-node.port", 0); - const bool tici_reader_enabled = !tici_reader_addr.empty() || tici_reader_port > 0; - const bool exclude_rss_file_from_memory_control - = tici_reader_enabled && config.getBool("tici.exclude-rss-file-from-memory-control", true); - if (exclude_rss_file_from_memory_control) - { - LOG_INFO(log, "Enable proxy adjusted RSS memory control for TiCI"); - addExtraArgs("exclude-rss-file-from-memory-control", "1"); - } } static TiFlashProxyConfig genForTest() { return TiFlashProxyConfig{}; }