From 0e73d4573d4bf28beaa63098a6512ae45dab3642 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Thu, 11 Sep 2025 19:39:55 +0800 Subject: [PATCH 01/15] commit for tmp files --- eloq_data_store_service/purger_prompt.md | 65 +++ .../single_object_path_purge.cc | 382 ++++++++++++++++++ 2 files changed, 447 insertions(+) create mode 100644 eloq_data_store_service/purger_prompt.md create mode 100644 eloq_data_store_service/single_object_path_purge.cc diff --git a/eloq_data_store_service/purger_prompt.md b/eloq_data_store_service/purger_prompt.md new file mode 100644 index 0000000..242fae4 --- /dev/null +++ b/eloq_data_store_service/purger_prompt.md @@ -0,0 +1,65 @@ +# This is a prompt file for LLM to generate a purger for RocksDBCloud S3 files + +## Background + +* The RocksDB has the Manifest file as the entry point of each DB instance. + - It contains the meta info of the DB files. + - It also contains a set of SST files. + +* The RocksDBCloud extends the RocksDB with the capability of store DB files in S3 storage. + - It also supports forking a DB instance from an existing DB instance. + - Each RocksDBCloud DB instance has a CLOUDMANIFEST file as the entry point of DB instance. + - The CLOUDMANIFEST file contains a list of epoch postfixed Manifest file which appeared in its forking history. + - The DB forking process can be describe as steps (assuming the both DB instances are in same S3 object path, e.g. s3://bucket_name/object_path/) + - Copy the CLOUDMANIFEST file from old CLOUDMANIFEST (e.g. CLOUDMANIFEST-0), by given a suffix to make distinguish between the two files, e.g. CLOUDMANIFEST-1. + - Generate a new epoch (e.g. 582107b0a928437c) for an new RocksDB Mainfest file as postfix (e.g. Manifest-${epoch}). + - Add the new Manifest file and the epoch into the CLOUDMANIFEST file as current epoch. + - Then, an new DB instance can be start from the new CLOUDMANIFEST file. + - All new SST file created by this new DB instance will be postfixed with the same epoch (e.g. 002434.sst-${epoch}). + +## Problem + +* Just like RocksDB, the SST files in RocksDBCloud can be deleted after compaction or the DB instance is removed. +* But, when the forking relation between RocksDBCloud DB instances are exists, we can not simplely delete those files, since they can be referenced by other DB instances. +* So, we have to prevent the ordinary SST file deletion happen in normal RocksDB, but introduce a purger program to perform the clean job. + +## The Purger + +* Existing algorithm already been implemented + - List all SST files under the object_path. + - List all CLOUDMANIFEST files under the object_path. + - Load all current Manifest files. + - Create a live file list by listing all live file number in each Manifest and find out + the correct epoch to form the complete SST file name (e.g. 002442.sst-582107b0a928437c). + - Create a candidate obsolete file list by find out all SST files which are under the + object_path but not in the live files list. + - There can be some newly created SST file not in the Manifest file, so we only treat + the candidate file older than the Manifest file as obsolete file. + - Above algorithm are implemented in the single_object_path_purger.cc, + please refer it for details. + +* The problem of the current implemented purger + - In RocksDB, the SST files are generated by memory table flush and SST compaction. + * They are executed in parallel by a group of background threads, every one can update the Manifest file. + * The Manifest file will be updated each time after a memory table flushed. + * The Manifest file will be updated at the end of each compaction round, and several SST files can be generated during compaction. + - So, the Manifest file can be updated before an new SST file were enlisted in the Manifest file, and the new SST file will be treated as obsolete. + +## The improvement of the purger algorithm + +* Base on the observation that all file numbers are monotone increased. +* We implement a rockdb::EventListener and subscribe the FlushBegin, CompactionBegin events +* When Flush and Compaction happens, we get the max file number from the db, and add to a time based slide window state +* At the end of the slide window, we find out the smaller file number in the state and update to a file in S3 object path. + e.g. smallest_new_file_number-${epoch} +* If there are no flush or compaction happens during the time based slide window, the update smallest_new_file_number-${epoch} file with UINT64_MAX +* When purger check the candidate obsolete SST file, if its file number is small than the file number in smallest_new_file_number-${epoch}, then + it can be treat as obsolete. + + +## Your tasks + +* Implement the EventListener. + - Reference the RocksDBEventLisener found in rocksdb_data_store_common.h +* Implement the time based slide window to update the smallest_new_file_number-${epoch} file. +* Implement a standalone purger program by referencing the single_object_path_purger.cc diff --git a/eloq_data_store_service/single_object_path_purge.cc b/eloq_data_store_service/single_object_path_purge.cc new file mode 100644 index 0000000..0f45c99 --- /dev/null +++ b/eloq_data_store_service/single_object_path_purge.cc @@ -0,0 +1,382 @@ +// Copyright (c) 2017 Rockset. +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cloud/cloud_manifest.h" +#include "cloud/filename.h" +#include "cloud/manifest_reader.h" +#include "cloud/purge.h" +#include "file/filename.h" +#include "rocksdb/cloud/cloud_file_system_impl.h" +#include "rocksdb/cloud/cloud_storage_provider.h" + +/** + * Purger implementation for the single object path case. This refactored + * version separates the main loop (Purger) from a single purge cycle + * (RunSinglePurgeCycle) and uses helper methods for each logical step. + * + * Prerequisites for running the purger remain unchanged: it only runs when + * the source and destination buckets are either the same or not both valid + * with non-empty object paths that differ. If both are valid and differ, + * we skip running. + */ +namespace ROCKSDB_NAMESPACE { + +namespace { // anonymous namespace for refactored free helper utilities + +// Type aliases local to this translation unit (were previously in header) +using PurgerAllFiles = + std::vector>; +using PurgerCloudManifestMap = + std::unordered_map>; +using PurgerLiveFileSet = std::unordered_set; +using PurgerEpochManifestMap = + std::unordered_map; + +struct PurgerCycleState { + PurgerAllFiles all_files; // (name, metadata) + std::vector + cloud_manifest_files; // names of CLOUDMANIFEST* objects + PurgerCloudManifestMap cloudmanifests; // loaded cloud manifest objects + PurgerLiveFileSet live_file_names; // logical filenames considered live + PurgerEpochManifestMap + current_epoch_manifest_files; // epoch -> manifest file metadata + std::vector obsolete_files; // files selected for deletion +}; + +static bool PrerequisitesMet(const CloudFileSystemImpl &cfs) { + const CloudFileSystemOptions &cfs_opts = cfs.GetCloudFileSystemOptions(); + if (cfs_opts.src_bucket.IsValid() && + !cfs_opts.src_bucket.GetObjectPath().empty() && + cfs_opts.dest_bucket.IsValid() && + !cfs_opts.dest_bucket.GetObjectPath().empty() && + cfs_opts.src_bucket != cfs_opts.dest_bucket) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Single Object Path Purger is not running because the " + "prerequisites are not met."); + return false; + } + return true; +} + +static IOStatus ListAllFiles(CloudFileSystemImpl &cfs, + PurgerAllFiles *all_files) { + const std::string &dest_object_path = cfs.GetDestObjectPath(); + IOStatus s = cfs.GetStorageProvider()->ListCloudObjects( + cfs.GetDestBucketName(), dest_object_path, all_files); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to list files in destination object path %s: %s", + dest_object_path.c_str(), s.ToString().c_str()); + } + + return s; +} + +static IOStatus ListCloudManifests( + CloudFileSystemImpl &cfs, std::vector *cloud_manifest_files) { + IOStatus s = cfs.GetStorageProvider()->ListCloudObjectsWithPrefix( + cfs.GetDestBucketName(), cfs.GetDestObjectPath(), "CLOUDMANIFEST", + cloud_manifest_files); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to list cloud manifest files in bucket %s: %s", + cfs.GetDestBucketName().c_str(), s.ToString().c_str()); + } else { + for (const auto &f : *cloud_manifest_files) { + Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, + "[pg] Found cloud manifest file %s", f.c_str()); + } + } + return s; +} + +static IOStatus LoadCloudManifests( + CloudFileSystemImpl &cfs, + const std::vector &cloud_manifest_files, + PurgerCloudManifestMap *manifests) { + const FileOptions file_opts; + IODebugContext *dbg = nullptr; + IOStatus overall = IOStatus::OK(); + + for (const auto &cloud_manifest_file : cloud_manifest_files) { + std::string cloud_manifest_file_path = + cfs.GetDestObjectPath() + pathsep + cloud_manifest_file; + std::unique_ptr file; + IOStatus s = cfs.NewSequentialFileCloud(cfs.GetDestBucketName(), + cloud_manifest_file_path, file_opts, + &file, dbg); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to open cloud manifest file %s: %s", + cloud_manifest_file.c_str(), s.ToString().c_str()); + if (overall.ok()) overall = s; + continue; + } + + std::unique_ptr cloud_manifest; + s = CloudManifest::LoadFromLog( + std::unique_ptr( + new SequentialFileReader(std::move(file), cloud_manifest_file)), + &cloud_manifest); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to load cloud manifest from file %s: %s", + cloud_manifest_file.c_str(), s.ToString().c_str()); + if (overall.ok()) overall = s; + continue; + } + + Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, + "[pg] Loaded cloud manifest file %s with current epoch %s", + cloud_manifest_file.c_str(), cloud_manifest->GetCurrentEpoch().c_str()); + + (*manifests)[cloud_manifest_file] = std::move(cloud_manifest); + } + return overall; +} + +static IOStatus CollectLiveFiles(CloudFileSystemImpl &cfs, + const PurgerCloudManifestMap &cloudmanifests, + PurgerLiveFileSet *live_files, + PurgerEpochManifestMap *epoch_manifest_infos) { + const CloudFileSystemOptions &cfs_opts = cfs.GetCloudFileSystemOptions(); + const std::string &dest_object_path = cfs_opts.dest_bucket.GetObjectPath(); + IOStatus overall = IOStatus::OK(); + + std::unique_ptr manifest_reader(new ManifestReader( + cfs.info_log_, &cfs, cfs_opts.dest_bucket.GetBucketName())); + + std::set + live_file_numbers; // temporary container reused per manifest + + for (auto &entry : cloudmanifests) { + const auto &cloud_manifest_name = entry.first; + CloudManifest *cloud_manifest_ptr = entry.second.get(); + + live_file_numbers.clear(); + + std::string current_epoch = cloud_manifest_ptr->GetCurrentEpoch(); + auto manifest_file = ManifestFileWithEpoch(dest_object_path, current_epoch); + + CloudObjectInformation manifest_file_info; + IOStatus s = cfs.GetStorageProvider()->GetCloudObjectMetadata( + cfs.GetDestBucketName(), manifest_file, &manifest_file_info); + + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to get metadata for manifest file %s: %s", + manifest_file.c_str(), s.ToString().c_str()); + if (overall.ok()) overall = s; + continue; + } + + Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, + "[pg] Current epoch Manifest file %s of CloudManifest %s has size %lu " + "and content hash %s and timestamp %lu", + manifest_file.c_str(), cloud_manifest_name.c_str(), + manifest_file_info.size, manifest_file_info.content_hash.c_str(), + manifest_file_info.modification_time); + + (*epoch_manifest_infos)[current_epoch] = manifest_file_info; + + s = manifest_reader->GetLiveFiles(dest_object_path, current_epoch, + &live_file_numbers); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to get live files from cloud manifest file %s: %s", + cloud_manifest_name.c_str(), s.ToString().c_str()); + if (overall.ok()) overall = s; + continue; + } + + for (const auto &num : live_file_numbers) { + std::string file_name = MakeTableFileName(num); + file_name = + cfs.RemapFilenameWithCloudManifest(file_name, cloud_manifest_ptr); + live_files->insert(file_name); + Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, + "[pg] Live file %s found in cloud manifest %s", file_name.c_str(), + cloud_manifest_name.c_str()); + } + } + return overall; +} + +static void SelectObsoleteFiles( + CloudFileSystemImpl &cfs, const PurgerAllFiles &all_files, + const PurgerLiveFileSet &live_files, + const PurgerEpochManifestMap &epoch_manifest_infos, + std::vector *obsolete_files) { + for (const auto &candidate : all_files) { + Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, + "[pg] Checking candidate file %s", candidate.first.c_str()); + const std::string &candidate_file_path = candidate.first; + + // Skip files that are not SST files + if (!ends_with(RemoveEpoch(candidate_file_path), ".sst")) { + continue; + } + + const std::string candidate_file_epoch = GetEpoch(candidate_file_path); + const CloudObjectInformation &candidate_file_info = candidate.second; + uint64_t candidate_modification_time = + candidate_file_info.modification_time; + + // Give max value to manifest modification time + // if the candidate file epoch is not current epoch + uint64_t manifest_modification_time = std::numeric_limits::max(); + auto it_epoch = epoch_manifest_infos.find(candidate_file_epoch); + if (it_epoch != epoch_manifest_infos.end()) { + manifest_modification_time = it_epoch->second.modification_time; + } + + if (live_files.find(candidate_file_path) != live_files.end()) { + continue; + } + + if (candidate_modification_time < manifest_modification_time) { + obsolete_files->push_back(candidate_file_path); + Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, + "[pg] Candidate file %s is obsolete and will be deleted", + candidate_file_path.c_str()); + } else { + Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, + "[pg] Candidate file %s is not obsolete because its modification " + "time %lu is later than the current epoch manifest file's " + "modification time %lu", + candidate_file_path.c_str(), candidate_modification_time, + manifest_modification_time); + } + } +} + +static void DeleteObsoleteFiles( + CloudFileSystemImpl &cfs, const std::vector &obsolete_files) { + const std::string &dest_object_path = cfs.GetDestObjectPath(); + size_t deleted = 0; + size_t failures = 0; + for (const auto &file_to_delete : obsolete_files) { + std::string file_path = dest_object_path + pathsep + file_to_delete; + Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, + "[pg] Deleting obsolete file %s from destination bucket", + file_to_delete.c_str()); + IOStatus s = cfs.GetStorageProvider()->DeleteCloudObject( + cfs.GetDestBucketName(), file_path); + if (!s.ok()) { + ++failures; + Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, + "[pg] Failed to delete obsolete file %s: %s", file_path.c_str(), + s.ToString().c_str()); + } else { + ++deleted; + } + } + Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, + "[pg] Obsolete deletion summary: requested=%zu deleted=%zu failures=%zu", + obsolete_files.size(), deleted, failures); +} + +static void RunSinglePurgeCycle(CloudFileSystemImpl &cfs) { + PurgerCycleState state; // fresh state each cycle + + Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, + "[pg] Single Object Path Purger started a new cycle"); + + if (!ListAllFiles(cfs, &state.all_files).ok()) { + return; + } + + if (!ListCloudManifests(cfs, &state.cloud_manifest_files).ok()) { + return; + } + + if (!LoadCloudManifests(cfs, state.cloud_manifest_files, + &state.cloudmanifests) + .ok()) { + return; + } + + if (!CollectLiveFiles(cfs, state.cloudmanifests, &state.live_file_names, + &state.current_epoch_manifest_files) + .ok()) { + return; + } + + SelectObsoleteFiles(cfs, state.all_files, state.live_file_names, + state.current_epoch_manifest_files, + &state.obsolete_files); + + DeleteObsoleteFiles(cfs, state.obsolete_files); + + Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, + "[pg] Purge cycle summary: total_listed=%zu manifests_listed=%zu " + "manifests_loaded=%zu live_files=%zu obsolete_selected=%zu", + state.all_files.size(), state.cloud_manifest_files.size(), + state.cloudmanifests.size(), state.live_file_names.size(), + state.obsolete_files.size()); +} + +} // anonymous namespace + +// ------------- Main purger thread ------------- // + +void CloudFileSystemImpl::Purger() { + Log(InfoLogLevel::INFO_LEVEL, info_log_, + "[pg] Single Object Path Purger thread started"); + + if (!PrerequisitesMet(*this)) { + return; + } + + const auto periodicity_ms = + GetCloudFileSystemOptions().purger_periodicity_millis; + + while (true) { + // Wait for next cycle or termination request + std::unique_lock lk(purger_lock_); + purger_cv_.wait_for(lk, std::chrono::milliseconds(periodicity_ms), + [&]() { return !purger_is_running_; }); + if (!purger_is_running_) { + break; // shutdown requested + } + lk.unlock(); // release lock during IO work + + RunSinglePurgeCycle(*this); + } + + Log(InfoLogLevel::INFO_LEVEL, info_log_, + "[pg] Single Object Path Purger thread exiting"); +} + +IOStatus CloudFileSystemImpl::FindObsoleteFiles( + const std::string & /*bucket_name_prefix*/, + std::vector * /*pathnames*/) { + return IOStatus::NotSupported( + "Single Object Path Purger does not support FindObsoleteFiles"); +} +IOStatus CloudFileSystemImpl::FindObsoleteDbid( + const std::string & /*bucket_name_prefix*/, + std::vector * /*to_delete_list*/) { + return IOStatus::NotSupported( + "Single Object Path Purger does not support FindObsoleteDbid"); +} + +IOStatus CloudFileSystemImpl::extractParents( + const std::string & /*bucket_name_prefix*/, const DbidList & /*dbid_list*/, + DbidParents * /*parents*/) { + return IOStatus::NotSupported( + "Single Object Path Purger does not support extractParents"); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE From f476d080ea5de1ee1dc6f5d1e0e6f95dde920a51 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Fri, 12 Sep 2025 13:52:30 +0800 Subject: [PATCH 02/15] Initial commit --- eloq_data_store_service/CMakeLists.txt | 21 + .../build_eloq_store.cmake | 4 +- eloq_data_store_service/improved_purger.cpp | 486 ++++++++++++++++++ .../purger_event_listener.cpp | 218 ++++++++ .../purger_event_listener.h | 129 +++++ .../purger_sliding_window.cpp | 251 +++++++++ .../purger_sliding_window.h | 162 ++++++ 7 files changed, 1270 insertions(+), 1 deletion(-) create mode 100644 eloq_data_store_service/improved_purger.cpp create mode 100644 eloq_data_store_service/purger_event_listener.cpp create mode 100644 eloq_data_store_service/purger_event_listener.h create mode 100644 eloq_data_store_service/purger_sliding_window.cpp create mode 100644 eloq_data_store_service/purger_sliding_window.h diff --git a/eloq_data_store_service/CMakeLists.txt b/eloq_data_store_service/CMakeLists.txt index 56b03c0..e56b67c 100644 --- a/eloq_data_store_service/CMakeLists.txt +++ b/eloq_data_store_service/CMakeLists.txt @@ -385,6 +385,8 @@ if ((WITH_DATA_STORE STREQUAL "ELOQDSS_ROCKSDB_CLOUD_S3") OR rocksdb_data_store_common.cpp rocksdb_cloud_data_store.cpp rocksdb_config.cpp + purger_event_listener.cpp + purger_sliding_window.cpp ) elseif (WITH_DATA_STORE STREQUAL "ELOQDSS_ROCKSDB") SET(RESELOQ_SOURCES ${RESELOQ_SOURCES} @@ -433,6 +435,25 @@ if ((WITH_DATA_STORE STREQUAL "ELOQDSS_ROCKSDB_CLOUD_S3") OR INSTALL_RPATH_USE_LINK_PATH TRUE) install(TARGETS rocksdb_cloud_dump RUNTIME DESTINATION bin) + + # Add improved purger utility + add_executable(improved_purger + improved_purger.cpp + rocksdb_data_store_common.cpp + rocksdb_config.cpp + purger_event_listener.cpp + purger_sliding_window.cpp + data_store_service_config.cpp + INIReader.cpp + ini.c + ds_request.pb.cc) + target_link_libraries(improved_purger ${DYNAMIC_LIB} ${ROCKSDB_LIBRARIES} ${GFLAGS_LIBRARY}) + set_target_properties(improved_purger PROPERTIES + BUILD_RPATH "$ORIGIN/../lib" + INSTALL_RPATH "$ORIGIN/../lib" + INSTALL_RPATH_USE_LINK_PATH TRUE) + install(TARGETS improved_purger + RUNTIME DESTINATION bin) endif() set_target_properties(dss_server PROPERTIES diff --git a/eloq_data_store_service/build_eloq_store.cmake b/eloq_data_store_service/build_eloq_store.cmake index 50474b7..8eb63d7 100644 --- a/eloq_data_store_service/build_eloq_store.cmake +++ b/eloq_data_store_service/build_eloq_store.cmake @@ -96,7 +96,9 @@ set(ELOQ_STORE_SOURCES ${ELOQ_STORE_SOURCE_DIR}/object_store.cpp ${ELOQ_STORE_SOURCE_DIR}/types.cpp ${ELOQ_STORE_SOURCE_DIR}/kv_options.cpp - ${ELOQ_STORE_SOURCE_DIR}/eloqstore_module.cpp) + ${ELOQ_STORE_SOURCE_DIR}/eloqstore_module.cpp + purger_event_listener.cpp + purger_sliding_window.cpp) add_library(eloqstore STATIC ${ELOQ_STORE_SOURCES} ${INI_SOURCES}) diff --git a/eloq_data_store_service/improved_purger.cpp b/eloq_data_store_service/improved_purger.cpp new file mode 100644 index 0000000..6bc8fd8 --- /dev/null +++ b/eloq_data_store_service/improved_purger.cpp @@ -0,0 +1,486 @@ +/** + * Copyright (C) 2025 EloqData Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under either of the following two licenses: + * 1. GNU Affero General Public License, version 3, as published by the Free + * Software Foundation. + * 2. GNU General Public License as published by the Free Software + * Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License or GNU General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * and GNU General Public License V2 along with this program. If not, see + * . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cloud/cloud_manifest.h" +#include "cloud/filename.h" +#include "cloud/manifest_reader.h" +#include "file/filename.h" +#include "rocksdb/cloud/cloud_file_system_impl.h" +#include "rocksdb/cloud/cloud_storage_provider.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" + +#include "purger_sliding_window.h" + +// Command line flags +DEFINE_string(s3_url, "", "S3 URL in format s3://bucket/path (required)"); +DEFINE_int32(purge_interval_seconds, 300, "Purge cycle interval in seconds (default: 5 minutes)"); +DEFINE_bool(dry_run, false, "Dry run mode - list obsolete files but don't delete them"); +DEFINE_string(aws_region, "us-west-2", "AWS region (default: us-west-2)"); +DEFINE_int32(file_number_grace_minutes, 5, "Grace period for file number threshold in minutes (default: 5)"); + +namespace ROCKSDB_NAMESPACE { + +/** + * @brief Parse S3 URL into bucket and object path components + * @param s3_url S3 URL in format s3://bucket/path + * @param bucket_name Output bucket name + * @param object_path Output object path + * @return true if parsing succeeded, false otherwise + */ +bool ParseS3Url(const std::string& s3_url, std::string* bucket_name, std::string* object_path) { + std::regex s3_regex(R"(s3://([^/]+)(/.*)?)", std::regex_constants::icase); + std::smatch matches; + + if (!std::regex_match(s3_url, matches, s3_regex)) { + return false; + } + + *bucket_name = matches[1].str(); + *object_path = matches.size() > 2 ? matches[2].str() : ""; + + // Remove leading slash from object path + if (!object_path->empty() && (*object_path)[0] == '/') { + *object_path = object_path->substr(1); + } + + return true; +} + +/** + * @brief Enhanced purger with file number threshold support + */ +class ImprovedPurger { +public: + // Type aliases + using PurgerAllFiles = std::vector>; + using PurgerCloudManifestMap = std::unordered_map>; + using PurgerLiveFileSet = std::unordered_set; + using PurgerEpochManifestMap = std::unordered_map; + using PurgerFileNumberThresholds = std::unordered_map; // epoch -> threshold + + struct PurgerCycleState { + PurgerAllFiles all_files; + std::vector cloud_manifest_files; + PurgerCloudManifestMap cloudmanifests; + PurgerLiveFileSet live_file_names; + PurgerEpochManifestMap current_epoch_manifest_files; + PurgerFileNumberThresholds file_number_thresholds; // NEW: epoch -> min file number + std::vector obsolete_files; + }; + +private: + std::shared_ptr cfs_; + std::string bucket_name_; + std::string object_path_; + bool dry_run_; + int file_number_grace_minutes_; + +public: + ImprovedPurger(std::shared_ptr cfs, + const std::string& bucket_name, + const std::string& object_path, + bool dry_run, + int file_number_grace_minutes) + : cfs_(cfs), + bucket_name_(bucket_name), + object_path_(object_path), + dry_run_(dry_run), + file_number_grace_minutes_(file_number_grace_minutes) {} + + /** + * @brief Run a single purge cycle with improved file number checking + */ + void RunSinglePurgeCycle() { + PurgerCycleState state; + + LOG(INFO) << "[ImprovedPurger] Starting purge cycle for " << bucket_name_ << "/" << object_path_; + + if (!ListAllFiles(&state.all_files)) { + return; + } + + if (!ListCloudManifests(&state.cloud_manifest_files)) { + return; + } + + if (!LoadCloudManifests(state.cloud_manifest_files, &state.cloudmanifests)) { + return; + } + + if (!CollectLiveFiles(state.cloudmanifests, &state.live_file_names, + &state.current_epoch_manifest_files)) { + return; + } + + // NEW: Load file number thresholds from S3 + LoadFileNumberThresholds(state.cloudmanifests, &state.file_number_thresholds); + + // Enhanced selection with file number checking + SelectObsoleteFilesWithThreshold(state.all_files, state.live_file_names, + state.current_epoch_manifest_files, + state.file_number_thresholds, + &state.obsolete_files); + + if (dry_run_) { + LOG(INFO) << "[ImprovedPurger] DRY RUN: Would delete " << state.obsolete_files.size() << " files"; + for (const auto& file : state.obsolete_files) { + LOG(INFO) << "[ImprovedPurger] DRY RUN: Would delete " << file; + } + } else { + DeleteObsoleteFiles(state.obsolete_files); + } + + LOG(INFO) << "[ImprovedPurger] Purge cycle summary: total_files=" << state.all_files.size() + << " manifests=" << state.cloudmanifests.size() + << " live_files=" << state.live_file_names.size() + << " obsolete_selected=" << state.obsolete_files.size() + << " thresholds_loaded=" << state.file_number_thresholds.size(); + } + +private: + bool ListAllFiles(PurgerAllFiles* all_files) { + IOStatus s = cfs_->GetStorageProvider()->ListCloudObjects( + bucket_name_, object_path_, all_files); + + if (!s.ok()) { + LOG(ERROR) << "[ImprovedPurger] Failed to list files in " << object_path_ + << ": " << s.ToString(); + return false; + } + + LOG(INFO) << "[ImprovedPurger] Listed " << all_files->size() << " files"; + return true; + } + + bool ListCloudManifests(std::vector* cloud_manifest_files) { + IOStatus s = cfs_->GetStorageProvider()->ListCloudObjectsWithPrefix( + bucket_name_, object_path_, "CLOUDMANIFEST", cloud_manifest_files); + + if (!s.ok()) { + LOG(ERROR) << "[ImprovedPurger] Failed to list cloud manifests: " << s.ToString(); + return false; + } + + LOG(INFO) << "[ImprovedPurger] Found " << cloud_manifest_files->size() << " cloud manifest files"; + return true; + } + + bool LoadCloudManifests(const std::vector& cloud_manifest_files, + PurgerCloudManifestMap* manifests) { + const FileOptions file_opts; + IODebugContext* dbg = nullptr; + bool success = true; + + for (const auto& cloud_manifest_file : cloud_manifest_files) { + std::string full_path = object_path_ + "/" + cloud_manifest_file; + std::unique_ptr file; + + IOStatus s = cfs_->NewSequentialFileCloud(bucket_name_, full_path, file_opts, &file, dbg); + if (!s.ok()) { + LOG(ERROR) << "[ImprovedPurger] Failed to open manifest " << cloud_manifest_file + << ": " << s.ToString(); + success = false; + continue; + } + + std::unique_ptr cloud_manifest; + s = CloudManifest::LoadFromLog( + std::unique_ptr( + new SequentialFileReader(std::move(file), cloud_manifest_file)), + &cloud_manifest); + + if (!s.ok()) { + LOG(ERROR) << "[ImprovedPurger] Failed to load manifest " << cloud_manifest_file + << ": " << s.ToString(); + success = false; + continue; + } + + LOG(INFO) << "[ImprovedPurger] Loaded manifest " << cloud_manifest_file + << " with epoch " << cloud_manifest->GetCurrentEpoch(); + + (*manifests)[cloud_manifest_file] = std::move(cloud_manifest); + } + + return success; + } + + bool CollectLiveFiles(const PurgerCloudManifestMap& cloudmanifests, + PurgerLiveFileSet* live_files, + PurgerEpochManifestMap* epoch_manifest_infos) { + std::unique_ptr manifest_reader( + new ManifestReader(cfs_->info_log_, cfs_.get(), bucket_name_)); + + std::set live_file_numbers; + bool success = true; + + for (const auto& entry : cloudmanifests) { + const std::string& cloud_manifest_name = entry.first; + CloudManifest* cloud_manifest_ptr = entry.second.get(); + + live_file_numbers.clear(); + std::string current_epoch = cloud_manifest_ptr->GetCurrentEpoch(); + std::string manifest_file = ManifestFileWithEpoch(object_path_, current_epoch); + + CloudObjectInformation manifest_file_info; + IOStatus s = cfs_->GetStorageProvider()->GetCloudObjectMetadata( + bucket_name_, manifest_file, &manifest_file_info); + + if (!s.ok()) { + LOG(ERROR) << "[ImprovedPurger] Failed to get metadata for manifest " + << manifest_file << ": " << s.ToString(); + success = false; + continue; + } + + (*epoch_manifest_infos)[current_epoch] = manifest_file_info; + + s = manifest_reader->GetLiveFiles(object_path_, current_epoch, &live_file_numbers); + if (!s.ok()) { + LOG(ERROR) << "[ImprovedPurger] Failed to get live files from manifest " + << cloud_manifest_name << ": " << s.ToString(); + success = false; + continue; + } + + for (uint64_t num : live_file_numbers) { + std::string file_name = MakeTableFileName(num); + file_name = cfs_->RemapFilenameWithCloudManifest(file_name, cloud_manifest_ptr); + live_files->insert(file_name); + DLOG(INFO) << "[ImprovedPurger] Live file: " << file_name; + } + } + + return success; + } + + void LoadFileNumberThresholds(const PurgerCloudManifestMap& cloudmanifests, + PurgerFileNumberThresholds* thresholds) { + for (const auto& entry : cloudmanifests) { + CloudManifest* manifest = entry.second.get(); + std::string epoch = manifest->GetCurrentEpoch(); + + // Create S3 file number updater to read threshold + auto s3_updater = std::make_unique( + bucket_name_, object_path_, epoch, cfs_->GetStorageProvider()); + + uint64_t threshold = s3_updater->ReadSmallestFileNumber(); + (*thresholds)[epoch] = threshold; + + if (threshold == std::numeric_limits::max()) { + LOG(INFO) << "[ImprovedPurger] No file number threshold found for epoch " << epoch + << " (using conservative approach)"; + } else { + LOG(INFO) << "[ImprovedPurger] Loaded file number threshold " << threshold + << " for epoch " << epoch; + } + } + } + + void SelectObsoleteFilesWithThreshold(const PurgerAllFiles& all_files, + const PurgerLiveFileSet& live_files, + const PurgerEpochManifestMap& epoch_manifest_infos, + const PurgerFileNumberThresholds& thresholds, + std::vector* obsolete_files) { + for (const auto& candidate : all_files) { + const std::string& candidate_file_path = candidate.first; + const CloudObjectInformation& candidate_file_info = candidate.second; + + // Skip non-SST files + if (!ends_with(RemoveEpoch(candidate_file_path), ".sst")) { + continue; + } + + // Skip live files + if (live_files.find(candidate_file_path) != live_files.end()) { + continue; + } + + std::string candidate_epoch = GetEpoch(candidate_file_path); + uint64_t candidate_modification_time = candidate_file_info.modification_time; + + // Get manifest modification time + uint64_t manifest_modification_time = std::numeric_limits::max(); + auto epoch_it = epoch_manifest_infos.find(candidate_epoch); + if (epoch_it != epoch_manifest_infos.end()) { + manifest_modification_time = epoch_it->second.modification_time; + } + + // NEW: Check file number threshold + bool safe_by_file_number = true; + auto threshold_it = thresholds.find(candidate_epoch); + if (threshold_it != thresholds.end()) { + uint64_t threshold = threshold_it->second; + if (threshold != std::numeric_limits::max()) { + // Extract file number from candidate file name + uint64_t file_number = 0; + std::string base_name = RemoveEpoch(candidate_file_path); + if (ParseFileName(base_name, &file_number, nullptr)) { + if (file_number >= threshold) { + safe_by_file_number = false; + DLOG(INFO) << "[ImprovedPurger] File " << candidate_file_path + << " protected by file number threshold (file_num=" + << file_number << ", threshold=" << threshold << ")"; + } + } + } + } + + // Apply both time-based and file number-based checks + if (safe_by_file_number && candidate_modification_time < manifest_modification_time) { + obsolete_files->push_back(candidate_file_path); + DLOG(INFO) << "[ImprovedPurger] File " << candidate_file_path << " is obsolete"; + } else { + DLOG(INFO) << "[ImprovedPurger] File " << candidate_file_path + << " is protected (safe_by_file_number=" << safe_by_file_number + << ", candidate_time=" << candidate_modification_time + << ", manifest_time=" << manifest_modification_time << ")"; + } + } + } + + void DeleteObsoleteFiles(const std::vector& obsolete_files) { + size_t deleted = 0; + size_t failures = 0; + + for (const auto& file_to_delete : obsolete_files) { + std::string file_path = object_path_ + "/" + file_to_delete; + LOG(INFO) << "[ImprovedPurger] Deleting obsolete file " << file_to_delete; + + IOStatus s = cfs_->GetStorageProvider()->DeleteCloudObject(bucket_name_, file_path); + if (!s.ok()) { + ++failures; + LOG(ERROR) << "[ImprovedPurger] Failed to delete " << file_path + << ": " << s.ToString(); + } else { + ++deleted; + } + } + + LOG(INFO) << "[ImprovedPurger] Deletion summary: requested=" << obsolete_files.size() + << " deleted=" << deleted << " failures=" << failures; + } +}; + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char* argv[]) { + // Initialize gflags and glog + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + + // Set log level to INFO by default + FLAGS_logtostderr = 1; + FLAGS_minloglevel = 0; // INFO level + + if (FLAGS_s3_url.empty()) { + std::cerr << "Error: --s3_url is required\n"; + std::cerr << "Usage: " << argv[0] << " --s3_url=s3://bucket/path [options]\n"; + std::cerr << "Options:\n"; + std::cerr << " --purge_interval_seconds=300 Purge cycle interval (default: 5 minutes)\n"; + std::cerr << " --dry_run=false Dry run mode - don't actually delete files\n"; + std::cerr << " --aws_region=us-west-2 AWS region\n"; + std::cerr << " --file_number_grace_minutes=5 Grace period for file number threshold\n"; + return 1; + } + + std::string bucket_name, object_path; + if (!ROCKSDB_NAMESPACE::ParseS3Url(FLAGS_s3_url, &bucket_name, &object_path)) { + std::cerr << "Error: Invalid S3 URL format. Expected: s3://bucket/path\n"; + return 1; + } + + LOG(INFO) << "Starting improved purger for S3 URL: " << FLAGS_s3_url; + LOG(INFO) << "Parsed - Bucket: " << bucket_name << ", Object Path: " << object_path; + LOG(INFO) << "Configuration - Purge Interval: " << FLAGS_purge_interval_seconds + << "s, Dry Run: " << (FLAGS_dry_run ? "true" : "false") + << ", AWS Region: " << FLAGS_aws_region + << ", File Number Grace: " << FLAGS_file_number_grace_minutes << " minutes"; + + try { + // Create CloudFileSystemOptions + ROCKSDB_NAMESPACE::CloudFileSystemOptions cfs_options; + cfs_options.src_bucket.SetBucketName(bucket_name, ""); + cfs_options.src_bucket.SetRegion(FLAGS_aws_region); + cfs_options.dest_bucket = cfs_options.src_bucket; + cfs_options.dest_bucket.SetObjectPath(object_path); + cfs_options.purger_periodicity_millis = FLAGS_purge_interval_seconds * 1000; + + // Create CloudFileSystem + std::shared_ptr cloud_fs; + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::CloudFileSystem::NewAwsFileSystem( + ROCKSDB_NAMESPACE::FileSystem::Default(), cfs_options, &cloud_fs); + + if (!s.ok()) { + LOG(FATAL) << "Failed to create CloudFileSystem: " << s.ToString(); + return 1; + } + + auto cfs_impl = std::dynamic_pointer_cast(cloud_fs); + if (!cfs_impl) { + LOG(FATAL) << "Failed to cast to CloudFileSystemImpl"; + return 1; + } + + // Create and run improved purger + ROCKSDB_NAMESPACE::ImprovedPurger purger( + cfs_impl, bucket_name, object_path, + FLAGS_dry_run, FLAGS_file_number_grace_minutes); + + // Run purge cycles + while (true) { + auto start_time = std::chrono::steady_clock::now(); + + purger.RunSinglePurgeCycle(); + + auto end_time = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + LOG(INFO) << "Purge cycle completed in " << duration.count() << "ms"; + + // Sleep until next cycle + std::this_thread::sleep_for(std::chrono::seconds(FLAGS_purge_interval_seconds)); + } + + } catch (const std::exception& e) { + LOG(FATAL) << "Exception: " << e.what(); + return 1; + } + + return 0; +} + diff --git a/eloq_data_store_service/purger_event_listener.cpp b/eloq_data_store_service/purger_event_listener.cpp new file mode 100644 index 0000000..6452a65 --- /dev/null +++ b/eloq_data_store_service/purger_event_listener.cpp @@ -0,0 +1,218 @@ +/** + * Copyright (C) 2025 EloqData Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under either of the following two licenses: + * 1. GNU Affero General Public License, version 3, as published by the Free + * Software Foundation. + * 2. GNU General Public License as published by the Free Software + * Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License or GNU General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * and GNU General Public License V2 along with this program. If not, see + * . + * + */ + +#include "purger_event_listener.h" + +#include + +#include + +namespace EloqDS +{ + +PurgerEventListener::PurgerEventListener( + const std::string &epoch, + const std::string &bucket_name, + const std::string &s3_object_path, + std::shared_ptr storage_provider, + std::chrono::milliseconds window_duration, + std::chrono::milliseconds s3_update_interval) + : epoch_(epoch), bucket_name_(bucket_name), s3_object_path_(s3_object_path) +{ + sliding_window_ = std::make_unique(window_duration, + s3_update_interval, + epoch_, + bucket_name_, + s3_object_path_, + storage_provider); + + LOG(INFO) << "PurgerEventListener created for epoch " << epoch_ + << ", bucket: " << bucket_name_ + << ", object_path: " << s3_object_path_ + << ", window_duration: " << window_duration.count() << "ms" + << ", s3_update_interval: " << s3_update_interval.count() << "ms"; +} + +PurgerEventListener::~PurgerEventListener() +{ + Stop(); +} + +void PurgerEventListener::OnFlushBegin( + rocksdb::DB *db, const rocksdb::FlushJobInfo &flush_job_info) +{ + // Log flush begin event (similar to existing RocksDBEventListener) + if (flush_job_info.triggered_writes_slowdown || + flush_job_info.triggered_writes_stop) + { + LOG(INFO) << "[PurgerEventListener] Flush begin, file: " + << flush_job_info.file_path + << ", job_id: " << flush_job_info.job_id + << ", thread: " << flush_job_info.thread_id + << ", file_number: " << flush_job_info.file_number + << ", triggered_writes_slowdown: " + << flush_job_info.triggered_writes_slowdown + << ", triggered_writes_stop: " + << flush_job_info.triggered_writes_stop + << ", smallest_seqno: " << flush_job_info.smallest_seqno + << ", largest_seqno: " << flush_job_info.largest_seqno + << ", flush_reason: " + << GetFlushReason(flush_job_info.flush_reason) + << ", epoch: " << epoch_; + } + + // Update sliding window with current max file number + UpdateSlidingWindow(db); + + DLOG(INFO) << "[PurgerEventListener] OnFlushBegin processed for epoch " + << epoch_ << ", job_id: " << flush_job_info.job_id; +} + +void PurgerEventListener::OnFlushCompleted( + rocksdb::DB *db, const rocksdb::FlushJobInfo &flush_job_info) +{ + // Log flush completion event + if (flush_job_info.triggered_writes_slowdown || + flush_job_info.triggered_writes_stop) + { + LOG(INFO) << "[PurgerEventListener] Flush completed, file: " + << flush_job_info.file_path + << ", job_id: " << flush_job_info.job_id + << ", thread: " << flush_job_info.thread_id + << ", file_number: " << flush_job_info.file_number + << ", triggered_writes_slowdown: " + << flush_job_info.triggered_writes_slowdown + << ", triggered_writes_stop: " + << flush_job_info.triggered_writes_stop + << ", smallest_seqno: " << flush_job_info.smallest_seqno + << ", largest_seqno: " << flush_job_info.largest_seqno + << ", flush_reason: " + << GetFlushReason(flush_job_info.flush_reason) + << ", epoch: " << epoch_; + } + + DLOG(INFO) << "[PurgerEventListener] OnFlushCompleted processed for epoch " + << epoch_ << ", job_id: " << flush_job_info.job_id; +} + +void PurgerEventListener::OnCompactionBegin( + rocksdb::DB *db, const rocksdb::CompactionJobInfo &ci) +{ + DLOG(INFO) << "[PurgerEventListener] Compaction begin, job_id: " + << ci.job_id << ", thread: " << ci.thread_id + << ", output_level: " << ci.output_level + << ", input_files_size: " << ci.input_files.size() + << ", compaction_reason: " + << static_cast(ci.compaction_reason) + << ", epoch: " << epoch_; + + // Update sliding window with current max file number + UpdateSlidingWindow(db); + + DLOG(INFO) << "[PurgerEventListener] OnCompactionBegin processed for epoch " + << epoch_ << ", job_id: " << ci.job_id; +} + +void PurgerEventListener::OnCompactionCompleted( + rocksdb::DB *db, const rocksdb::CompactionJobInfo &ci) +{ + DLOG(INFO) << "[PurgerEventListener] Compaction completed, job_id: " + << ci.job_id << ", thread: " << ci.thread_id + << ", output_level: " << ci.output_level + << ", input_files_size: " << ci.input_files.size() + << ", output_files_size: " << ci.output_files.size() + << ", compaction_reason: " + << static_cast(ci.compaction_reason) + << ", epoch: " << epoch_; + + DLOG(INFO) + << "[PurgerEventListener] OnCompactionCompleted processed for epoch " + << epoch_ << ", job_id: " << ci.job_id; +} + +void PurgerEventListener::Stop() +{ + if (sliding_window_) + { + sliding_window_->Stop(); + sliding_window_.reset(); + } + + LOG(INFO) << "PurgerEventListener stopped for epoch " << epoch_; +} + +std::string PurgerEventListener::GetFlushReason( + rocksdb::FlushReason flush_reason) +{ + switch (flush_reason) + { + case rocksdb::FlushReason::kOthers: + return "Others"; + case rocksdb::FlushReason::kGetLiveFiles: + return "GetLiveFiles"; + case rocksdb::FlushReason::kShutDown: + return "ShutDown"; + case rocksdb::FlushReason::kExternalFileIngestion: + return "ExternalFileIngestion"; + case rocksdb::FlushReason::kManualCompaction: + return "ManualCompaction"; + case rocksdb::FlushReason::kWriteBufferManager: + return "WriteBufferManager"; + case rocksdb::FlushReason::kWriteBufferFull: + return "WriteBufferFull"; + case rocksdb::FlushReason::kTest: + return "Test"; + case rocksdb::FlushReason::kDeleteFiles: + return "DeleteFiles"; + case rocksdb::FlushReason::kAutoCompaction: + return "AutoCompaction"; + case rocksdb::FlushReason::kManualFlush: + return "ManualFlush"; + case rocksdb::FlushReason::kErrorRecovery: + return "ErrorRecovery"; + default: + return "Unknown"; + } +} + +void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db) +{ + if (!db) + { + LOG(ERROR) << "[PurgerEventListener] DB pointer is null for epoch " + << epoch_; + return; + } + + // Get current max file number from RocksDB + uint64_t max_file_number = db->GetNextFileNumber() - 1; + + if (sliding_window_) + { + sliding_window_->AddFileNumber(max_file_number); + } + + DLOG(INFO) << "[PurgerEventListener] Added file number to sliding window: " + << max_file_number << ", epoch: " << epoch_; +} + +} // namespace EloqDS diff --git a/eloq_data_store_service/purger_event_listener.h b/eloq_data_store_service/purger_event_listener.h new file mode 100644 index 0000000..d2bec63 --- /dev/null +++ b/eloq_data_store_service/purger_event_listener.h @@ -0,0 +1,129 @@ +/** + * Copyright (C) 2025 EloqData Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under either of the following two licenses: + * 1. GNU Affero General Public License, version 3, as published by the Free + * Software Foundation. + * 2. GNU General Public License as published by the Free Software + * Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License or GNU General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * and GNU General Public License V2 along with this program. If not, see + * . + * + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include "purger_sliding_window.h" + +namespace EloqDS +{ + +/** + * @brief Enhanced EventListener for tracking file numbers to improve purger safety + * + * This listener subscribes to FlushBegin and CompactionBegin events to capture + * the maximum file number at the time of these operations. The file numbers are + * fed into a sliding window which periodically updates S3 with the smallest + * file number threshold to prevent premature deletion by the purger. + */ +class PurgerEventListener : public rocksdb::EventListener +{ +public: + /** + * @brief Constructor for PurgerEventListener + * @param epoch The epoch string for this DB instance + * @param bucket_name S3 bucket name + * @param s3_object_path S3 object path + * @param storage_provider Cloud storage provider for S3 operations + * @param window_duration Duration to keep entries in sliding window (default: 5 minutes) + * @param s3_update_interval Interval for updating S3 file (default: 1 minute) + */ + PurgerEventListener(const std::string& epoch, + const std::string& bucket_name, + const std::string& s3_object_path, + std::shared_ptr storage_provider, + std::chrono::milliseconds window_duration = std::chrono::minutes(5), + std::chrono::milliseconds s3_update_interval = std::chrono::minutes(1)); + + /** + * @brief Destructor - stops the sliding window + */ + ~PurgerEventListener(); + + /** + * @brief Called when a flush operation begins + * @param db Pointer to the database instance + * @param flush_job_info Information about the flush operation + */ + void OnFlushBegin(rocksdb::DB* db, + const rocksdb::FlushJobInfo& flush_job_info) override; + + /** + * @brief Called when a flush operation completes + * @param db Pointer to the database instance + * @param flush_job_info Information about the flush operation + */ + void OnFlushCompleted(rocksdb::DB* db, + const rocksdb::FlushJobInfo& flush_job_info) override; + + /** + * @brief Called when a compaction operation begins + * @param db Pointer to the database instance + * @param ci Information about the compaction operation + */ + void OnCompactionBegin(rocksdb::DB* db, + const rocksdb::CompactionJobInfo& ci) override; + + /** + * @brief Called when a compaction operation completes + * @param db Pointer to the database instance + * @param ci Information about the compaction operation + */ + void OnCompactionCompleted(rocksdb::DB* db, + const rocksdb::CompactionJobInfo& ci) override; + + /** + * @brief Stop the event listener and cleanup resources + */ + void Stop(); + + /** + * @brief Get flush reason string for logging + * @param flush_reason The flush reason enum value + * @return String representation of the flush reason + */ + std::string GetFlushReason(rocksdb::FlushReason flush_reason); + +private: + std::string epoch_; + std::string bucket_name_; + std::string s3_object_path_; + + std::unique_ptr sliding_window_; + + /** + * @brief Update sliding window with current max file number from DB + * @param db Pointer to the database instance + */ + void UpdateSlidingWindow(rocksdb::DB* db); +}; + +} // namespace EloqDS + diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp new file mode 100644 index 0000000..b428c11 --- /dev/null +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -0,0 +1,251 @@ +/** + * Copyright (C) 2025 EloqData Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under either of the following two licenses: + * 1. GNU Affero General Public License, version 3, as published by the Free + * Software Foundation. + * 2. GNU General Public License as published by the Free Software + * Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License or GNU General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * and GNU General Public License V2 along with this program. If not, see + * . + * + */ + +#include "purger_sliding_window.h" + +#include +#include + +namespace EloqDS +{ + +// S3FileNumberUpdater implementation + +S3FileNumberUpdater::S3FileNumberUpdater( + const std::string& bucket_name, + const std::string& s3_object_path, + const std::string& epoch, + std::shared_ptr storage_provider) + : bucket_name_(bucket_name), + s3_object_path_(s3_object_path), + epoch_(epoch), + storage_provider_(storage_provider) +{ +} + +void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number) +{ + std::string content = std::to_string(file_number); + std::string object_key = GetS3ObjectKey(); + + rocksdb::IOStatus s = storage_provider_->PutCloudObject( + bucket_name_, object_key, content); + + if (!s.ok()) { + LOG(ERROR) << "Failed to update smallest file number to S3: " + << s.ToString() << ", object_key: " << object_key + << ", file_number: " << file_number; + } else { + DLOG(INFO) << "Updated smallest file number in S3: " << file_number + << ", object_key: " << object_key; + } +} + +uint64_t S3FileNumberUpdater::ReadSmallestFileNumber() +{ + std::string object_key = GetS3ObjectKey(); + std::string content; + + rocksdb::IOStatus s = storage_provider_->GetCloudObject( + bucket_name_, object_key, content); + + if (!s.ok()) { + DLOG(INFO) << "Failed to read smallest file number from S3: " + << s.ToString() << ", object_key: " << object_key + << ", returning UINT64_MIN"; + return std::numeric_limits::min(); + } + + try { + uint64_t file_number = std::stoull(content); + DLOG(INFO) << "Read smallest file number from S3: " << file_number + << ", object_key: " << object_key; + return file_number; + } catch (const std::exception& e) { + LOG(ERROR) << "Failed to parse smallest file number from S3 content: '" + << content << "', error: " << e.what() + << ", object_key: " << object_key; + return std::numeric_limits::min(); + } +} + +void S3FileNumberUpdater::WriteNoActivityMarker() +{ + UpdateSmallestFileNumber(std::numeric_limits::max()); +} + +std::string S3FileNumberUpdater::GetS3ObjectKey() const +{ + std::ostringstream oss; + oss << s3_object_path_; + if (!s3_object_path_.empty() && s3_object_path_.back() != '/') { + oss << "/"; + } + oss << "smallest_new_file_number-" << epoch_; + return oss.str(); +} + +// SlidingWindow implementation + +SlidingWindow::SlidingWindow( + std::chrono::milliseconds window_duration, + std::chrono::milliseconds s3_update_interval, + const std::string& epoch, + const std::string& bucket_name, + const std::string& s3_object_path, + std::shared_ptr storage_provider) + : window_duration_(window_duration), + s3_update_interval_(s3_update_interval), + epoch_(epoch), + should_stop_(false) +{ + s3_updater_ = std::make_unique( + bucket_name, s3_object_path, epoch, storage_provider); + + // Start the timer thread + timer_thread_ = std::make_unique(&SlidingWindow::TimerWorker, this); + + DLOG(INFO) << "SlidingWindow started for epoch " << epoch_ + << ", window_duration: " << window_duration_.count() << "ms" + << ", s3_update_interval: " << s3_update_interval_.count() << "ms"; +} + +SlidingWindow::~SlidingWindow() +{ + Stop(); +} + +void SlidingWindow::AddFileNumber(uint64_t file_number) +{ + std::lock_guard lock(window_mutex_); + + window_entries_.emplace_back(file_number); + + DLOG(INFO) << "Added file number to sliding window: " << file_number + << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); +} + +uint64_t SlidingWindow::GetSmallestFileNumber() +{ + std::lock_guard lock(window_mutex_); + + if (window_entries_.empty()) { + return std::numeric_limits::max(); + } + + uint64_t smallest = window_entries_[0].file_number; + for (const auto& entry : window_entries_) { + if (entry.file_number < smallest) { + smallest = entry.file_number; + } + } + + DLOG(INFO) << "Current smallest file number: " << smallest + << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); + + return smallest; +} + +void SlidingWindow::Stop() +{ + // Signal the timer thread to stop + { + std::lock_guard lock(window_mutex_); + if (should_stop_) { + return; // Already stopped + } + should_stop_ = true; + } + cv_.notify_all(); + + // Wait for the timer thread to finish + if (timer_thread_ && timer_thread_->joinable()) { + timer_thread_->join(); + timer_thread_.reset(); + } + + DLOG(INFO) << "SlidingWindow stopped for epoch " << epoch_; +} + +void SlidingWindow::TimerWorker() +{ + std::unique_lock lock(window_mutex_); + + while (!should_stop_) { + // Wait for the specified interval or stop signal + cv_.wait_for(lock, s3_update_interval_, [this] { return should_stop_; }); + + if (should_stop_) { + break; + } + + // Perform periodic tasks + CleanupExpiredEntries(); + + // Release lock during S3 operation to avoid blocking AddFileNumber + lock.unlock(); + FlushToS3(); + lock.lock(); + } + + DLOG(INFO) << "SlidingWindow timer thread exiting for epoch " << epoch_; +} + +void SlidingWindow::FlushToS3() +{ + uint64_t smallest = GetSmallestFileNumber(); + + if (smallest == std::numeric_limits::max()) { + // No activity marker + s3_updater_->WriteNoActivityMarker(); + DLOG(INFO) << "Wrote no activity marker to S3 for epoch " << epoch_; + } else { + s3_updater_->UpdateSmallestFileNumber(smallest); + DLOG(INFO) << "Updated S3 with smallest file number: " << smallest + << ", epoch: " << epoch_; + } +} + +void SlidingWindow::CleanupExpiredEntries() +{ + // This function is called with window_mutex_ already locked + + auto now = std::chrono::steady_clock::now(); + auto cutoff_time = now - window_duration_; + + size_t original_size = window_entries_.size(); + + // Remove expired entries from the front of the deque + while (!window_entries_.empty() && + window_entries_.front().timestamp < cutoff_time) { + window_entries_.pop_front(); + } + + size_t removed = original_size - window_entries_.size(); + if (removed > 0) { + DLOG(INFO) << "Cleaned up " << removed << " expired entries from sliding window" + << ", epoch: " << epoch_ << ", remaining: " << window_entries_.size(); + } +} + +} // namespace EloqDS + diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h new file mode 100644 index 0000000..d641d25 --- /dev/null +++ b/eloq_data_store_service/purger_sliding_window.h @@ -0,0 +1,162 @@ +/** + * Copyright (C) 2025 EloqData Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under either of the following two licenses: + * 1. GNU Affero General Public License, version 3, as published by the Free + * Software Foundation. + * 2. GNU General Public License as published by the Free Software + * Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License or GNU General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * and GNU General Public License V2 along with this program. If not, see + * . + * + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace EloqDS +{ + +/** + * @brief S3 file updater for writing smallest file number to S3 + */ +class S3FileNumberUpdater +{ +public: + S3FileNumberUpdater(const std::string& bucket_name, + const std::string& s3_object_path, + const std::string& epoch, + std::shared_ptr storage_provider); + + ~S3FileNumberUpdater() = default; + + /** + * @brief Update the smallest file number in S3 + * @param file_number The smallest file number to write + */ + void UpdateSmallestFileNumber(uint64_t file_number); + + /** + * @brief Read the smallest file number from S3 + * @return The smallest file number, or UINT64_MAX if not found + */ + uint64_t ReadSmallestFileNumber(); + + /** + * @brief Write no activity marker (UINT64_MAX) to S3 + */ + void WriteNoActivityMarker(); + +private: + std::string bucket_name_; + std::string s3_object_path_; + std::string epoch_; + std::shared_ptr storage_provider_; + + std::string GetS3ObjectKey() const; +}; + +/** + * @brief Time-based sliding window for tracking file numbers with automatic S3 updates + */ +class SlidingWindow +{ +public: + /** + * @brief Constructor for sliding window + * @param window_duration Duration to keep entries in the window + * @param s3_update_interval Interval for updating S3 file + * @param epoch The epoch string for this DB instance + * @param bucket_name S3 bucket name + * @param s3_object_path S3 object path + * @param storage_provider Cloud storage provider for S3 operations + */ + SlidingWindow(std::chrono::milliseconds window_duration, + std::chrono::milliseconds s3_update_interval, + const std::string& epoch, + const std::string& bucket_name, + const std::string& s3_object_path, + std::shared_ptr storage_provider); + + /** + * @brief Destructor - stops the timer thread + */ + ~SlidingWindow(); + + /** + * @brief Add a file number to the sliding window + * @param file_number The file number to add + */ + void AddFileNumber(uint64_t file_number); + + /** + * @brief Get the smallest file number in the current window + * @return The smallest file number, or UINT64_MAX if window is empty + */ + uint64_t GetSmallestFileNumber(); + + /** + * @brief Stop the sliding window and cleanup + */ + void Stop(); + +private: + struct WindowEntry + { + uint64_t file_number; + std::chrono::steady_clock::time_point timestamp; + + WindowEntry(uint64_t num) + : file_number(num), timestamp(std::chrono::steady_clock::now()) {} + }; + + std::deque window_entries_; + std::chrono::milliseconds window_duration_; + std::chrono::milliseconds s3_update_interval_; + std::string epoch_; + + std::unique_ptr s3_updater_; + + // Threading + std::unique_ptr timer_thread_; + std::mutex window_mutex_; + std::condition_variable cv_; + bool should_stop_; + + /** + * @brief Timer thread worker function + */ + void TimerWorker(); + + /** + * @brief Flush current minimum file number to S3 + */ + void FlushToS3(); + + /** + * @brief Remove expired entries from the window + */ + void CleanupExpiredEntries(); +}; + +} // namespace EloqDS + From 992ac5ba6cde3704bb24ad7ec3987c41c6ae7989 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Mon, 15 Sep 2025 16:06:29 +0800 Subject: [PATCH 03/15] Remove improved_purger --- eloq_data_store_service/CMakeLists.txt | 19 - eloq_data_store_service/improved_purger.cpp | 486 ------------------ .../purger_event_listener.cpp | 23 +- .../purger_event_listener.h | 4 +- eloq_data_store_service/purger_prompt.md | 65 --- .../purger_sliding_window.cpp | 59 ++- .../purger_sliding_window.h | 62 ++- 7 files changed, 93 insertions(+), 625 deletions(-) delete mode 100644 eloq_data_store_service/improved_purger.cpp delete mode 100644 eloq_data_store_service/purger_prompt.md diff --git a/eloq_data_store_service/CMakeLists.txt b/eloq_data_store_service/CMakeLists.txt index e56b67c..8a29c9f 100644 --- a/eloq_data_store_service/CMakeLists.txt +++ b/eloq_data_store_service/CMakeLists.txt @@ -435,25 +435,6 @@ if ((WITH_DATA_STORE STREQUAL "ELOQDSS_ROCKSDB_CLOUD_S3") OR INSTALL_RPATH_USE_LINK_PATH TRUE) install(TARGETS rocksdb_cloud_dump RUNTIME DESTINATION bin) - - # Add improved purger utility - add_executable(improved_purger - improved_purger.cpp - rocksdb_data_store_common.cpp - rocksdb_config.cpp - purger_event_listener.cpp - purger_sliding_window.cpp - data_store_service_config.cpp - INIReader.cpp - ini.c - ds_request.pb.cc) - target_link_libraries(improved_purger ${DYNAMIC_LIB} ${ROCKSDB_LIBRARIES} ${GFLAGS_LIBRARY}) - set_target_properties(improved_purger PROPERTIES - BUILD_RPATH "$ORIGIN/../lib" - INSTALL_RPATH "$ORIGIN/../lib" - INSTALL_RPATH_USE_LINK_PATH TRUE) - install(TARGETS improved_purger - RUNTIME DESTINATION bin) endif() set_target_properties(dss_server PROPERTIES diff --git a/eloq_data_store_service/improved_purger.cpp b/eloq_data_store_service/improved_purger.cpp deleted file mode 100644 index 6bc8fd8..0000000 --- a/eloq_data_store_service/improved_purger.cpp +++ /dev/null @@ -1,486 +0,0 @@ -/** - * Copyright (C) 2025 EloqData Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under either of the following two licenses: - * 1. GNU Affero General Public License, version 3, as published by the Free - * Software Foundation. - * 2. GNU General Public License as published by the Free Software - * Foundation; version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License or GNU General Public License for more - * details. - * - * You should have received a copy of the GNU Affero General Public License - * and GNU General Public License V2 along with this program. If not, see - * . - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "cloud/cloud_manifest.h" -#include "cloud/filename.h" -#include "cloud/manifest_reader.h" -#include "file/filename.h" -#include "rocksdb/cloud/cloud_file_system_impl.h" -#include "rocksdb/cloud/cloud_storage_provider.h" -#include "rocksdb/env.h" -#include "rocksdb/options.h" - -#include "purger_sliding_window.h" - -// Command line flags -DEFINE_string(s3_url, "", "S3 URL in format s3://bucket/path (required)"); -DEFINE_int32(purge_interval_seconds, 300, "Purge cycle interval in seconds (default: 5 minutes)"); -DEFINE_bool(dry_run, false, "Dry run mode - list obsolete files but don't delete them"); -DEFINE_string(aws_region, "us-west-2", "AWS region (default: us-west-2)"); -DEFINE_int32(file_number_grace_minutes, 5, "Grace period for file number threshold in minutes (default: 5)"); - -namespace ROCKSDB_NAMESPACE { - -/** - * @brief Parse S3 URL into bucket and object path components - * @param s3_url S3 URL in format s3://bucket/path - * @param bucket_name Output bucket name - * @param object_path Output object path - * @return true if parsing succeeded, false otherwise - */ -bool ParseS3Url(const std::string& s3_url, std::string* bucket_name, std::string* object_path) { - std::regex s3_regex(R"(s3://([^/]+)(/.*)?)", std::regex_constants::icase); - std::smatch matches; - - if (!std::regex_match(s3_url, matches, s3_regex)) { - return false; - } - - *bucket_name = matches[1].str(); - *object_path = matches.size() > 2 ? matches[2].str() : ""; - - // Remove leading slash from object path - if (!object_path->empty() && (*object_path)[0] == '/') { - *object_path = object_path->substr(1); - } - - return true; -} - -/** - * @brief Enhanced purger with file number threshold support - */ -class ImprovedPurger { -public: - // Type aliases - using PurgerAllFiles = std::vector>; - using PurgerCloudManifestMap = std::unordered_map>; - using PurgerLiveFileSet = std::unordered_set; - using PurgerEpochManifestMap = std::unordered_map; - using PurgerFileNumberThresholds = std::unordered_map; // epoch -> threshold - - struct PurgerCycleState { - PurgerAllFiles all_files; - std::vector cloud_manifest_files; - PurgerCloudManifestMap cloudmanifests; - PurgerLiveFileSet live_file_names; - PurgerEpochManifestMap current_epoch_manifest_files; - PurgerFileNumberThresholds file_number_thresholds; // NEW: epoch -> min file number - std::vector obsolete_files; - }; - -private: - std::shared_ptr cfs_; - std::string bucket_name_; - std::string object_path_; - bool dry_run_; - int file_number_grace_minutes_; - -public: - ImprovedPurger(std::shared_ptr cfs, - const std::string& bucket_name, - const std::string& object_path, - bool dry_run, - int file_number_grace_minutes) - : cfs_(cfs), - bucket_name_(bucket_name), - object_path_(object_path), - dry_run_(dry_run), - file_number_grace_minutes_(file_number_grace_minutes) {} - - /** - * @brief Run a single purge cycle with improved file number checking - */ - void RunSinglePurgeCycle() { - PurgerCycleState state; - - LOG(INFO) << "[ImprovedPurger] Starting purge cycle for " << bucket_name_ << "/" << object_path_; - - if (!ListAllFiles(&state.all_files)) { - return; - } - - if (!ListCloudManifests(&state.cloud_manifest_files)) { - return; - } - - if (!LoadCloudManifests(state.cloud_manifest_files, &state.cloudmanifests)) { - return; - } - - if (!CollectLiveFiles(state.cloudmanifests, &state.live_file_names, - &state.current_epoch_manifest_files)) { - return; - } - - // NEW: Load file number thresholds from S3 - LoadFileNumberThresholds(state.cloudmanifests, &state.file_number_thresholds); - - // Enhanced selection with file number checking - SelectObsoleteFilesWithThreshold(state.all_files, state.live_file_names, - state.current_epoch_manifest_files, - state.file_number_thresholds, - &state.obsolete_files); - - if (dry_run_) { - LOG(INFO) << "[ImprovedPurger] DRY RUN: Would delete " << state.obsolete_files.size() << " files"; - for (const auto& file : state.obsolete_files) { - LOG(INFO) << "[ImprovedPurger] DRY RUN: Would delete " << file; - } - } else { - DeleteObsoleteFiles(state.obsolete_files); - } - - LOG(INFO) << "[ImprovedPurger] Purge cycle summary: total_files=" << state.all_files.size() - << " manifests=" << state.cloudmanifests.size() - << " live_files=" << state.live_file_names.size() - << " obsolete_selected=" << state.obsolete_files.size() - << " thresholds_loaded=" << state.file_number_thresholds.size(); - } - -private: - bool ListAllFiles(PurgerAllFiles* all_files) { - IOStatus s = cfs_->GetStorageProvider()->ListCloudObjects( - bucket_name_, object_path_, all_files); - - if (!s.ok()) { - LOG(ERROR) << "[ImprovedPurger] Failed to list files in " << object_path_ - << ": " << s.ToString(); - return false; - } - - LOG(INFO) << "[ImprovedPurger] Listed " << all_files->size() << " files"; - return true; - } - - bool ListCloudManifests(std::vector* cloud_manifest_files) { - IOStatus s = cfs_->GetStorageProvider()->ListCloudObjectsWithPrefix( - bucket_name_, object_path_, "CLOUDMANIFEST", cloud_manifest_files); - - if (!s.ok()) { - LOG(ERROR) << "[ImprovedPurger] Failed to list cloud manifests: " << s.ToString(); - return false; - } - - LOG(INFO) << "[ImprovedPurger] Found " << cloud_manifest_files->size() << " cloud manifest files"; - return true; - } - - bool LoadCloudManifests(const std::vector& cloud_manifest_files, - PurgerCloudManifestMap* manifests) { - const FileOptions file_opts; - IODebugContext* dbg = nullptr; - bool success = true; - - for (const auto& cloud_manifest_file : cloud_manifest_files) { - std::string full_path = object_path_ + "/" + cloud_manifest_file; - std::unique_ptr file; - - IOStatus s = cfs_->NewSequentialFileCloud(bucket_name_, full_path, file_opts, &file, dbg); - if (!s.ok()) { - LOG(ERROR) << "[ImprovedPurger] Failed to open manifest " << cloud_manifest_file - << ": " << s.ToString(); - success = false; - continue; - } - - std::unique_ptr cloud_manifest; - s = CloudManifest::LoadFromLog( - std::unique_ptr( - new SequentialFileReader(std::move(file), cloud_manifest_file)), - &cloud_manifest); - - if (!s.ok()) { - LOG(ERROR) << "[ImprovedPurger] Failed to load manifest " << cloud_manifest_file - << ": " << s.ToString(); - success = false; - continue; - } - - LOG(INFO) << "[ImprovedPurger] Loaded manifest " << cloud_manifest_file - << " with epoch " << cloud_manifest->GetCurrentEpoch(); - - (*manifests)[cloud_manifest_file] = std::move(cloud_manifest); - } - - return success; - } - - bool CollectLiveFiles(const PurgerCloudManifestMap& cloudmanifests, - PurgerLiveFileSet* live_files, - PurgerEpochManifestMap* epoch_manifest_infos) { - std::unique_ptr manifest_reader( - new ManifestReader(cfs_->info_log_, cfs_.get(), bucket_name_)); - - std::set live_file_numbers; - bool success = true; - - for (const auto& entry : cloudmanifests) { - const std::string& cloud_manifest_name = entry.first; - CloudManifest* cloud_manifest_ptr = entry.second.get(); - - live_file_numbers.clear(); - std::string current_epoch = cloud_manifest_ptr->GetCurrentEpoch(); - std::string manifest_file = ManifestFileWithEpoch(object_path_, current_epoch); - - CloudObjectInformation manifest_file_info; - IOStatus s = cfs_->GetStorageProvider()->GetCloudObjectMetadata( - bucket_name_, manifest_file, &manifest_file_info); - - if (!s.ok()) { - LOG(ERROR) << "[ImprovedPurger] Failed to get metadata for manifest " - << manifest_file << ": " << s.ToString(); - success = false; - continue; - } - - (*epoch_manifest_infos)[current_epoch] = manifest_file_info; - - s = manifest_reader->GetLiveFiles(object_path_, current_epoch, &live_file_numbers); - if (!s.ok()) { - LOG(ERROR) << "[ImprovedPurger] Failed to get live files from manifest " - << cloud_manifest_name << ": " << s.ToString(); - success = false; - continue; - } - - for (uint64_t num : live_file_numbers) { - std::string file_name = MakeTableFileName(num); - file_name = cfs_->RemapFilenameWithCloudManifest(file_name, cloud_manifest_ptr); - live_files->insert(file_name); - DLOG(INFO) << "[ImprovedPurger] Live file: " << file_name; - } - } - - return success; - } - - void LoadFileNumberThresholds(const PurgerCloudManifestMap& cloudmanifests, - PurgerFileNumberThresholds* thresholds) { - for (const auto& entry : cloudmanifests) { - CloudManifest* manifest = entry.second.get(); - std::string epoch = manifest->GetCurrentEpoch(); - - // Create S3 file number updater to read threshold - auto s3_updater = std::make_unique( - bucket_name_, object_path_, epoch, cfs_->GetStorageProvider()); - - uint64_t threshold = s3_updater->ReadSmallestFileNumber(); - (*thresholds)[epoch] = threshold; - - if (threshold == std::numeric_limits::max()) { - LOG(INFO) << "[ImprovedPurger] No file number threshold found for epoch " << epoch - << " (using conservative approach)"; - } else { - LOG(INFO) << "[ImprovedPurger] Loaded file number threshold " << threshold - << " for epoch " << epoch; - } - } - } - - void SelectObsoleteFilesWithThreshold(const PurgerAllFiles& all_files, - const PurgerLiveFileSet& live_files, - const PurgerEpochManifestMap& epoch_manifest_infos, - const PurgerFileNumberThresholds& thresholds, - std::vector* obsolete_files) { - for (const auto& candidate : all_files) { - const std::string& candidate_file_path = candidate.first; - const CloudObjectInformation& candidate_file_info = candidate.second; - - // Skip non-SST files - if (!ends_with(RemoveEpoch(candidate_file_path), ".sst")) { - continue; - } - - // Skip live files - if (live_files.find(candidate_file_path) != live_files.end()) { - continue; - } - - std::string candidate_epoch = GetEpoch(candidate_file_path); - uint64_t candidate_modification_time = candidate_file_info.modification_time; - - // Get manifest modification time - uint64_t manifest_modification_time = std::numeric_limits::max(); - auto epoch_it = epoch_manifest_infos.find(candidate_epoch); - if (epoch_it != epoch_manifest_infos.end()) { - manifest_modification_time = epoch_it->second.modification_time; - } - - // NEW: Check file number threshold - bool safe_by_file_number = true; - auto threshold_it = thresholds.find(candidate_epoch); - if (threshold_it != thresholds.end()) { - uint64_t threshold = threshold_it->second; - if (threshold != std::numeric_limits::max()) { - // Extract file number from candidate file name - uint64_t file_number = 0; - std::string base_name = RemoveEpoch(candidate_file_path); - if (ParseFileName(base_name, &file_number, nullptr)) { - if (file_number >= threshold) { - safe_by_file_number = false; - DLOG(INFO) << "[ImprovedPurger] File " << candidate_file_path - << " protected by file number threshold (file_num=" - << file_number << ", threshold=" << threshold << ")"; - } - } - } - } - - // Apply both time-based and file number-based checks - if (safe_by_file_number && candidate_modification_time < manifest_modification_time) { - obsolete_files->push_back(candidate_file_path); - DLOG(INFO) << "[ImprovedPurger] File " << candidate_file_path << " is obsolete"; - } else { - DLOG(INFO) << "[ImprovedPurger] File " << candidate_file_path - << " is protected (safe_by_file_number=" << safe_by_file_number - << ", candidate_time=" << candidate_modification_time - << ", manifest_time=" << manifest_modification_time << ")"; - } - } - } - - void DeleteObsoleteFiles(const std::vector& obsolete_files) { - size_t deleted = 0; - size_t failures = 0; - - for (const auto& file_to_delete : obsolete_files) { - std::string file_path = object_path_ + "/" + file_to_delete; - LOG(INFO) << "[ImprovedPurger] Deleting obsolete file " << file_to_delete; - - IOStatus s = cfs_->GetStorageProvider()->DeleteCloudObject(bucket_name_, file_path); - if (!s.ok()) { - ++failures; - LOG(ERROR) << "[ImprovedPurger] Failed to delete " << file_path - << ": " << s.ToString(); - } else { - ++deleted; - } - } - - LOG(INFO) << "[ImprovedPurger] Deletion summary: requested=" << obsolete_files.size() - << " deleted=" << deleted << " failures=" << failures; - } -}; - -} // namespace ROCKSDB_NAMESPACE - -int main(int argc, char* argv[]) { - // Initialize gflags and glog - gflags::ParseCommandLineFlags(&argc, &argv, true); - google::InitGoogleLogging(argv[0]); - - // Set log level to INFO by default - FLAGS_logtostderr = 1; - FLAGS_minloglevel = 0; // INFO level - - if (FLAGS_s3_url.empty()) { - std::cerr << "Error: --s3_url is required\n"; - std::cerr << "Usage: " << argv[0] << " --s3_url=s3://bucket/path [options]\n"; - std::cerr << "Options:\n"; - std::cerr << " --purge_interval_seconds=300 Purge cycle interval (default: 5 minutes)\n"; - std::cerr << " --dry_run=false Dry run mode - don't actually delete files\n"; - std::cerr << " --aws_region=us-west-2 AWS region\n"; - std::cerr << " --file_number_grace_minutes=5 Grace period for file number threshold\n"; - return 1; - } - - std::string bucket_name, object_path; - if (!ROCKSDB_NAMESPACE::ParseS3Url(FLAGS_s3_url, &bucket_name, &object_path)) { - std::cerr << "Error: Invalid S3 URL format. Expected: s3://bucket/path\n"; - return 1; - } - - LOG(INFO) << "Starting improved purger for S3 URL: " << FLAGS_s3_url; - LOG(INFO) << "Parsed - Bucket: " << bucket_name << ", Object Path: " << object_path; - LOG(INFO) << "Configuration - Purge Interval: " << FLAGS_purge_interval_seconds - << "s, Dry Run: " << (FLAGS_dry_run ? "true" : "false") - << ", AWS Region: " << FLAGS_aws_region - << ", File Number Grace: " << FLAGS_file_number_grace_minutes << " minutes"; - - try { - // Create CloudFileSystemOptions - ROCKSDB_NAMESPACE::CloudFileSystemOptions cfs_options; - cfs_options.src_bucket.SetBucketName(bucket_name, ""); - cfs_options.src_bucket.SetRegion(FLAGS_aws_region); - cfs_options.dest_bucket = cfs_options.src_bucket; - cfs_options.dest_bucket.SetObjectPath(object_path); - cfs_options.purger_periodicity_millis = FLAGS_purge_interval_seconds * 1000; - - // Create CloudFileSystem - std::shared_ptr cloud_fs; - ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::CloudFileSystem::NewAwsFileSystem( - ROCKSDB_NAMESPACE::FileSystem::Default(), cfs_options, &cloud_fs); - - if (!s.ok()) { - LOG(FATAL) << "Failed to create CloudFileSystem: " << s.ToString(); - return 1; - } - - auto cfs_impl = std::dynamic_pointer_cast(cloud_fs); - if (!cfs_impl) { - LOG(FATAL) << "Failed to cast to CloudFileSystemImpl"; - return 1; - } - - // Create and run improved purger - ROCKSDB_NAMESPACE::ImprovedPurger purger( - cfs_impl, bucket_name, object_path, - FLAGS_dry_run, FLAGS_file_number_grace_minutes); - - // Run purge cycles - while (true) { - auto start_time = std::chrono::steady_clock::now(); - - purger.RunSinglePurgeCycle(); - - auto end_time = std::chrono::steady_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time); - LOG(INFO) << "Purge cycle completed in " << duration.count() << "ms"; - - // Sleep until next cycle - std::this_thread::sleep_for(std::chrono::seconds(FLAGS_purge_interval_seconds)); - } - - } catch (const std::exception& e) { - LOG(FATAL) << "Exception: " << e.what(); - return 1; - } - - return 0; -} - diff --git a/eloq_data_store_service/purger_event_listener.cpp b/eloq_data_store_service/purger_event_listener.cpp index 6452a65..fe44e74 100644 --- a/eloq_data_store_service/purger_event_listener.cpp +++ b/eloq_data_store_service/purger_event_listener.cpp @@ -81,7 +81,7 @@ void PurgerEventListener::OnFlushBegin( } // Update sliding window with current max file number - UpdateSlidingWindow(db); + UpdateSlidingWindow(db, flush_job_info.thread_id, flush_job_info.job_id); DLOG(INFO) << "[PurgerEventListener] OnFlushBegin processed for epoch " << epoch_ << ", job_id: " << flush_job_info.job_id; @@ -110,6 +110,12 @@ void PurgerEventListener::OnFlushCompleted( << ", epoch: " << epoch_; } + // Remove the entry from sliding window + if (sliding_window_) + { + sliding_window_->RemoveFileNumber(flush_job_info.thread_id, flush_job_info.job_id); + } + DLOG(INFO) << "[PurgerEventListener] OnFlushCompleted processed for epoch " << epoch_ << ", job_id: " << flush_job_info.job_id; } @@ -126,7 +132,7 @@ void PurgerEventListener::OnCompactionBegin( << ", epoch: " << epoch_; // Update sliding window with current max file number - UpdateSlidingWindow(db); + UpdateSlidingWindow(db, ci.thread_id, ci.job_id); DLOG(INFO) << "[PurgerEventListener] OnCompactionBegin processed for epoch " << epoch_ << ", job_id: " << ci.job_id; @@ -144,6 +150,12 @@ void PurgerEventListener::OnCompactionCompleted( << static_cast(ci.compaction_reason) << ", epoch: " << epoch_; + // Remove the entry from sliding window + if (sliding_window_) + { + sliding_window_->RemoveFileNumber(ci.thread_id, ci.job_id); + } + DLOG(INFO) << "[PurgerEventListener] OnCompactionCompleted processed for epoch " << epoch_ << ", job_id: " << ci.job_id; @@ -194,7 +206,7 @@ std::string PurgerEventListener::GetFlushReason( } } -void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db) +void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db, int thread_id, uint64_t job_id) { if (!db) { @@ -208,11 +220,12 @@ void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db) if (sliding_window_) { - sliding_window_->AddFileNumber(max_file_number); + sliding_window_->AddFileNumber(max_file_number, thread_id, job_id); } DLOG(INFO) << "[PurgerEventListener] Added file number to sliding window: " - << max_file_number << ", epoch: " << epoch_; + << max_file_number << ", thread_id: " << thread_id + << ", job_id: " << job_id << ", epoch: " << epoch_; } } // namespace EloqDS diff --git a/eloq_data_store_service/purger_event_listener.h b/eloq_data_store_service/purger_event_listener.h index d2bec63..900a1b7 100644 --- a/eloq_data_store_service/purger_event_listener.h +++ b/eloq_data_store_service/purger_event_listener.h @@ -121,8 +121,10 @@ class PurgerEventListener : public rocksdb::EventListener /** * @brief Update sliding window with current max file number from DB * @param db Pointer to the database instance + * @param thread_id The thread ID of the operation (default: 0) + * @param job_id The job ID of the operation (default: 0) */ - void UpdateSlidingWindow(rocksdb::DB* db); + void UpdateSlidingWindow(rocksdb::DB* db, int thread_id = 0, uint64_t job_id = 0); }; } // namespace EloqDS diff --git a/eloq_data_store_service/purger_prompt.md b/eloq_data_store_service/purger_prompt.md deleted file mode 100644 index 242fae4..0000000 --- a/eloq_data_store_service/purger_prompt.md +++ /dev/null @@ -1,65 +0,0 @@ -# This is a prompt file for LLM to generate a purger for RocksDBCloud S3 files - -## Background - -* The RocksDB has the Manifest file as the entry point of each DB instance. - - It contains the meta info of the DB files. - - It also contains a set of SST files. - -* The RocksDBCloud extends the RocksDB with the capability of store DB files in S3 storage. - - It also supports forking a DB instance from an existing DB instance. - - Each RocksDBCloud DB instance has a CLOUDMANIFEST file as the entry point of DB instance. - - The CLOUDMANIFEST file contains a list of epoch postfixed Manifest file which appeared in its forking history. - - The DB forking process can be describe as steps (assuming the both DB instances are in same S3 object path, e.g. s3://bucket_name/object_path/) - - Copy the CLOUDMANIFEST file from old CLOUDMANIFEST (e.g. CLOUDMANIFEST-0), by given a suffix to make distinguish between the two files, e.g. CLOUDMANIFEST-1. - - Generate a new epoch (e.g. 582107b0a928437c) for an new RocksDB Mainfest file as postfix (e.g. Manifest-${epoch}). - - Add the new Manifest file and the epoch into the CLOUDMANIFEST file as current epoch. - - Then, an new DB instance can be start from the new CLOUDMANIFEST file. - - All new SST file created by this new DB instance will be postfixed with the same epoch (e.g. 002434.sst-${epoch}). - -## Problem - -* Just like RocksDB, the SST files in RocksDBCloud can be deleted after compaction or the DB instance is removed. -* But, when the forking relation between RocksDBCloud DB instances are exists, we can not simplely delete those files, since they can be referenced by other DB instances. -* So, we have to prevent the ordinary SST file deletion happen in normal RocksDB, but introduce a purger program to perform the clean job. - -## The Purger - -* Existing algorithm already been implemented - - List all SST files under the object_path. - - List all CLOUDMANIFEST files under the object_path. - - Load all current Manifest files. - - Create a live file list by listing all live file number in each Manifest and find out - the correct epoch to form the complete SST file name (e.g. 002442.sst-582107b0a928437c). - - Create a candidate obsolete file list by find out all SST files which are under the - object_path but not in the live files list. - - There can be some newly created SST file not in the Manifest file, so we only treat - the candidate file older than the Manifest file as obsolete file. - - Above algorithm are implemented in the single_object_path_purger.cc, - please refer it for details. - -* The problem of the current implemented purger - - In RocksDB, the SST files are generated by memory table flush and SST compaction. - * They are executed in parallel by a group of background threads, every one can update the Manifest file. - * The Manifest file will be updated each time after a memory table flushed. - * The Manifest file will be updated at the end of each compaction round, and several SST files can be generated during compaction. - - So, the Manifest file can be updated before an new SST file were enlisted in the Manifest file, and the new SST file will be treated as obsolete. - -## The improvement of the purger algorithm - -* Base on the observation that all file numbers are monotone increased. -* We implement a rockdb::EventListener and subscribe the FlushBegin, CompactionBegin events -* When Flush and Compaction happens, we get the max file number from the db, and add to a time based slide window state -* At the end of the slide window, we find out the smaller file number in the state and update to a file in S3 object path. - e.g. smallest_new_file_number-${epoch} -* If there are no flush or compaction happens during the time based slide window, the update smallest_new_file_number-${epoch} file with UINT64_MAX -* When purger check the candidate obsolete SST file, if its file number is small than the file number in smallest_new_file_number-${epoch}, then - it can be treat as obsolete. - - -## Your tasks - -* Implement the EventListener. - - Reference the RocksDBEventLisener found in rocksdb_data_store_common.h -* Implement the time based slide window to update the smallest_new_file_number-${epoch} file. -* Implement a standalone purger program by referencing the single_object_path_purger.cc diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index b428c11..fbc1e40 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -134,16 +134,39 @@ SlidingWindow::~SlidingWindow() Stop(); } -void SlidingWindow::AddFileNumber(uint64_t file_number) +void SlidingWindow::AddFileNumber(uint64_t file_number, int thread_id, uint64_t job_id) { std::lock_guard lock(window_mutex_); - window_entries_.emplace_back(file_number); + std::string key = GenerateKey(thread_id, job_id); + window_entries_.emplace(key, WindowEntry(file_number)); DLOG(INFO) << "Added file number to sliding window: " << file_number + << ", thread_id: " << thread_id << ", job_id: " << job_id << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); } +void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) +{ + std::lock_guard lock(window_mutex_); + + std::string key = GenerateKey(thread_id, job_id); + auto it = window_entries_.find(key); + + if (it != window_entries_.end()) { + uint64_t removed_file_number = it->second.file_number; + window_entries_.erase(it); + + DLOG(INFO) << "Removed file number from sliding window: " << removed_file_number + << ", thread_id: " << thread_id << ", job_id: " << job_id + << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); + } else { + DLOG(WARNING) << "Attempted to remove non-existent entry from sliding window: " + << "thread_id: " << thread_id << ", job_id: " << job_id + << ", epoch: " << epoch_; + } +} + uint64_t SlidingWindow::GetSmallestFileNumber() { std::lock_guard lock(window_mutex_); @@ -152,10 +175,10 @@ uint64_t SlidingWindow::GetSmallestFileNumber() return std::numeric_limits::max(); } - uint64_t smallest = window_entries_[0].file_number; + uint64_t smallest = std::numeric_limits::max(); for (const auto& entry : window_entries_) { - if (entry.file_number < smallest) { - smallest = entry.file_number; + if (entry.second.file_number < smallest) { + smallest = entry.second.file_number; } } @@ -198,9 +221,6 @@ void SlidingWindow::TimerWorker() break; } - // Perform periodic tasks - CleanupExpiredEntries(); - // Release lock during S3 operation to avoid blocking AddFileNumber lock.unlock(); FlushToS3(); @@ -225,26 +245,11 @@ void SlidingWindow::FlushToS3() } } -void SlidingWindow::CleanupExpiredEntries() +std::string SlidingWindow::GenerateKey(int thread_id, uint64_t job_id) const { - // This function is called with window_mutex_ already locked - - auto now = std::chrono::steady_clock::now(); - auto cutoff_time = now - window_duration_; - - size_t original_size = window_entries_.size(); - - // Remove expired entries from the front of the deque - while (!window_entries_.empty() && - window_entries_.front().timestamp < cutoff_time) { - window_entries_.pop_front(); - } - - size_t removed = original_size - window_entries_.size(); - if (removed > 0) { - DLOG(INFO) << "Cleaned up " << removed << " expired entries from sliding window" - << ", epoch: " << epoch_ << ", remaining: " << window_entries_.size(); - } + std::ostringstream oss; + oss << thread_id << "-" << job_id; + return oss.str(); } } // namespace EloqDS diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h index d641d25..4dd2790 100644 --- a/eloq_data_store_service/purger_sliding_window.h +++ b/eloq_data_store_service/purger_sliding_window.h @@ -22,16 +22,16 @@ #pragma once +#include +#include + #include #include -#include #include #include #include #include - -#include -#include +#include namespace EloqDS { @@ -42,10 +42,11 @@ namespace EloqDS class S3FileNumberUpdater { public: - S3FileNumberUpdater(const std::string& bucket_name, - const std::string& s3_object_path, - const std::string& epoch, - std::shared_ptr storage_provider); + S3FileNumberUpdater( + const std::string &bucket_name, + const std::string &s3_object_path, + const std::string &epoch, + std::shared_ptr storage_provider); ~S3FileNumberUpdater() = default; @@ -76,7 +77,8 @@ class S3FileNumberUpdater }; /** - * @brief Time-based sliding window for tracking file numbers with automatic S3 updates + * @brief Time-based sliding window for tracking file numbers with automatic S3 + * updates */ class SlidingWindow { @@ -90,12 +92,13 @@ class SlidingWindow * @param s3_object_path S3 object path * @param storage_provider Cloud storage provider for S3 operations */ - SlidingWindow(std::chrono::milliseconds window_duration, - std::chrono::milliseconds s3_update_interval, - const std::string& epoch, - const std::string& bucket_name, - const std::string& s3_object_path, - std::shared_ptr storage_provider); + SlidingWindow( + std::chrono::milliseconds window_duration, + std::chrono::milliseconds s3_update_interval, + const std::string &epoch, + const std::string &bucket_name, + const std::string &s3_object_path, + std::shared_ptr storage_provider); /** * @brief Destructor - stops the timer thread @@ -105,8 +108,17 @@ class SlidingWindow /** * @brief Add a file number to the sliding window * @param file_number The file number to add + * @param thread_id The thread ID of the operation + * @param job_id The job ID of the operation */ - void AddFileNumber(uint64_t file_number); + void AddFileNumber(uint64_t file_number, int thread_id, uint64_t job_id); + + /** + * @brief Remove a file number entry from the sliding window + * @param thread_id The thread ID of the operation + * @param job_id The job ID of the operation + */ + void RemoveFileNumber(int thread_id, uint64_t job_id); /** * @brief Get the smallest file number in the current window @@ -126,10 +138,14 @@ class SlidingWindow std::chrono::steady_clock::time_point timestamp; WindowEntry(uint64_t num) - : file_number(num), timestamp(std::chrono::steady_clock::now()) {} + : file_number(num), + timestamp(std::chrono::steady_clock::now()) + { + } }; - std::deque window_entries_; + // Map of (thread_id + job_id) -> WindowEntry + std::unordered_map window_entries_; std::chrono::milliseconds window_duration_; std::chrono::milliseconds s3_update_interval_; std::string epoch_; @@ -153,10 +169,12 @@ class SlidingWindow void FlushToS3(); /** - * @brief Remove expired entries from the window + * @brief Generate a key string for the window_entries_ map + * @param thread_id The thread ID of the operation + * @param job_id The job ID of the operation + * @return A string key combining thread_id and job_id */ - void CleanupExpiredEntries(); + std::string GenerateKey(int thread_id, uint64_t job_id) const; }; -} // namespace EloqDS - +} // namespace EloqDS From 514863eff8b5c152c05a715de788de1584cf3da2 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Mon, 15 Sep 2025 19:41:55 +0800 Subject: [PATCH 04/15] Add event_listener --- .../purger_event_listener.cpp | 63 +++--- .../purger_event_listener.h | 7 +- .../purger_sliding_window.cpp | 203 +++++++++++++----- .../purger_sliding_window.h | 18 +- .../rocksdb_cloud_data_store.cpp | 33 ++- 5 files changed, 221 insertions(+), 103 deletions(-) diff --git a/eloq_data_store_service/purger_event_listener.cpp b/eloq_data_store_service/purger_event_listener.cpp index fe44e74..9ad283a 100644 --- a/eloq_data_store_service/purger_event_listener.cpp +++ b/eloq_data_store_service/purger_event_listener.cpp @@ -20,12 +20,12 @@ * */ -#include "purger_event_listener.h" - #include #include +#include "purger_event_listener.h" + namespace EloqDS { @@ -36,16 +36,16 @@ PurgerEventListener::PurgerEventListener( std::shared_ptr storage_provider, std::chrono::milliseconds window_duration, std::chrono::milliseconds s3_update_interval) - : epoch_(epoch), bucket_name_(bucket_name), s3_object_path_(s3_object_path) + : bucket_name_(bucket_name), s3_object_path_(s3_object_path) { sliding_window_ = std::make_unique(window_duration, s3_update_interval, - epoch_, + epoch, bucket_name_, s3_object_path_, storage_provider); - LOG(INFO) << "PurgerEventListener created for epoch " << epoch_ + LOG(INFO) << "PurgerEventListener created for epoch " << epoch << ", bucket: " << bucket_name_ << ", object_path: " << s3_object_path_ << ", window_duration: " << window_duration.count() << "ms" @@ -57,6 +57,19 @@ PurgerEventListener::~PurgerEventListener() Stop(); } +void PurgerEventListener::SetEpoch(const std::string &epoch) +{ + if (sliding_window_) + { + LOG(INFO) << "PurgerEventListener epoch updated from " + << (sliding_window_->GetEpoch().empty() + ? "empty" + : sliding_window_->GetEpoch()) + << " to " << epoch; + sliding_window_->SetEpoch(epoch); + } +} + void PurgerEventListener::OnFlushBegin( rocksdb::DB *db, const rocksdb::FlushJobInfo &flush_job_info) { @@ -76,15 +89,11 @@ void PurgerEventListener::OnFlushBegin( << ", smallest_seqno: " << flush_job_info.smallest_seqno << ", largest_seqno: " << flush_job_info.largest_seqno << ", flush_reason: " - << GetFlushReason(flush_job_info.flush_reason) - << ", epoch: " << epoch_; + << GetFlushReason(flush_job_info.flush_reason); } // Update sliding window with current max file number UpdateSlidingWindow(db, flush_job_info.thread_id, flush_job_info.job_id); - - DLOG(INFO) << "[PurgerEventListener] OnFlushBegin processed for epoch " - << epoch_ << ", job_id: " << flush_job_info.job_id; } void PurgerEventListener::OnFlushCompleted( @@ -106,18 +115,15 @@ void PurgerEventListener::OnFlushCompleted( << ", smallest_seqno: " << flush_job_info.smallest_seqno << ", largest_seqno: " << flush_job_info.largest_seqno << ", flush_reason: " - << GetFlushReason(flush_job_info.flush_reason) - << ", epoch: " << epoch_; + << GetFlushReason(flush_job_info.flush_reason); } // Remove the entry from sliding window if (sliding_window_) { - sliding_window_->RemoveFileNumber(flush_job_info.thread_id, flush_job_info.job_id); + sliding_window_->RemoveFileNumber(flush_job_info.thread_id, + flush_job_info.job_id); } - - DLOG(INFO) << "[PurgerEventListener] OnFlushCompleted processed for epoch " - << epoch_ << ", job_id: " << flush_job_info.job_id; } void PurgerEventListener::OnCompactionBegin( @@ -128,14 +134,10 @@ void PurgerEventListener::OnCompactionBegin( << ", output_level: " << ci.output_level << ", input_files_size: " << ci.input_files.size() << ", compaction_reason: " - << static_cast(ci.compaction_reason) - << ", epoch: " << epoch_; + << static_cast(ci.compaction_reason); // Update sliding window with current max file number UpdateSlidingWindow(db, ci.thread_id, ci.job_id); - - DLOG(INFO) << "[PurgerEventListener] OnCompactionBegin processed for epoch " - << epoch_ << ", job_id: " << ci.job_id; } void PurgerEventListener::OnCompactionCompleted( @@ -148,17 +150,13 @@ void PurgerEventListener::OnCompactionCompleted( << ", output_files_size: " << ci.output_files.size() << ", compaction_reason: " << static_cast(ci.compaction_reason) - << ", epoch: " << epoch_; + << ", epoch: " << sliding_window_->GetEpoch(); // Remove the entry from sliding window if (sliding_window_) { sliding_window_->RemoveFileNumber(ci.thread_id, ci.job_id); } - - DLOG(INFO) - << "[PurgerEventListener] OnCompactionCompleted processed for epoch " - << epoch_ << ", job_id: " << ci.job_id; } void PurgerEventListener::Stop() @@ -168,8 +166,6 @@ void PurgerEventListener::Stop() sliding_window_->Stop(); sliding_window_.reset(); } - - LOG(INFO) << "PurgerEventListener stopped for epoch " << epoch_; } std::string PurgerEventListener::GetFlushReason( @@ -206,12 +202,13 @@ std::string PurgerEventListener::GetFlushReason( } } -void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db, int thread_id, uint64_t job_id) +void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db, + int thread_id, + uint64_t job_id) { if (!db) { - LOG(ERROR) << "[PurgerEventListener] DB pointer is null for epoch " - << epoch_; + LOG(ERROR) << "[PurgerEventListener] DB pointer is null"; return; } @@ -222,10 +219,6 @@ void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db, int thread_id, ui { sliding_window_->AddFileNumber(max_file_number, thread_id, job_id); } - - DLOG(INFO) << "[PurgerEventListener] Added file number to sliding window: " - << max_file_number << ", thread_id: " << thread_id - << ", job_id: " << job_id << ", epoch: " << epoch_; } } // namespace EloqDS diff --git a/eloq_data_store_service/purger_event_listener.h b/eloq_data_store_service/purger_event_listener.h index 900a1b7..7fec5bf 100644 --- a/eloq_data_store_service/purger_event_listener.h +++ b/eloq_data_store_service/purger_event_listener.h @@ -67,6 +67,12 @@ class PurgerEventListener : public rocksdb::EventListener */ ~PurgerEventListener(); + /** + * @brief Update the epoch string + * @param epoch The new epoch string + */ + void SetEpoch(const std::string& epoch); + /** * @brief Called when a flush operation begins * @param db Pointer to the database instance @@ -112,7 +118,6 @@ class PurgerEventListener : public rocksdb::EventListener std::string GetFlushReason(rocksdb::FlushReason flush_reason); private: - std::string epoch_; std::string bucket_name_; std::string s3_object_path_; diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index fbc1e40..df5a723 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -20,67 +20,114 @@ * */ -#include "purger_sliding_window.h" - +#include +#include +#include +#include +#include + +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include + +#include "purger_sliding_window.h" namespace EloqDS { +using std::make_unique; // S3FileNumberUpdater implementation S3FileNumberUpdater::S3FileNumberUpdater( - const std::string& bucket_name, - const std::string& s3_object_path, - const std::string& epoch, + const std::string &bucket_name, + const std::string &s3_object_path, std::shared_ptr storage_provider) : bucket_name_(bucket_name), s3_object_path_(s3_object_path), - epoch_(epoch), storage_provider_(storage_provider) { } -void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number) +void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number, + const std::string &epoch) { std::string content = std::to_string(file_number); - std::string object_key = GetS3ObjectKey(); + std::string object_key = GetS3ObjectKey(epoch); + + // Write to temp local file at first, then upload to S3 + std::string time_id = std::to_string( + std::chrono::steady_clock::now().time_since_epoch().count()); + std::string temp_file_path = + "/tmp/smallest_file_number_" + epoch + "_" + time_id + ".txt"; + std::ofstream temp_file(temp_file_path); + if (!temp_file.is_open()) + { + LOG(ERROR) << "Failed to open temp file for writing: " + << temp_file_path; + return; + } + temp_file << content; + temp_file.close(); + // Now upload the temp file to S3 rocksdb::IOStatus s = storage_provider_->PutCloudObject( - bucket_name_, object_key, content); + temp_file_path, bucket_name_, object_key); - if (!s.ok()) { + if (!s.ok()) + { LOG(ERROR) << "Failed to update smallest file number to S3: " - << s.ToString() << ", object_key: " << object_key + << s.ToString() << " ,bucket_name: " << bucket_name_ + << ", object_key: " << object_key << ", file_number: " << file_number; - } else { - DLOG(INFO) << "Updated smallest file number in S3: " << file_number + } + else + { + DLOG(INFO) << "Updated smallest file number in S3: " + << " bucket_name: " << bucket_name_ + << ", file_number: " << file_number << ", object_key: " << object_key; } + + // Remove the temp file + if (std::remove(temp_file_path.c_str()) != 0) + { + LOG(WARNING) << "Failed to remove temp file: " << temp_file_path; + } } -uint64_t S3FileNumberUpdater::ReadSmallestFileNumber() +uint64_t S3FileNumberUpdater::ReadSmallestFileNumber(const std::string &epoch) { - std::string object_key = GetS3ObjectKey(); + std::string object_key = GetS3ObjectKey(epoch); std::string content; - rocksdb::IOStatus s = storage_provider_->GetCloudObject( - bucket_name_, object_key, content); + rocksdb::IOStatus s = + storage_provider_->GetCloudObject(bucket_name_, object_key, content); - if (!s.ok()) { + if (!s.ok()) + { DLOG(INFO) << "Failed to read smallest file number from S3: " << s.ToString() << ", object_key: " << object_key << ", returning UINT64_MIN"; return std::numeric_limits::min(); } - try { + try + { uint64_t file_number = std::stoull(content); DLOG(INFO) << "Read smallest file number from S3: " << file_number << ", object_key: " << object_key; return file_number; - } catch (const std::exception& e) { + } + catch (const std::exception &e) + { LOG(ERROR) << "Failed to parse smallest file number from S3 content: '" << content << "', error: " << e.what() << ", object_key: " << object_key; @@ -88,19 +135,20 @@ uint64_t S3FileNumberUpdater::ReadSmallestFileNumber() } } -void S3FileNumberUpdater::WriteNoActivityMarker() +void S3FileNumberUpdater::WriteNoActivityMarker(const std::string &epoch) { - UpdateSmallestFileNumber(std::numeric_limits::max()); + UpdateSmallestFileNumber(std::numeric_limits::max(), epoch); } -std::string S3FileNumberUpdater::GetS3ObjectKey() const +std::string S3FileNumberUpdater::GetS3ObjectKey(const std::string &epoch) const { std::ostringstream oss; oss << s3_object_path_; - if (!s3_object_path_.empty() && s3_object_path_.back() != '/') { + if (!s3_object_path_.empty() && s3_object_path_.back() != '/') + { oss << "/"; } - oss << "smallest_new_file_number-" << epoch_; + oss << "smallest_new_file_number-" << epoch; return oss.str(); } @@ -109,9 +157,9 @@ std::string S3FileNumberUpdater::GetS3ObjectKey() const SlidingWindow::SlidingWindow( std::chrono::milliseconds window_duration, std::chrono::milliseconds s3_update_interval, - const std::string& epoch, - const std::string& bucket_name, - const std::string& s3_object_path, + const std::string &epoch, + const std::string &bucket_name, + const std::string &s3_object_path, std::shared_ptr storage_provider) : window_duration_(window_duration), s3_update_interval_(s3_update_interval), @@ -119,14 +167,15 @@ SlidingWindow::SlidingWindow( should_stop_(false) { s3_updater_ = std::make_unique( - bucket_name, s3_object_path, epoch, storage_provider); + bucket_name, s3_object_path, storage_provider); // Start the timer thread - timer_thread_ = std::make_unique(&SlidingWindow::TimerWorker, this); + timer_thread_ = make_unique(&SlidingWindow::TimerWorker, this); DLOG(INFO) << "SlidingWindow started for epoch " << epoch_ << ", window_duration: " << window_duration_.count() << "ms" - << ", s3_update_interval: " << s3_update_interval_.count() << "ms"; + << ", s3_update_interval: " << s3_update_interval_.count() + << "ms"; } SlidingWindow::~SlidingWindow() @@ -134,7 +183,21 @@ SlidingWindow::~SlidingWindow() Stop(); } -void SlidingWindow::AddFileNumber(uint64_t file_number, int thread_id, uint64_t job_id) +void SlidingWindow::SetEpoch(const std::string &epoch) +{ + std::lock_guard lock(window_mutex_); + epoch_ = epoch; +} + +std::string SlidingWindow::GetEpoch() +{ + std::lock_guard lock(window_mutex_); + return epoch_; +} + +void SlidingWindow::AddFileNumber(uint64_t file_number, + int thread_id, + uint64_t job_id) { std::lock_guard lock(window_mutex_); @@ -143,7 +206,8 @@ void SlidingWindow::AddFileNumber(uint64_t file_number, int thread_id, uint64_t DLOG(INFO) << "Added file number to sliding window: " << file_number << ", thread_id: " << thread_id << ", job_id: " << job_id - << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); + << ", epoch: " << epoch_ + << ", window size: " << window_entries_.size(); } void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) @@ -153,17 +217,22 @@ void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) std::string key = GenerateKey(thread_id, job_id); auto it = window_entries_.find(key); - if (it != window_entries_.end()) { + if (it != window_entries_.end()) + { uint64_t removed_file_number = it->second.file_number; window_entries_.erase(it); - DLOG(INFO) << "Removed file number from sliding window: " << removed_file_number - << ", thread_id: " << thread_id << ", job_id: " << job_id - << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); - } else { - DLOG(WARNING) << "Attempted to remove non-existent entry from sliding window: " - << "thread_id: " << thread_id << ", job_id: " << job_id - << ", epoch: " << epoch_; + DLOG(INFO) << "Removed file number from sliding window: " + << removed_file_number << ", thread_id: " << thread_id + << ", job_id: " << job_id << ", epoch: " << epoch_ + << ", window size: " << window_entries_.size(); + } + else + { + DLOG(WARNING) + << "Attempted to remove non-existent entry from sliding window: " + << "thread_id: " << thread_id << ", job_id: " << job_id + << ", epoch: " << epoch_; } } @@ -171,19 +240,23 @@ uint64_t SlidingWindow::GetSmallestFileNumber() { std::lock_guard lock(window_mutex_); - if (window_entries_.empty()) { + if (window_entries_.empty()) + { return std::numeric_limits::max(); } uint64_t smallest = std::numeric_limits::max(); - for (const auto& entry : window_entries_) { - if (entry.second.file_number < smallest) { + for (const auto &entry : window_entries_) + { + if (entry.second.file_number < smallest) + { smallest = entry.second.file_number; } } DLOG(INFO) << "Current smallest file number: " << smallest - << ", epoch: " << epoch_ << ", window size: " << window_entries_.size(); + << ", epoch: " << epoch_ + << ", window size: " << window_entries_.size(); return smallest; } @@ -193,15 +266,17 @@ void SlidingWindow::Stop() // Signal the timer thread to stop { std::lock_guard lock(window_mutex_); - if (should_stop_) { - return; // Already stopped + if (should_stop_) + { + return; // Already stopped } should_stop_ = true; } cv_.notify_all(); // Wait for the timer thread to finish - if (timer_thread_ && timer_thread_->joinable()) { + if (timer_thread_ && timer_thread_->joinable()) + { timer_thread_->join(); timer_thread_.reset(); } @@ -213,14 +288,26 @@ void SlidingWindow::TimerWorker() { std::unique_lock lock(window_mutex_); - while (!should_stop_) { + while (!should_stop_) + { + DLOG(INFO) << "SlidingWindow timer tick for epoch " << epoch_; // Wait for the specified interval or stop signal - cv_.wait_for(lock, s3_update_interval_, [this] { return should_stop_; }); + cv_.wait_for( + lock, s3_update_interval_, [this] { return should_stop_; }); - if (should_stop_) { + if (should_stop_) + { break; } + // do not attempt S3 update if epoch is empty (indicates epoch is still + // being set) + if (epoch_.empty()) + { + continue; + } + DLOG(INFO) << "SlidingWindow timer processing for epoch " << epoch_; + // Release lock during S3 operation to avoid blocking AddFileNumber lock.unlock(); FlushToS3(); @@ -234,12 +321,15 @@ void SlidingWindow::FlushToS3() { uint64_t smallest = GetSmallestFileNumber(); - if (smallest == std::numeric_limits::max()) { + if (smallest == std::numeric_limits::max()) + { // No activity marker - s3_updater_->WriteNoActivityMarker(); + s3_updater_->WriteNoActivityMarker(epoch_); DLOG(INFO) << "Wrote no activity marker to S3 for epoch " << epoch_; - } else { - s3_updater_->UpdateSmallestFileNumber(smallest); + } + else + { + s3_updater_->UpdateSmallestFileNumber(smallest, epoch_); DLOG(INFO) << "Updated S3 with smallest file number: " << smallest << ", epoch: " << epoch_; } @@ -252,5 +342,4 @@ std::string SlidingWindow::GenerateKey(int thread_id, uint64_t job_id) const return oss.str(); } -} // namespace EloqDS - +} // namespace EloqDS diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h index 4dd2790..e9ce290 100644 --- a/eloq_data_store_service/purger_sliding_window.h +++ b/eloq_data_store_service/purger_sliding_window.h @@ -45,7 +45,6 @@ class S3FileNumberUpdater S3FileNumberUpdater( const std::string &bucket_name, const std::string &s3_object_path, - const std::string &epoch, std::shared_ptr storage_provider); ~S3FileNumberUpdater() = default; @@ -54,26 +53,26 @@ class S3FileNumberUpdater * @brief Update the smallest file number in S3 * @param file_number The smallest file number to write */ - void UpdateSmallestFileNumber(uint64_t file_number); + void UpdateSmallestFileNumber(uint64_t file_number, + const std::string &epoch); /** * @brief Read the smallest file number from S3 * @return The smallest file number, or UINT64_MAX if not found */ - uint64_t ReadSmallestFileNumber(); + uint64_t ReadSmallestFileNumber(const std::string &epoch); /** * @brief Write no activity marker (UINT64_MAX) to S3 */ - void WriteNoActivityMarker(); + void WriteNoActivityMarker(const std::string &epoch); private: std::string bucket_name_; std::string s3_object_path_; - std::string epoch_; std::shared_ptr storage_provider_; - std::string GetS3ObjectKey() const; + std::string GetS3ObjectKey(const std::string &epoch) const; }; /** @@ -105,6 +104,10 @@ class SlidingWindow */ ~SlidingWindow(); + void SetEpoch(const std::string &epoch); + + std::string GetEpoch(); + /** * @brief Add a file number to the sliding window * @param file_number The file number to add @@ -138,8 +141,7 @@ class SlidingWindow std::chrono::steady_clock::time_point timestamp; WindowEntry(uint64_t num) - : file_number(num), - timestamp(std::chrono::steady_clock::now()) + : file_number(num), timestamp(std::chrono::steady_clock::now()) { } }; diff --git a/eloq_data_store_service/rocksdb_cloud_data_store.cpp b/eloq_data_store_service/rocksdb_cloud_data_store.cpp index 263efb5..470056d 100644 --- a/eloq_data_store_service/rocksdb_cloud_data_store.cpp +++ b/eloq_data_store_service/rocksdb_cloud_data_store.cpp @@ -44,6 +44,8 @@ #include "data_store_service.h" #include "ds_request.pb.h" #include "internal_request.h" +#include "purger_event_listener.h" +#include "rocksdb/cloud/cloud_file_system_impl.h" #include "rocksdb/cloud/cloud_storage_provider.h" #include "rocksdb_cloud_data_store.h" @@ -495,8 +497,6 @@ bool RocksDBCloudDataStore::OpenCloudDB( options.skip_stats_update_on_db_open = true; // Important! keep atomic_flush true, since we disabled WAL options.atomic_flush = true; - auto db_event_listener = std::make_shared(); - options.listeners.emplace_back(db_event_listener); // The following two configuration items are setup for purpose of removing // expired kv data items according to their ttl Rocksdb will compact all sst @@ -567,6 +567,23 @@ bool RocksDBCloudDataStore::OpenCloudDB( max_bytes_for_level_multiplier_; } + // Add event listener for purger + rocksdb::CloudFileSystemImpl *cfs_impl = + dynamic_cast(cloud_fs_.get()); + if (cfs_impl == nullptr) + { + LOG(ERROR) << "Fail to get CloudFileSystemImpl from cloud_fs_"; + return false; + } + std::string bucket_name = + cloud_config_.bucket_prefix_ + cloud_config_.bucket_name_; + auto db_event_listener = + std::make_shared("", + bucket_name, + cloud_config_.object_path_, + cfs_impl->GetStorageProvider()); + options.listeners.emplace_back(db_event_listener); + // set ttl compaction filter assert(ttl_compaction_filter_ == nullptr); ttl_compaction_filter_ = std::make_unique(); @@ -618,6 +635,18 @@ bool RocksDBCloudDataStore::OpenCloudDB( auto &cfs_options_ref = cfs->GetMutableCloudFileSystemOptions(); cfs_options_ref.skip_cloud_files_in_getchildren = false; + // set epoch for purger event listener + std::string current_epoch; + status = db_->GetCurrentEpoch(¤t_epoch); + if (!status.ok()) + { + LOG(ERROR) << "Fail to get current epoch from db, error: " + << status.ToString(); + return false; + } + assert(!current_epoch.empty()); + db_event_listener->SetEpoch(current_epoch); + if (cloud_config_.warm_up_thread_num_ != 0) { db_->WarmUp(cloud_config_.warm_up_thread_num_); From bb08ed1ab95d1fd0ec9abbfbe0ab73db2b8b727b Mon Sep 17 00:00:00 2001 From: githubzilla Date: Tue, 16 Sep 2025 13:22:11 +0800 Subject: [PATCH 05/15] Rename smallest file number tmp file name --- eloq_data_store_service/purger_sliding_window.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index df5a723..f4728a7 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -66,7 +66,7 @@ void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number, std::string time_id = std::to_string( std::chrono::steady_clock::now().time_since_epoch().count()); std::string temp_file_path = - "/tmp/smallest_file_number_" + epoch + "_" + time_id + ".txt"; + "/tmp/smallest_file_number_" + epoch + "_" + time_id + "_upload.txt"; std::ofstream temp_file(temp_file_path); if (!temp_file.is_open()) { From e2354e367860837a738642a2d7e0bdff0415eb13 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Tue, 16 Sep 2025 17:31:15 +0800 Subject: [PATCH 06/15] Improve smallest file number update frequence --- .../purger_event_listener.cpp | 14 +- .../purger_event_listener.h | 60 +++++---- .../purger_sliding_window.cpp | 126 +++++++++--------- .../purger_sliding_window.h | 30 +++-- .../rocksdb_cloud_data_store.cpp | 8 ++ 5 files changed, 136 insertions(+), 102 deletions(-) diff --git a/eloq_data_store_service/purger_event_listener.cpp b/eloq_data_store_service/purger_event_listener.cpp index 9ad283a..cfaef2f 100644 --- a/eloq_data_store_service/purger_event_listener.cpp +++ b/eloq_data_store_service/purger_event_listener.cpp @@ -34,11 +34,11 @@ PurgerEventListener::PurgerEventListener( const std::string &bucket_name, const std::string &s3_object_path, std::shared_ptr storage_provider, - std::chrono::milliseconds window_duration, + std::chrono::milliseconds entry_duration, std::chrono::milliseconds s3_update_interval) : bucket_name_(bucket_name), s3_object_path_(s3_object_path) { - sliding_window_ = std::make_unique(window_duration, + sliding_window_ = std::make_unique(entry_duration, s3_update_interval, epoch, bucket_name_, @@ -48,7 +48,7 @@ PurgerEventListener::PurgerEventListener( LOG(INFO) << "PurgerEventListener created for epoch " << epoch << ", bucket: " << bucket_name_ << ", object_path: " << s3_object_path_ - << ", window_duration: " << window_duration.count() << "ms" + << ", window_duration: " << entry_duration.count() << "ms" << ", s3_update_interval: " << s3_update_interval.count() << "ms"; } @@ -70,6 +70,14 @@ void PurgerEventListener::SetEpoch(const std::string &epoch) } } +void PurgerEventListener::BlockPurger() +{ + if (sliding_window_) + { + sliding_window_->BlockPurger(); + } +} + void PurgerEventListener::OnFlushBegin( rocksdb::DB *db, const rocksdb::FlushJobInfo &flush_job_info) { diff --git a/eloq_data_store_service/purger_event_listener.h b/eloq_data_store_service/purger_event_listener.h index 7fec5bf..cc209f5 100644 --- a/eloq_data_store_service/purger_event_listener.h +++ b/eloq_data_store_service/purger_event_listener.h @@ -22,21 +22,22 @@ #pragma once +#include +#include +#include + #include #include #include -#include -#include -#include - #include "purger_sliding_window.h" namespace EloqDS { /** - * @brief Enhanced EventListener for tracking file numbers to improve purger safety + * @brief Enhanced EventListener for tracking file numbers to improve purger + * safety * * This listener subscribes to FlushBegin and CompactionBegin events to capture * the maximum file number at the time of these operations. The file numbers are @@ -52,15 +53,21 @@ class PurgerEventListener : public rocksdb::EventListener * @param bucket_name S3 bucket name * @param s3_object_path S3 object path * @param storage_provider Cloud storage provider for S3 operations - * @param window_duration Duration to keep entries in sliding window (default: 5 minutes) - * @param s3_update_interval Interval for updating S3 file (default: 1 minute) + * @param entry_duration Duration to keep entries in sliding window even it + * is deleted (default: 15 seconds, should be less than purger interval, + * indicating the minimum update frequency, it prevents too frequent + * updates) + * @param s3_update_interval Interval for updating S3 file (default: 30 + * seconds, indicating the maximum update frequency) */ - PurgerEventListener(const std::string& epoch, - const std::string& bucket_name, - const std::string& s3_object_path, - std::shared_ptr storage_provider, - std::chrono::milliseconds window_duration = std::chrono::minutes(5), - std::chrono::milliseconds s3_update_interval = std::chrono::minutes(1)); + PurgerEventListener( + const std::string &epoch, + const std::string &bucket_name, + const std::string &s3_object_path, + std::shared_ptr storage_provider, + std::chrono::milliseconds entry_duration = std::chrono::seconds(15), + std::chrono::milliseconds s3_update_interval = + std::chrono::seconds(30)); /** * @brief Destructor - stops the sliding window @@ -71,39 +78,41 @@ class PurgerEventListener : public rocksdb::EventListener * @brief Update the epoch string * @param epoch The new epoch string */ - void SetEpoch(const std::string& epoch); + void SetEpoch(const std::string &epoch); + + void BlockPurger(); /** * @brief Called when a flush operation begins * @param db Pointer to the database instance * @param flush_job_info Information about the flush operation */ - void OnFlushBegin(rocksdb::DB* db, - const rocksdb::FlushJobInfo& flush_job_info) override; + void OnFlushBegin(rocksdb::DB *db, + const rocksdb::FlushJobInfo &flush_job_info) override; /** * @brief Called when a flush operation completes * @param db Pointer to the database instance * @param flush_job_info Information about the flush operation */ - void OnFlushCompleted(rocksdb::DB* db, - const rocksdb::FlushJobInfo& flush_job_info) override; + void OnFlushCompleted(rocksdb::DB *db, + const rocksdb::FlushJobInfo &flush_job_info) override; /** * @brief Called when a compaction operation begins * @param db Pointer to the database instance * @param ci Information about the compaction operation */ - void OnCompactionBegin(rocksdb::DB* db, - const rocksdb::CompactionJobInfo& ci) override; + void OnCompactionBegin(rocksdb::DB *db, + const rocksdb::CompactionJobInfo &ci) override; /** * @brief Called when a compaction operation completes * @param db Pointer to the database instance * @param ci Information about the compaction operation */ - void OnCompactionCompleted(rocksdb::DB* db, - const rocksdb::CompactionJobInfo& ci) override; + void OnCompactionCompleted(rocksdb::DB *db, + const rocksdb::CompactionJobInfo &ci) override; /** * @brief Stop the event listener and cleanup resources @@ -129,8 +138,9 @@ class PurgerEventListener : public rocksdb::EventListener * @param thread_id The thread ID of the operation (default: 0) * @param job_id The job ID of the operation (default: 0) */ - void UpdateSlidingWindow(rocksdb::DB* db, int thread_id = 0, uint64_t job_id = 0); + void UpdateSlidingWindow(rocksdb::DB *db, + int thread_id = 0, + uint64_t job_id = 0); }; -} // namespace EloqDS - +} // namespace EloqDS diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index f4728a7..8df1e84 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -103,41 +103,10 @@ void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number, } } -uint64_t S3FileNumberUpdater::ReadSmallestFileNumber(const std::string &epoch) +void S3FileNumberUpdater::BlockPurger(const std::string &epoch) { - std::string object_key = GetS3ObjectKey(epoch); - std::string content; - - rocksdb::IOStatus s = - storage_provider_->GetCloudObject(bucket_name_, object_key, content); - - if (!s.ok()) - { - DLOG(INFO) << "Failed to read smallest file number from S3: " - << s.ToString() << ", object_key: " << object_key - << ", returning UINT64_MIN"; - return std::numeric_limits::min(); - } - - try - { - uint64_t file_number = std::stoull(content); - DLOG(INFO) << "Read smallest file number from S3: " << file_number - << ", object_key: " << object_key; - return file_number; - } - catch (const std::exception &e) - { - LOG(ERROR) << "Failed to parse smallest file number from S3 content: '" - << content << "', error: " << e.what() - << ", object_key: " << object_key; - return std::numeric_limits::min(); - } -} - -void S3FileNumberUpdater::WriteNoActivityMarker(const std::string &epoch) -{ - UpdateSmallestFileNumber(std::numeric_limits::max(), epoch); + DLOG(INFO) << "Wrote 0 as file number to S3 to block purger"; + UpdateSmallestFileNumber(std::numeric_limits::min(), epoch); } std::string S3FileNumberUpdater::GetS3ObjectKey(const std::string &epoch) const @@ -155,13 +124,13 @@ std::string S3FileNumberUpdater::GetS3ObjectKey(const std::string &epoch) const // SlidingWindow implementation SlidingWindow::SlidingWindow( - std::chrono::milliseconds window_duration, + std::chrono::milliseconds entry_duration, std::chrono::milliseconds s3_update_interval, const std::string &epoch, const std::string &bucket_name, const std::string &s3_object_path, std::shared_ptr storage_provider) - : window_duration_(window_duration), + : entry_duration_(entry_duration), s3_update_interval_(s3_update_interval), epoch_(epoch), should_stop_(false) @@ -173,7 +142,7 @@ SlidingWindow::SlidingWindow( timer_thread_ = make_unique(&SlidingWindow::TimerWorker, this); DLOG(INFO) << "SlidingWindow started for epoch " << epoch_ - << ", window_duration: " << window_duration_.count() << "ms" + << ", window_duration: " << entry_duration_.count() << "ms" << ", s3_update_interval: " << s3_update_interval_.count() << "ms"; } @@ -201,6 +170,18 @@ void SlidingWindow::AddFileNumber(uint64_t file_number, { std::lock_guard lock(window_mutex_); + uint64_t smallest = GetSmallestFileNumber(); + if (file_number < smallest) + { + DLOG(WARNING) << "New file number " << file_number + << " is smaller than current smallest " << smallest + << " update smallest file number in S3 immediately" + << ", thread_id: " << thread_id << ", job_id: " << job_id + << ", epoch: " << epoch_; + // The purger must seem the smallest file number before seeing + // any larger file number, so update S3 immediately + s3_updater_->UpdateSmallestFileNumber(file_number, epoch_); + } std::string key = GenerateKey(thread_id, job_id); window_entries_.emplace(key, WindowEntry(file_number)); @@ -219,8 +200,18 @@ void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) if (it != window_entries_.end()) { - uint64_t removed_file_number = it->second.file_number; - window_entries_.erase(it); + uint64_t removed_file_number = it->second.file_number_; + auto now = std::chrono::steady_clock::now(); + if (now - it->second.timestamp_ < entry_duration_) + { + // Entry is still within the duration window, only mark as deleted + it->second.deleted_ = true; + } + else + { + // Entry is expired, remove it + window_entries_.erase(it); + } DLOG(INFO) << "Removed file number from sliding window: " << removed_file_number << ", thread_id: " << thread_id @@ -238,20 +229,33 @@ void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) uint64_t SlidingWindow::GetSmallestFileNumber() { - std::lock_guard lock(window_mutex_); - if (window_entries_.empty()) { return std::numeric_limits::max(); } uint64_t smallest = std::numeric_limits::max(); - for (const auto &entry : window_entries_) + auto now = std::chrono::steady_clock::now(); + for (auto it = window_entries_.begin(); it != window_entries_.end();) { - if (entry.second.file_number < smallest) + // To avoid frequent S3 updates, do not remove entries that are + // marked deleted until they expire + if (it->second.deleted_) + { + if (now - it->second.timestamp_ >= entry_duration_) + { + // Entry is expired, remove it + it = window_entries_.erase(it); + continue; + } + } + + if (it->second.file_number_ < smallest) { - smallest = entry.second.file_number; + smallest = it->second.file_number_; } + + it++; } DLOG(INFO) << "Current smallest file number: " << smallest @@ -261,6 +265,18 @@ uint64_t SlidingWindow::GetSmallestFileNumber() return smallest; } +void SlidingWindow::BlockPurger() +{ + std::lock_guard lock(window_mutex_); + if (epoch_.empty()) + { + LOG(WARNING) + << "Cannot block purger, epoch is not set in sliding window"; + return; + } + s3_updater_->BlockPurger(epoch_); +} + void SlidingWindow::Stop() { // Signal the timer thread to stop @@ -308,31 +324,21 @@ void SlidingWindow::TimerWorker() } DLOG(INFO) << "SlidingWindow timer processing for epoch " << epoch_; + uint64_t smallest = GetSmallestFileNumber(); // Release lock during S3 operation to avoid blocking AddFileNumber lock.unlock(); - FlushToS3(); + FlushToS3(smallest); lock.lock(); } DLOG(INFO) << "SlidingWindow timer thread exiting for epoch " << epoch_; } -void SlidingWindow::FlushToS3() +void SlidingWindow::FlushToS3(uint64_t smallest) { - uint64_t smallest = GetSmallestFileNumber(); - - if (smallest == std::numeric_limits::max()) - { - // No activity marker - s3_updater_->WriteNoActivityMarker(epoch_); - DLOG(INFO) << "Wrote no activity marker to S3 for epoch " << epoch_; - } - else - { - s3_updater_->UpdateSmallestFileNumber(smallest, epoch_); - DLOG(INFO) << "Updated S3 with smallest file number: " << smallest - << ", epoch: " << epoch_; - } + s3_updater_->UpdateSmallestFileNumber(smallest, epoch_); + DLOG(INFO) << "Updated S3 with smallest file number: " << smallest + << ", epoch: " << epoch_; } std::string SlidingWindow::GenerateKey(int thread_id, uint64_t job_id) const diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h index e9ce290..d90ba45 100644 --- a/eloq_data_store_service/purger_sliding_window.h +++ b/eloq_data_store_service/purger_sliding_window.h @@ -57,15 +57,9 @@ class S3FileNumberUpdater const std::string &epoch); /** - * @brief Read the smallest file number from S3 - * @return The smallest file number, or UINT64_MAX if not found + * @brief Block purger temporarily */ - uint64_t ReadSmallestFileNumber(const std::string &epoch); - - /** - * @brief Write no activity marker (UINT64_MAX) to S3 - */ - void WriteNoActivityMarker(const std::string &epoch); + void BlockPurger(const std::string &epoch); private: std::string bucket_name_; @@ -92,7 +86,7 @@ class SlidingWindow * @param storage_provider Cloud storage provider for S3 operations */ SlidingWindow( - std::chrono::milliseconds window_duration, + std::chrono::milliseconds entry_duration, std::chrono::milliseconds s3_update_interval, const std::string &epoch, const std::string &bucket_name, @@ -129,6 +123,11 @@ class SlidingWindow */ uint64_t GetSmallestFileNumber(); + /** + * @brief Block purger temporarily + */ + void BlockPurger(); + /** * @brief Stop the sliding window and cleanup */ @@ -137,18 +136,21 @@ class SlidingWindow private: struct WindowEntry { - uint64_t file_number; - std::chrono::steady_clock::time_point timestamp; + uint64_t file_number_; + std::chrono::steady_clock::time_point timestamp_; + bool deleted_; WindowEntry(uint64_t num) - : file_number(num), timestamp(std::chrono::steady_clock::now()) + : file_number_(num), + timestamp_(std::chrono::steady_clock::now()), + deleted_(false) { } }; // Map of (thread_id + job_id) -> WindowEntry std::unordered_map window_entries_; - std::chrono::milliseconds window_duration_; + std::chrono::milliseconds entry_duration_; std::chrono::milliseconds s3_update_interval_; std::string epoch_; @@ -168,7 +170,7 @@ class SlidingWindow /** * @brief Flush current minimum file number to S3 */ - void FlushToS3(); + void FlushToS3(uint64_t smallest); /** * @brief Generate a key string for the window_entries_ map diff --git a/eloq_data_store_service/rocksdb_cloud_data_store.cpp b/eloq_data_store_service/rocksdb_cloud_data_store.cpp index 470056d..c09856f 100644 --- a/eloq_data_store_service/rocksdb_cloud_data_store.cpp +++ b/eloq_data_store_service/rocksdb_cloud_data_store.cpp @@ -635,6 +635,10 @@ bool RocksDBCloudDataStore::OpenCloudDB( auto &cfs_options_ref = cfs->GetMutableCloudFileSystemOptions(); cfs_options_ref.skip_cloud_files_in_getchildren = false; + // Stop background work - memtable flush and compaction + // before blocking purger + db_->PauseBackgroundWork(); + // set epoch for purger event listener std::string current_epoch; status = db_->GetCurrentEpoch(¤t_epoch); @@ -646,6 +650,10 @@ bool RocksDBCloudDataStore::OpenCloudDB( } assert(!current_epoch.empty()); db_event_listener->SetEpoch(current_epoch); + db_event_listener->BlockPurger(); + + // Resume background work + db_->ContinueBackgroundWork(); if (cloud_config_.warm_up_thread_num_ != 0) { From 7d787f63aa15639e57dec29473ac612271a5aab3 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Tue, 16 Sep 2025 17:38:47 +0800 Subject: [PATCH 07/15] Add explain about the empty epoch --- eloq_data_store_service/rocksdb_cloud_data_store.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eloq_data_store_service/rocksdb_cloud_data_store.cpp b/eloq_data_store_service/rocksdb_cloud_data_store.cpp index c09856f..1b656ff 100644 --- a/eloq_data_store_service/rocksdb_cloud_data_store.cpp +++ b/eloq_data_store_service/rocksdb_cloud_data_store.cpp @@ -578,7 +578,7 @@ bool RocksDBCloudDataStore::OpenCloudDB( std::string bucket_name = cloud_config_.bucket_prefix_ + cloud_config_.bucket_name_; auto db_event_listener = - std::make_shared("", + std::make_shared("", /*We still don't know the epoch now*/ bucket_name, cloud_config_.object_path_, cfs_impl->GetStorageProvider()); From d342789c5e7a0708fcb6b1ff35a712d0073ddeba Mon Sep 17 00:00:00 2001 From: githubzilla Date: Tue, 16 Sep 2025 17:40:30 +0800 Subject: [PATCH 08/15] Remove single_object_path_purge.cc --- .../single_object_path_purge.cc | 382 ------------------ 1 file changed, 382 deletions(-) delete mode 100644 eloq_data_store_service/single_object_path_purge.cc diff --git a/eloq_data_store_service/single_object_path_purge.cc b/eloq_data_store_service/single_object_path_purge.cc deleted file mode 100644 index 0f45c99..0000000 --- a/eloq_data_store_service/single_object_path_purge.cc +++ /dev/null @@ -1,382 +0,0 @@ -// Copyright (c) 2017 Rockset. -#ifndef ROCKSDB_LITE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cloud/cloud_manifest.h" -#include "cloud/filename.h" -#include "cloud/manifest_reader.h" -#include "cloud/purge.h" -#include "file/filename.h" -#include "rocksdb/cloud/cloud_file_system_impl.h" -#include "rocksdb/cloud/cloud_storage_provider.h" - -/** - * Purger implementation for the single object path case. This refactored - * version separates the main loop (Purger) from a single purge cycle - * (RunSinglePurgeCycle) and uses helper methods for each logical step. - * - * Prerequisites for running the purger remain unchanged: it only runs when - * the source and destination buckets are either the same or not both valid - * with non-empty object paths that differ. If both are valid and differ, - * we skip running. - */ -namespace ROCKSDB_NAMESPACE { - -namespace { // anonymous namespace for refactored free helper utilities - -// Type aliases local to this translation unit (were previously in header) -using PurgerAllFiles = - std::vector>; -using PurgerCloudManifestMap = - std::unordered_map>; -using PurgerLiveFileSet = std::unordered_set; -using PurgerEpochManifestMap = - std::unordered_map; - -struct PurgerCycleState { - PurgerAllFiles all_files; // (name, metadata) - std::vector - cloud_manifest_files; // names of CLOUDMANIFEST* objects - PurgerCloudManifestMap cloudmanifests; // loaded cloud manifest objects - PurgerLiveFileSet live_file_names; // logical filenames considered live - PurgerEpochManifestMap - current_epoch_manifest_files; // epoch -> manifest file metadata - std::vector obsolete_files; // files selected for deletion -}; - -static bool PrerequisitesMet(const CloudFileSystemImpl &cfs) { - const CloudFileSystemOptions &cfs_opts = cfs.GetCloudFileSystemOptions(); - if (cfs_opts.src_bucket.IsValid() && - !cfs_opts.src_bucket.GetObjectPath().empty() && - cfs_opts.dest_bucket.IsValid() && - !cfs_opts.dest_bucket.GetObjectPath().empty() && - cfs_opts.src_bucket != cfs_opts.dest_bucket) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Single Object Path Purger is not running because the " - "prerequisites are not met."); - return false; - } - return true; -} - -static IOStatus ListAllFiles(CloudFileSystemImpl &cfs, - PurgerAllFiles *all_files) { - const std::string &dest_object_path = cfs.GetDestObjectPath(); - IOStatus s = cfs.GetStorageProvider()->ListCloudObjects( - cfs.GetDestBucketName(), dest_object_path, all_files); - if (!s.ok()) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to list files in destination object path %s: %s", - dest_object_path.c_str(), s.ToString().c_str()); - } - - return s; -} - -static IOStatus ListCloudManifests( - CloudFileSystemImpl &cfs, std::vector *cloud_manifest_files) { - IOStatus s = cfs.GetStorageProvider()->ListCloudObjectsWithPrefix( - cfs.GetDestBucketName(), cfs.GetDestObjectPath(), "CLOUDMANIFEST", - cloud_manifest_files); - if (!s.ok()) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to list cloud manifest files in bucket %s: %s", - cfs.GetDestBucketName().c_str(), s.ToString().c_str()); - } else { - for (const auto &f : *cloud_manifest_files) { - Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, - "[pg] Found cloud manifest file %s", f.c_str()); - } - } - return s; -} - -static IOStatus LoadCloudManifests( - CloudFileSystemImpl &cfs, - const std::vector &cloud_manifest_files, - PurgerCloudManifestMap *manifests) { - const FileOptions file_opts; - IODebugContext *dbg = nullptr; - IOStatus overall = IOStatus::OK(); - - for (const auto &cloud_manifest_file : cloud_manifest_files) { - std::string cloud_manifest_file_path = - cfs.GetDestObjectPath() + pathsep + cloud_manifest_file; - std::unique_ptr file; - IOStatus s = cfs.NewSequentialFileCloud(cfs.GetDestBucketName(), - cloud_manifest_file_path, file_opts, - &file, dbg); - if (!s.ok()) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to open cloud manifest file %s: %s", - cloud_manifest_file.c_str(), s.ToString().c_str()); - if (overall.ok()) overall = s; - continue; - } - - std::unique_ptr cloud_manifest; - s = CloudManifest::LoadFromLog( - std::unique_ptr( - new SequentialFileReader(std::move(file), cloud_manifest_file)), - &cloud_manifest); - if (!s.ok()) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to load cloud manifest from file %s: %s", - cloud_manifest_file.c_str(), s.ToString().c_str()); - if (overall.ok()) overall = s; - continue; - } - - Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, - "[pg] Loaded cloud manifest file %s with current epoch %s", - cloud_manifest_file.c_str(), cloud_manifest->GetCurrentEpoch().c_str()); - - (*manifests)[cloud_manifest_file] = std::move(cloud_manifest); - } - return overall; -} - -static IOStatus CollectLiveFiles(CloudFileSystemImpl &cfs, - const PurgerCloudManifestMap &cloudmanifests, - PurgerLiveFileSet *live_files, - PurgerEpochManifestMap *epoch_manifest_infos) { - const CloudFileSystemOptions &cfs_opts = cfs.GetCloudFileSystemOptions(); - const std::string &dest_object_path = cfs_opts.dest_bucket.GetObjectPath(); - IOStatus overall = IOStatus::OK(); - - std::unique_ptr manifest_reader(new ManifestReader( - cfs.info_log_, &cfs, cfs_opts.dest_bucket.GetBucketName())); - - std::set - live_file_numbers; // temporary container reused per manifest - - for (auto &entry : cloudmanifests) { - const auto &cloud_manifest_name = entry.first; - CloudManifest *cloud_manifest_ptr = entry.second.get(); - - live_file_numbers.clear(); - - std::string current_epoch = cloud_manifest_ptr->GetCurrentEpoch(); - auto manifest_file = ManifestFileWithEpoch(dest_object_path, current_epoch); - - CloudObjectInformation manifest_file_info; - IOStatus s = cfs.GetStorageProvider()->GetCloudObjectMetadata( - cfs.GetDestBucketName(), manifest_file, &manifest_file_info); - - if (!s.ok()) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to get metadata for manifest file %s: %s", - manifest_file.c_str(), s.ToString().c_str()); - if (overall.ok()) overall = s; - continue; - } - - Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, - "[pg] Current epoch Manifest file %s of CloudManifest %s has size %lu " - "and content hash %s and timestamp %lu", - manifest_file.c_str(), cloud_manifest_name.c_str(), - manifest_file_info.size, manifest_file_info.content_hash.c_str(), - manifest_file_info.modification_time); - - (*epoch_manifest_infos)[current_epoch] = manifest_file_info; - - s = manifest_reader->GetLiveFiles(dest_object_path, current_epoch, - &live_file_numbers); - if (!s.ok()) { - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to get live files from cloud manifest file %s: %s", - cloud_manifest_name.c_str(), s.ToString().c_str()); - if (overall.ok()) overall = s; - continue; - } - - for (const auto &num : live_file_numbers) { - std::string file_name = MakeTableFileName(num); - file_name = - cfs.RemapFilenameWithCloudManifest(file_name, cloud_manifest_ptr); - live_files->insert(file_name); - Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, - "[pg] Live file %s found in cloud manifest %s", file_name.c_str(), - cloud_manifest_name.c_str()); - } - } - return overall; -} - -static void SelectObsoleteFiles( - CloudFileSystemImpl &cfs, const PurgerAllFiles &all_files, - const PurgerLiveFileSet &live_files, - const PurgerEpochManifestMap &epoch_manifest_infos, - std::vector *obsolete_files) { - for (const auto &candidate : all_files) { - Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, - "[pg] Checking candidate file %s", candidate.first.c_str()); - const std::string &candidate_file_path = candidate.first; - - // Skip files that are not SST files - if (!ends_with(RemoveEpoch(candidate_file_path), ".sst")) { - continue; - } - - const std::string candidate_file_epoch = GetEpoch(candidate_file_path); - const CloudObjectInformation &candidate_file_info = candidate.second; - uint64_t candidate_modification_time = - candidate_file_info.modification_time; - - // Give max value to manifest modification time - // if the candidate file epoch is not current epoch - uint64_t manifest_modification_time = std::numeric_limits::max(); - auto it_epoch = epoch_manifest_infos.find(candidate_file_epoch); - if (it_epoch != epoch_manifest_infos.end()) { - manifest_modification_time = it_epoch->second.modification_time; - } - - if (live_files.find(candidate_file_path) != live_files.end()) { - continue; - } - - if (candidate_modification_time < manifest_modification_time) { - obsolete_files->push_back(candidate_file_path); - Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, - "[pg] Candidate file %s is obsolete and will be deleted", - candidate_file_path.c_str()); - } else { - Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, - "[pg] Candidate file %s is not obsolete because its modification " - "time %lu is later than the current epoch manifest file's " - "modification time %lu", - candidate_file_path.c_str(), candidate_modification_time, - manifest_modification_time); - } - } -} - -static void DeleteObsoleteFiles( - CloudFileSystemImpl &cfs, const std::vector &obsolete_files) { - const std::string &dest_object_path = cfs.GetDestObjectPath(); - size_t deleted = 0; - size_t failures = 0; - for (const auto &file_to_delete : obsolete_files) { - std::string file_path = dest_object_path + pathsep + file_to_delete; - Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, - "[pg] Deleting obsolete file %s from destination bucket", - file_to_delete.c_str()); - IOStatus s = cfs.GetStorageProvider()->DeleteCloudObject( - cfs.GetDestBucketName(), file_path); - if (!s.ok()) { - ++failures; - Log(InfoLogLevel::ERROR_LEVEL, cfs.info_log_, - "[pg] Failed to delete obsolete file %s: %s", file_path.c_str(), - s.ToString().c_str()); - } else { - ++deleted; - } - } - Log(InfoLogLevel::DEBUG_LEVEL, cfs.info_log_, - "[pg] Obsolete deletion summary: requested=%zu deleted=%zu failures=%zu", - obsolete_files.size(), deleted, failures); -} - -static void RunSinglePurgeCycle(CloudFileSystemImpl &cfs) { - PurgerCycleState state; // fresh state each cycle - - Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, - "[pg] Single Object Path Purger started a new cycle"); - - if (!ListAllFiles(cfs, &state.all_files).ok()) { - return; - } - - if (!ListCloudManifests(cfs, &state.cloud_manifest_files).ok()) { - return; - } - - if (!LoadCloudManifests(cfs, state.cloud_manifest_files, - &state.cloudmanifests) - .ok()) { - return; - } - - if (!CollectLiveFiles(cfs, state.cloudmanifests, &state.live_file_names, - &state.current_epoch_manifest_files) - .ok()) { - return; - } - - SelectObsoleteFiles(cfs, state.all_files, state.live_file_names, - state.current_epoch_manifest_files, - &state.obsolete_files); - - DeleteObsoleteFiles(cfs, state.obsolete_files); - - Log(InfoLogLevel::INFO_LEVEL, cfs.info_log_, - "[pg] Purge cycle summary: total_listed=%zu manifests_listed=%zu " - "manifests_loaded=%zu live_files=%zu obsolete_selected=%zu", - state.all_files.size(), state.cloud_manifest_files.size(), - state.cloudmanifests.size(), state.live_file_names.size(), - state.obsolete_files.size()); -} - -} // anonymous namespace - -// ------------- Main purger thread ------------- // - -void CloudFileSystemImpl::Purger() { - Log(InfoLogLevel::INFO_LEVEL, info_log_, - "[pg] Single Object Path Purger thread started"); - - if (!PrerequisitesMet(*this)) { - return; - } - - const auto periodicity_ms = - GetCloudFileSystemOptions().purger_periodicity_millis; - - while (true) { - // Wait for next cycle or termination request - std::unique_lock lk(purger_lock_); - purger_cv_.wait_for(lk, std::chrono::milliseconds(periodicity_ms), - [&]() { return !purger_is_running_; }); - if (!purger_is_running_) { - break; // shutdown requested - } - lk.unlock(); // release lock during IO work - - RunSinglePurgeCycle(*this); - } - - Log(InfoLogLevel::INFO_LEVEL, info_log_, - "[pg] Single Object Path Purger thread exiting"); -} - -IOStatus CloudFileSystemImpl::FindObsoleteFiles( - const std::string & /*bucket_name_prefix*/, - std::vector * /*pathnames*/) { - return IOStatus::NotSupported( - "Single Object Path Purger does not support FindObsoleteFiles"); -} -IOStatus CloudFileSystemImpl::FindObsoleteDbid( - const std::string & /*bucket_name_prefix*/, - std::vector * /*to_delete_list*/) { - return IOStatus::NotSupported( - "Single Object Path Purger does not support FindObsoleteDbid"); -} - -IOStatus CloudFileSystemImpl::extractParents( - const std::string & /*bucket_name_prefix*/, const DbidList & /*dbid_list*/, - DbidParents * /*parents*/) { - return IOStatus::NotSupported( - "Single Object Path Purger does not support extractParents"); -} - -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE From 7a1125424c03abdd2926d38541702dce973945b9 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Wed, 17 Sep 2025 15:31:56 +0800 Subject: [PATCH 09/15] Using mktmp --- .../purger_sliding_window.cpp | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index 8df1e84..c19c542 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -62,21 +62,31 @@ void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number, std::string content = std::to_string(file_number); std::string object_key = GetS3ObjectKey(epoch); - // Write to temp local file at first, then upload to S3 - std::string time_id = std::to_string( - std::chrono::steady_clock::now().time_since_epoch().count()); - std::string temp_file_path = - "/tmp/smallest_file_number_" + epoch + "_" + time_id + "_upload.txt"; - std::ofstream temp_file(temp_file_path); - if (!temp_file.is_open()) + // Write to temp local file at first + char tmp_template[] = + "/tmp/smallest_file_number_upload_XXXXXX"; // Xs will be replaced + int fd = mkstemp(tmp_template); + if (fd == -1) { - LOG(ERROR) << "Failed to open temp file for writing: " - << temp_file_path; + LOG(ERROR) << "Failed to open temp file for writing: " << tmp_template; return; } - temp_file << content; - temp_file.close(); - // Now upload the temp file to S3 + + std::string temp_file_path = tmp_template; + + // write content to the temp file + if (write(fd, content.c_str(), content.size()) == -1) + { + LOG(ERROR) << "Failed to write to temp file: " << temp_file_path; + close(fd); + // Remove the temp file + if (std::remove(temp_file_path.c_str()) != 0) + { + LOG(WARNING) << "Failed to remove temp file: " << temp_file_path; + } + return; + } + close(fd); // We will open it later for reading rocksdb::IOStatus s = storage_provider_->PutCloudObject( temp_file_path, bucket_name_, object_key); From b810431498c6d52973968b4573366d0bd8657cc8 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Wed, 17 Sep 2025 16:38:44 +0800 Subject: [PATCH 10/15] Reflect review comments --- .../build_eloq_store.cmake | 4 +- .../purger_event_listener.cpp | 2 +- .../purger_event_listener.h | 2 +- .../purger_sliding_window.cpp | 58 ++++++++++++------- .../purger_sliding_window.h | 8 +++ .../rocksdb_cloud_data_store.cpp | 27 ++++++--- 6 files changed, 69 insertions(+), 32 deletions(-) diff --git a/eloq_data_store_service/build_eloq_store.cmake b/eloq_data_store_service/build_eloq_store.cmake index 8eb63d7..2854cfe 100644 --- a/eloq_data_store_service/build_eloq_store.cmake +++ b/eloq_data_store_service/build_eloq_store.cmake @@ -97,8 +97,8 @@ set(ELOQ_STORE_SOURCES ${ELOQ_STORE_SOURCE_DIR}/types.cpp ${ELOQ_STORE_SOURCE_DIR}/kv_options.cpp ${ELOQ_STORE_SOURCE_DIR}/eloqstore_module.cpp - purger_event_listener.cpp - purger_sliding_window.cpp) + ${ELOQ_STORE_SOURCE_DIR}/purger_event_listener.cpp + ${ELOQ_STORE_SOURCE_DIR}/purger_sliding_window.cpp) add_library(eloqstore STATIC ${ELOQ_STORE_SOURCES} ${INI_SOURCES}) diff --git a/eloq_data_store_service/purger_event_listener.cpp b/eloq_data_store_service/purger_event_listener.cpp index cfaef2f..b9154b6 100644 --- a/eloq_data_store_service/purger_event_listener.cpp +++ b/eloq_data_store_service/purger_event_listener.cpp @@ -211,7 +211,7 @@ std::string PurgerEventListener::GetFlushReason( } void PurgerEventListener::UpdateSlidingWindow(rocksdb::DB *db, - int thread_id, + uint64_t thread_id, uint64_t job_id) { if (!db) diff --git a/eloq_data_store_service/purger_event_listener.h b/eloq_data_store_service/purger_event_listener.h index cc209f5..0ca5c31 100644 --- a/eloq_data_store_service/purger_event_listener.h +++ b/eloq_data_store_service/purger_event_listener.h @@ -139,7 +139,7 @@ class PurgerEventListener : public rocksdb::EventListener * @param job_id The job ID of the operation (default: 0) */ void UpdateSlidingWindow(rocksdb::DB *db, - int thread_id = 0, + uint64_t thread_id = 0, uint64_t job_id = 0); }; diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index c19c542..fbd243c 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -88,6 +88,17 @@ void S3FileNumberUpdater::UpdateSmallestFileNumber(uint64_t file_number, } close(fd); // We will open it later for reading + if (!storage_provider_) + { + LOG(ERROR) << "Cloud storage provider is not initialized"; + // Remove the temp file + if (std::remove(temp_file_path.c_str()) != 0) + { + LOG(WARNING) << "Failed to remove temp file: " << temp_file_path; + } + return; + } + rocksdb::IOStatus s = storage_provider_->PutCloudObject( temp_file_path, bucket_name_, object_key); @@ -117,6 +128,8 @@ void S3FileNumberUpdater::BlockPurger(const std::string &epoch) { DLOG(INFO) << "Wrote 0 as file number to S3 to block purger"; UpdateSmallestFileNumber(std::numeric_limits::min(), epoch); + // Don't update last_published_smallest_ in sliding window, + // so that future smaller file number can still be updated } std::string S3FileNumberUpdater::GetS3ObjectKey(const std::string &epoch) const @@ -143,6 +156,7 @@ SlidingWindow::SlidingWindow( : entry_duration_(entry_duration), s3_update_interval_(s3_update_interval), epoch_(epoch), + last_published_smallest_(std::numeric_limits::max()), should_stop_(false) { s3_updater_ = std::make_unique( @@ -178,27 +192,31 @@ void SlidingWindow::AddFileNumber(uint64_t file_number, int thread_id, uint64_t job_id) { - std::lock_guard lock(window_mutex_); - - uint64_t smallest = GetSmallestFileNumber(); - if (file_number < smallest) + std::string epoch_copy; + bool do_immediate_update = false; { - DLOG(WARNING) << "New file number " << file_number - << " is smaller than current smallest " << smallest - << " update smallest file number in S3 immediately" - << ", thread_id: " << thread_id << ", job_id: " << job_id - << ", epoch: " << epoch_; - // The purger must seem the smallest file number before seeing - // any larger file number, so update S3 immediately - s3_updater_->UpdateSmallestFileNumber(file_number, epoch_); + std::lock_guard lock(window_mutex_); + do_immediate_update = + (file_number < last_published_smallest_) && !epoch_.empty(); + epoch_copy = epoch_; + std::string key = GenerateKey(thread_id, job_id); + window_entries_.emplace(key, WindowEntry(file_number)); + DLOG(INFO) << "Added file number to sliding window: " << file_number + << ", thread_id: " << thread_id << ", job_id: " << job_id + << ", epoch: " << epoch_ + << ", window size: " << window_entries_.size(); } - std::string key = GenerateKey(thread_id, job_id); - window_entries_.emplace(key, WindowEntry(file_number)); - DLOG(INFO) << "Added file number to sliding window: " << file_number - << ", thread_id: " << thread_id << ", job_id: " << job_id - << ", epoch: " << epoch_ - << ", window size: " << window_entries_.size(); + if (do_immediate_update) + { + DLOG(INFO) << "Immediate S3 update with smaller file number: " + << file_number << ", thread_id: " << thread_id + << ", job_id: " << job_id << ", epoch: " << epoch_copy; + std::lock_guard g(window_mutex_); + s3_updater_->UpdateSmallestFileNumber(file_number, epoch_copy); + last_published_smallest_ = + std::min(last_published_smallest_, file_number); + } } void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) @@ -335,10 +353,7 @@ void SlidingWindow::TimerWorker() DLOG(INFO) << "SlidingWindow timer processing for epoch " << epoch_; uint64_t smallest = GetSmallestFileNumber(); - // Release lock during S3 operation to avoid blocking AddFileNumber - lock.unlock(); FlushToS3(smallest); - lock.lock(); } DLOG(INFO) << "SlidingWindow timer thread exiting for epoch " << epoch_; @@ -347,6 +362,7 @@ void SlidingWindow::TimerWorker() void SlidingWindow::FlushToS3(uint64_t smallest) { s3_updater_->UpdateSmallestFileNumber(smallest, epoch_); + last_published_smallest_ = smallest; DLOG(INFO) << "Updated S3 with smallest file number: " << smallest << ", epoch: " << epoch_; } diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h index d90ba45..a5569d7 100644 --- a/eloq_data_store_service/purger_sliding_window.h +++ b/eloq_data_store_service/purger_sliding_window.h @@ -98,6 +98,11 @@ class SlidingWindow */ ~SlidingWindow(); + SlidingWindow(const SlidingWindow&) = delete; + SlidingWindow& operator=(const SlidingWindow&) = delete; + SlidingWindow(SlidingWindow&&) = delete; + SlidingWindow& operator=(SlidingWindow&&) = delete; + void SetEpoch(const std::string &epoch); std::string GetEpoch(); @@ -154,6 +159,9 @@ class SlidingWindow std::chrono::milliseconds s3_update_interval_; std::string epoch_; + // Last published smallest file number to avoid conflicting updates + uint64_t last_published_smallest_{std::numeric_limits::max()}; + std::unique_ptr s3_updater_; // Threading diff --git a/eloq_data_store_service/rocksdb_cloud_data_store.cpp b/eloq_data_store_service/rocksdb_cloud_data_store.cpp index 1b656ff..e9bc02d 100644 --- a/eloq_data_store_service/rocksdb_cloud_data_store.cpp +++ b/eloq_data_store_service/rocksdb_cloud_data_store.cpp @@ -577,11 +577,11 @@ bool RocksDBCloudDataStore::OpenCloudDB( } std::string bucket_name = cloud_config_.bucket_prefix_ + cloud_config_.bucket_name_; - auto db_event_listener = - std::make_shared("", /*We still don't know the epoch now*/ - bucket_name, - cloud_config_.object_path_, - cfs_impl->GetStorageProvider()); + auto db_event_listener = std::make_shared( + "", /*We still don't know the epoch now*/ + bucket_name, + cloud_config_.object_path_, + cfs_impl->GetStorageProvider()); options.listeners.emplace_back(db_event_listener); // set ttl compaction filter @@ -637,7 +637,14 @@ bool RocksDBCloudDataStore::OpenCloudDB( // Stop background work - memtable flush and compaction // before blocking purger - db_->PauseBackgroundWork(); + status = db_->PauseBackgroundWork(); + if (!status.ok()) + { + LOG(ERROR) << "Fail to pause background work, error: " + << status.ToString(); + db_->ContinueBackgroundWork(); + return false; + } // set epoch for purger event listener std::string current_epoch; @@ -646,9 +653,15 @@ bool RocksDBCloudDataStore::OpenCloudDB( { LOG(ERROR) << "Fail to get current epoch from db, error: " << status.ToString(); + db_->ContinueBackgroundWork(); + return false; + } + if (current_epoch.empty()) + { + LOG(ERROR) << "Current epoch from db is empty"; + db_->ContinueBackgroundWork(); return false; } - assert(!current_epoch.empty()); db_event_listener->SetEpoch(current_epoch); db_event_listener->BlockPurger(); From d399a2b8ec7baec931212d5df80aef1d3811ed34 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Wed, 17 Sep 2025 16:40:22 +0800 Subject: [PATCH 11/15] Update last_published_smallest_ --- eloq_data_store_service/purger_sliding_window.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index fbd243c..a112e6d 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -214,8 +214,7 @@ void SlidingWindow::AddFileNumber(uint64_t file_number, << ", job_id: " << job_id << ", epoch: " << epoch_copy; std::lock_guard g(window_mutex_); s3_updater_->UpdateSmallestFileNumber(file_number, epoch_copy); - last_published_smallest_ = - std::min(last_published_smallest_, file_number); + last_published_smallest_ = file_number; } } From 23d2af6fe7bc2fa28b87add7c82bf49521503304 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Wed, 17 Sep 2025 16:45:55 +0800 Subject: [PATCH 12/15] Update AddFileNumber --- .../purger_sliding_window.cpp | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index a112e6d..c8e1d4b 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -192,30 +192,21 @@ void SlidingWindow::AddFileNumber(uint64_t file_number, int thread_id, uint64_t job_id) { - std::string epoch_copy; - bool do_immediate_update = false; - { - std::lock_guard lock(window_mutex_); - do_immediate_update = - (file_number < last_published_smallest_) && !epoch_.empty(); - epoch_copy = epoch_; - std::string key = GenerateKey(thread_id, job_id); - window_entries_.emplace(key, WindowEntry(file_number)); - DLOG(INFO) << "Added file number to sliding window: " << file_number - << ", thread_id: " << thread_id << ", job_id: " << job_id - << ", epoch: " << epoch_ - << ", window size: " << window_entries_.size(); - } - - if (do_immediate_update) + std::lock_guard lock(window_mutex_); + if (file_number < last_published_smallest_) { DLOG(INFO) << "Immediate S3 update with smaller file number: " << file_number << ", thread_id: " << thread_id - << ", job_id: " << job_id << ", epoch: " << epoch_copy; - std::lock_guard g(window_mutex_); - s3_updater_->UpdateSmallestFileNumber(file_number, epoch_copy); + << ", job_id: " << job_id << ", epoch: " << epoch_; + s3_updater_->UpdateSmallestFileNumber(file_number, epoch_); last_published_smallest_ = file_number; } + std::string key = GenerateKey(thread_id, job_id); + window_entries_.emplace(key, WindowEntry(file_number)); + DLOG(INFO) << "Added file number to sliding window: " << file_number + << ", thread_id: " << thread_id << ", job_id: " << job_id + << ", epoch: " << epoch_ + << ", window size: " << window_entries_.size(); } void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) From 3b245e19dc464a17ba5ca5363dd606c6ff4063b0 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Wed, 17 Sep 2025 17:05:09 +0800 Subject: [PATCH 13/15] Update thread_id type --- eloq_data_store_service/purger_sliding_window.cpp | 5 +++-- eloq_data_store_service/purger_sliding_window.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index c8e1d4b..c345e2b 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -189,7 +189,7 @@ std::string SlidingWindow::GetEpoch() } void SlidingWindow::AddFileNumber(uint64_t file_number, - int thread_id, + uint64_t thread_id, uint64_t job_id) { std::lock_guard lock(window_mutex_); @@ -357,7 +357,8 @@ void SlidingWindow::FlushToS3(uint64_t smallest) << ", epoch: " << epoch_; } -std::string SlidingWindow::GenerateKey(int thread_id, uint64_t job_id) const +std::string SlidingWindow::GenerateKey(uint64_t thread_id, + uint64_t job_id) const { std::ostringstream oss; oss << thread_id << "-" << job_id; diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h index a5569d7..926679c 100644 --- a/eloq_data_store_service/purger_sliding_window.h +++ b/eloq_data_store_service/purger_sliding_window.h @@ -113,7 +113,7 @@ class SlidingWindow * @param thread_id The thread ID of the operation * @param job_id The job ID of the operation */ - void AddFileNumber(uint64_t file_number, int thread_id, uint64_t job_id); + void AddFileNumber(uint64_t file_number, uint64_t thread_id, uint64_t job_id); /** * @brief Remove a file number entry from the sliding window @@ -186,7 +186,7 @@ class SlidingWindow * @param job_id The job ID of the operation * @return A string key combining thread_id and job_id */ - std::string GenerateKey(int thread_id, uint64_t job_id) const; + std::string GenerateKey(uint64_t thread_id, uint64_t job_id) const; }; } // namespace EloqDS From d0410a8a26912e257ca656ee676c4595402d231e Mon Sep 17 00:00:00 2001 From: githubzilla Date: Wed, 17 Sep 2025 17:31:38 +0800 Subject: [PATCH 14/15] Reflect comment --- eloq_data_store_service/purger_event_listener.cpp | 3 +-- eloq_data_store_service/purger_sliding_window.cpp | 6 ++++-- eloq_data_store_service/purger_sliding_window.h | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/eloq_data_store_service/purger_event_listener.cpp b/eloq_data_store_service/purger_event_listener.cpp index b9154b6..355cf9a 100644 --- a/eloq_data_store_service/purger_event_listener.cpp +++ b/eloq_data_store_service/purger_event_listener.cpp @@ -157,8 +157,7 @@ void PurgerEventListener::OnCompactionCompleted( << ", input_files_size: " << ci.input_files.size() << ", output_files_size: " << ci.output_files.size() << ", compaction_reason: " - << static_cast(ci.compaction_reason) - << ", epoch: " << sliding_window_->GetEpoch(); + << static_cast(ci.compaction_reason); // Remove the entry from sliding window if (sliding_window_) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index c345e2b..b757b3c 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -28,7 +28,8 @@ #include #include -#include +#include +#include #include #include #include @@ -37,6 +38,7 @@ #include #include #include +#include #include "purger_sliding_window.h" @@ -209,7 +211,7 @@ void SlidingWindow::AddFileNumber(uint64_t file_number, << ", window size: " << window_entries_.size(); } -void SlidingWindow::RemoveFileNumber(int thread_id, uint64_t job_id) +void SlidingWindow::RemoveFileNumber(uint64_t thread_id, uint64_t job_id) { std::lock_guard lock(window_mutex_); diff --git a/eloq_data_store_service/purger_sliding_window.h b/eloq_data_store_service/purger_sliding_window.h index 926679c..fcbc2f6 100644 --- a/eloq_data_store_service/purger_sliding_window.h +++ b/eloq_data_store_service/purger_sliding_window.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -120,7 +121,7 @@ class SlidingWindow * @param thread_id The thread ID of the operation * @param job_id The job ID of the operation */ - void RemoveFileNumber(int thread_id, uint64_t job_id); + void RemoveFileNumber(uint64_t thread_id, uint64_t job_id); /** * @brief Get the smallest file number in the current window From 1b01a1eea2be574f466ec45529bf45fe6db475a5 Mon Sep 17 00:00:00 2001 From: githubzilla Date: Thu, 18 Sep 2025 12:29:58 +0800 Subject: [PATCH 15/15] Avoid immediately update smallest file number file --- .../purger_sliding_window.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/eloq_data_store_service/purger_sliding_window.cpp b/eloq_data_store_service/purger_sliding_window.cpp index b757b3c..6396b39 100644 --- a/eloq_data_store_service/purger_sliding_window.cpp +++ b/eloq_data_store_service/purger_sliding_window.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #include "purger_sliding_window.h" @@ -222,16 +222,13 @@ void SlidingWindow::RemoveFileNumber(uint64_t thread_id, uint64_t job_id) { uint64_t removed_file_number = it->second.file_number_; auto now = std::chrono::steady_clock::now(); - if (now - it->second.timestamp_ < entry_duration_) - { - // Entry is still within the duration window, only mark as deleted - it->second.deleted_ = true; - } - else - { - // Entry is expired, remove it - window_entries_.erase(it); - } + // Mark the entry as deleted, but do not remove it immediately + // to avoid frequent S3 updates, and give some time for Manifests + // file get updated. + // + // The entry will be removed when it expires in GetSmallestFileNumber() + it->second.deleted_ = true; + it->second.timestamp_ = now; DLOG(INFO) << "Removed file number from sliding window: " << removed_file_number << ", thread_id: " << thread_id