From 797284cdf02d2aec5c86ef38c72aeed4e770eb6e Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 08:13:14 +0000 Subject: [PATCH 1/9] Send out batchwrite req concurrenctly and wait all at once --- data_store_service_client.cpp | 112 ++++++++++++++++------------ data_store_service_client_closure.h | 5 +- 2 files changed, 68 insertions(+), 49 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index cffc0fe..e91f310 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -280,11 +280,13 @@ bool DataStoreServiceClient::PutAll( flush_task_entry_idx++; } - SyncCallbackData *sync_putall = sync_callback_data_pool_.NextObject(); + SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject(); PoolableGuard sync_putall_guard(sync_putall); + sync_putall->Reset(); uint16_t parts_cnt_per_key = 1; uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5; + uint32_t batch_cnt = 0; // Write data for hash_partitioned table for (auto part_it = hash_partitions_map.begin(); @@ -305,9 +307,18 @@ bool DataStoreServiceClient::PutAll( txservice::TxKey tx_key = ckpt_rec.Key(); // Start a new batch if done with current partition. - if (write_batch_size >= MAX_WRITE_BATCH_SIZE) + if (write_batch_size >= SyncPutAllData::max_flying_write_count) { - sync_putall->Reset(); + // Wait for in-flight requests to decrease if limit reached + { + std::unique_lock lk(sync_putall->mux_); + while (sync_putall->unfinished_request_cnt_ >= + SyncPutAllData::max_flying_write_count) + { + sync_putall->cv_.wait(lk); + } + } + BatchWriteRecords(kv_table_name, part_it->first, std::move(key_parts), @@ -317,19 +328,9 @@ bool DataStoreServiceClient::PutAll( std::move(op_types), true, sync_putall, - SyncCallback, + SyncPutAllCallback, parts_cnt_per_key, parts_cnt_per_record); - sync_putall->Wait(); - - if (sync_putall->Result().error_code() != - EloqDS::remote::DataStoreError::NO_ERROR) - { - LOG(WARNING) - << "DataStoreHandler: Failed to write batch."; - - return false; - } key_parts.clear(); record_parts.clear(); records_ts.clear(); @@ -341,6 +342,7 @@ bool DataStoreServiceClient::PutAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); write_batch_size = 0; + ++batch_cnt; } assert(ckpt_rec.payload_status_ == @@ -360,7 +362,6 @@ bool DataStoreServiceClient::PutAll( // Send out the last batch if (key_parts.size() > 0) { - sync_putall->Reset(); BatchWriteRecords(kv_table_name, part_it->first, std::move(key_parts), @@ -370,23 +371,16 @@ bool DataStoreServiceClient::PutAll( std::move(op_types), true, sync_putall, - SyncCallback, + SyncPutAllCallback, parts_cnt_per_key, parts_cnt_per_record); - sync_putall->Wait(); key_parts.clear(); record_parts.clear(); records_ts.clear(); records_ttl.clear(); op_types.clear(); write_batch_size = 0; - if (sync_putall->Result().error_code() != - EloqDS::remote::DataStoreError::NO_ERROR) - { - LOG(WARNING) << "DataStoreHandler: Failed to write batch."; - - return false; - } + ++batch_cnt; } } @@ -409,9 +403,21 @@ bool DataStoreServiceClient::PutAll( txservice::TxKey tx_key = ckpt_rec.Key(); // Start a new batch if done with current partition. - if (write_batch_size >= MAX_WRITE_BATCH_SIZE) + if (write_batch_size >= + SyncPutAllData::max_flying_write_count) { - sync_putall->Reset(); + // Wait for in-flight requests to decrease if limit + // reached + { + std::unique_lock lk( + sync_putall->mux_); + while (sync_putall->unfinished_request_cnt_ >= + SyncPutAllData::max_flying_write_count) + { + sync_putall->cv_.wait(lk); + } + } + BatchWriteRecords(kv_table_name, part_it->first, std::move(key_parts), @@ -421,19 +427,9 @@ bool DataStoreServiceClient::PutAll( std::move(op_types), true, sync_putall, - SyncCallback, + SyncPutAllCallback, parts_cnt_per_key, parts_cnt_per_record); - sync_putall->Wait(); - - if (sync_putall->Result().error_code() != - EloqDS::remote::DataStoreError::NO_ERROR) - { - LOG(WARNING) - << "DataStoreHandler: Failed to write batch."; - - return false; - } record_tmp_mem_area.clear(); key_parts.clear(); record_parts.clear(); @@ -446,6 +442,7 @@ bool DataStoreServiceClient::PutAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); write_batch_size = 0; + ++batch_cnt; } assert(ckpt_rec.payload_status_ == @@ -460,7 +457,6 @@ bool DataStoreServiceClient::PutAll( // Send out the last batch if (key_parts.size() > 0) { - sync_putall->Reset(); BatchWriteRecords(kv_table_name, part_it->first, std::move(key_parts), @@ -470,10 +466,9 @@ bool DataStoreServiceClient::PutAll( std::move(op_types), true, sync_putall, - SyncCallback, + SyncPutAllCallback, parts_cnt_per_key, parts_cnt_per_record); - sync_putall->Wait(); record_tmp_mem_area.clear(); key_parts.clear(); record_parts.clear(); @@ -481,17 +476,29 @@ bool DataStoreServiceClient::PutAll( records_ttl.clear(); op_types.clear(); write_batch_size = 0; - if (sync_putall->Result().error_code() != - EloqDS::remote::DataStoreError::NO_ERROR) - { - LOG(WARNING) - << "DataStoreHandler: Failed to write batch."; - - return false; - } + ++batch_cnt; } } } + + // Wait for all requests to complete + { + std::unique_lock lk(sync_putall->mux_); + sync_putall->unfinished_request_cnt_ += batch_cnt; + sync_putall->all_request_started_ = true; + while (sync_putall->unfinished_request_cnt_ != 0) + { + sync_putall->cv_.wait(lk); + } + } + + if (sync_putall->result_.error_code() != + remote::DataStoreError::NO_ERROR) + { + LOG(ERROR) << "PutAll failed for error: " + << sync_putall->result_.error_msg(); + return false; + } } return true; } @@ -1939,6 +1946,15 @@ bool DataStoreServiceClient::PutArchivesAll( // Start a new batch if done with current partition. if (write_batch_size >= MAX_WRITE_BATCH_SIZE) { + // Wait for in-flight requests to decrease if limit reached + { + std::unique_lock lk(sync_putall->mux_); + while (sync_putall->unfinished_request_cnt_ >= + SyncPutAllData::max_flying_write_count) + { + sync_putall->cv_.wait(lk); + } + } BatchWriteRecords(kv_mvcc_archive_name, partition_id, std::move(keys), diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index 9f8c203..d0d8ddd 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -99,6 +99,8 @@ struct SyncCallbackData : public Poolable struct SyncPutAllData : public Poolable { + static constexpr int32_t max_flying_write_count = 32; + void Reset() { unfinished_request_cnt_ = 0; @@ -123,7 +125,8 @@ struct SyncPutAllData : public Poolable } --unfinished_request_cnt_; - if (all_request_started_ && unfinished_request_cnt_ == 0) + if ((all_request_started_ && unfinished_request_cnt_ == 0) || + unfinished_request_cnt_ == max_flying_write_count - 1) { cv_.notify_one(); } From ea30ac22799ac163e2c3f57def6724ab7b49d829 Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 08:32:17 +0000 Subject: [PATCH 2/9] resolve comment --- data_store_service_client.cpp | 72 ++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index e91f310..94bf1e4 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -286,7 +286,6 @@ bool DataStoreServiceClient::PutAll( uint16_t parts_cnt_per_key = 1; uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5; - uint32_t batch_cnt = 0; // Write data for hash_partitioned table for (auto part_it = hash_partitions_map.begin(); @@ -307,18 +306,8 @@ bool DataStoreServiceClient::PutAll( txservice::TxKey tx_key = ckpt_rec.Key(); // Start a new batch if done with current partition. - if (write_batch_size >= SyncPutAllData::max_flying_write_count) + if (write_batch_size >= MAX_WRITE_BATCH_SIZE) { - // Wait for in-flight requests to decrease if limit reached - { - std::unique_lock lk(sync_putall->mux_); - while (sync_putall->unfinished_request_cnt_ >= - SyncPutAllData::max_flying_write_count) - { - sync_putall->cv_.wait(lk); - } - } - BatchWriteRecords(kv_table_name, part_it->first, std::move(key_parts), @@ -331,6 +320,16 @@ bool DataStoreServiceClient::PutAll( SyncPutAllCallback, parts_cnt_per_key, parts_cnt_per_record); + // Wait for in-flight requests to decrease if limit reached + { + std::unique_lock lk(sync_putall->mux_); + sync_putall->unfinished_request_cnt_++; + while (sync_putall->unfinished_request_cnt_ >= + SyncPutAllData::max_flying_write_count) + { + sync_putall->cv_.wait(lk); + } + } key_parts.clear(); record_parts.clear(); records_ts.clear(); @@ -342,7 +341,6 @@ bool DataStoreServiceClient::PutAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); write_batch_size = 0; - ++batch_cnt; } assert(ckpt_rec.payload_status_ == @@ -374,13 +372,16 @@ bool DataStoreServiceClient::PutAll( SyncPutAllCallback, parts_cnt_per_key, parts_cnt_per_record); + { + std::unique_lock lk(sync_putall->mux_); + sync_putall->unfinished_request_cnt_++; + } key_parts.clear(); record_parts.clear(); records_ts.clear(); records_ttl.clear(); op_types.clear(); write_batch_size = 0; - ++batch_cnt; } } @@ -403,21 +404,8 @@ bool DataStoreServiceClient::PutAll( txservice::TxKey tx_key = ckpt_rec.Key(); // Start a new batch if done with current partition. - if (write_batch_size >= - SyncPutAllData::max_flying_write_count) + if (write_batch_size >= MAX_WRITE_BATCH_SIZE) { - // Wait for in-flight requests to decrease if limit - // reached - { - std::unique_lock lk( - sync_putall->mux_); - while (sync_putall->unfinished_request_cnt_ >= - SyncPutAllData::max_flying_write_count) - { - sync_putall->cv_.wait(lk); - } - } - BatchWriteRecords(kv_table_name, part_it->first, std::move(key_parts), @@ -442,7 +430,18 @@ bool DataStoreServiceClient::PutAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); write_batch_size = 0; - ++batch_cnt; + // Wait for in-flight requests to decrease if limit + // reached + { + std::unique_lock lk( + sync_putall->mux_); + sync_putall->unfinished_request_cnt_++; + while (sync_putall->unfinished_request_cnt_ >= + SyncPutAllData::max_flying_write_count) + { + sync_putall->cv_.wait(lk); + } + } } assert(ckpt_rec.payload_status_ == @@ -476,7 +475,10 @@ bool DataStoreServiceClient::PutAll( records_ttl.clear(); op_types.clear(); write_batch_size = 0; - ++batch_cnt; + { + std::unique_lock lk(sync_putall->mux_); + sync_putall->unfinished_request_cnt_++; + } } } } @@ -484,7 +486,6 @@ bool DataStoreServiceClient::PutAll( // Wait for all requests to complete { std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_ += batch_cnt; sync_putall->all_request_started_ = true; while (sync_putall->unfinished_request_cnt_ != 0) { @@ -1932,7 +1933,6 @@ bool DataStoreServiceClient::PutArchivesAll( SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject(); PoolableGuard guard(sync_putall); sync_putall->Reset(); - uint32_t batch_cnt = 0; size_t recs_cnt = archive_ptrs.size(); keys.reserve(recs_cnt * parts_cnt_per_key); @@ -1949,6 +1949,7 @@ bool DataStoreServiceClient::PutArchivesAll( // Wait for in-flight requests to decrease if limit reached { std::unique_lock lk(sync_putall->mux_); + sync_putall->unfinished_request_cnt_++; while (sync_putall->unfinished_request_cnt_ >= SyncPutAllData::max_flying_write_count) { @@ -1979,7 +1980,6 @@ bool DataStoreServiceClient::PutArchivesAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); write_batch_size = 0; - ++batch_cnt; } txservice::FlushRecord &ckpt_rec = *archive_ptrs[i].second; @@ -2056,13 +2056,15 @@ bool DataStoreServiceClient::PutArchivesAll( records_ttl.reserve(recs_cnt); op_types.reserve(recs_cnt); write_batch_size = 0; - ++batch_cnt; + { + std::unique_lock lk(sync_putall->mux_); + sync_putall->unfinished_request_cnt_++; + } } // Wait the result. { std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_ += batch_cnt; sync_putall->all_request_started_ = true; while (sync_putall->unfinished_request_cnt_ != 0) { From 09a5b8bb1ba77e105ffcdf6b5e222988810e142d Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Wed, 17 Sep 2025 16:32:47 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`bat?= =?UTF-8?q?ch=5Fwrite`=20(#84)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @liunyl. * https://github.com/eloqdata/store_handler/pull/83#issuecomment-3301872873 The following files were modified: * `data_store_service_client.cpp` * `data_store_service_client_closure.h` Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- data_store_service_client.cpp | 43 +++++++++++++ data_store_service_client_closure.h | 93 ++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 2 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index 94bf1e4..b030d18 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -130,6 +130,29 @@ void DataStoreServiceClient::ScheduleTimerTasks() assert(false); } +/** + * @brief Batch-writes a set of flush tasks into KV tables. + * + * Processes the provided flush tasks grouped by table and partition, serializes + * each record (object tables use raw encoded blobs; non-object tables encode + * tx-records with unpack info), and issues batched PUT/DELETE operations via + * BatchWriteRecords. Batches are emitted per KV-partition and sized according + * to SyncPutAllData::max_flying_write_count; the method blocks as necessary to + * respect the global in-flight write limit and waits for all dispatched + * requests to complete before returning. + * + * The function distinguishes hash- and range-partitioned tables, computes + * per-partition batches, and updates per-record timestamps/TTLs and operation + * types. Partial batches are flushed at partition boundaries. On any remote or + * batch-level error the function logs the failure and returns false. + * + * @param flush_task Mapping from KV table name to a vector of flush task + * entries containing the records to write. Each entry's + * data_sync_vec_ provides the sequence of records for that + * flush task. + * @return true if all batches completed successfully; false if any batch + * reported an error. + */ bool DataStoreServiceClient::PutAll( std::unordered_map>> @@ -1869,6 +1892,26 @@ void DataStoreServiceClient::DecodeArchiveValue( value_offset = pos; } +/** + * @brief Writes multiple MVCC archive records to the MVCC archive KV table in partitioned batches. + * + * Groups archive entries from the provided flush tasks by archive partition, serializes keys + * and values into batch write requests, and dispatches those requests (possibly concurrently) + * to the KV layer. Batches are split to respect MAX_WRITE_BATCH_SIZE and an internal limit on + * in-flight write requests; the method waits for all dispatched batches for each partition to + * complete before returning. + * + * Side effects: + * - Commits serialized archive records to kv_mvcc_archive_name with a default TTL of 1 day. + * - Converts per-record commit timestamps to big-endian form as part of key encoding (the + * in-memory commit_ts field of those records is mutated during processing). + * + * @param flush_task Map from KV table name to a vector of FlushTaskEntry pointers whose + * archive vectors contain the FlushRecord entries to write. Only entries + * with non-empty archive vectors are processed. + * @return true if all batches for all partitions completed successfully; false if any batch + * failed (an error will be logged). + */ bool DataStoreServiceClient::PutArchivesAll( std::unordered_map>> diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index d0d8ddd..77af7b5 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -1,4 +1,3 @@ - /** * Copyright (C) 2025 EloqData Inc. * @@ -35,7 +34,97 @@ #include "data_store_service_scanner.h" #include "eloq_data_store_service/object_pool.h" -namespace EloqDS +/** + * Callback type invoked on completion of a datastore operation. + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). + * - client: reference to the DataStoreServiceClient that executed the operation. + * - result: operation result detail (error code/message and any operation-specific fields). + */ + + /** + * Synchronization helper used to wait for an asynchronous datastore operation to complete. + * + * Provides a mutex/condition variable pair and a CommonResult to store the outcome. + * Typical usage: Reset() before issuing the async operation, Notify() from the async + * completion callback, and Wait() from the waiting thread. HasError() reports whether + * the stored result represents an error other than NO_ERROR or KEY_NOT_FOUND. + */ + + /** + * Aggregation and flow-control helper for coordinating many concurrent put-all writes. + * + * - unfinished_request_cnt_: signed count of outstanding write requests (must be signed). + * - all_request_started_: set to true once all requests have been launched. + * - max_flying_write_count: upper bound on concurrent in-flight writes (32). + * + * Finish(res) will merge the first non-NO_ERROR result into `result_`, decrement the + * unfinished request count, and notify a waiter when either: + * - all requests have been started and the unfinished count reaches zero, or + * - the unfinished count falls to (max_flying_write_count - 1), enabling flow control + * to allow launching further requests while keeping in-flight writes bounded. + */ + + /** + * Generic synchronous callback adapter invoked by closures to signal completion. + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). + * - client: reference to the DataStoreServiceClient that executed the operation. + * - result: operation result detail (error code/message and any operation-specific fields). + */ + + /** + * Shared helper used when reading archived records concurrently. + * + * Holds references to external synchronization primitives and counters: + * - mtx_, cv_: external mutex and condition variable used to guard flying_read_cnt_. + * - flying_read_cnt_: reference to the shared in-flight read counter. + * - error_code_: reference to an integer used to capture the first observed error. + * + * Also stores the most recent read result (partition_id_, key_str_, value_str_, ts_, ttl_). + * Thread-safe: methods that mutate or read shared resources acquire the provided mutex. + */ + + /** + * Callback invoked for batch archive reads to aggregate or forward results. + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). + * - client: reference to the DataStoreServiceClient that executed the operation. + * - result: operation result detail (error code/message and any operation-specific fields). + */ + + /** + * Callback invoked to load a range slice (archive or otherwise). + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). + * - client: reference to the DataStoreServiceClient that executed the operation. + * - result: operation result detail (error code/message and any operation-specific fields). + */ + + /** + * Closure implementing a datastore Read operation supporting both local and remote paths. + * + * Use Reset(...) to configure a read (table, partition, key, client, and callback), then: + * - PrepareRequest(is_local): prepare an RPC request if remote, or clear local result for local reads. + * - Run(): executed when an RPC completes (or when local processing is finished). Run() + * handles RPC failures with retry logic, translates NOT_OWNER into sharding handling + * and potential retry, and finally invokes the user callback with the CommonResult. + * + * Accessors provide access to the brpc::Controller, request/response objects, table/partition/key, + * and local-result fields (value, ts, ttl, result). Value accessors return either the local + * in-memory values or the response's values depending on the request mode. + * + * Note: retry behavior is governed by the associated DataStoreServiceClient retry_limit_. + */ + namespace EloqDS { typedef void (*DataStoreCallback)(void *data, ::google::protobuf::Closure *closure, From 13cae07e402651da24d1c61167ca715fff7be9f8 Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 08:52:28 +0000 Subject: [PATCH 4/9] add docstring --- data_store_service_client.cpp | 461 +++++++++++++++++++++++++++- data_store_service_client_closure.h | 339 ++++++++++++++------ 2 files changed, 702 insertions(+), 98 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index b030d18..471e2ba 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -92,6 +92,15 @@ DataStoreServiceClient::~DataStoreServiceClient() upsert_table_worker_.Shutdown(); } +/** + * @brief Configures the data store service client with cluster manager information. + * + * Initializes the client with cluster configuration including node hostnames and ports. + * Logs all node information for debugging purposes and stores the cluster manager + * reference for future use. + * + * @param cluster_manager Reference to the cluster manager containing shard and node information. + */ void DataStoreServiceClient::SetupConfig( const DataStoreServiceClusterManager &cluster_manager) { @@ -106,6 +115,15 @@ void DataStoreServiceClient::SetupConfig( cluster_manager_ = cluster_manager; } +/** + * @brief Establishes connection to the data store service. + * + * Attempts to connect to the data store service with retry logic. Initializes + * pre-built tables and retries up to 5 times with 1-second delays between attempts. + * Returns true if connection succeeds, false otherwise. + * + * @return true if connection is successful, false if all retry attempts fail. + */ bool DataStoreServiceClient::Connect() { bool succeed = false; @@ -124,6 +142,13 @@ bool DataStoreServiceClient::Connect() return succeed; } +/** + * @brief Schedules timer-based tasks for the data store service. + * + * Currently not implemented. This method is a placeholder for future timer-based + * functionality such as periodic cleanup, health checks, or maintenance tasks. + * Will assert and log an error if called. + */ void DataStoreServiceClient::ScheduleTimerTasks() { LOG(ERROR) << "ScheduleTimerTasks not implemented"; @@ -527,6 +552,16 @@ bool DataStoreServiceClient::PutAll( return true; } +/** + * @brief Persists data from specified KV tables to storage. + * + * Flushes data from the provided KV table names to persistent storage using + * asynchronous flush operations. Waits for completion and returns success/failure + * status. Logs warnings on failure and debug info on success. + * + * @param kv_table_names Vector of KV table names to persist. + * @return true if all tables are persisted successfully, false if any operation fails. + */ bool DataStoreServiceClient::PersistKV( const std::vector &kv_table_names) { @@ -548,6 +583,26 @@ bool DataStoreServiceClient::PersistKV( return true; } +/** + * @brief Upserts table schema information to the data store. + * + * Handles table creation, modification, and deletion operations by updating + * table schema information in the data store. Validates leadership, processes + * the operation asynchronously, and sets appropriate error codes on failure. + * Supports various operation types including CREATE, DROP, and ALTER operations. + * + * @param old_table_schema Pointer to the existing table schema (nullptr for CREATE). + * @param new_table_schema Pointer to the new table schema. + * @param op_type Type of operation (CREATE, DROP, ALTER, etc.). + * @param commit_ts Commit timestamp for the operation. + * @param ng_id Node group ID for the operation. + * @param tx_term Transaction term for consistency. + * @param hd_res Handler result object to store operation outcome. + * @param alter_table_info Information about table alterations (nullptr if not applicable). + * @param cc_req CC request base object. + * @param ccs CC shard reference. + * @param err_code Error code output parameter. + */ void DataStoreServiceClient::UpsertTable( const txservice::TableSchema *old_table_schema, const txservice::TableSchema *new_table_schema, @@ -598,6 +653,16 @@ void DataStoreServiceClient::UpsertTable( { this->UpsertTable(table_data); }); } +/** + * @brief Fetches table catalog information from the data store. + * + * Retrieves catalog information for the specified table by reading from the + * KV table catalogs storage. Uses partition ID 0 and the catalog name as the key. + * The operation is performed asynchronously with a callback for completion handling. + * + * @param ccm_table_name The table name to fetch catalog information for. + * @param fetch_cc Fetch catalog CC object to store the result and handle completion. + */ void DataStoreServiceClient::FetchTableCatalog( const txservice::TableName &ccm_table_name, txservice::FetchCatalogCc *fetch_cc) @@ -611,6 +676,16 @@ void DataStoreServiceClient::FetchTableCatalog( &FetchTableCatalogCallback); } +/** + * @brief Fetches current table statistics from the data store. + * + * Retrieves the current version of table statistics for the specified table. + * Determines the appropriate KV partition ID and reads from the table statistics + * version storage. The operation is performed asynchronously with callback handling. + * + * @param ccm_table_name The table name to fetch statistics for. + * @param fetch_cc Fetch table statistics CC object to store the result and handle completion. + */ void DataStoreServiceClient::FetchCurrentTableStatistics( const txservice::TableName &ccm_table_name, txservice::FetchTableStatisticsCc *fetch_cc) @@ -626,6 +701,17 @@ void DataStoreServiceClient::FetchCurrentTableStatistics( &FetchCurrentTableStatsCallback); } +/** + * @brief Fetches table statistics for a specific version from the data store. + * + * Retrieves table statistics for a specific version by constructing key ranges + * based on the table name and version number. Clears previous key ranges and + * session information, then constructs start and end keys for the version-specific + * statistics. The operation is performed asynchronously with callback handling. + * + * @param ccm_table_name The table name to fetch statistics for. + * @param fetch_cc Fetch table statistics CC object containing version information and result storage. + */ void DataStoreServiceClient::FetchTableStatistics( const txservice::TableName &ccm_table_name, txservice::FetchTableStatisticsCc *fetch_cc) @@ -698,6 +784,19 @@ std::string EncodeTableStatsKey(const txservice::TableName &base_table_name, return key; } +/** + * @brief Upserts table statistics to the data store. + * + * Stores table statistics by splitting sample keys into segments and writing them + * to the KV storage. Each segment contains index type, record count, and sample keys. + * Also updates the checkpoint version for the table statistics. Uses batch write + * operations for efficiency and handles both local and remote storage paths. + * + * @param ccm_table_name The table name to store statistics for. + * @param sample_pool_map Map of index names to sample pools containing record counts and sample keys. + * @param version The version number for the statistics. + * @return true if all statistics are stored successfully, false if any operation fails. + */ bool DataStoreServiceClient::UpsertTableStatistics( const txservice::TableName &ccm_table_name, const std::unordered_map DataStoreServiceClient::ScanForward( const txservice::TableName &table_name, @@ -1139,6 +1305,19 @@ std::string DataStoreServiceClient::EncodeRangeKey( return key; } +/** + * @brief Encodes range information into a binary value format. + * + * Serializes range metadata including range ID, range version, general version, + * and segment count into a binary string format for storage in the KV system. + * Uses little-endian encoding for all numeric values. + * + * @param range_id The range identifier. + * @param range_version The version of the range. + * @param version The general version number. + * @param segment_cnt The number of segments in the range. + * @return Binary string containing the encoded range value. + */ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, uint64_t range_version, uint64_t version, @@ -1159,6 +1338,18 @@ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, return kv_range_record; } +/** + * @brief Encodes a range slice key for storage in the KV system. + * + * Creates a composite key by combining table name, range ID, and segment ID. + * Uses little-endian encoding for numeric values since range slice operations + * are point reads rather than scans, optimizing for direct key lookup performance. + * + * @param table_name The table name for the range slice. + * @param range_id The range identifier. + * @param segment_id The segment identifier within the range. + * @return Binary string containing the encoded range slice key. + */ std::string DataStoreServiceClient::EncodeRangeSliceKey( const txservice::TableName &table_name, int32_t range_id, @@ -1176,7 +1367,16 @@ std::string DataStoreServiceClient::EncodeRangeSliceKey( return key; } -// Replace the segment_id in range_slice_key +/** + * @brief Updates the segment ID in an encoded range slice key. + * + * Modifies an existing range slice key by replacing the segment ID portion + * with a new segment ID value. This is used for updating range slice keys + * without recreating the entire key structure. + * + * @param range_slice_key The range slice key to update (modified in place). + * @param new_segment_id The new segment ID to use. + */ void DataStoreServiceClient::UpdateEncodedRangeSliceKey( std::string &range_slice_key, uint32_t new_segment_id) { @@ -1186,6 +1386,23 @@ void DataStoreServiceClient::UpdateEncodedRangeSliceKey( sizeof(new_segment_id)); } +/** + * @brief Updates range slices for a table partition. + * + * Stores range slice information by segmenting the slices into manageable chunks + * and writing them to the KV storage system. Handles slice serialization with + * proper key encoding and batch size management. Also updates the range information + * with the new version and segment count. Uses both local and remote storage paths + * based on configuration. + * + * @param table_name The table name for the range slices. + * @param version The version number for the slices. + * @param range_start_key The starting key for the range. + * @param slices Vector of store slices to update. + * @param partition_id The partition ID for the range. + * @param range_version The version of the range. + * @return true if all slices are updated successfully, false if any operation fails. + */ bool DataStoreServiceClient::UpdateRangeSlices( const txservice::TableName &table_name, uint64_t version, @@ -1336,6 +1553,19 @@ bool DataStoreServiceClient::UpdateRangeSlices( return true; } +/** + * @brief Upserts range information for a table. + * + * Updates range slices for multiple ranges by calling UpdateRangeSlices for each + * range in the provided vector. After updating all ranges, flushes the range table + * data to ensure persistence. Validates that the table name is not empty and + * handles errors from individual range updates. + * + * @param table_name The table name for the ranges. + * @param range_info Vector of split range information to upsert. + * @param version The version number for the ranges. + * @return true if all ranges are updated and flushed successfully, false if any operation fails. + */ bool DataStoreServiceClient::UpsertRanges( const txservice::TableName &table_name, std::vector range_info, @@ -1374,6 +1604,20 @@ bool DataStoreServiceClient::UpsertRanges( return true; } +/** + * @brief Fetches table schema information synchronously. + * + * Retrieves table schema information from the data store using asynchronous + * operations with synchronous waiting. Uses FetchTableCatalog internally and + * waits for completion before returning the result. Provides schema image, + * found status, and version timestamp. + * + * @param table_name The table name to fetch schema for. + * @param schema_image Output parameter for the schema image data. + * @param found Output parameter indicating if the table was found. + * @param version_ts Output parameter for the version timestamp. + * @return true if the fetch operation completes successfully, false otherwise. + */ bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name, std::string &schema_image, bool &found, @@ -1399,6 +1643,18 @@ bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name, return !callback_data->HasError(); } +/** + * @brief Discovers all table names in the data store. + * + * Scans the table catalogs to discover all available table names. Uses pagination + * with session management and supports cooperative scheduling through yield/resume + * function pointers. Performs the scan asynchronously and waits for completion. + * + * @param norm_name_vec Output vector to store the discovered table names. + * @param yield_fptr Optional function pointer for yielding control during pagination. + * @param resume_fptr Optional function pointer for resuming after yielding. + * @return true if the discovery operation completes successfully, false if any error occurs. + */ bool DataStoreServiceClient::DiscoverAllTableNames( std::vector &norm_name_vec, const std::function *yield_fptr, @@ -1426,10 +1682,18 @@ bool DataStoreServiceClient::DiscoverAllTableNames( return !callback_data->HasError(); } -// The store format of database catalog in kvstore is as follows: -// -// key: dbname -// value: db_definition +/** + * @brief Upserts database definition to the data store. + * + * Stores database definition information in the KV storage system. The storage + * format uses the database name as the key and the database definition as the value. + * Uses current timestamp for versioning and performs the operation asynchronously + * with synchronous waiting for completion. + * + * @param db The database name to upsert. + * @param definition The database definition to store. + * @return true if the database is upserted successfully, false if any operation fails. + */ bool DataStoreServiceClient::UpsertDatabase(std::string_view db, std::string_view definition) { @@ -1475,6 +1739,16 @@ bool DataStoreServiceClient::UpsertDatabase(std::string_view db, return true; } +/** + * @brief Drops a database from the data store. + * + * Removes a database definition from the KV storage system by performing a DELETE + * operation on the database catalog. Uses current timestamp for versioning and + * performs the operation asynchronously with synchronous waiting for completion. + * + * @param db The database name to drop. + * @return true if the database is dropped successfully, false if any operation fails. + */ bool DataStoreServiceClient::DropDatabase(std::string_view db) { std::vector keys; @@ -1519,6 +1793,20 @@ bool DataStoreServiceClient::DropDatabase(std::string_view db) return true; } +/** + * @brief Fetches database definition from the data store. + * + * Retrieves database definition information from the KV storage system. + * Supports cooperative scheduling through yield/resume function pointers + * and performs the operation asynchronously with synchronous waiting. + * + * @param db The database name to fetch. + * @param definition Output parameter for the database definition. + * @param found Output parameter indicating if the database was found. + * @param yield_fptr Optional function pointer for yielding control. + * @param resume_fptr Optional function pointer for resuming after yielding. + * @return true if the fetch operation completes successfully, false if any error occurs. + */ bool DataStoreServiceClient::FetchDatabase( std::string_view db, std::string &definition, @@ -1793,6 +2081,19 @@ void DataStoreServiceClient::EncodeArchiveKey( write_batch_size += sizeof(uint64_t); } +/** + * @brief Decodes an archive key to extract its components. + * + * Parses an archive key string to extract the table name, transaction key, + * and commit timestamp. The archive key format is: "log:item:{table_name}:{key}:{commit_ts}". + * Validates the key format and extracts each component using string separators. + * + * @param archive_key The archive key string to decode. + * @param table_name Output parameter for the extracted table name. + * @param key Output parameter for the extracted transaction key. + * @param be_commit_ts Output parameter for the extracted commit timestamp (big-endian). + * @return true if the key is successfully decoded, false if the format is invalid. + */ bool DataStoreServiceClient::DecodeArchiveKey(const std::string &archive_key, std::string &table_name, txservice::TxKey &key, @@ -1834,6 +2135,20 @@ bool DataStoreServiceClient::DecodeArchiveKey(const std::string &archive_key, return true; } +/** + * @brief Encodes archive value data for storage. + * + * Serializes archive value information including deletion status, unpack info, + * and encoded blob data into record parts for batch writing. Handles both + * deleted and non-deleted records with appropriate data encoding. + * + * @param is_deleted Whether the record is marked as deleted. + * @param value Pointer to the transaction record (nullptr for deleted records). + * @param unpack_info_size Size of the unpack info data. + * @param encoded_blob_size Size of the encoded blob data. + * @param record_parts Vector to store the encoded record parts. + * @param write_batch_size Running total of batch size (updated in place). + */ void DataStoreServiceClient::EncodeArchiveValue( bool is_deleted, const txservice::TxRecord *value, @@ -2127,6 +2442,17 @@ bool DataStoreServiceClient::PutArchivesAll( return true; } +/** + * @brief Copies base table data to archive storage. + * + * Reads base table records and copies them to archive storage with concurrent + * read operations. Manages in-flight read count to control concurrency and + * handles both hash and range partitioned tables. Uses archive-specific + * encoding and TTL settings for the copied data. + * + * @param flush_task Map of table names to flush task entries containing base records to copy. + * @return true if all records are successfully copied to archive, false if any operation fails. + */ bool DataStoreServiceClient::CopyBaseToArchive( std::unordered_map>> @@ -2293,6 +2619,20 @@ bool DataStoreServiceClient::CopyBaseToArchive( return true; } +/** + * @brief Fetches archive records for a specific key from a given timestamp. + * + * Retrieves archived versions of a record from the MVCC archive storage. + * Scans the archive table for records matching the specified key and timestamp range. + * Currently asserts false as this functionality is not fully implemented. + * + * @param table_name The table name to fetch archives for. + * @param kv_info KV catalog information for the table. + * @param key The key to fetch archive records for. + * @param archives Output vector to store the fetched archive records. + * @param from_ts Starting timestamp for the archive fetch. + * @return Currently always returns false (not implemented). + */ bool DataStoreServiceClient::FetchArchives( const txservice::TableName &table_name, const txservice::KVCatalogInfo *kv_info, @@ -2377,6 +2717,23 @@ bool DataStoreServiceClient::FetchArchives( return true; } +/** + * @brief Fetches the visible archive record for a key at a specific timestamp. + * + * Retrieves the most recent archive record for a given key that is visible + * at the specified upper bound timestamp. Scans the archive table in reverse + * order to find the latest visible version. Currently asserts false as this + * functionality is not fully implemented. + * + * @param table_name The table name to fetch archive for. + * @param kv_info KV catalog information for the table. + * @param key The key to fetch archive record for. + * @param upper_bound_ts The upper bound timestamp for visibility. + * @param rec Output parameter for the fetched record. + * @param rec_status Output parameter for the record status. + * @param commit_ts Output parameter for the commit timestamp. + * @return Currently always returns false (not implemented). + */ bool DataStoreServiceClient::FetchVisibleArchive( const txservice::TableName &table_name, const txservice::KVCatalogInfo *kv_info, @@ -2461,6 +2818,17 @@ bool DataStoreServiceClient::FetchVisibleArchive( return true; } +/** + * @brief Fetches archive records for a fetch record CC operation. + * + * Retrieves archive records for a specific key and snapshot read timestamp. + * Encodes the appropriate key range for scanning the archive table and + * initiates a scan operation to fetch all relevant archive versions. + * Sets up the fetch CC object with the necessary scan parameters. + * + * @param fetch_cc Fetch record CC object containing key, timestamp, and result storage. + * @return DataStoreOpStatus indicating the operation status. + */ txservice::store::DataStoreHandler::DataStoreOpStatus DataStoreServiceClient::FetchArchives(txservice::FetchRecordCc *fetch_cc) { @@ -2530,6 +2898,19 @@ DataStoreServiceClient::FetchVisibleArchive( return txservice::store::DataStoreHandler::DataStoreOpStatus::Success; } +/** + * @brief Creates a snapshot for backup operations. + * + * Initiates a snapshot creation process across all shards in the cluster. + * Collects shard IDs from the cluster manager and coordinates snapshot creation + * for both local and remote shards. Waits for completion and returns the + * backup files generated during the process. + * + * @param backup_name The name for the backup snapshot. + * @param backup_files Output vector to store the generated backup file paths. + * @param backup_ts The timestamp for the backup. + * @return true if the snapshot is created successfully, false if any operation fails. + */ bool DataStoreServiceClient::CreateSnapshotForBackup( const std::string &backup_name, std::vector &backup_files, @@ -2562,6 +2943,16 @@ bool DataStoreServiceClient::CreateSnapshotForBackup( return !callback_data->HasError(); } +/** + * @brief Internal method for creating snapshots for backup operations. + * + * Processes snapshot creation for individual shards, handling both local and + * remote shards differently. For local shards, prepares local requests; for + * remote shards, prepares RPC requests. Manages the closure lifecycle and + * coordinates completion when all shards are processed. + * + * @param closure The closure object managing the snapshot creation process. + */ void DataStoreServiceClient::CreateSnapshotForBackupInternal( CreateSnapshotForBackupClosure *closure) { @@ -2610,11 +3001,30 @@ void DataStoreServiceClient::CreateSnapshotForBackupInternal( } } +/** + * @brief Determines if range copying is needed. + * + * Currently always returns true, indicating that range copying is always required. + * This method is used to determine whether range data needs to be copied during + * certain operations. + * + * @return Always returns true. + */ bool DataStoreServiceClient::NeedCopyRange() const { return true; } +/** + * @brief Restores transaction cache for a node group. + * + * Currently not implemented. This method is a placeholder for restoring + * transaction cache state for a specific node group and term. + * Will log an error and assert false if called. + * + * @param cc_ng_id The node group ID to restore cache for. + * @param cc_ng_term The term for the node group. + */ void DataStoreServiceClient::RestoreTxCache(txservice::NodeGroupId cc_ng_id, int64_t cc_ng_term) { @@ -2622,19 +3032,50 @@ void DataStoreServiceClient::RestoreTxCache(txservice::NodeGroupId cc_ng_id, assert(false); } +/** + * @brief Handles leader start event. + * + * Currently always returns true. This method is called when the node becomes + * a leader and can be used to perform leader-specific initialization. + * + * @param next_leader_node Pointer to store the next leader node ID (unused). + * @return Always returns true. + */ bool DataStoreServiceClient::OnLeaderStart(uint32_t *next_leader_node) { return true; } +/** + * @brief Handles start following event. + * + * Currently empty implementation. This method is called when the node starts + * following another leader and can be used to perform follower-specific initialization. + */ void DataStoreServiceClient::OnStartFollowing() { } +/** + * @brief Handles shutdown event. + * + * Currently empty implementation. This method is called when the node is shutting + * down and can be used to perform cleanup operations. + */ void DataStoreServiceClient::OnShutdown() { } +/** + * @brief Checks if a shard is local to this node. + * + * Determines whether the specified shard is owned by this node using the + * cluster manager. This is used for scale-up scenarios where data needs to be + * migrated from smaller to larger nodes. + * + * @param shard_id The shard ID to check. + * @return true if the shard is local to this node, false otherwise. + */ bool DataStoreServiceClient::IsLocalShard(uint32_t shard_id) { // this is a temporary solution for scale up scenario (from one smaller @@ -2642,6 +3083,16 @@ bool DataStoreServiceClient::IsLocalShard(uint32_t shard_id) return cluster_manager_.IsOwnerOfShard(shard_id); } +/** + * @brief Checks if a partition is local to this node. + * + * Determines whether the specified partition is owned by this node using the + * cluster manager. Used for determining whether operations should be performed + * locally or remotely. + * + * @param partition_id The partition ID to check. + * @return true if the partition is local to this node, false otherwise. + */ bool DataStoreServiceClient::IsLocalPartition(int32_t partition_id) { return cluster_manager_.IsOwnerOfPartition(partition_id); diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index 77af7b5..16836a5 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -34,103 +34,35 @@ #include "data_store_service_scanner.h" #include "eloq_data_store_service/object_pool.h" -/** - * Callback type invoked on completion of a datastore operation. - * - * Parameters: - * - data: user-provided context pointer passed through the async call. - * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). - * - client: reference to the DataStoreServiceClient that executed the operation. - * - result: operation result detail (error code/message and any operation-specific fields). - */ - - /** - * Synchronization helper used to wait for an asynchronous datastore operation to complete. - * - * Provides a mutex/condition variable pair and a CommonResult to store the outcome. - * Typical usage: Reset() before issuing the async operation, Notify() from the async - * completion callback, and Wait() from the waiting thread. HasError() reports whether - * the stored result represents an error other than NO_ERROR or KEY_NOT_FOUND. - */ - - /** - * Aggregation and flow-control helper for coordinating many concurrent put-all writes. - * - * - unfinished_request_cnt_: signed count of outstanding write requests (must be signed). - * - all_request_started_: set to true once all requests have been launched. - * - max_flying_write_count: upper bound on concurrent in-flight writes (32). - * - * Finish(res) will merge the first non-NO_ERROR result into `result_`, decrement the - * unfinished request count, and notify a waiter when either: - * - all requests have been started and the unfinished count reaches zero, or - * - the unfinished count falls to (max_flying_write_count - 1), enabling flow control - * to allow launching further requests while keeping in-flight writes bounded. - */ - - /** - * Generic synchronous callback adapter invoked by closures to signal completion. - * - * Parameters: - * - data: user-provided context pointer passed through the async call. - * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). - * - client: reference to the DataStoreServiceClient that executed the operation. - * - result: operation result detail (error code/message and any operation-specific fields). - */ - - /** - * Shared helper used when reading archived records concurrently. - * - * Holds references to external synchronization primitives and counters: - * - mtx_, cv_: external mutex and condition variable used to guard flying_read_cnt_. - * - flying_read_cnt_: reference to the shared in-flight read counter. - * - error_code_: reference to an integer used to capture the first observed error. - * - * Also stores the most recent read result (partition_id_, key_str_, value_str_, ts_, ttl_). - * Thread-safe: methods that mutate or read shared resources acquire the provided mutex. - */ - - /** - * Callback invoked for batch archive reads to aggregate or forward results. - * - * Parameters: - * - data: user-provided context pointer passed through the async call. - * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). - * - client: reference to the DataStoreServiceClient that executed the operation. - * - result: operation result detail (error code/message and any operation-specific fields). - */ - - /** - * Callback invoked to load a range slice (archive or otherwise). - * - * Parameters: - * - data: user-provided context pointer passed through the async call. - * - closure: protobuf closure associated with the RPC (may be nullptr for local paths). - * - client: reference to the DataStoreServiceClient that executed the operation. - * - result: operation result detail (error code/message and any operation-specific fields). - */ - - /** - * Closure implementing a datastore Read operation supporting both local and remote paths. - * - * Use Reset(...) to configure a read (table, partition, key, client, and callback), then: - * - PrepareRequest(is_local): prepare an RPC request if remote, or clear local result for local reads. - * - Run(): executed when an RPC completes (or when local processing is finished). Run() - * handles RPC failures with retry logic, translates NOT_OWNER into sharding handling - * and potential retry, and finally invokes the user callback with the CommonResult. - * - * Accessors provide access to the brpc::Controller, request/response objects, table/partition/key, - * and local-result fields (value, ts, ttl, result). Value accessors return either the local - * in-memory values or the response's values depending on the request mode. - * - * Note: retry behavior is governed by the associated DataStoreServiceClient retry_limit_. - */ - namespace EloqDS +namespace EloqDS { +/** + * Callback type invoked on completion of a datastore operation. + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for + * local paths). + * - client: reference to the DataStoreServiceClient that executed the + * operation. + * - result: operation result detail (error code/message and any + * operation-specific fields). + */ typedef void (*DataStoreCallback)(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Synchronization helper used to wait for an asynchronous datastore operation + * to complete. + * + * Provides a mutex/condition variable pair and a CommonResult to store the + * outcome. Typical usage: Reset() before issuing the async operation, Notify() + * from the async completion callback, and Wait() from the waiting thread. + * HasError() reports whether the stored result represents an error other than + * NO_ERROR or KEY_NOT_FOUND. + */ struct SyncCallbackData : public Poolable { SyncCallbackData() : mtx_(), cv_(), finished_(false) @@ -185,6 +117,22 @@ struct SyncCallbackData : public Poolable remote::CommonResult result_; }; +/** + * Aggregation and flow-control helper for coordinating many concurrent put-all + * writes. + * + * - unfinished_request_cnt_: signed count of outstanding write requests (must + * be signed). + * - all_request_started_: set to true once all requests have been launched. + * - max_flying_write_count: upper bound on concurrent in-flight writes (32). + * + * Finish(res) will merge the first non-NO_ERROR result into `result_`, + * decrement the unfinished request count, and notify a waiter when either: + * - all requests have been started and the unfinished count reaches zero, or + * - the unfinished count falls to (max_flying_write_count - 1), enabling flow + * control to allow launching further requests while keeping in-flight writes + * bounded. + */ struct SyncPutAllData : public Poolable { @@ -228,12 +176,44 @@ struct SyncPutAllData : public Poolable bthread::Mutex mux_; bthread::ConditionVariable cv_; }; +/** + * Generic synchronous callback adapter invoked by closures to signal + * completion. + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for + * local paths). + * - client: reference to the DataStoreServiceClient that executed the + * operation. + * - result: operation result detail (error code/message and any + * operation-specific fields). + */ void SyncCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data structure for concurrent archive record reading operations. + * + * Manages synchronization and flow control for reading base records that will + * be copied to archive storage. Tracks flying read count and provides mutex + * synchronization for concurrent access. + * + * Holds references to external synchronization primitives and counters: + * - mtx_, cv_: external mutex and condition variable used to guard + * flying_read_cnt_. + * - flying_read_cnt_: reference to the shared in-flight read counter. + * - error_code_: reference to an integer used to capture the first observed + * error. + * + * Also stores the most recent read result (partition_id_, key_str_, value_str_, + * ts_, ttl_). Thread-safe: methods that mutate or read shared resources acquire + * the provided mutex. + */ + struct ReadBaseForArchiveCallbackData { ReadBaseForArchiveCallbackData(bthread::Mutex &mtx, @@ -333,17 +313,60 @@ struct ReadBaseForArchiveCallbackData uint64_t ts_; uint64_t ttl_; }; - +/** + * Callback invoked for batch archive reads to aggregate or forward results. + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for + * local paths). + * - client: reference to the DataStoreServiceClient that executed the + * operation. + * - result: operation result detail (error code/message and any + * operation-specific fields). + */ void SyncBatchReadForArchiveCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback invoked to load a range slice (archive or otherwise). + * + * Parameters: + * - data: user-provided context pointer passed through the async call. + * - closure: protobuf closure associated with the RPC (may be nullptr for + * local paths). + * - client: reference to the DataStoreServiceClient that executed the + * operation. + * - result: operation result detail (error code/message and any + * operation-specific fields). + */ void LoadRangeSliceCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); - +/** + * Closure implementing a datastore Read operation supporting both local and + * remote paths. + * + * Use Reset(...) to configure a read (table, partition, key, client, and + * callback), then: + * - PrepareRequest(is_local): prepare an RPC request if remote, or clear local + * result for local reads. + * - Run(): executed when an RPC completes (or when local processing is + * finished). Run() handles RPC failures with retry logic, translates NOT_OWNER + * into sharding handling and potential retry, and finally invokes the user + * callback with the CommonResult. + * + * Accessors provide access to the brpc::Controller, request/response objects, + * table/partition/key, and local-result fields (value, ts, ttl, result). Value + * accessors return either the local in-memory values or the response's values + * depending on the request mode. + * + * Note: retry behavior is governed by the associated DataStoreServiceClient + * retry_limit_. + */ class ReadClosure : public ::google::protobuf::Closure, public Poolable { public: @@ -637,6 +660,13 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable void *callback_data_; }; +/** + * Closure for asynchronous data flushing operations to KV storage. + * + * Manages the lifecycle of flush operations, including RPC communication, + * retry logic, and callback invocation. Supports both local and remote + * flush operations with configurable retry behavior. + */ class FlushDataClosure : public ::google::protobuf::Closure, public Poolable { public: @@ -2163,31 +2193,62 @@ class CreateSnapshotForBackupClosure : public ::google::protobuf::Closure, void *callback_data_; }; +/** + * Callback for fetching individual records from the data store. + * + * Handles the completion of record fetch operations and processes the result. + */ void FetchRecordCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching snapshot data from the data store. + * + * Handles the completion of snapshot fetch operations and processes the result. + */ void FetchSnapshotCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for asynchronous table drop operations. + * + * Contains the KV table name that is being dropped. + */ struct AsyncDropTableCallbackData { std::string kv_table_name_; }; +/** + * Callback for asynchronous table drop operations. + * + * Handles the completion of table drop operations and processes the result. + */ void AsyncDropTableCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching table catalog information. + * + * Handles the completion of table catalog fetch operations and processes the result. + */ void FetchTableCatalogCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for fetching table information. + * + * Extends SyncCallbackData to include table-specific information like + * schema image, version timestamp, and found status. + */ struct FetchTableCallbackData : public SyncCallbackData { FetchTableCallbackData() = default; @@ -2219,11 +2280,24 @@ void FetchTableCallback(void *data, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for synchronous put-all operations. + * + * Handles the completion of batch put operations and updates the + * SyncPutAllData structure with the result. + */ void SyncPutAllCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for fetching database information. + * + * Extends SyncCallbackData to include database-specific information like + * database definition, found status, and yield/resume function pointers + * for cooperative scheduling. + */ struct FetchDatabaseCallbackData : public SyncCallbackData { FetchDatabaseCallbackData() = default; @@ -2280,11 +2354,22 @@ struct FetchDatabaseCallbackData : public SyncCallbackData const std::function *resume_fptr_; }; +/** + * Callback for fetching database information. + * + * Handles the completion of database fetch operations and processes the result. + */ void FetchDatabaseCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for fetching all database names. + * + * Extends SyncCallbackData to include database names list and yield/resume + * function pointers for cooperative scheduling during pagination. + */ struct FetchAllDatabaseCallbackData : public SyncCallbackData { FetchAllDatabaseCallbackData() = default; @@ -2347,11 +2432,22 @@ struct FetchAllDatabaseCallbackData : public SyncCallbackData std::string end_key_; }; +/** + * Callback for fetching all database names. + * + * Handles the completion of all database names fetch operations and processes the result. + */ void FetchAllDatabaseCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for discovering all table names. + * + * Extends SyncCallbackData to include table names list and yield/resume + * function pointers for cooperative scheduling during pagination. + */ struct DiscoverAllTableNamesCallbackData : public SyncCallbackData { DiscoverAllTableNamesCallbackData() = default; @@ -2408,30 +2504,61 @@ struct DiscoverAllTableNamesCallbackData : public SyncCallbackData std::string session_id_; }; +/** + * Callback for discovering all table names. + * + * Handles the completion of table name discovery operations and processes the result. + */ void DiscoverAllTableNamesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching table ranges. + * + * Handles the completion of table range fetch operations and processes the result. + */ void FetchTableRangesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching range slices. + * + * Handles the completion of range slice fetch operations and processes the result. + */ void FetchRangeSlicesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching current table statistics. + * + * Handles the completion of current table statistics fetch operations and processes the result. + */ void FetchCurrentTableStatsCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching table statistics. + * + * Handles the completion of table statistics fetch operations and processes the result. + */ void FetchTableStatsCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for fetching archive records. + * + * Extends SyncCallbackData to include archive-specific information like + * table name, partition ID, key ranges, batch size, and scan direction. + */ struct FetchArchivesCallbackData : public SyncCallbackData { FetchArchivesCallbackData(const std::string_view kv_table_name, @@ -2464,21 +2591,42 @@ struct FetchArchivesCallbackData : public SyncCallbackData std::vector archive_commit_ts_; }; +/** + * Callback for fetching archive records. + * + * Handles the completion of archive record fetch operations and processes the result. + */ void FetchArchivesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching record archives. + * + * Handles the completion of record archive fetch operations and processes the result. + */ void FetchRecordArchivesCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for fetching snapshot archives. + * + * Handles the completion of snapshot archive fetch operations and processes the result. + */ void FetchSnapshotArchiveCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback data for creating snapshots for backup operations. + * + * Extends SyncCallbackData to include backup-specific information like + * backup name, timestamp, and backup files list. + */ struct CreateSnapshotForBackupCallbackData : public SyncCallbackData { CreateSnapshotForBackupCallbackData() = default; @@ -2505,6 +2653,11 @@ struct CreateSnapshotForBackupCallbackData : public SyncCallbackData std::vector *backup_files_; }; +/** + * Callback for creating snapshots for backup operations. + * + * Handles the completion of snapshot creation for backup operations and processes the result. + */ void CreateSnapshotForBackupCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, From 383e3925cbc89017ff3b7da614b4316ac3d39de5 Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 10:22:21 +0000 Subject: [PATCH 5/9] refactor so that putall flush different partitions concurrently --- data_store_service_client.cpp | 618 ++++++++++++++------------ data_store_service_client.h | 29 ++ data_store_service_client_closure.cpp | 42 ++ data_store_service_client_closure.h | 154 ++++++- 4 files changed, 551 insertions(+), 292 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index 471e2ba..c241167 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -20,6 +20,7 @@ * */ #include "data_store_service_client.h" +#include "data_store_service_client_closure.h" #include @@ -33,7 +34,6 @@ #include #include -#include "data_store_service_client_closure.h" #include "data_store_service_scanner.h" #include "eloq_data_store_service/object_pool.h" // ObjectPool #include "eloq_data_store_service/thread_worker_pool.h" @@ -183,107 +183,19 @@ bool DataStoreServiceClient::PutAll( std::vector>> &flush_task) { - std::vector key_parts; - std::vector record_parts; - std::vector records_ts; - std::vector records_ttl; - std::vector op_types; - std::vector record_tmp_mem_area; uint64_t now = txservice::LocalCcShards::ClockTsInMillseconds(); - auto PrepareObjectData = - [&](txservice::FlushRecord &ckpt_rec, size_t &write_batch_size) - { - txservice::TxKey tx_key = ckpt_rec.Key(); - uint64_t ttl = - ckpt_rec.payload_status_ == txservice::RecordStatus::Normal - ? ckpt_rec.Payload()->GetTTL() - : 0; - if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal && - (!ckpt_rec.Payload()->HasTTL() || ttl > now)) - { - key_parts.emplace_back( - std::string_view(tx_key.Data(), tx_key.Size())); - write_batch_size += tx_key.Size(); - - const txservice::TxRecord *rec = ckpt_rec.Payload(); - // Upserts a key to the k-v store - record_parts.emplace_back(std::string_view(rec->EncodedBlobData(), - rec->EncodedBlobSize())); - write_batch_size += rec->EncodedBlobSize(); - - records_ts.push_back(ckpt_rec.commit_ts_); - write_batch_size += sizeof(uint64_t); // commit_ts - // - records_ttl.push_back(ttl); - write_batch_size += sizeof(uint64_t); // ttl - - op_types.push_back(WriteOpType::PUT); - write_batch_size += sizeof(WriteOpType); - } - else - { - key_parts.emplace_back( - std::string_view(tx_key.Data(), tx_key.Size())); - write_batch_size += tx_key.Size(); - - record_parts.emplace_back(std::string_view()); - - records_ts.push_back(ckpt_rec.commit_ts_); - write_batch_size += sizeof(uint64_t); // commit_ts - - records_ttl.push_back(0); // no ttl - write_batch_size += sizeof(uint64_t); // ttl - - op_types.push_back(WriteOpType::DELETE); - write_batch_size += sizeof(WriteOpType); - } - }; - - auto PrepareRecordData = - [&](txservice::FlushRecord &ckpt_rec, size_t &write_batch_size) - { - uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000; - txservice::TxKey tx_key = ckpt_rec.Key(); - bool is_deleted = - !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal); - key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size())); - write_batch_size += tx_key.Size(); - - const txservice::TxRecord *rec = ckpt_rec.Payload(); - // encode is_delete, encoded_blob_data and unpack_info - if (is_deleted) - { - records_ttl.push_back(retired_ttl_for_deleted); - } - else - { - records_ttl.push_back(0); // no ttl - } - write_batch_size += sizeof(uint64_t); // ttl - - op_types.push_back(WriteOpType::PUT); - write_batch_size += sizeof(WriteOpType); - - SerializeTxRecord(is_deleted, - rec, - record_tmp_mem_area, - record_parts, - write_batch_size); - - records_ts.push_back(ckpt_rec.commit_ts_); - write_batch_size += sizeof(uint64_t); - }; - // map from (table_name, partition_id) to the index of the records in the - // batch + // Process each table for (auto &[kv_table_name, entries] : flush_task) { auto &table_name = entries.front()->data_sync_task_->table_name_; + + // Group records by partition std::unordered_map>> hash_partitions_map; std::unordered_map> range_partitions_map; std::unordered_map partition_record_cnt; - size_t write_batch_size = 0; + size_t flush_task_entry_idx = 0; for (auto &entry : entries) { @@ -315,8 +227,7 @@ bool DataStoreServiceClient::PutAll( } else { - // All records in the batch are in the same partition for range - // table. + // All records in the batch are in the same partition for range table uint32_t parition_id = KvPartitionIdOf(batch[0].partition_id_, true); auto [it, inserted] = @@ -328,6 +239,7 @@ bool DataStoreServiceClient::PutAll( flush_task_entry_idx++; } + // Create global coordinator SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject(); PoolableGuard sync_putall_guard(sync_putall); sync_putall->Reset(); @@ -335,218 +247,91 @@ bool DataStoreServiceClient::PutAll( uint16_t parts_cnt_per_key = 1; uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5; - // Write data for hash_partitioned table - for (auto part_it = hash_partitions_map.begin(); - part_it != hash_partitions_map.end(); - ++part_it) + // Create partition states and prepare batches + std::vector> partition_states; + std::vector> callback_data_list; + + // Process hash partitions + for (auto &[partition_id, flush_recs] : hash_partitions_map) { - auto &flush_recs = part_it->second; - size_t recs_cnt = partition_record_cnt[part_it->first]; - key_parts.reserve(recs_cnt * parts_cnt_per_key); - record_parts.reserve(recs_cnt * parts_cnt_per_record); - records_ts.reserve(recs_cnt); - records_ttl.reserve(recs_cnt); - op_types.reserve(recs_cnt); - for (auto idx : flush_recs) - { - txservice::FlushRecord &ckpt_rec = - entries.at(idx.first)->data_sync_vec_->at(idx.second); - txservice::TxKey tx_key = ckpt_rec.Key(); - - // Start a new batch if done with current partition. - if (write_batch_size >= MAX_WRITE_BATCH_SIZE) - { - BatchWriteRecords(kv_table_name, - part_it->first, - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - true, - sync_putall, - SyncPutAllCallback, - parts_cnt_per_key, - parts_cnt_per_record); - // Wait for in-flight requests to decrease if limit reached - { - std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_++; - while (sync_putall->unfinished_request_cnt_ >= - SyncPutAllData::max_flying_write_count) - { - sync_putall->cv_.wait(lk); - } - } - key_parts.clear(); - record_parts.clear(); - records_ts.clear(); - records_ttl.clear(); - op_types.clear(); - key_parts.reserve(recs_cnt * parts_cnt_per_key); - record_parts.reserve(recs_cnt * parts_cnt_per_record); - records_ts.reserve(recs_cnt); - records_ttl.reserve(recs_cnt); - op_types.reserve(recs_cnt); - write_batch_size = 0; - } - - assert(ckpt_rec.payload_status_ == - txservice::RecordStatus::Normal || - ckpt_rec.payload_status_ == - txservice::RecordStatus::Deleted); - - if (table_name.IsObjectTable()) - { - PrepareObjectData(ckpt_rec, write_batch_size); - } - else - { - PrepareRecordData(ckpt_rec, write_batch_size); - } - } - // Send out the last batch - if (key_parts.size() > 0) - { - BatchWriteRecords(kv_table_name, - part_it->first, - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - true, - sync_putall, - SyncPutAllCallback, - parts_cnt_per_key, - parts_cnt_per_record); - { - std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_++; - } - key_parts.clear(); - record_parts.clear(); - records_ts.clear(); - records_ttl.clear(); - op_types.clear(); - write_batch_size = 0; - } + auto partition_state = std::make_unique(partition_id); + auto callback_data = std::make_unique( + partition_state.get(), sync_putall, std::string(kv_table_name)); + + // Prepare batches for this partition + PreparePartitionBatches(*partition_state, flush_recs, entries, + table_name, parts_cnt_per_key, parts_cnt_per_record, now); + + partition_states.push_back(std::move(partition_state)); + callback_data_list.push_back(std::move(callback_data)); } - - // Write data for range_partitioned table - for (auto part_it = range_partitions_map.begin(); - part_it != range_partitions_map.end(); - ++part_it) + + // Process range partitions + for (auto &[partition_id, flush_recs] : range_partitions_map) { - size_t recs_cnt = partition_record_cnt[part_it->first]; - key_parts.reserve(recs_cnt * parts_cnt_per_key); - record_parts.reserve(recs_cnt * parts_cnt_per_record); - records_ts.reserve(recs_cnt); - records_ttl.reserve(recs_cnt); - op_types.reserve(recs_cnt); - record_tmp_mem_area.reserve(recs_cnt * 2); - for (auto idx : part_it->second) - { - for (auto &ckpt_rec : *entries.at(idx)->data_sync_vec_) - { - txservice::TxKey tx_key = ckpt_rec.Key(); - - // Start a new batch if done with current partition. - if (write_batch_size >= MAX_WRITE_BATCH_SIZE) - { - BatchWriteRecords(kv_table_name, - part_it->first, - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - true, - sync_putall, - SyncPutAllCallback, - parts_cnt_per_key, - parts_cnt_per_record); - record_tmp_mem_area.clear(); - key_parts.clear(); - record_parts.clear(); - records_ts.clear(); - records_ttl.clear(); - op_types.clear(); - key_parts.reserve(recs_cnt * parts_cnt_per_key); - record_parts.reserve(recs_cnt * parts_cnt_per_record); - records_ts.reserve(recs_cnt); - records_ttl.reserve(recs_cnt); - op_types.reserve(recs_cnt); - write_batch_size = 0; - // Wait for in-flight requests to decrease if limit - // reached - { - std::unique_lock lk( - sync_putall->mux_); - sync_putall->unfinished_request_cnt_++; - while (sync_putall->unfinished_request_cnt_ >= - SyncPutAllData::max_flying_write_count) - { - sync_putall->cv_.wait(lk); - } - } - } + auto partition_state = std::make_unique(partition_id); + auto callback_data = std::make_unique( + partition_state.get(), sync_putall, std::string(kv_table_name)); + + // Prepare batches for this partition + PrepareRangePartitionBatches(*partition_state, flush_recs, entries, + table_name, parts_cnt_per_key, parts_cnt_per_record, now); + + partition_states.push_back(std::move(partition_state)); + callback_data_list.push_back(std::move(callback_data)); + } - assert(ckpt_rec.payload_status_ == - txservice::RecordStatus::Normal || - ckpt_rec.payload_status_ == - txservice::RecordStatus::Deleted); + // Set up global coordinator + sync_putall->total_partitions_ = partition_states.size(); + sync_putall->partition_states_ = std::move(partition_states); - // currently there is no object table in range partitioned - // table - PrepareRecordData(ckpt_rec, write_batch_size); - } - // Send out the last batch - if (key_parts.size() > 0) - { - BatchWriteRecords(kv_table_name, - part_it->first, - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - true, - sync_putall, - SyncPutAllCallback, - parts_cnt_per_key, - parts_cnt_per_record); - record_tmp_mem_area.clear(); - key_parts.clear(); - record_parts.clear(); - records_ts.clear(); - records_ttl.clear(); - op_types.clear(); - write_batch_size = 0; - { - std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_++; - } - } + // Start concurrent processing for each partition + for (size_t i = 0; i < callback_data_list.size(); ++i) + { + auto* partition_state = sync_putall->partition_states_[i].get(); + auto* callback_data = callback_data_list[i].get(); + + // Start the first batch for this partition + PartitionBatchRequest first_batch; + if (partition_state->GetNextBatch(first_batch)) { + BatchWriteRecords( + callback_data->table_name, + partition_state->partition_id, + std::move(first_batch.key_parts), + std::move(first_batch.record_parts), + std::move(first_batch.records_ts), + std::move(first_batch.records_ttl), + std::move(first_batch.op_types), + true, // skip_wal + callback_data, + PartitionBatchCallback, + first_batch.parts_cnt_per_key, + first_batch.parts_cnt_per_record); + } else { + // No batches for this partition, mark as completed + partition_state->MarkCompleted(); + sync_putall->OnPartitionCompleted(); } } - // Wait for all requests to complete + // Wait for all partitions to complete { std::unique_lock lk(sync_putall->mux_); - sync_putall->all_request_started_ = true; - while (sync_putall->unfinished_request_cnt_ != 0) + while (sync_putall->completed_partitions_ < sync_putall->total_partitions_) { sync_putall->cv_.wait(lk); } } - if (sync_putall->result_.error_code() != - remote::DataStoreError::NO_ERROR) + // Check for errors + for (auto& partition_state : sync_putall->partition_states_) { - LOG(ERROR) << "PutAll failed for error: " - << sync_putall->result_.error_msg(); + if (partition_state->IsFailed()) + { + LOG(ERROR) << "PutAll failed for partition " << partition_state->partition_id + << " with error: " << partition_state->result.error_msg(); return false; + } } } return true; @@ -4325,4 +4110,257 @@ bool DataStoreServiceClient::DeleteCatalog( return true; } +void DataStoreServiceClient::PreparePartitionBatches( + EloqDS::PartitionFlushState& partition_state, + const std::vector>& flush_recs, + const std::vector>& entries, + const txservice::TableName& table_name, + uint16_t parts_cnt_per_key, + uint16_t parts_cnt_per_record, + uint64_t now) +{ + std::vector key_parts; + std::vector record_parts; + std::vector records_ts; + std::vector records_ttl; + std::vector op_types; + std::vector record_tmp_mem_area; + size_t write_batch_size = 0; + + auto PrepareObjectData = + [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size) + { + txservice::TxKey tx_key = ckpt_rec.Key(); + uint64_t ttl = + ckpt_rec.payload_status_ == txservice::RecordStatus::Normal + ? ckpt_rec.Payload()->GetTTL() + : 0; + if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal && + (!ckpt_rec.Payload()->HasTTL() || ttl > now)) + { + key_parts.emplace_back( + std::string_view(tx_key.Data(), tx_key.Size())); + batch_size += tx_key.Size(); + + const txservice::TxRecord *rec = ckpt_rec.Payload(); + record_parts.emplace_back(std::string_view(rec->EncodedBlobData(), + rec->EncodedBlobSize())); + batch_size += rec->EncodedBlobSize(); + + records_ts.push_back(ckpt_rec.commit_ts_); + batch_size += sizeof(uint64_t); + + records_ttl.push_back(ttl); + batch_size += sizeof(uint64_t); + + op_types.push_back(WriteOpType::PUT); + batch_size += sizeof(WriteOpType); + } + else + { + key_parts.emplace_back( + std::string_view(tx_key.Data(), tx_key.Size())); + batch_size += tx_key.Size(); + + record_parts.emplace_back(std::string_view()); + batch_size += 0; + + records_ts.push_back(ckpt_rec.commit_ts_); + batch_size += sizeof(uint64_t); + + records_ttl.push_back(0); + batch_size += sizeof(uint64_t); + + op_types.push_back(WriteOpType::DELETE); + batch_size += sizeof(WriteOpType); + } + }; + + auto PrepareRecordData = + [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size) + { + uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000; + txservice::TxKey tx_key = ckpt_rec.Key(); + bool is_deleted = + !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal); + key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size())); + batch_size += tx_key.Size(); + + const txservice::TxRecord *rec = ckpt_rec.Payload(); + if (is_deleted) + { + records_ttl.push_back(retired_ttl_for_deleted); + } + else + { + records_ttl.push_back(0); + } + batch_size += sizeof(uint64_t); + + op_types.push_back(WriteOpType::PUT); + batch_size += sizeof(WriteOpType); + + SerializeTxRecord(is_deleted, + rec, + record_tmp_mem_area, + record_parts, + batch_size); + + records_ts.push_back(ckpt_rec.commit_ts_); + batch_size += sizeof(uint64_t); + }; + + // Process records and create batches + for (auto idx : flush_recs) + { + txservice::FlushRecord &ckpt_rec = + entries.at(idx.first)->data_sync_vec_->at(idx.second); + + // Start a new batch if size limit reached + if (write_batch_size >= MAX_WRITE_BATCH_SIZE) + { + partition_state.AddBatch(PartitionBatchRequest( + std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); + + key_parts.clear(); + record_parts.clear(); + records_ts.clear(); + records_ttl.clear(); + op_types.clear(); + record_tmp_mem_area.clear(); + write_batch_size = 0; + } + + assert(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal || + ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted); + + if (table_name.IsObjectTable()) + { + PrepareObjectData(ckpt_rec, write_batch_size); + } + else + { + PrepareRecordData(ckpt_rec, write_batch_size); + } + } + + // Add the last batch if it has data + if (key_parts.size() > 0) + { + partition_state.AddBatch(PartitionBatchRequest( + std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); + } +} + +void DataStoreServiceClient::PrepareRangePartitionBatches( + EloqDS::PartitionFlushState& partition_state, + const std::vector& flush_recs, + const std::vector>& entries, + const txservice::TableName& table_name, + uint16_t parts_cnt_per_key, + uint16_t parts_cnt_per_record, + uint64_t now) +{ + std::vector key_parts; + std::vector record_parts; + std::vector records_ts; + std::vector records_ttl; + std::vector op_types; + std::vector record_tmp_mem_area; + size_t write_batch_size = 0; + + auto PrepareRecordData = + [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size) + { + uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000; + txservice::TxKey tx_key = ckpt_rec.Key(); + bool is_deleted = + !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal); + key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size())); + batch_size += tx_key.Size(); + + const txservice::TxRecord *rec = ckpt_rec.Payload(); + if (is_deleted) + { + records_ttl.push_back(retired_ttl_for_deleted); + } + else + { + records_ttl.push_back(0); + } + batch_size += sizeof(uint64_t); + + op_types.push_back(WriteOpType::PUT); + batch_size += sizeof(WriteOpType); + + SerializeTxRecord(is_deleted, + rec, + record_tmp_mem_area, + record_parts, + batch_size); + + records_ts.push_back(ckpt_rec.commit_ts_); + batch_size += sizeof(uint64_t); + }; + + // Process records and create batches + for (auto idx : flush_recs) + { + for (auto &ckpt_rec : *entries.at(idx)->data_sync_vec_) + { + // Start a new batch if size limit reached + if (write_batch_size >= MAX_WRITE_BATCH_SIZE) + { + partition_state.AddBatch(PartitionBatchRequest( + std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); + + key_parts.clear(); + record_parts.clear(); + records_ts.clear(); + records_ttl.clear(); + op_types.clear(); + record_tmp_mem_area.clear(); + write_batch_size = 0; + } + + assert(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal || + ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted); + + // Currently there is no object table in range partitioned table + PrepareRecordData(ckpt_rec, write_batch_size); + } + } + + // Add the last batch if it has data + if (key_parts.size() > 0) + { + partition_state.AddBatch(PartitionBatchRequest( + std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); + } +} + } // namespace EloqDS \ No newline at end of file diff --git a/data_store_service_client.h b/data_store_service_client.h index 4e0348c..fef31fe 100644 --- a/data_store_service_client.h +++ b/data_store_service_client.h @@ -39,6 +39,10 @@ namespace EloqDS { +// Forward declarations for types defined in closure header +struct PartitionFlushState; +struct PartitionBatchRequest; +struct PartitionCallbackData; class DataStoreServiceClient; class BatchWriteRecordsClosure; class ReadClosure; @@ -489,6 +493,27 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler void BatchWriteRecordsInternal(BatchWriteRecordsClosure *closure); + /** + * Helper methods for concurrent PutAll implementation + */ + void PreparePartitionBatches( + PartitionFlushState& partition_state, + const std::vector>& flush_recs, + const std::vector>& entries, + const txservice::TableName& table_name, + uint16_t parts_cnt_per_key, + uint16_t parts_cnt_per_record, + uint64_t now); + + void PrepareRangePartitionBatches( + PartitionFlushState& partition_state, + const std::vector& flush_recs, + const std::vector>& entries, + const txservice::TableName& table_name, + uint16_t parts_cnt_per_key, + uint16_t parts_cnt_per_record, + uint64_t now); + /** * Delete range and flush data are not frequent calls, all calls are sent * with rpc. @@ -635,6 +660,10 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler friend class DropTableClosure; friend class ScanNextClosure; friend class CreateSnapshotForBackupClosure; + friend void PartitionBatchCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); friend class SinglePartitionScanner; friend void FetchAllDatabaseCallback(void *data, ::google::protobuf::Closure *closure, diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp index b0204a7..77774ce 100644 --- a/data_store_service_client_closure.cpp +++ b/data_store_service_client_closure.cpp @@ -405,6 +405,48 @@ void SyncPutAllCallback(void *data, callback_data->Finish(result); } +void PartitionBatchCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result) +{ + auto *callback_data = reinterpret_cast(data); + auto *partition_state = callback_data->partition_state; + auto *global_coordinator = callback_data->global_coordinator; + + // Check if the batch failed + if (result.error_code() != remote::DataStoreError::NO_ERROR) { + partition_state->MarkFailed(result); + // Notify the global coordinator that this partition failed + global_coordinator->OnPartitionCompleted(); + return; + } + + // Try to get the next batch for this partition + PartitionBatchRequest next_batch; + if (partition_state->GetNextBatch(next_batch)) { + // Send the next batch + client.BatchWriteRecords( + callback_data->table_name, + partition_state->partition_id, + std::move(next_batch.key_parts), + std::move(next_batch.record_parts), + std::move(next_batch.records_ts), + std::move(next_batch.records_ttl), + std::move(next_batch.op_types), + true, // skip_wal + callback_data, + PartitionBatchCallback, + next_batch.parts_cnt_per_key, + next_batch.parts_cnt_per_record); + } else { + // No more batches, mark partition as completed + partition_state->MarkCompleted(); + // Notify the global coordinator that this partition completed + global_coordinator->OnPartitionCompleted(); + } +} + void FetchDatabaseCallback(void *data, ::google::protobuf::Closure *closure, DataStoreServiceClient &client, diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index 16836a5..d02934f 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -28,11 +28,17 @@ #include #include #include +#include #include #include "data_store_service_client.h" -#include "data_store_service_scanner.h" #include "eloq_data_store_service/object_pool.h" +#include "eloq_data_store_service/data_store_service.h" + +// Forward declarations +namespace EloqDS { +class DataStoreServiceClient; +} namespace EloqDS { @@ -143,6 +149,10 @@ struct SyncPutAllData : public Poolable unfinished_request_cnt_ = 0; all_request_started_ = false; result_.Clear(); + // Clear partition states if using new concurrent approach + partition_states_.clear(); + completed_partitions_ = 0; + total_partitions_ = 0; } virtual void Clear() override @@ -150,6 +160,9 @@ struct SyncPutAllData : public Poolable unfinished_request_cnt_ = 0; all_request_started_ = false; result_.Clear(); + partition_states_.clear(); + completed_partitions_ = 0; + total_partitions_ = 0; } void Finish(const remote::CommonResult &res) @@ -169,12 +182,138 @@ struct SyncPutAllData : public Poolable } } + // New method for per-partition coordination + void OnPartitionCompleted() + { + std::unique_lock lk(mux_); + completed_partitions_++; + if (completed_partitions_ >= total_partitions_) { + cv_.notify_one(); + } + } // NOTICE: "unfinished_request_cnt_" must use signed integer. int32_t unfinished_request_cnt_{0}; bool all_request_started_{false}; remote::CommonResult result_; - bthread::Mutex mux_; + mutable bthread::Mutex mux_; bthread::ConditionVariable cv_; + + // New fields for per-partition coordination + std::vector> partition_states_; + int32_t completed_partitions_{0}; + int32_t total_partitions_{0}; +}; + +/** + * @brief Represents a single batch request for a partition + */ +struct PartitionBatchRequest { + std::vector key_parts; + std::vector record_parts; + std::vector records_ts; + std::vector records_ttl; + std::vector op_types; + uint16_t parts_cnt_per_key; + uint16_t parts_cnt_per_record; + + PartitionBatchRequest() = default; + + PartitionBatchRequest(std::vector&& keys, + std::vector&& records, + std::vector&& ts, + std::vector&& ttl, + std::vector&& ops, + uint16_t key_parts_count, + uint16_t record_parts_count) + : key_parts(std::move(keys)) + , record_parts(std::move(records)) + , records_ts(std::move(ts)) + , records_ttl(std::move(ttl)) + , op_types(std::move(ops)) + , parts_cnt_per_key(key_parts_count) + , parts_cnt_per_record(record_parts_count) {} +}; + +/** + * @brief Per-partition state management for concurrent flushing + */ +struct PartitionFlushState { + int32_t partition_id; + std::queue pending_batches; + bool has_inflight_request = false; + bool completed = false; + bool failed = false; + remote::CommonResult result; + mutable bthread::Mutex mux; + + PartitionFlushState(int32_t pid) : partition_id(pid) { + result.Clear(); + } + + void Reset() { + std::unique_lock lk(mux); + while (!pending_batches.empty()) { + pending_batches.pop(); + } + has_inflight_request = false; + completed = false; + failed = false; + result.Clear(); + } + + bool HasMoreBatches() const { + std::unique_lock lk(mux); + return !pending_batches.empty() || has_inflight_request; + } + + bool IsCompleted() const { + std::unique_lock lk(mux); + return completed; + } + + bool IsFailed() const { + std::unique_lock lk(mux); + return failed; + } + + void MarkCompleted() { + std::unique_lock lk(mux); + completed = true; + } + + void MarkFailed(const remote::CommonResult& error) { + std::unique_lock lk(mux); + failed = true; + result.set_error_code(error.error_code()); + result.set_error_msg(error.error_msg()); + } + + bool GetNextBatch(PartitionBatchRequest& batch) { + std::unique_lock lk(mux); + if (pending_batches.empty()) { + return false; + } + batch = std::move(pending_batches.front()); + pending_batches.pop(); + return true; + } + + void AddBatch(PartitionBatchRequest&& batch) { + std::unique_lock lk(mux); + pending_batches.push(std::move(batch)); + } +}; + +/** + * @brief Wrapper for partition callback data that includes global coordinator + */ +struct PartitionCallbackData { + PartitionFlushState* partition_state; + SyncPutAllData* global_coordinator; + std::string table_name; + + PartitionCallbackData(PartitionFlushState* ps, SyncPutAllData* gc, const std::string& tn) + : partition_state(ps), global_coordinator(gc), table_name(tn) {} }; /** * Generic synchronous callback adapter invoked by closures to signal @@ -2291,6 +2430,17 @@ void SyncPutAllCallback(void *data, DataStoreServiceClient &client, const remote::CommonResult &result); +/** + * Callback for per-partition batch operations in concurrent PutAll. + * + * Handles the completion of a single batch for a partition and chains + * to the next batch if available, or marks the partition as completed. + */ +void PartitionBatchCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); + /** * Callback data for fetching database information. * From 5c4457576c69317ab34c50c637497a06e605ffbd Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 11:24:43 +0000 Subject: [PATCH 6/9] put reusable objects in pool --- data_store_service_client.cpp | 544 +++++++++++++++----------- data_store_service_client_closure.cpp | 76 ++-- data_store_service_client_closure.h | 415 +++++++++++--------- 3 files changed, 596 insertions(+), 439 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index c241167..ea54dfd 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -20,7 +20,6 @@ * */ #include "data_store_service_client.h" -#include "data_store_service_client_closure.h" #include @@ -34,6 +33,7 @@ #include #include +#include "data_store_service_client_closure.h" #include "data_store_service_scanner.h" #include "eloq_data_store_service/object_pool.h" // ObjectPool #include "eloq_data_store_service/thread_worker_pool.h" @@ -65,6 +65,9 @@ thread_local ObjectPool thread_local ObjectPool discover_all_tables_callback_data_pool_; thread_local ObjectPool sync_putall_data_pool_; +thread_local ObjectPool sync_concurrent_request_pool_; +thread_local ObjectPool partition_flush_state_pool_; +thread_local ObjectPool partition_callback_data_pool_; static const uint64_t MAX_WRITE_BATCH_SIZE = 64 * 1024 * 1024; // 64MB @@ -93,13 +96,15 @@ DataStoreServiceClient::~DataStoreServiceClient() } /** - * @brief Configures the data store service client with cluster manager information. + * @brief Configures the data store service client with cluster manager + * information. * - * Initializes the client with cluster configuration including node hostnames and ports. - * Logs all node information for debugging purposes and stores the cluster manager - * reference for future use. + * Initializes the client with cluster configuration including node hostnames + * and ports. Logs all node information for debugging purposes and stores the + * cluster manager reference for future use. * - * @param cluster_manager Reference to the cluster manager containing shard and node information. + * @param cluster_manager Reference to the cluster manager containing shard and + * node information. */ void DataStoreServiceClient::SetupConfig( const DataStoreServiceClusterManager &cluster_manager) @@ -119,8 +124,8 @@ void DataStoreServiceClient::SetupConfig( * @brief Establishes connection to the data store service. * * Attempts to connect to the data store service with retry logic. Initializes - * pre-built tables and retries up to 5 times with 1-second delays between attempts. - * Returns true if connection succeeds, false otherwise. + * pre-built tables and retries up to 5 times with 1-second delays between + * attempts. Returns true if connection succeeds, false otherwise. * * @return true if connection is successful, false if all retry attempts fail. */ @@ -145,9 +150,9 @@ bool DataStoreServiceClient::Connect() /** * @brief Schedules timer-based tasks for the data store service. * - * Currently not implemented. This method is a placeholder for future timer-based - * functionality such as periodic cleanup, health checks, or maintenance tasks. - * Will assert and log an error if called. + * Currently not implemented. This method is a placeholder for future + * timer-based functionality such as periodic cleanup, health checks, or + * maintenance tasks. Will assert and log an error if called. */ void DataStoreServiceClient::ScheduleTimerTasks() { @@ -156,26 +161,33 @@ void DataStoreServiceClient::ScheduleTimerTasks() } /** - * @brief Batch-writes a set of flush tasks into KV tables. + * @brief Batch-writes a set of flush tasks into KV tables using concurrent + * partition processing. * * Processes the provided flush tasks grouped by table and partition, serializes * each record (object tables use raw encoded blobs; non-object tables encode * tx-records with unpack info), and issues batched PUT/DELETE operations via - * BatchWriteRecords. Batches are emitted per KV-partition and sized according - * to SyncPutAllData::max_flying_write_count; the method blocks as necessary to - * respect the global in-flight write limit and waits for all dispatched - * requests to complete before returning. + * BatchWriteRecords. The method uses a concurrent approach where different + * partitions can flush simultaneously, but each partition maintains + * serialization (only one request in-flight per partition at a time). + * + * Key features: + * - Concurrent processing across different partitions + * - Per-partition serialization to respect KV store constraints + * - Automatic batching based on MAX_WRITE_BATCH_SIZE (64MB) + * - Chained callbacks within each partition for sequential processing + * - Global coordination to wait for all partitions to complete * * The function distinguishes hash- and range-partitioned tables, computes * per-partition batches, and updates per-record timestamps/TTLs and operation - * types. Partial batches are flushed at partition boundaries. On any remote or - * batch-level error the function logs the failure and returns false. + * types. On any partition-level error, the function logs the failure and + * returns false. * * @param flush_task Mapping from KV table name to a vector of flush task * entries containing the records to write. Each entry's * data_sync_vec_ provides the sequence of records for that * flush task. - * @return true if all batches completed successfully; false if any batch + * @return true if all partitions completed successfully; false if any partition * reported an error. */ bool DataStoreServiceClient::PutAll( @@ -189,13 +201,13 @@ bool DataStoreServiceClient::PutAll( for (auto &[kv_table_name, entries] : flush_task) { auto &table_name = entries.front()->data_sync_task_->table_name_; - + // Group records by partition std::unordered_map>> hash_partitions_map; std::unordered_map> range_partitions_map; std::unordered_map partition_record_cnt; - + size_t flush_task_entry_idx = 0; for (auto &entry : entries) { @@ -227,7 +239,8 @@ bool DataStoreServiceClient::PutAll( } else { - // All records in the batch are in the same partition for range table + // All records in the batch are in the same partition for range + // table uint32_t parition_id = KvPartitionIdOf(batch[0].partition_id_, true); auto [it, inserted] = @@ -248,68 +261,79 @@ bool DataStoreServiceClient::PutAll( uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5; // Create partition states and prepare batches - std::vector> partition_states; - std::vector> callback_data_list; - + std::vector callback_data_list; + // Process hash partitions for (auto &[partition_id, flush_recs] : hash_partitions_map) { - auto partition_state = std::make_unique(partition_id); - auto callback_data = std::make_unique( - partition_state.get(), sync_putall, std::string(kv_table_name)); - + auto partition_state = partition_flush_state_pool_.NextObject(); + partition_state->Reset(partition_id); + auto callback_data = partition_callback_data_pool_.NextObject(); + callback_data->Reset(partition_state, sync_putall, kv_table_name); + // Prepare batches for this partition - PreparePartitionBatches(*partition_state, flush_recs, entries, - table_name, parts_cnt_per_key, parts_cnt_per_record, now); - - partition_states.push_back(std::move(partition_state)); - callback_data_list.push_back(std::move(callback_data)); + PreparePartitionBatches(*partition_state, + flush_recs, + entries, + table_name, + parts_cnt_per_key, + parts_cnt_per_record, + now); + + sync_putall->partition_states_.push_back(partition_state); + callback_data_list.push_back(callback_data); } - + // Process range partitions for (auto &[partition_id, flush_recs] : range_partitions_map) { - auto partition_state = std::make_unique(partition_id); - auto callback_data = std::make_unique( - partition_state.get(), sync_putall, std::string(kv_table_name)); - + auto partition_state = partition_flush_state_pool_.NextObject(); + partition_state->Reset(partition_id); + auto callback_data = partition_callback_data_pool_.NextObject(); + callback_data->Reset(partition_state, sync_putall, kv_table_name); + // Prepare batches for this partition - PrepareRangePartitionBatches(*partition_state, flush_recs, entries, - table_name, parts_cnt_per_key, parts_cnt_per_record, now); - - partition_states.push_back(std::move(partition_state)); - callback_data_list.push_back(std::move(callback_data)); + PrepareRangePartitionBatches(*partition_state, + flush_recs, + entries, + table_name, + parts_cnt_per_key, + parts_cnt_per_record, + now); + + sync_putall->partition_states_.push_back(partition_state); + callback_data_list.push_back(callback_data); } // Set up global coordinator - sync_putall->total_partitions_ = partition_states.size(); - sync_putall->partition_states_ = std::move(partition_states); + sync_putall->total_partitions_ = sync_putall->partition_states_.size(); // Start concurrent processing for each partition for (size_t i = 0; i < callback_data_list.size(); ++i) { - auto* partition_state = sync_putall->partition_states_[i].get(); - auto* callback_data = callback_data_list[i].get(); - + auto *partition_state = sync_putall->partition_states_[i]; + auto *callback_data = callback_data_list[i]; + // Start the first batch for this partition PartitionBatchRequest first_batch; - if (partition_state->GetNextBatch(first_batch)) { - BatchWriteRecords( - callback_data->table_name, - partition_state->partition_id, - std::move(first_batch.key_parts), - std::move(first_batch.record_parts), - std::move(first_batch.records_ts), - std::move(first_batch.records_ttl), - std::move(first_batch.op_types), - true, // skip_wal - callback_data, - PartitionBatchCallback, - first_batch.parts_cnt_per_key, - first_batch.parts_cnt_per_record); - } else { + if (partition_state->GetNextBatch(first_batch)) + { + BatchWriteRecords(callback_data->table_name, + partition_state->partition_id, + std::move(first_batch.key_parts), + std::move(first_batch.record_parts), + std::move(first_batch.records_ts), + std::move(first_batch.records_ttl), + std::move(first_batch.op_types), + true, // skip_wal + callback_data, + PartitionBatchCallback, + first_batch.parts_cnt_per_key, + first_batch.parts_cnt_per_record); + } + else + { // No batches for this partition, mark as completed - partition_state->MarkCompleted(); sync_putall->OnPartitionCompleted(); } } @@ -317,22 +341,30 @@ bool DataStoreServiceClient::PutAll( // Wait for all partitions to complete { std::unique_lock lk(sync_putall->mux_); - while (sync_putall->completed_partitions_ < sync_putall->total_partitions_) + while (sync_putall->completed_partitions_ < + sync_putall->total_partitions_) { sync_putall->cv_.wait(lk); } } // Check for errors - for (auto& partition_state : sync_putall->partition_states_) + for (auto &partition_state : sync_putall->partition_states_) { if (partition_state->IsFailed()) { - LOG(ERROR) << "PutAll failed for partition " << partition_state->partition_id - << " with error: " << partition_state->result.error_msg(); - return false; + LOG(ERROR) << "PutAll failed for partition " + << partition_state->partition_id << " with error: " + << partition_state->result.error_msg(); + return false; } } + + for (auto &callback_data : callback_data_list) + { + callback_data->Clear(); + callback_data->Free(); + } } return true; } @@ -341,11 +373,12 @@ bool DataStoreServiceClient::PutAll( * @brief Persists data from specified KV tables to storage. * * Flushes data from the provided KV table names to persistent storage using - * asynchronous flush operations. Waits for completion and returns success/failure - * status. Logs warnings on failure and debug info on success. + * asynchronous flush operations. Waits for completion and returns + * success/failure status. Logs warnings on failure and debug info on success. * * @param kv_table_names Vector of KV table names to persist. - * @return true if all tables are persisted successfully, false if any operation fails. + * @return true if all tables are persisted successfully, false if any operation + * fails. */ bool DataStoreServiceClient::PersistKV( const std::vector &kv_table_names) @@ -374,16 +407,19 @@ bool DataStoreServiceClient::PersistKV( * Handles table creation, modification, and deletion operations by updating * table schema information in the data store. Validates leadership, processes * the operation asynchronously, and sets appropriate error codes on failure. - * Supports various operation types including CREATE, DROP, and ALTER operations. + * Supports various operation types including CREATE, DROP, and ALTER + * operations. * - * @param old_table_schema Pointer to the existing table schema (nullptr for CREATE). + * @param old_table_schema Pointer to the existing table schema (nullptr for + * CREATE). * @param new_table_schema Pointer to the new table schema. * @param op_type Type of operation (CREATE, DROP, ALTER, etc.). * @param commit_ts Commit timestamp for the operation. * @param ng_id Node group ID for the operation. * @param tx_term Transaction term for consistency. * @param hd_res Handler result object to store operation outcome. - * @param alter_table_info Information about table alterations (nullptr if not applicable). + * @param alter_table_info Information about table alterations (nullptr if not + * applicable). * @param cc_req CC request base object. * @param ccs CC shard reference. * @param err_code Error code output parameter. @@ -442,11 +478,13 @@ void DataStoreServiceClient::UpsertTable( * @brief Fetches table catalog information from the data store. * * Retrieves catalog information for the specified table by reading from the - * KV table catalogs storage. Uses partition ID 0 and the catalog name as the key. - * The operation is performed asynchronously with a callback for completion handling. + * KV table catalogs storage. Uses partition ID 0 and the catalog name as the + * key. The operation is performed asynchronously with a callback for completion + * handling. * * @param ccm_table_name The table name to fetch catalog information for. - * @param fetch_cc Fetch catalog CC object to store the result and handle completion. + * @param fetch_cc Fetch catalog CC object to store the result and handle + * completion. */ void DataStoreServiceClient::FetchTableCatalog( const txservice::TableName &ccm_table_name, @@ -465,11 +503,13 @@ void DataStoreServiceClient::FetchTableCatalog( * @brief Fetches current table statistics from the data store. * * Retrieves the current version of table statistics for the specified table. - * Determines the appropriate KV partition ID and reads from the table statistics - * version storage. The operation is performed asynchronously with callback handling. + * Determines the appropriate KV partition ID and reads from the table + * statistics version storage. The operation is performed asynchronously with + * callback handling. * * @param ccm_table_name The table name to fetch statistics for. - * @param fetch_cc Fetch table statistics CC object to store the result and handle completion. + * @param fetch_cc Fetch table statistics CC object to store the result and + * handle completion. */ void DataStoreServiceClient::FetchCurrentTableStatistics( const txservice::TableName &ccm_table_name, @@ -491,11 +531,13 @@ void DataStoreServiceClient::FetchCurrentTableStatistics( * * Retrieves table statistics for a specific version by constructing key ranges * based on the table name and version number. Clears previous key ranges and - * session information, then constructs start and end keys for the version-specific - * statistics. The operation is performed asynchronously with callback handling. + * session information, then constructs start and end keys for the + * version-specific statistics. The operation is performed asynchronously with + * callback handling. * * @param ccm_table_name The table name to fetch statistics for. - * @param fetch_cc Fetch table statistics CC object containing version information and result storage. + * @param fetch_cc Fetch table statistics CC object containing version + * information and result storage. */ void DataStoreServiceClient::FetchTableStatistics( const txservice::TableName &ccm_table_name, @@ -572,15 +614,18 @@ std::string EncodeTableStatsKey(const txservice::TableName &base_table_name, /** * @brief Upserts table statistics to the data store. * - * Stores table statistics by splitting sample keys into segments and writing them - * to the KV storage. Each segment contains index type, record count, and sample keys. - * Also updates the checkpoint version for the table statistics. Uses batch write - * operations for efficiency and handles both local and remote storage paths. + * Stores table statistics by splitting sample keys into segments and writing + * them to the KV storage. Each segment contains index type, record count, and + * sample keys. Also updates the checkpoint version for the table statistics. + * Uses batch write operations for efficiency and handles both local and remote + * storage paths. * * @param ccm_table_name The table name to store statistics for. - * @param sample_pool_map Map of index names to sample pools containing record counts and sample keys. + * @param sample_pool_map Map of index names to sample pools containing record + * counts and sample keys. * @param version The version number for the statistics. - * @return true if all statistics are stored successfully, false if any operation fails. + * @return true if all statistics are stored successfully, false if any + * operation fails. */ bool DataStoreServiceClient::UpsertTableStatistics( const txservice::TableName &ccm_table_name, @@ -762,12 +807,13 @@ bool DataStoreServiceClient::UpsertTableStatistics( /** * @brief Fetches table ranges from the data store. * - * Retrieves range information for the specified table by scanning the range table - * storage. Constructs start and end keys based on the table name and performs - * a scan operation with pagination support. The operation is performed asynchronously - * with callback handling for completion. + * Retrieves range information for the specified table by scanning the range + * table storage. Constructs start and end keys based on the table name and + * performs a scan operation with pagination support. The operation is performed + * asynchronously with callback handling for completion. * - * @param fetch_cc Fetch table ranges CC object containing table name and result storage. + * @param fetch_cc Fetch table ranges CC object containing table name and result + * storage. */ void DataStoreServiceClient::FetchTableRanges( txservice::FetchTableRangesCc *fetch_cc) @@ -801,7 +847,8 @@ void DataStoreServiceClient::FetchTableRanges( * for reading range information. The operation is performed asynchronously * with callback handling for completion. * - * @param fetch_cc Fetch range slices request object containing table name, range entry, and result storage. + * @param fetch_cc Fetch range slices request object containing table name, + * range entry, and result storage. */ void DataStoreServiceClient::FetchRangeSlices( txservice::FetchRangeSlicesReq *fetch_cc) @@ -835,8 +882,8 @@ void DataStoreServiceClient::FetchRangeSlices( * * Removes data from the KV table that falls outside the specified range. * Constructs the appropriate start key based on the provided parameters and - * performs a delete range operation. Handles special cases for negative infinity - * keys and constructs proper key boundaries for the deletion. + * performs a delete range operation. Handles special cases for negative + * infinity keys and constructs proper key boundaries for the deletion. * * @param table_name The table name to delete data from. * @param partition_id The partition ID for the operation. @@ -917,9 +964,9 @@ bool DataStoreServiceClient::Read(const txservice::TableName &table_name, /** * @brief Creates a scanner for forward or backward scanning of table data. * - * Creates and initializes a data store scanner for iterating over records in a table. - * Supports both forward and backward scanning with configurable search conditions. - * The scanner is initialized before returning. + * Creates and initializes a data store scanner for iterating over records in a + * table. Supports both forward and backward scanning with configurable search + * conditions. The scanner is initialized before returning. * * @param table_name The table name to scan. * @param ng_id Node group ID for the operation. @@ -1128,7 +1175,8 @@ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id, * * Creates a composite key by combining table name, range ID, and segment ID. * Uses little-endian encoding for numeric values since range slice operations - * are point reads rather than scans, optimizing for direct key lookup performance. + * are point reads rather than scans, optimizing for direct key lookup + * performance. * * @param table_name The table name for the range slice. * @param range_id The range identifier. @@ -1174,11 +1222,11 @@ void DataStoreServiceClient::UpdateEncodedRangeSliceKey( /** * @brief Updates range slices for a table partition. * - * Stores range slice information by segmenting the slices into manageable chunks - * and writing them to the KV storage system. Handles slice serialization with - * proper key encoding and batch size management. Also updates the range information - * with the new version and segment count. Uses both local and remote storage paths - * based on configuration. + * Stores range slice information by segmenting the slices into manageable + * chunks and writing them to the KV storage system. Handles slice serialization + * with proper key encoding and batch size management. Also updates the range + * information with the new version and segment count. Uses both local and + * remote storage paths based on configuration. * * @param table_name The table name for the range slices. * @param version The version number for the slices. @@ -1186,7 +1234,8 @@ void DataStoreServiceClient::UpdateEncodedRangeSliceKey( * @param slices Vector of store slices to update. * @param partition_id The partition ID for the range. * @param range_version The version of the range. - * @return true if all slices are updated successfully, false if any operation fails. + * @return true if all slices are updated successfully, false if any operation + * fails. */ bool DataStoreServiceClient::UpdateRangeSlices( const txservice::TableName &table_name, @@ -1341,15 +1390,16 @@ bool DataStoreServiceClient::UpdateRangeSlices( /** * @brief Upserts range information for a table. * - * Updates range slices for multiple ranges by calling UpdateRangeSlices for each - * range in the provided vector. After updating all ranges, flushes the range table - * data to ensure persistence. Validates that the table name is not empty and - * handles errors from individual range updates. + * Updates range slices for multiple ranges by calling UpdateRangeSlices for + * each range in the provided vector. After updating all ranges, flushes the + * range table data to ensure persistence. Validates that the table name is not + * empty and handles errors from individual range updates. * * @param table_name The table name for the ranges. * @param range_info Vector of split range information to upsert. * @param version The version number for the ranges. - * @return true if all ranges are updated and flushed successfully, false if any operation fails. + * @return true if all ranges are updated and flushed successfully, false if any + * operation fails. */ bool DataStoreServiceClient::UpsertRanges( const txservice::TableName &table_name, @@ -1431,14 +1481,17 @@ bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name, /** * @brief Discovers all table names in the data store. * - * Scans the table catalogs to discover all available table names. Uses pagination - * with session management and supports cooperative scheduling through yield/resume - * function pointers. Performs the scan asynchronously and waits for completion. + * Scans the table catalogs to discover all available table names. Uses + * pagination with session management and supports cooperative scheduling + * through yield/resume function pointers. Performs the scan asynchronously and + * waits for completion. * * @param norm_name_vec Output vector to store the discovered table names. - * @param yield_fptr Optional function pointer for yielding control during pagination. + * @param yield_fptr Optional function pointer for yielding control during + * pagination. * @param resume_fptr Optional function pointer for resuming after yielding. - * @return true if the discovery operation completes successfully, false if any error occurs. + * @return true if the discovery operation completes successfully, false if any + * error occurs. */ bool DataStoreServiceClient::DiscoverAllTableNames( std::vector &norm_name_vec, @@ -1471,13 +1524,14 @@ bool DataStoreServiceClient::DiscoverAllTableNames( * @brief Upserts database definition to the data store. * * Stores database definition information in the KV storage system. The storage - * format uses the database name as the key and the database definition as the value. - * Uses current timestamp for versioning and performs the operation asynchronously - * with synchronous waiting for completion. + * format uses the database name as the key and the database definition as the + * value. Uses current timestamp for versioning and performs the operation + * asynchronously with synchronous waiting for completion. * * @param db The database name to upsert. * @param definition The database definition to store. - * @return true if the database is upserted successfully, false if any operation fails. + * @return true if the database is upserted successfully, false if any operation + * fails. */ bool DataStoreServiceClient::UpsertDatabase(std::string_view db, std::string_view definition) @@ -1527,12 +1581,14 @@ bool DataStoreServiceClient::UpsertDatabase(std::string_view db, /** * @brief Drops a database from the data store. * - * Removes a database definition from the KV storage system by performing a DELETE - * operation on the database catalog. Uses current timestamp for versioning and - * performs the operation asynchronously with synchronous waiting for completion. + * Removes a database definition from the KV storage system by performing a + * DELETE operation on the database catalog. Uses current timestamp for + * versioning and performs the operation asynchronously with synchronous waiting + * for completion. * * @param db The database name to drop. - * @return true if the database is dropped successfully, false if any operation fails. + * @return true if the database is dropped successfully, false if any operation + * fails. */ bool DataStoreServiceClient::DropDatabase(std::string_view db) { @@ -1590,7 +1646,8 @@ bool DataStoreServiceClient::DropDatabase(std::string_view db) * @param found Output parameter indicating if the database was found. * @param yield_fptr Optional function pointer for yielding control. * @param resume_fptr Optional function pointer for resuming after yielding. - * @return true if the fetch operation completes successfully, false if any error occurs. + * @return true if the fetch operation completes successfully, false if any + * error occurs. */ bool DataStoreServiceClient::FetchDatabase( std::string_view db, @@ -1870,14 +1927,17 @@ void DataStoreServiceClient::EncodeArchiveKey( * @brief Decodes an archive key to extract its components. * * Parses an archive key string to extract the table name, transaction key, - * and commit timestamp. The archive key format is: "log:item:{table_name}:{key}:{commit_ts}". - * Validates the key format and extracts each component using string separators. + * and commit timestamp. The archive key format is: + * "log:item:{table_name}:{key}:{commit_ts}". Validates the key format and + * extracts each component using string separators. * * @param archive_key The archive key string to decode. * @param table_name Output parameter for the extracted table name. * @param key Output parameter for the extracted transaction key. - * @param be_commit_ts Output parameter for the extracted commit timestamp (big-endian). - * @return true if the key is successfully decoded, false if the format is invalid. + * @param be_commit_ts Output parameter for the extracted commit timestamp + * (big-endian). + * @return true if the key is successfully decoded, false if the format is + * invalid. */ bool DataStoreServiceClient::DecodeArchiveKey(const std::string &archive_key, std::string &table_name, @@ -1993,24 +2053,36 @@ void DataStoreServiceClient::DecodeArchiveValue( } /** - * @brief Writes multiple MVCC archive records to the MVCC archive KV table in partitioned batches. + * @brief Writes multiple MVCC archive records to the MVCC archive KV table + * using sequential batch processing. + * + * Groups archive entries from the provided flush tasks by archive partition, + * serializes keys and values into batch write requests, and dispatches those + * requests sequentially within each partition. Uses SyncConcurrentRequest for + * global concurrency control to limit the total number of in-flight requests + * across all partitions. + * + * Key features: + * - Sequential processing within each partition to maintain ordering + * - Global concurrency control with max_flying_write_count limit (32) + * - Automatic batching based on MAX_WRITE_BATCH_SIZE (64MB) + * - Flow control to prevent overwhelming the system * - * Groups archive entries from the provided flush tasks by archive partition, serializes keys - * and values into batch write requests, and dispatches those requests (possibly concurrently) - * to the KV layer. Batches are split to respect MAX_WRITE_BATCH_SIZE and an internal limit on - * in-flight write requests; the method waits for all dispatched batches for each partition to - * complete before returning. + * The method waits for all dispatched batches for each partition to complete + * before returning. * * Side effects: - * - Commits serialized archive records to kv_mvcc_archive_name with a default TTL of 1 day. - * - Converts per-record commit timestamps to big-endian form as part of key encoding (the - * in-memory commit_ts field of those records is mutated during processing). + * - Commits serialized archive records to kv_mvcc_archive_name with a default + * TTL of 1 day. + * - Converts per-record commit timestamps to big-endian form as part of key + * encoding (the in-memory commit_ts field of those records is mutated during + * processing). * - * @param flush_task Map from KV table name to a vector of FlushTaskEntry pointers whose - * archive vectors contain the FlushRecord entries to write. Only entries - * with non-empty archive vectors are processed. - * @return true if all batches for all partitions completed successfully; false if any batch - * failed (an error will be logged). + * @param flush_task Map from KV table name to a vector of FlushTaskEntry + * pointers whose archive vectors contain the FlushRecord entries to write. Only + * entries with non-empty archive vectors are processed. + * @return true if all batches for all partitions completed successfully; false + * if any batch failed (an error will be logged). */ bool DataStoreServiceClient::PutArchivesAll( std::unordered_map op_types; // temporary storage for the records in between batch // for keeping record upack info and encoded blob sizes - std::vector record_tmp_mem_area; + std::vector record_tmp_mem_area; record_tmp_mem_area.resize(archive_ptrs.size() * 2); // unpack_info_size + encoded_blob_size size_t write_batch_size = 0; @@ -2073,9 +2145,10 @@ bool DataStoreServiceClient::PutArchivesAll( uint16_t parts_cnt_per_record = 5; // Send the batch request - SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject(); - PoolableGuard guard(sync_putall); - sync_putall->Reset(); + SyncConcurrentRequest *sync_concurrent = + sync_concurrent_request_pool_.NextObject(); + PoolableGuard guard(sync_concurrent); + sync_concurrent->Reset(); size_t recs_cnt = archive_ptrs.size(); keys.reserve(recs_cnt * parts_cnt_per_key); @@ -2091,13 +2164,13 @@ bool DataStoreServiceClient::PutArchivesAll( { // Wait for in-flight requests to decrease if limit reached { - std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_++; - while (sync_putall->unfinished_request_cnt_ >= - SyncPutAllData::max_flying_write_count) + std::unique_lock lk(sync_concurrent->mux_); + while (sync_concurrent->unfinished_request_cnt_ >= + SyncConcurrentRequest::max_flying_write_count) { - sync_putall->cv_.wait(lk); + sync_concurrent->cv_.wait(lk); } + sync_concurrent->unfinished_request_cnt_++; } BatchWriteRecords(kv_mvcc_archive_name, partition_id, @@ -2107,8 +2180,8 @@ bool DataStoreServiceClient::PutArchivesAll( std::move(records_ttl), std::move(op_types), true, - sync_putall, - SyncPutAllCallback, + sync_concurrent, + SyncConcurrentRequestCallback, parts_cnt_per_key, parts_cnt_per_record); keys.clear(); @@ -2183,8 +2256,8 @@ bool DataStoreServiceClient::PutArchivesAll( std::move(records_ttl), std::move(op_types), true, - sync_putall, - SyncPutAllCallback, + sync_concurrent, + SyncConcurrentRequestCallback, parts_cnt_per_key, parts_cnt_per_record); keys.clear(); @@ -2200,26 +2273,26 @@ bool DataStoreServiceClient::PutArchivesAll( op_types.reserve(recs_cnt); write_batch_size = 0; { - std::unique_lock lk(sync_putall->mux_); - sync_putall->unfinished_request_cnt_++; + std::unique_lock lk(sync_concurrent->mux_); + sync_concurrent->unfinished_request_cnt_++; } } // Wait the result. { - std::unique_lock lk(sync_putall->mux_); - sync_putall->all_request_started_ = true; - while (sync_putall->unfinished_request_cnt_ != 0) + std::unique_lock lk(sync_concurrent->mux_); + sync_concurrent->all_request_started_ = true; + while (sync_concurrent->unfinished_request_cnt_ != 0) { - sync_putall->cv_.wait(lk); + sync_concurrent->cv_.wait(lk); } } - if (sync_putall->result_.error_code() != + if (sync_concurrent->result_.error_code() != remote::DataStoreError::NO_ERROR) { LOG(ERROR) << "PutArchivesAll failed for error: " - << sync_putall->result_.error_msg(); + << sync_concurrent->result_.error_msg(); return false; } } @@ -2235,8 +2308,10 @@ bool DataStoreServiceClient::PutArchivesAll( * handles both hash and range partitioned tables. Uses archive-specific * encoding and TTL settings for the copied data. * - * @param flush_task Map of table names to flush task entries containing base records to copy. - * @return true if all records are successfully copied to archive, false if any operation fails. + * @param flush_task Map of table names to flush task entries containing base + * records to copy. + * @return true if all records are successfully copied to archive, false if any + * operation fails. */ bool DataStoreServiceClient::CopyBaseToArchive( std::unordered_map>& flush_recs, - const std::vector>& entries, - const txservice::TableName& table_name, + EloqDS::PartitionFlushState &partition_state, + const std::vector> &flush_recs, + const std::vector> &entries, + const txservice::TableName &table_name, uint16_t parts_cnt_per_key, uint16_t parts_cnt_per_record, uint64_t now) @@ -4200,11 +4279,8 @@ void DataStoreServiceClient::PreparePartitionBatches( op_types.push_back(WriteOpType::PUT); batch_size += sizeof(WriteOpType); - SerializeTxRecord(is_deleted, - rec, - record_tmp_mem_area, - record_parts, - batch_size); + SerializeTxRecord( + is_deleted, rec, record_tmp_mem_area, record_parts, batch_size); records_ts.push_back(ckpt_rec.commit_ts_); batch_size += sizeof(uint64_t); @@ -4219,15 +4295,16 @@ void DataStoreServiceClient::PreparePartitionBatches( // Start a new batch if size limit reached if (write_batch_size >= MAX_WRITE_BATCH_SIZE) { - partition_state.AddBatch(PartitionBatchRequest( - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); - + partition_state.AddBatch( + PartitionBatchRequest(std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(record_tmp_mem_area), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); + key_parts.clear(); record_parts.clear(); records_ts.clear(); @@ -4253,22 +4330,23 @@ void DataStoreServiceClient::PreparePartitionBatches( // Add the last batch if it has data if (key_parts.size() > 0) { - partition_state.AddBatch(PartitionBatchRequest( - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); + partition_state.AddBatch( + PartitionBatchRequest(std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(record_tmp_mem_area), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); } } void DataStoreServiceClient::PrepareRangePartitionBatches( - EloqDS::PartitionFlushState& partition_state, - const std::vector& flush_recs, - const std::vector>& entries, - const txservice::TableName& table_name, + EloqDS::PartitionFlushState &partition_state, + const std::vector &flush_recs, + const std::vector> &entries, + const txservice::TableName &table_name, uint16_t parts_cnt_per_key, uint16_t parts_cnt_per_record, uint64_t now) @@ -4305,11 +4383,8 @@ void DataStoreServiceClient::PrepareRangePartitionBatches( op_types.push_back(WriteOpType::PUT); batch_size += sizeof(WriteOpType); - SerializeTxRecord(is_deleted, - rec, - record_tmp_mem_area, - record_parts, - batch_size); + SerializeTxRecord( + is_deleted, rec, record_tmp_mem_area, record_parts, batch_size); records_ts.push_back(ckpt_rec.commit_ts_); batch_size += sizeof(uint64_t); @@ -4323,15 +4398,16 @@ void DataStoreServiceClient::PrepareRangePartitionBatches( // Start a new batch if size limit reached if (write_batch_size >= MAX_WRITE_BATCH_SIZE) { - partition_state.AddBatch(PartitionBatchRequest( - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); - + partition_state.AddBatch( + PartitionBatchRequest(std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(record_tmp_mem_area), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); + key_parts.clear(); record_parts.clear(); records_ts.clear(); @@ -4341,8 +4417,9 @@ void DataStoreServiceClient::PrepareRangePartitionBatches( write_batch_size = 0; } - assert(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal || - ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted); + assert( + ckpt_rec.payload_status_ == txservice::RecordStatus::Normal || + ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted); // Currently there is no object table in range partitioned table PrepareRecordData(ckpt_rec, write_batch_size); @@ -4352,14 +4429,15 @@ void DataStoreServiceClient::PrepareRangePartitionBatches( // Add the last batch if it has data if (key_parts.size() > 0) { - partition_state.AddBatch(PartitionBatchRequest( - std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); + partition_state.AddBatch( + PartitionBatchRequest(std::move(key_parts), + std::move(record_parts), + std::move(records_ts), + std::move(records_ttl), + std::move(record_tmp_mem_area), + std::move(op_types), + parts_cnt_per_key, + parts_cnt_per_record)); } } diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp index 77774ce..a3a1238 100644 --- a/data_store_service_client_closure.cpp +++ b/data_store_service_client_closure.cpp @@ -20,11 +20,12 @@ * */ +#include "data_store_service_client_closure.h" + #include #include #include -#include "data_store_service_client_closure.h" #include "store_util.h" // host_to_big_endian #include "tx_service/include/cc/cc_request.h" #include "tx_service/include/cc/local_cc_shards.h" @@ -173,8 +174,8 @@ void FetchRecordCallback(void *data, if (!DataStoreServiceClient::DeserializeTxRecordStr( val, is_deleted, offset)) { - LOG(ERROR) << "====fetch record===decode error==" - << " key: " << read_closure->Key() + LOG(ERROR) << "====fetch record===decode error==" << " key: " + << read_closure->Key() << " status: " << (int) fetch_cc->rec_status_; std::abort(); } @@ -396,52 +397,53 @@ void FetchTableCallback(void *data, fetch_table_data->Notify(); } -void SyncPutAllCallback(void *data, - ::google::protobuf::Closure *closure, - DataStoreServiceClient &client, - const remote::CommonResult &result) +void SyncConcurrentRequestCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result) { - auto *callback_data = reinterpret_cast(data); + auto *callback_data = reinterpret_cast(data); callback_data->Finish(result); } void PartitionBatchCallback(void *data, - ::google::protobuf::Closure *closure, - DataStoreServiceClient &client, - const remote::CommonResult &result) + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result) { auto *callback_data = reinterpret_cast(data); auto *partition_state = callback_data->partition_state; auto *global_coordinator = callback_data->global_coordinator; - + // Check if the batch failed - if (result.error_code() != remote::DataStoreError::NO_ERROR) { + if (result.error_code() != remote::DataStoreError::NO_ERROR) + { partition_state->MarkFailed(result); // Notify the global coordinator that this partition failed global_coordinator->OnPartitionCompleted(); return; } - + // Try to get the next batch for this partition PartitionBatchRequest next_batch; - if (partition_state->GetNextBatch(next_batch)) { + if (partition_state->GetNextBatch(next_batch)) + { // Send the next batch - client.BatchWriteRecords( - callback_data->table_name, - partition_state->partition_id, - std::move(next_batch.key_parts), - std::move(next_batch.record_parts), - std::move(next_batch.records_ts), - std::move(next_batch.records_ttl), - std::move(next_batch.op_types), - true, // skip_wal - callback_data, - PartitionBatchCallback, - next_batch.parts_cnt_per_key, - next_batch.parts_cnt_per_record); - } else { - // No more batches, mark partition as completed - partition_state->MarkCompleted(); + client.BatchWriteRecords(callback_data->table_name, + partition_state->partition_id, + std::move(next_batch.key_parts), + std::move(next_batch.record_parts), + std::move(next_batch.records_ts), + std::move(next_batch.records_ttl), + std::move(next_batch.op_types), + true, // skip_wal + callback_data, + PartitionBatchCallback, + next_batch.parts_cnt_per_key, + next_batch.parts_cnt_per_record); + } + else + { // Notify the global coordinator that this partition completed global_coordinator->OnPartitionCompleted(); } @@ -1428,4 +1430,16 @@ void CreateSnapshotForBackupCallback(void *data, backup_callback_data->Notify(); } + +bool PartitionFlushState::GetNextBatch(PartitionBatchRequest &batch) +{ + std::unique_lock lk(mux); + if (pending_batches.empty()) + { + return false; + } + batch = std::move(pending_batches.front()); + pending_batches.pop(); + return true; +} } // namespace EloqDS diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index d02934f..ddc6dc3 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -26,17 +26,18 @@ #include #include +#include #include #include -#include #include #include "data_store_service_client.h" -#include "eloq_data_store_service/object_pool.h" #include "eloq_data_store_service/data_store_service.h" +#include "eloq_data_store_service/object_pool.h" // Forward declarations -namespace EloqDS { +namespace EloqDS +{ class DataStoreServiceClient; } @@ -123,21 +124,83 @@ struct SyncCallbackData : public Poolable remote::CommonResult result_; }; + /** - * Aggregation and flow-control helper for coordinating many concurrent put-all - * writes. + * @brief Per-partition state management for concurrent flushing + */ +struct PartitionFlushState : public Poolable +{ + int32_t partition_id; + std::queue pending_batches; + bool failed = false; + remote::CommonResult result; + mutable bthread::Mutex mux; + + PartitionFlushState() : partition_id(0) + { + result.Clear(); + } + + void Reset(int32_t pid) + { + partition_id = pid; + while (!pending_batches.empty()) + { + pending_batches.pop(); + } + failed = false; + result.Clear(); + } + + void Clear() override + { + partition_id = 0; + while (!pending_batches.empty()) + { + pending_batches.pop(); + } + } + bool IsFailed() const + { + std::unique_lock lk(mux); + return failed; + } + + void MarkFailed(const remote::CommonResult &error) + { + std::unique_lock lk(mux); + failed = true; + result.set_error_code(error.error_code()); + result.set_error_msg(error.error_msg()); + } + + bool GetNextBatch(PartitionBatchRequest &batch); + + void AddBatch(PartitionBatchRequest &&batch) + { + std::unique_lock lk(mux); + pending_batches.push(std::move(batch)); + } +}; + +/** + * Coordination helper for concurrent partition-based put-all operations. * - * - unfinished_request_cnt_: signed count of outstanding write requests (must - * be signed). - * - all_request_started_: set to true once all requests have been launched. - * - max_flying_write_count: upper bound on concurrent in-flight writes (32). + * This structure manages the coordination of multiple partitions that can + * process concurrently, with each partition maintaining serialization (only one + * request in-flight per partition). It tracks partition completion and provides + * global coordination for the entire PutAll operation. * - * Finish(res) will merge the first non-NO_ERROR result into `result_`, - * decrement the unfinished request count, and notify a waiter when either: - * - all requests have been started and the unfinished count reaches zero, or - * - the unfinished count falls to (max_flying_write_count - 1), enabling flow - * control to allow launching further requests while keeping in-flight writes - * bounded. + * Key components: + * - partition_states_: vector of PartitionFlushState objects, one per partition + * - completed_partitions_: count of partitions that have finished processing + * - total_partitions_: total number of partitions to process + * - OnPartitionCompleted(): called when a partition finishes (success or + * failure) + * + * The structure waits for all partitions to complete before the PutAll + * operation can finish. If any partition fails, the entire operation is + * considered failed. */ struct SyncPutAllData : public Poolable @@ -146,23 +209,75 @@ struct SyncPutAllData : public Poolable void Reset() { - unfinished_request_cnt_ = 0; - all_request_started_ = false; - result_.Clear(); // Clear partition states if using new concurrent approach partition_states_.clear(); completed_partitions_ = 0; total_partitions_ = 0; } + virtual void Clear() override + { + completed_partitions_ = 0; + total_partitions_ = 0; + for (auto *partition_state : partition_states_) + { + partition_state->Clear(); + partition_state->Free(); + } + partition_states_.clear(); + } + void OnPartitionCompleted() + { + std::unique_lock lk(mux_); + completed_partitions_++; + if (completed_partitions_ >= total_partitions_) + { + cv_.notify_one(); + } + } + mutable bthread::Mutex mux_; + bthread::ConditionVariable cv_; + + // fields for per-partition coordination + std::vector partition_states_; + int32_t completed_partitions_{0}; + int32_t total_partitions_{0}; +}; + +/** + * Coordination helper for sequential batch operations with global concurrency + * control. + * + * This structure manages the coordination of sequential batch operations (like + * PutArchivesAll) where batches are processed one after another within each + * partition, but with global concurrency control to limit the total number of + * in-flight requests across all partitions. + * + * Key features: + * - Global concurrency control with max_flying_write_count limit (32) + * - Sequential processing within each partition + * - Flow control to prevent overwhelming the system + * - Error aggregation from all batches + * + * This is used by operations that need to maintain sequential ordering within + * partitions while still allowing some concurrency across the system. + */ +struct SyncConcurrentRequest : public Poolable +{ + static constexpr int32_t max_flying_write_count = 32; + + void Reset() + { + unfinished_request_cnt_ = 0; + all_request_started_ = false; + result_.Clear(); + } + virtual void Clear() override { unfinished_request_cnt_ = 0; all_request_started_ = false; result_.Clear(); - partition_states_.clear(); - completed_partitions_ = 0; - total_partitions_ = 0; } void Finish(const remote::CommonResult &res) @@ -182,138 +297,77 @@ struct SyncPutAllData : public Poolable } } - // New method for per-partition coordination - void OnPartitionCompleted() - { - std::unique_lock lk(mux_); - completed_partitions_++; - if (completed_partitions_ >= total_partitions_) { - cv_.notify_one(); - } - } // NOTICE: "unfinished_request_cnt_" must use signed integer. int32_t unfinished_request_cnt_{0}; bool all_request_started_{false}; remote::CommonResult result_; mutable bthread::Mutex mux_; bthread::ConditionVariable cv_; - - // New fields for per-partition coordination - std::vector> partition_states_; - int32_t completed_partitions_{0}; - int32_t total_partitions_{0}; }; /** * @brief Represents a single batch request for a partition */ -struct PartitionBatchRequest { +struct PartitionBatchRequest +{ std::vector key_parts; std::vector record_parts; std::vector records_ts; std::vector records_ttl; + std::vector record_tmp_mem_area; std::vector op_types; uint16_t parts_cnt_per_key; uint16_t parts_cnt_per_record; - + PartitionBatchRequest() = default; - - PartitionBatchRequest(std::vector&& keys, - std::vector&& records, - std::vector&& ts, - std::vector&& ttl, - std::vector&& ops, - uint16_t key_parts_count, - uint16_t record_parts_count) - : key_parts(std::move(keys)) - , record_parts(std::move(records)) - , records_ts(std::move(ts)) - , records_ttl(std::move(ttl)) - , op_types(std::move(ops)) - , parts_cnt_per_key(key_parts_count) - , parts_cnt_per_record(record_parts_count) {} -}; -/** - * @brief Per-partition state management for concurrent flushing - */ -struct PartitionFlushState { - int32_t partition_id; - std::queue pending_batches; - bool has_inflight_request = false; - bool completed = false; - bool failed = false; - remote::CommonResult result; - mutable bthread::Mutex mux; - - PartitionFlushState(int32_t pid) : partition_id(pid) { - result.Clear(); - } - - void Reset() { - std::unique_lock lk(mux); - while (!pending_batches.empty()) { - pending_batches.pop(); - } - has_inflight_request = false; - completed = false; - failed = false; - result.Clear(); - } - - bool HasMoreBatches() const { - std::unique_lock lk(mux); - return !pending_batches.empty() || has_inflight_request; - } - - bool IsCompleted() const { - std::unique_lock lk(mux); - return completed; - } - - bool IsFailed() const { - std::unique_lock lk(mux); - return failed; - } - - void MarkCompleted() { - std::unique_lock lk(mux); - completed = true; - } - - void MarkFailed(const remote::CommonResult& error) { - std::unique_lock lk(mux); - failed = true; - result.set_error_code(error.error_code()); - result.set_error_msg(error.error_msg()); - } - - bool GetNextBatch(PartitionBatchRequest& batch) { - std::unique_lock lk(mux); - if (pending_batches.empty()) { - return false; - } - batch = std::move(pending_batches.front()); - pending_batches.pop(); - return true; - } - - void AddBatch(PartitionBatchRequest&& batch) { - std::unique_lock lk(mux); - pending_batches.push(std::move(batch)); + PartitionBatchRequest(std::vector &&keys, + std::vector &&records, + std::vector &&ts, + std::vector &&ttl, + std::vector &&record_tmp_mem_area, + std::vector &&ops, + uint16_t key_parts_count, + uint16_t record_parts_count) + : key_parts(std::move(keys)), + record_parts(std::move(records)), + records_ts(std::move(ts)), + records_ttl(std::move(ttl)), + op_types(std::move(ops)), + parts_cnt_per_key(key_parts_count), + parts_cnt_per_record(record_parts_count) + { } }; - /** * @brief Wrapper for partition callback data that includes global coordinator */ -struct PartitionCallbackData { - PartitionFlushState* partition_state; - SyncPutAllData* global_coordinator; - std::string table_name; - - PartitionCallbackData(PartitionFlushState* ps, SyncPutAllData* gc, const std::string& tn) - : partition_state(ps), global_coordinator(gc), table_name(tn) {} +struct PartitionCallbackData : public Poolable +{ + PartitionFlushState *partition_state; + SyncPutAllData *global_coordinator; + std::string_view table_name; + + PartitionCallbackData() + : partition_state(nullptr), global_coordinator(nullptr), table_name("") + { + } + + void Reset(PartitionFlushState *ps, + SyncPutAllData *gc, + const std::string_view tn) + { + partition_state = ps; + global_coordinator = gc; + table_name = tn; + } + + void Clear() override + { + partition_state = nullptr; + global_coordinator = nullptr; + table_name = ""; + } }; /** * Generic synchronous callback adapter invoked by closures to signal @@ -336,7 +390,7 @@ void SyncCallback(void *data, /** * Callback data structure for concurrent archive record reading operations. - * + * * Manages synchronization and flow control for reading base records that will * be copied to archive storage. Tracks flying read count and provides mutex * synchronization for concurrent access. @@ -801,7 +855,7 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable /** * Closure for asynchronous data flushing operations to KV storage. - * + * * Manages the lifecycle of flush operations, including RPC communication, * retry logic, and callback invocation. Supports both local and remote * flush operations with configurable retry behavior. @@ -2334,7 +2388,7 @@ class CreateSnapshotForBackupClosure : public ::google::protobuf::Closure, /** * Callback for fetching individual records from the data store. - * + * * Handles the completion of record fetch operations and processes the result. */ void FetchRecordCallback(void *data, @@ -2344,7 +2398,7 @@ void FetchRecordCallback(void *data, /** * Callback for fetching snapshot data from the data store. - * + * * Handles the completion of snapshot fetch operations and processes the result. */ void FetchSnapshotCallback(void *data, @@ -2354,7 +2408,7 @@ void FetchSnapshotCallback(void *data, /** * Callback data for asynchronous table drop operations. - * + * * Contains the KV table name that is being dropped. */ struct AsyncDropTableCallbackData @@ -2364,7 +2418,7 @@ struct AsyncDropTableCallbackData /** * Callback for asynchronous table drop operations. - * + * * Handles the completion of table drop operations and processes the result. */ void AsyncDropTableCallback(void *data, @@ -2374,8 +2428,9 @@ void AsyncDropTableCallback(void *data, /** * Callback for fetching table catalog information. - * - * Handles the completion of table catalog fetch operations and processes the result. + * + * Handles the completion of table catalog fetch operations and processes the + * result. */ void FetchTableCatalogCallback(void *data, ::google::protobuf::Closure *closure, @@ -2384,7 +2439,7 @@ void FetchTableCatalogCallback(void *data, /** * Callback data for fetching table information. - * + * * Extends SyncCallbackData to include table-specific information like * schema image, version timestamp, and found status. */ @@ -2420,30 +2475,30 @@ void FetchTableCallback(void *data, const remote::CommonResult &result); /** - * Callback for synchronous put-all operations. - * - * Handles the completion of batch put operations and updates the - * SyncPutAllData structure with the result. + * Callback for synchronous concurrent request operations. + * + * Handles the completion of concurrent request operations and updates the + * SyncConcurrentRequest structure with the result. */ -void SyncPutAllCallback(void *data, - ::google::protobuf::Closure *closure, - DataStoreServiceClient &client, - const remote::CommonResult &result); +void SyncConcurrentRequestCallback(void *data, + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); /** * Callback for per-partition batch operations in concurrent PutAll. - * + * * Handles the completion of a single batch for a partition and chains * to the next batch if available, or marks the partition as completed. */ void PartitionBatchCallback(void *data, - ::google::protobuf::Closure *closure, - DataStoreServiceClient &client, - const remote::CommonResult &result); + ::google::protobuf::Closure *closure, + DataStoreServiceClient &client, + const remote::CommonResult &result); /** * Callback data for fetching database information. - * + * * Extends SyncCallbackData to include database-specific information like * database definition, found status, and yield/resume function pointers * for cooperative scheduling. @@ -2506,7 +2561,7 @@ struct FetchDatabaseCallbackData : public SyncCallbackData /** * Callback for fetching database information. - * + * * Handles the completion of database fetch operations and processes the result. */ void FetchDatabaseCallback(void *data, @@ -2516,7 +2571,7 @@ void FetchDatabaseCallback(void *data, /** * Callback data for fetching all database names. - * + * * Extends SyncCallbackData to include database names list and yield/resume * function pointers for cooperative scheduling during pagination. */ @@ -2584,8 +2639,9 @@ struct FetchAllDatabaseCallbackData : public SyncCallbackData /** * Callback for fetching all database names. - * - * Handles the completion of all database names fetch operations and processes the result. + * + * Handles the completion of all database names fetch operations and processes + * the result. */ void FetchAllDatabaseCallback(void *data, ::google::protobuf::Closure *closure, @@ -2594,7 +2650,7 @@ void FetchAllDatabaseCallback(void *data, /** * Callback data for discovering all table names. - * + * * Extends SyncCallbackData to include table names list and yield/resume * function pointers for cooperative scheduling during pagination. */ @@ -2656,8 +2712,9 @@ struct DiscoverAllTableNamesCallbackData : public SyncCallbackData /** * Callback for discovering all table names. - * - * Handles the completion of table name discovery operations and processes the result. + * + * Handles the completion of table name discovery operations and processes the + * result. */ void DiscoverAllTableNamesCallback(void *data, ::google::protobuf::Closure *closure, @@ -2666,8 +2723,9 @@ void DiscoverAllTableNamesCallback(void *data, /** * Callback for fetching table ranges. - * - * Handles the completion of table range fetch operations and processes the result. + * + * Handles the completion of table range fetch operations and processes the + * result. */ void FetchTableRangesCallback(void *data, ::google::protobuf::Closure *closure, @@ -2676,8 +2734,9 @@ void FetchTableRangesCallback(void *data, /** * Callback for fetching range slices. - * - * Handles the completion of range slice fetch operations and processes the result. + * + * Handles the completion of range slice fetch operations and processes the + * result. */ void FetchRangeSlicesCallback(void *data, ::google::protobuf::Closure *closure, @@ -2685,8 +2744,9 @@ void FetchRangeSlicesCallback(void *data, const remote::CommonResult &result); /** * Callback for fetching current table statistics. - * - * Handles the completion of current table statistics fetch operations and processes the result. + * + * Handles the completion of current table statistics fetch operations and + * processes the result. */ void FetchCurrentTableStatsCallback(void *data, ::google::protobuf::Closure *closure, @@ -2695,8 +2755,9 @@ void FetchCurrentTableStatsCallback(void *data, /** * Callback for fetching table statistics. - * - * Handles the completion of table statistics fetch operations and processes the result. + * + * Handles the completion of table statistics fetch operations and processes the + * result. */ void FetchTableStatsCallback(void *data, ::google::protobuf::Closure *closure, @@ -2705,7 +2766,7 @@ void FetchTableStatsCallback(void *data, /** * Callback data for fetching archive records. - * + * * Extends SyncCallbackData to include archive-specific information like * table name, partition ID, key ranges, batch size, and scan direction. */ @@ -2743,8 +2804,9 @@ struct FetchArchivesCallbackData : public SyncCallbackData /** * Callback for fetching archive records. - * - * Handles the completion of archive record fetch operations and processes the result. + * + * Handles the completion of archive record fetch operations and processes the + * result. */ void FetchArchivesCallback(void *data, ::google::protobuf::Closure *closure, @@ -2753,8 +2815,9 @@ void FetchArchivesCallback(void *data, /** * Callback for fetching record archives. - * - * Handles the completion of record archive fetch operations and processes the result. + * + * Handles the completion of record archive fetch operations and processes the + * result. */ void FetchRecordArchivesCallback(void *data, ::google::protobuf::Closure *closure, @@ -2763,8 +2826,9 @@ void FetchRecordArchivesCallback(void *data, /** * Callback for fetching snapshot archives. - * - * Handles the completion of snapshot archive fetch operations and processes the result. + * + * Handles the completion of snapshot archive fetch operations and processes the + * result. */ void FetchSnapshotArchiveCallback(void *data, ::google::protobuf::Closure *closure, @@ -2773,7 +2837,7 @@ void FetchSnapshotArchiveCallback(void *data, /** * Callback data for creating snapshots for backup operations. - * + * * Extends SyncCallbackData to include backup-specific information like * backup name, timestamp, and backup files list. */ @@ -2805,8 +2869,9 @@ struct CreateSnapshotForBackupCallbackData : public SyncCallbackData /** * Callback for creating snapshots for backup operations. - * - * Handles the completion of snapshot creation for backup operations and processes the result. + * + * Handles the completion of snapshot creation for backup operations and + * processes the result. */ void CreateSnapshotForBackupCallback(void *data, ::google::protobuf::Closure *closure, From 479388b7e028a873792c0f7dabbe3b411bb47bef Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 11:45:46 +0000 Subject: [PATCH 7/9] fix record parts corrupted across retry --- data_store_service_client.cpp | 2 +- data_store_service_client_closure.cpp | 2 +- data_store_service_client_closure.h | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index ea54dfd..e40cb60 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -315,7 +315,7 @@ bool DataStoreServiceClient::PutAll( auto *callback_data = callback_data_list[i]; // Start the first batch for this partition - PartitionBatchRequest first_batch; + auto &first_batch = callback_data->inflight_batch; if (partition_state->GetNextBatch(first_batch)) { BatchWriteRecords(callback_data->table_name, diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp index a3a1238..be53357 100644 --- a/data_store_service_client_closure.cpp +++ b/data_store_service_client_closure.cpp @@ -425,7 +425,7 @@ void PartitionBatchCallback(void *data, } // Try to get the next batch for this partition - PartitionBatchRequest next_batch; + PartitionBatchRequest &next_batch = callback_data->inflight_batch; if (partition_state->GetNextBatch(next_batch)) { // Send the next batch diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index ddc6dc3..9da972d 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -333,11 +333,24 @@ struct PartitionBatchRequest record_parts(std::move(records)), records_ts(std::move(ts)), records_ttl(std::move(ttl)), + record_tmp_mem_area(std::move(record_tmp_mem_area)), op_types(std::move(ops)), parts_cnt_per_key(key_parts_count), parts_cnt_per_record(record_parts_count) { } + + void Clear() + { + key_parts.clear(); + record_parts.clear(); + records_ts.clear(); + records_ttl.clear(); + record_tmp_mem_area.clear(); + op_types.clear(); + parts_cnt_per_key = 1; + parts_cnt_per_record = 1; + } }; /** * @brief Wrapper for partition callback data that includes global coordinator @@ -347,6 +360,7 @@ struct PartitionCallbackData : public Poolable PartitionFlushState *partition_state; SyncPutAllData *global_coordinator; std::string_view table_name; + PartitionBatchRequest inflight_batch; PartitionCallbackData() : partition_state(nullptr), global_coordinator(nullptr), table_name("") From 3be1e0bbd1c30cf12d252131a3a5dd33aaf00c20 Mon Sep 17 00:00:00 2001 From: liunyl Date: Wed, 17 Sep 2025 23:26:46 +0000 Subject: [PATCH 8/9] Fix bug that record_tmp_area invalid due to vector resize --- data_store_service_client.cpp | 183 +++++++++++++--------------- data_store_service_client_closure.h | 13 ++ 2 files changed, 95 insertions(+), 101 deletions(-) diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp index e40cb60..bfc8e21 100644 --- a/data_store_service_client.cpp +++ b/data_store_service_client.cpp @@ -4198,16 +4198,13 @@ void DataStoreServiceClient::PreparePartitionBatches( uint16_t parts_cnt_per_record, uint64_t now) { - std::vector key_parts; - std::vector record_parts; - std::vector records_ts; - std::vector records_ttl; - std::vector op_types; - std::vector record_tmp_mem_area; size_t write_batch_size = 0; - - auto PrepareObjectData = - [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size) + PartitionBatchRequest batch_request; + batch_request.Reset( + parts_cnt_per_key, parts_cnt_per_record, flush_recs.size()); + auto PrepareObjectData = [&](txservice::FlushRecord &ckpt_rec, + size_t &batch_size, + PartitionBatchRequest &batch_request) { txservice::TxKey tx_key = ckpt_rec.Key(); uint64_t ttl = @@ -4217,72 +4214,77 @@ void DataStoreServiceClient::PreparePartitionBatches( if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal && (!ckpt_rec.Payload()->HasTTL() || ttl > now)) { - key_parts.emplace_back( + batch_request.key_parts.emplace_back( std::string_view(tx_key.Data(), tx_key.Size())); batch_size += tx_key.Size(); const txservice::TxRecord *rec = ckpt_rec.Payload(); - record_parts.emplace_back(std::string_view(rec->EncodedBlobData(), - rec->EncodedBlobSize())); + batch_request.record_parts.emplace_back(std::string_view( + rec->EncodedBlobData(), rec->EncodedBlobSize())); batch_size += rec->EncodedBlobSize(); - records_ts.push_back(ckpt_rec.commit_ts_); + batch_request.records_ts.push_back(ckpt_rec.commit_ts_); batch_size += sizeof(uint64_t); - records_ttl.push_back(ttl); + batch_request.records_ttl.push_back(ttl); batch_size += sizeof(uint64_t); - op_types.push_back(WriteOpType::PUT); + batch_request.op_types.push_back(WriteOpType::PUT); batch_size += sizeof(WriteOpType); } else { - key_parts.emplace_back( + batch_request.key_parts.emplace_back( std::string_view(tx_key.Data(), tx_key.Size())); batch_size += tx_key.Size(); - record_parts.emplace_back(std::string_view()); + batch_request.record_parts.emplace_back(std::string_view()); batch_size += 0; - records_ts.push_back(ckpt_rec.commit_ts_); + batch_request.records_ts.push_back(ckpt_rec.commit_ts_); batch_size += sizeof(uint64_t); - records_ttl.push_back(0); + batch_request.records_ttl.push_back(0); batch_size += sizeof(uint64_t); - op_types.push_back(WriteOpType::DELETE); + batch_request.op_types.push_back(WriteOpType::DELETE); batch_size += sizeof(WriteOpType); } }; - auto PrepareRecordData = - [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size) + auto PrepareRecordData = [&](txservice::FlushRecord &ckpt_rec, + size_t &batch_size, + PartitionBatchRequest &batch_request) { uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000; txservice::TxKey tx_key = ckpt_rec.Key(); bool is_deleted = !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal); - key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size())); + batch_request.key_parts.emplace_back( + std::string_view(tx_key.Data(), tx_key.Size())); batch_size += tx_key.Size(); const txservice::TxRecord *rec = ckpt_rec.Payload(); if (is_deleted) { - records_ttl.push_back(retired_ttl_for_deleted); + batch_request.records_ttl.push_back(retired_ttl_for_deleted); } else { - records_ttl.push_back(0); + batch_request.records_ttl.push_back(0); } batch_size += sizeof(uint64_t); - op_types.push_back(WriteOpType::PUT); + batch_request.op_types.push_back(WriteOpType::PUT); batch_size += sizeof(WriteOpType); - SerializeTxRecord( - is_deleted, rec, record_tmp_mem_area, record_parts, batch_size); + SerializeTxRecord(is_deleted, + rec, + batch_request.record_tmp_mem_area, + batch_request.record_parts, + batch_size); - records_ts.push_back(ckpt_rec.commit_ts_); + batch_request.records_ts.push_back(ckpt_rec.commit_ts_); batch_size += sizeof(uint64_t); }; @@ -4293,24 +4295,18 @@ void DataStoreServiceClient::PreparePartitionBatches( entries.at(idx.first)->data_sync_vec_->at(idx.second); // Start a new batch if size limit reached - if (write_batch_size >= MAX_WRITE_BATCH_SIZE) + // or the record_tmp_mem_area is full. Since the record_parts is a + // vector of string_view that references the record_tmp_mem_area, we + // cannot allow the record_tmp_mem_area to be resized which will cause + // the record_parts to be invalid. + if (write_batch_size >= MAX_WRITE_BATCH_SIZE || + batch_request.record_tmp_mem_area.size() == + batch_request.record_tmp_mem_area.capacity()) { - partition_state.AddBatch( - PartitionBatchRequest(std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(record_tmp_mem_area), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); - - key_parts.clear(); - record_parts.clear(); - records_ts.clear(); - records_ttl.clear(); - op_types.clear(); - record_tmp_mem_area.clear(); + partition_state.AddBatch(std::move(batch_request)); + + batch_request.Reset( + parts_cnt_per_key, parts_cnt_per_record, flush_recs.size()); write_batch_size = 0; } @@ -4319,26 +4315,18 @@ void DataStoreServiceClient::PreparePartitionBatches( if (table_name.IsObjectTable()) { - PrepareObjectData(ckpt_rec, write_batch_size); + PrepareObjectData(ckpt_rec, write_batch_size, batch_request); } else { - PrepareRecordData(ckpt_rec, write_batch_size); + PrepareRecordData(ckpt_rec, write_batch_size, batch_request); } } // Add the last batch if it has data - if (key_parts.size() > 0) + if (batch_request.key_parts.size() > 0) { - partition_state.AddBatch( - PartitionBatchRequest(std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(record_tmp_mem_area), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); + partition_state.AddBatch(std::move(batch_request)); } } @@ -4351,69 +4339,70 @@ void DataStoreServiceClient::PrepareRangePartitionBatches( uint16_t parts_cnt_per_record, uint64_t now) { - std::vector key_parts; - std::vector record_parts; - std::vector records_ts; - std::vector records_ttl; - std::vector op_types; - std::vector record_tmp_mem_area; size_t write_batch_size = 0; + PartitionBatchRequest batch_request; - auto PrepareRecordData = - [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size) + auto PrepareRecordData = [&](txservice::FlushRecord &ckpt_rec, + size_t &batch_size, + PartitionBatchRequest &batch_request) { uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000; txservice::TxKey tx_key = ckpt_rec.Key(); bool is_deleted = !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal); - key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size())); + batch_request.key_parts.emplace_back( + std::string_view(tx_key.Data(), tx_key.Size())); batch_size += tx_key.Size(); const txservice::TxRecord *rec = ckpt_rec.Payload(); if (is_deleted) { - records_ttl.push_back(retired_ttl_for_deleted); + batch_request.records_ttl.push_back(retired_ttl_for_deleted); } else { - records_ttl.push_back(0); + batch_request.records_ttl.push_back(0); } batch_size += sizeof(uint64_t); - op_types.push_back(WriteOpType::PUT); + batch_request.op_types.push_back(WriteOpType::PUT); batch_size += sizeof(WriteOpType); - SerializeTxRecord( - is_deleted, rec, record_tmp_mem_area, record_parts, batch_size); + SerializeTxRecord(is_deleted, + rec, + batch_request.record_tmp_mem_area, + batch_request.record_parts, + batch_size); - records_ts.push_back(ckpt_rec.commit_ts_); + batch_request.records_ts.push_back(ckpt_rec.commit_ts_); batch_size += sizeof(uint64_t); }; + size_t rec_cnt = 0; + for (auto idx : flush_recs) + { + rec_cnt += entries.at(idx)->data_sync_vec_->size(); + } + batch_request.Reset(parts_cnt_per_key, parts_cnt_per_record, rec_cnt); + // Process records and create batches for (auto idx : flush_recs) { for (auto &ckpt_rec : *entries.at(idx)->data_sync_vec_) { // Start a new batch if size limit reached - if (write_batch_size >= MAX_WRITE_BATCH_SIZE) + // or the record_tmp_mem_area is full. Since the record_parts is a + // vector of string_view that references the record_tmp_mem_area, we + // cannot allow the record_tmp_mem_area to be resized which will + // cause the record_parts to be invalid. + if (write_batch_size >= MAX_WRITE_BATCH_SIZE || + batch_request.record_tmp_mem_area.size() == + batch_request.record_tmp_mem_area.capacity()) { - partition_state.AddBatch( - PartitionBatchRequest(std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(record_tmp_mem_area), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); - - key_parts.clear(); - record_parts.clear(); - records_ts.clear(); - records_ttl.clear(); - op_types.clear(); - record_tmp_mem_area.clear(); + partition_state.AddBatch(std::move(batch_request)); + + batch_request.Reset( + parts_cnt_per_key, parts_cnt_per_record, rec_cnt); write_batch_size = 0; } @@ -4422,22 +4411,14 @@ void DataStoreServiceClient::PrepareRangePartitionBatches( ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted); // Currently there is no object table in range partitioned table - PrepareRecordData(ckpt_rec, write_batch_size); + PrepareRecordData(ckpt_rec, write_batch_size, batch_request); } } // Add the last batch if it has data - if (key_parts.size() > 0) + if (batch_request.key_parts.size() > 0) { - partition_state.AddBatch( - PartitionBatchRequest(std::move(key_parts), - std::move(record_parts), - std::move(records_ts), - std::move(records_ttl), - std::move(record_tmp_mem_area), - std::move(op_types), - parts_cnt_per_key, - parts_cnt_per_record)); + partition_state.AddBatch(std::move(batch_request)); } } diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h index 9da972d..7878c31 100644 --- a/data_store_service_client_closure.h +++ b/data_store_service_client_closure.h @@ -351,6 +351,19 @@ struct PartitionBatchRequest parts_cnt_per_key = 1; parts_cnt_per_record = 1; } + + void Reset(uint16_t key_parts_count, uint16_t record_parts_count, size_t record_cnt) + { + Clear(); + parts_cnt_per_key = key_parts_count; + parts_cnt_per_record = record_parts_count; + key_parts.reserve(key_parts_count * record_cnt); + record_parts.reserve(record_parts_count * record_cnt); + records_ts.reserve(record_cnt); + records_ttl.reserve(record_cnt); + record_tmp_mem_area.reserve(record_cnt * 2); + op_types.reserve(record_cnt); + } }; /** * @brief Wrapper for partition callback data that includes global coordinator From 031ae080ccaf6bd895d9daf93ec4c09cb1cd33be Mon Sep 17 00:00:00 2001 From: Chen Zhao Date: Thu, 18 Sep 2025 12:11:16 +0800 Subject: [PATCH 9/9] fix compile error and change amplification factor to 2 --- eloq_data_store_service/eloq_store_config.cpp | 2 +- eloq_data_store_service/eloq_store_data_store.cpp | 7 +++++++ eloq_data_store_service/eloq_store_data_store.h | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/eloq_data_store_service/eloq_store_config.cpp b/eloq_data_store_service/eloq_store_config.cpp index f9e10b6..6ffa0ea 100644 --- a/eloq_data_store_service/eloq_store_config.cpp +++ b/eloq_data_store_service/eloq_store_config.cpp @@ -85,7 +85,7 @@ DEFINE_uint32(eloq_store_max_archive_tasks, 256, "EloqStore max archive tasks."); DEFINE_uint32(eloq_store_file_amplify_factor, - 4, + 2, "EloqStore file amplify factor."); DEFINE_uint64(eloq_store_local_space_limit, 1ULL << 40, diff --git a/eloq_data_store_service/eloq_store_data_store.cpp b/eloq_data_store_service/eloq_store_data_store.cpp index d36b0f5..c9ebf06 100644 --- a/eloq_data_store_service/eloq_store_data_store.cpp +++ b/eloq_data_store_service/eloq_store_data_store.cpp @@ -495,6 +495,13 @@ void EloqStoreDataStore::SwitchToReadWrite() return; } +void EloqStoreDataStore::CreateSnapshotForBackup( + CreateSnapshotForBackupRequest *req) +{ + return; +} + + void EloqStoreDataStore::ScanDelete(DeleteRangeRequest *delete_range_req) { ::eloqstore::TableIdent eloq_store_table_id; diff --git a/eloq_data_store_service/eloq_store_data_store.h b/eloq_data_store_service/eloq_store_data_store.h index 550a569..f0e12a9 100644 --- a/eloq_data_store_service/eloq_store_data_store.h +++ b/eloq_data_store_service/eloq_store_data_store.h @@ -252,6 +252,8 @@ class EloqStoreDataStore : public DataStore */ void SwitchToReadWrite() override; + void CreateSnapshotForBackup(CreateSnapshotForBackupRequest *req) override; + private: static void OnRead(::eloqstore::KvRequest *req); static void OnBatchWrite(::eloqstore::KvRequest *req);