From 797284cdf02d2aec5c86ef38c72aeed4e770eb6e Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 08:13:14 +0000
Subject: [PATCH 1/9] Send out batchwrite req concurrenctly and wait all at
 once

---
 data_store_service_client.cpp       | 112 ++++++++++++++++------------
 data_store_service_client_closure.h |   5 +-
 2 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index cffc0fe..e91f310 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -280,11 +280,13 @@ bool DataStoreServiceClient::PutAll(
             flush_task_entry_idx++;
         }
 
-        SyncCallbackData *sync_putall = sync_callback_data_pool_.NextObject();
+        SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject();
         PoolableGuard sync_putall_guard(sync_putall);
+        sync_putall->Reset();
 
         uint16_t parts_cnt_per_key = 1;
         uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5;
+        uint32_t batch_cnt = 0;
 
         // Write data for hash_partitioned table
         for (auto part_it = hash_partitions_map.begin();
@@ -305,9 +307,18 @@ bool DataStoreServiceClient::PutAll(
                 txservice::TxKey tx_key = ckpt_rec.Key();
 
                 // Start a new batch if done with current partition.
-                if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
+                if (write_batch_size >= SyncPutAllData::max_flying_write_count)
                 {
-                    sync_putall->Reset();
+                    // Wait for in-flight requests to decrease if limit reached
+                    {
+                        std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                        while (sync_putall->unfinished_request_cnt_ >=
+                               SyncPutAllData::max_flying_write_count)
+                        {
+                            sync_putall->cv_.wait(lk);
+                        }
+                    }
+
                     BatchWriteRecords(kv_table_name,
                                       part_it->first,
                                       std::move(key_parts),
@@ -317,19 +328,9 @@ bool DataStoreServiceClient::PutAll(
                                       std::move(op_types),
                                       true,
                                       sync_putall,
-                                      SyncCallback,
+                                      SyncPutAllCallback,
                                       parts_cnt_per_key,
                                       parts_cnt_per_record);
-                    sync_putall->Wait();
-
-                    if (sync_putall->Result().error_code() !=
-                        EloqDS::remote::DataStoreError::NO_ERROR)
-                    {
-                        LOG(WARNING)
-                            << "DataStoreHandler: Failed to write batch.";
-
-                        return false;
-                    }
                     key_parts.clear();
                     record_parts.clear();
                     records_ts.clear();
@@ -341,6 +342,7 @@ bool DataStoreServiceClient::PutAll(
                     records_ttl.reserve(recs_cnt);
                     op_types.reserve(recs_cnt);
                     write_batch_size = 0;
+                    ++batch_cnt;
                 }
 
                 assert(ckpt_rec.payload_status_ ==
@@ -360,7 +362,6 @@ bool DataStoreServiceClient::PutAll(
             // Send out the last batch
             if (key_parts.size() > 0)
             {
-                sync_putall->Reset();
                 BatchWriteRecords(kv_table_name,
                                   part_it->first,
                                   std::move(key_parts),
@@ -370,23 +371,16 @@ bool DataStoreServiceClient::PutAll(
                                   std::move(op_types),
                                   true,
                                   sync_putall,
-                                  SyncCallback,
+                                  SyncPutAllCallback,
                                   parts_cnt_per_key,
                                   parts_cnt_per_record);
-                sync_putall->Wait();
                 key_parts.clear();
                 record_parts.clear();
                 records_ts.clear();
                 records_ttl.clear();
                 op_types.clear();
                 write_batch_size = 0;
-                if (sync_putall->Result().error_code() !=
-                    EloqDS::remote::DataStoreError::NO_ERROR)
-                {
-                    LOG(WARNING) << "DataStoreHandler: Failed to write batch.";
-
-                    return false;
-                }
+                ++batch_cnt;
             }
         }
 
@@ -409,9 +403,21 @@ bool DataStoreServiceClient::PutAll(
                     txservice::TxKey tx_key = ckpt_rec.Key();
 
                     // Start a new batch if done with current partition.
-                    if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
+                    if (write_batch_size >=
+                        SyncPutAllData::max_flying_write_count)
                     {
-                        sync_putall->Reset();
+                        // Wait for in-flight requests to decrease if limit
+                        // reached
+                        {
+                            std::unique_lock<bthread::Mutex> lk(
+                                sync_putall->mux_);
+                            while (sync_putall->unfinished_request_cnt_ >=
+                                   SyncPutAllData::max_flying_write_count)
+                            {
+                                sync_putall->cv_.wait(lk);
+                            }
+                        }
+
                         BatchWriteRecords(kv_table_name,
                                           part_it->first,
                                           std::move(key_parts),
@@ -421,19 +427,9 @@ bool DataStoreServiceClient::PutAll(
                                           std::move(op_types),
                                           true,
                                           sync_putall,
-                                          SyncCallback,
+                                          SyncPutAllCallback,
                                           parts_cnt_per_key,
                                           parts_cnt_per_record);
-                        sync_putall->Wait();
-
-                        if (sync_putall->Result().error_code() !=
-                            EloqDS::remote::DataStoreError::NO_ERROR)
-                        {
-                            LOG(WARNING)
-                                << "DataStoreHandler: Failed to write batch.";
-
-                            return false;
-                        }
                         record_tmp_mem_area.clear();
                         key_parts.clear();
                         record_parts.clear();
@@ -446,6 +442,7 @@ bool DataStoreServiceClient::PutAll(
                         records_ttl.reserve(recs_cnt);
                         op_types.reserve(recs_cnt);
                         write_batch_size = 0;
+                        ++batch_cnt;
                     }
 
                     assert(ckpt_rec.payload_status_ ==
@@ -460,7 +457,6 @@ bool DataStoreServiceClient::PutAll(
                 // Send out the last batch
                 if (key_parts.size() > 0)
                 {
-                    sync_putall->Reset();
                     BatchWriteRecords(kv_table_name,
                                       part_it->first,
                                       std::move(key_parts),
@@ -470,10 +466,9 @@ bool DataStoreServiceClient::PutAll(
                                       std::move(op_types),
                                       true,
                                       sync_putall,
-                                      SyncCallback,
+                                      SyncPutAllCallback,
                                       parts_cnt_per_key,
                                       parts_cnt_per_record);
-                    sync_putall->Wait();
                     record_tmp_mem_area.clear();
                     key_parts.clear();
                     record_parts.clear();
@@ -481,17 +476,29 @@ bool DataStoreServiceClient::PutAll(
                     records_ttl.clear();
                     op_types.clear();
                     write_batch_size = 0;
-                    if (sync_putall->Result().error_code() !=
-                        EloqDS::remote::DataStoreError::NO_ERROR)
-                    {
-                        LOG(WARNING)
-                            << "DataStoreHandler: Failed to write batch.";
-
-                        return false;
-                    }
+                    ++batch_cnt;
                 }
             }
         }
+
+        // Wait for all requests to complete
+        {
+            std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+            sync_putall->unfinished_request_cnt_ += batch_cnt;
+            sync_putall->all_request_started_ = true;
+            while (sync_putall->unfinished_request_cnt_ != 0)
+            {
+                sync_putall->cv_.wait(lk);
+            }
+        }
+
+        if (sync_putall->result_.error_code() !=
+            remote::DataStoreError::NO_ERROR)
+        {
+            LOG(ERROR) << "PutAll failed for error: "
+                       << sync_putall->result_.error_msg();
+            return false;
+        }
     }
     return true;
 }
@@ -1939,6 +1946,15 @@ bool DataStoreServiceClient::PutArchivesAll(
             // Start a new batch if done with current partition.
             if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
             {
+                // Wait for in-flight requests to decrease if limit reached
+                {
+                    std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                    while (sync_putall->unfinished_request_cnt_ >=
+                           SyncPutAllData::max_flying_write_count)
+                    {
+                        sync_putall->cv_.wait(lk);
+                    }
+                }
                 BatchWriteRecords(kv_mvcc_archive_name,
                                   partition_id,
                                   std::move(keys),
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index 9f8c203..d0d8ddd 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -99,6 +99,8 @@ struct SyncCallbackData : public Poolable
 
 struct SyncPutAllData : public Poolable
 {
+    static constexpr int32_t max_flying_write_count = 32;
+
     void Reset()
     {
         unfinished_request_cnt_ = 0;
@@ -123,7 +125,8 @@ struct SyncPutAllData : public Poolable
         }
 
         --unfinished_request_cnt_;
-        if (all_request_started_ && unfinished_request_cnt_ == 0)
+        if ((all_request_started_ && unfinished_request_cnt_ == 0) ||
+            unfinished_request_cnt_ == max_flying_write_count - 1)
         {
             cv_.notify_one();
         }

From ea30ac22799ac163e2c3f57def6724ab7b49d829 Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 08:32:17 +0000
Subject: [PATCH 2/9] resolve comment

---
 data_store_service_client.cpp | 72 ++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index e91f310..94bf1e4 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -286,7 +286,6 @@ bool DataStoreServiceClient::PutAll(
 
         uint16_t parts_cnt_per_key = 1;
         uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5;
-        uint32_t batch_cnt = 0;
 
         // Write data for hash_partitioned table
         for (auto part_it = hash_partitions_map.begin();
@@ -307,18 +306,8 @@ bool DataStoreServiceClient::PutAll(
                 txservice::TxKey tx_key = ckpt_rec.Key();
 
                 // Start a new batch if done with current partition.
-                if (write_batch_size >= SyncPutAllData::max_flying_write_count)
+                if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
                 {
-                    // Wait for in-flight requests to decrease if limit reached
-                    {
-                        std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-                        while (sync_putall->unfinished_request_cnt_ >=
-                               SyncPutAllData::max_flying_write_count)
-                        {
-                            sync_putall->cv_.wait(lk);
-                        }
-                    }
-
                     BatchWriteRecords(kv_table_name,
                                       part_it->first,
                                       std::move(key_parts),
@@ -331,6 +320,16 @@ bool DataStoreServiceClient::PutAll(
                                       SyncPutAllCallback,
                                       parts_cnt_per_key,
                                       parts_cnt_per_record);
+                    // Wait for in-flight requests to decrease if limit reached
+                    {
+                        std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                        sync_putall->unfinished_request_cnt_++;
+                        while (sync_putall->unfinished_request_cnt_ >=
+                               SyncPutAllData::max_flying_write_count)
+                        {
+                            sync_putall->cv_.wait(lk);
+                        }
+                    }
                     key_parts.clear();
                     record_parts.clear();
                     records_ts.clear();
@@ -342,7 +341,6 @@ bool DataStoreServiceClient::PutAll(
                     records_ttl.reserve(recs_cnt);
                     op_types.reserve(recs_cnt);
                     write_batch_size = 0;
-                    ++batch_cnt;
                 }
 
                 assert(ckpt_rec.payload_status_ ==
@@ -374,13 +372,16 @@ bool DataStoreServiceClient::PutAll(
                                   SyncPutAllCallback,
                                   parts_cnt_per_key,
                                   parts_cnt_per_record);
+                {
+                    std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                    sync_putall->unfinished_request_cnt_++;
+                }
                 key_parts.clear();
                 record_parts.clear();
                 records_ts.clear();
                 records_ttl.clear();
                 op_types.clear();
                 write_batch_size = 0;
-                ++batch_cnt;
             }
         }
 
@@ -403,21 +404,8 @@ bool DataStoreServiceClient::PutAll(
                     txservice::TxKey tx_key = ckpt_rec.Key();
 
                     // Start a new batch if done with current partition.
-                    if (write_batch_size >=
-                        SyncPutAllData::max_flying_write_count)
+                    if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
                     {
-                        // Wait for in-flight requests to decrease if limit
-                        // reached
-                        {
-                            std::unique_lock<bthread::Mutex> lk(
-                                sync_putall->mux_);
-                            while (sync_putall->unfinished_request_cnt_ >=
-                                   SyncPutAllData::max_flying_write_count)
-                            {
-                                sync_putall->cv_.wait(lk);
-                            }
-                        }
-
                         BatchWriteRecords(kv_table_name,
                                           part_it->first,
                                           std::move(key_parts),
@@ -442,7 +430,18 @@ bool DataStoreServiceClient::PutAll(
                         records_ttl.reserve(recs_cnt);
                         op_types.reserve(recs_cnt);
                         write_batch_size = 0;
-                        ++batch_cnt;
+                        // Wait for in-flight requests to decrease if limit
+                        // reached
+                        {
+                            std::unique_lock<bthread::Mutex> lk(
+                                sync_putall->mux_);
+                            sync_putall->unfinished_request_cnt_++;
+                            while (sync_putall->unfinished_request_cnt_ >=
+                                   SyncPutAllData::max_flying_write_count)
+                            {
+                                sync_putall->cv_.wait(lk);
+                            }
+                        }
                     }
 
                     assert(ckpt_rec.payload_status_ ==
@@ -476,7 +475,10 @@ bool DataStoreServiceClient::PutAll(
                     records_ttl.clear();
                     op_types.clear();
                     write_batch_size = 0;
-                    ++batch_cnt;
+                    {
+                        std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                        sync_putall->unfinished_request_cnt_++;
+                    }
                 }
             }
         }
@@ -484,7 +486,6 @@ bool DataStoreServiceClient::PutAll(
         // Wait for all requests to complete
         {
             std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-            sync_putall->unfinished_request_cnt_ += batch_cnt;
             sync_putall->all_request_started_ = true;
             while (sync_putall->unfinished_request_cnt_ != 0)
             {
@@ -1932,7 +1933,6 @@ bool DataStoreServiceClient::PutArchivesAll(
         SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject();
         PoolableGuard guard(sync_putall);
         sync_putall->Reset();
-        uint32_t batch_cnt = 0;
 
         size_t recs_cnt = archive_ptrs.size();
         keys.reserve(recs_cnt * parts_cnt_per_key);
@@ -1949,6 +1949,7 @@ bool DataStoreServiceClient::PutArchivesAll(
                 // Wait for in-flight requests to decrease if limit reached
                 {
                     std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                    sync_putall->unfinished_request_cnt_++;
                     while (sync_putall->unfinished_request_cnt_ >=
                            SyncPutAllData::max_flying_write_count)
                     {
@@ -1979,7 +1980,6 @@ bool DataStoreServiceClient::PutArchivesAll(
                 records_ttl.reserve(recs_cnt);
                 op_types.reserve(recs_cnt);
                 write_batch_size = 0;
-                ++batch_cnt;
             }
 
             txservice::FlushRecord &ckpt_rec = *archive_ptrs[i].second;
@@ -2056,13 +2056,15 @@ bool DataStoreServiceClient::PutArchivesAll(
             records_ttl.reserve(recs_cnt);
             op_types.reserve(recs_cnt);
             write_batch_size = 0;
-            ++batch_cnt;
+            {
+                std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
+                sync_putall->unfinished_request_cnt_++;
+            }
         }
 
         // Wait the result.
         {
             std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-            sync_putall->unfinished_request_cnt_ += batch_cnt;
             sync_putall->all_request_started_ = true;
             while (sync_putall->unfinished_request_cnt_ != 0)
             {

From 09a5b8bb1ba77e105ffcdf6b5e222988810e142d Mon Sep 17 00:00:00 2001
From: "coderabbitai[bot]"
 <136622811+coderabbitai[bot]@users.noreply.github.com>
Date: Wed, 17 Sep 2025 16:32:47 +0800
Subject: [PATCH 3/9] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`bat?=
 =?UTF-8?q?ch=5Fwrite`=20(#84)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Docstrings generation was requested by @liunyl.

* https://github.com/eloqdata/store_handler/pull/83#issuecomment-3301872873

The following files were modified:

* `data_store_service_client.cpp`
* `data_store_service_client_closure.h`

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 data_store_service_client.cpp       | 43 +++++++++++++
 data_store_service_client_closure.h | 93 ++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index 94bf1e4..b030d18 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -130,6 +130,29 @@ void DataStoreServiceClient::ScheduleTimerTasks()
     assert(false);
 }
 
+/**
+ * @brief Batch-writes a set of flush tasks into KV tables.
+ *
+ * Processes the provided flush tasks grouped by table and partition, serializes
+ * each record (object tables use raw encoded blobs; non-object tables encode
+ * tx-records with unpack info), and issues batched PUT/DELETE operations via
+ * BatchWriteRecords. Batches are emitted per KV-partition and sized according
+ * to SyncPutAllData::max_flying_write_count; the method blocks as necessary to
+ * respect the global in-flight write limit and waits for all dispatched
+ * requests to complete before returning.
+ *
+ * The function distinguishes hash- and range-partitioned tables, computes
+ * per-partition batches, and updates per-record timestamps/TTLs and operation
+ * types. Partial batches are flushed at partition boundaries. On any remote or
+ * batch-level error the function logs the failure and returns false.
+ *
+ * @param flush_task Mapping from KV table name to a vector of flush task
+ *                   entries containing the records to write. Each entry's
+ *                   data_sync_vec_ provides the sequence of records for that
+ *                   flush task.
+ * @return true if all batches completed successfully; false if any batch
+ *         reported an error.
+ */
 bool DataStoreServiceClient::PutAll(
     std::unordered_map<std::string_view,
                        std::vector<std::unique_ptr<txservice::FlushTaskEntry>>>
@@ -1869,6 +1892,26 @@ void DataStoreServiceClient::DecodeArchiveValue(
     value_offset = pos;
 }
 
+/**
+ * @brief Writes multiple MVCC archive records to the MVCC archive KV table in partitioned batches.
+ *
+ * Groups archive entries from the provided flush tasks by archive partition, serializes keys
+ * and values into batch write requests, and dispatches those requests (possibly concurrently)
+ * to the KV layer. Batches are split to respect MAX_WRITE_BATCH_SIZE and an internal limit on
+ * in-flight write requests; the method waits for all dispatched batches for each partition to
+ * complete before returning.
+ *
+ * Side effects:
+ * - Commits serialized archive records to kv_mvcc_archive_name with a default TTL of 1 day.
+ * - Converts per-record commit timestamps to big-endian form as part of key encoding (the
+ *   in-memory commit_ts field of those records is mutated during processing).
+ *
+ * @param flush_task Map from KV table name to a vector of FlushTaskEntry pointers whose
+ *                   archive vectors contain the FlushRecord entries to write. Only entries
+ *                   with non-empty archive vectors are processed.
+ * @return true if all batches for all partitions completed successfully; false if any batch
+ *         failed (an error will be logged).
+ */
 bool DataStoreServiceClient::PutArchivesAll(
     std::unordered_map<std::string_view,
                        std::vector<std::unique_ptr<txservice::FlushTaskEntry>>>
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index d0d8ddd..77af7b5 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -1,4 +1,3 @@
-
 /**
  *    Copyright (C) 2025 EloqData Inc.
  *
@@ -35,7 +34,97 @@
 #include "data_store_service_scanner.h"
 #include "eloq_data_store_service/object_pool.h"
 
-namespace EloqDS
+/**
+     * Callback type invoked on completion of a datastore operation.
+     *
+     * Parameters:
+     *  - data: user-provided context pointer passed through the async call.
+     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
+     *  - client: reference to the DataStoreServiceClient that executed the operation.
+     *  - result: operation result detail (error code/message and any operation-specific fields).
+     */
+    
+    /**
+     * Synchronization helper used to wait for an asynchronous datastore operation to complete.
+     *
+     * Provides a mutex/condition variable pair and a CommonResult to store the outcome.
+     * Typical usage: Reset() before issuing the async operation, Notify() from the async
+     * completion callback, and Wait() from the waiting thread. HasError() reports whether
+     * the stored result represents an error other than NO_ERROR or KEY_NOT_FOUND.
+     */
+    
+    /**
+     * Aggregation and flow-control helper for coordinating many concurrent put-all writes.
+     *
+     * - unfinished_request_cnt_: signed count of outstanding write requests (must be signed).
+     * - all_request_started_: set to true once all requests have been launched.
+     * - max_flying_write_count: upper bound on concurrent in-flight writes (32).
+     *
+     * Finish(res) will merge the first non-NO_ERROR result into `result_`, decrement the
+     * unfinished request count, and notify a waiter when either:
+     *  - all requests have been started and the unfinished count reaches zero, or
+     *  - the unfinished count falls to (max_flying_write_count - 1), enabling flow control
+     *    to allow launching further requests while keeping in-flight writes bounded.
+     */
+    
+    /**
+     * Generic synchronous callback adapter invoked by closures to signal completion.
+     *
+     * Parameters:
+     *  - data: user-provided context pointer passed through the async call.
+     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
+     *  - client: reference to the DataStoreServiceClient that executed the operation.
+     *  - result: operation result detail (error code/message and any operation-specific fields).
+     */
+    
+    /**
+     * Shared helper used when reading archived records concurrently.
+     *
+     * Holds references to external synchronization primitives and counters:
+     *  - mtx_, cv_: external mutex and condition variable used to guard flying_read_cnt_.
+     *  - flying_read_cnt_: reference to the shared in-flight read counter.
+     *  - error_code_: reference to an integer used to capture the first observed error.
+     *
+     * Also stores the most recent read result (partition_id_, key_str_, value_str_, ts_, ttl_).
+     * Thread-safe: methods that mutate or read shared resources acquire the provided mutex.
+     */
+    
+    /**
+     * Callback invoked for batch archive reads to aggregate or forward results.
+     *
+     * Parameters:
+     *  - data: user-provided context pointer passed through the async call.
+     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
+     *  - client: reference to the DataStoreServiceClient that executed the operation.
+     *  - result: operation result detail (error code/message and any operation-specific fields).
+     */
+    
+    /**
+     * Callback invoked to load a range slice (archive or otherwise).
+     *
+     * Parameters:
+     *  - data: user-provided context pointer passed through the async call.
+     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
+     *  - client: reference to the DataStoreServiceClient that executed the operation.
+     *  - result: operation result detail (error code/message and any operation-specific fields).
+     */
+    
+    /**
+     * Closure implementing a datastore Read operation supporting both local and remote paths.
+     *
+     * Use Reset(...) to configure a read (table, partition, key, client, and callback), then:
+     *  - PrepareRequest(is_local): prepare an RPC request if remote, or clear local result for local reads.
+     *  - Run(): executed when an RPC completes (or when local processing is finished). Run()
+     *    handles RPC failures with retry logic, translates NOT_OWNER into sharding handling
+     *    and potential retry, and finally invokes the user callback with the CommonResult.
+     *
+     * Accessors provide access to the brpc::Controller, request/response objects, table/partition/key,
+     * and local-result fields (value, ts, ttl, result). Value accessors return either the local
+     * in-memory values or the response's values depending on the request mode.
+     *
+     * Note: retry behavior is governed by the associated DataStoreServiceClient retry_limit_.
+     */
+    namespace EloqDS
 {
 typedef void (*DataStoreCallback)(void *data,
                                   ::google::protobuf::Closure *closure,

From 13cae07e402651da24d1c61167ca715fff7be9f8 Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 08:52:28 +0000
Subject: [PATCH 4/9] add docstring

---
 data_store_service_client.cpp       | 461 +++++++++++++++++++++++++++-
 data_store_service_client_closure.h | 339 ++++++++++++++------
 2 files changed, 702 insertions(+), 98 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index b030d18..471e2ba 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -92,6 +92,15 @@ DataStoreServiceClient::~DataStoreServiceClient()
     upsert_table_worker_.Shutdown();
 }
 
+/**
+ * @brief Configures the data store service client with cluster manager information.
+ *
+ * Initializes the client with cluster configuration including node hostnames and ports.
+ * Logs all node information for debugging purposes and stores the cluster manager
+ * reference for future use.
+ *
+ * @param cluster_manager Reference to the cluster manager containing shard and node information.
+ */
 void DataStoreServiceClient::SetupConfig(
     const DataStoreServiceClusterManager &cluster_manager)
 {
@@ -106,6 +115,15 @@ void DataStoreServiceClient::SetupConfig(
     cluster_manager_ = cluster_manager;
 }
 
+/**
+ * @brief Establishes connection to the data store service.
+ *
+ * Attempts to connect to the data store service with retry logic. Initializes
+ * pre-built tables and retries up to 5 times with 1-second delays between attempts.
+ * Returns true if connection succeeds, false otherwise.
+ *
+ * @return true if connection is successful, false if all retry attempts fail.
+ */
 bool DataStoreServiceClient::Connect()
 {
     bool succeed = false;
@@ -124,6 +142,13 @@ bool DataStoreServiceClient::Connect()
     return succeed;
 }
 
+/**
+ * @brief Schedules timer-based tasks for the data store service.
+ *
+ * Currently not implemented. This method is a placeholder for future timer-based
+ * functionality such as periodic cleanup, health checks, or maintenance tasks.
+ * Will assert and log an error if called.
+ */
 void DataStoreServiceClient::ScheduleTimerTasks()
 {
     LOG(ERROR) << "ScheduleTimerTasks not implemented";
@@ -527,6 +552,16 @@ bool DataStoreServiceClient::PutAll(
     return true;
 }
 
+/**
+ * @brief Persists data from specified KV tables to storage.
+ *
+ * Flushes data from the provided KV table names to persistent storage using
+ * asynchronous flush operations. Waits for completion and returns success/failure
+ * status. Logs warnings on failure and debug info on success.
+ *
+ * @param kv_table_names Vector of KV table names to persist.
+ * @return true if all tables are persisted successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::PersistKV(
     const std::vector<std::string> &kv_table_names)
 {
@@ -548,6 +583,26 @@ bool DataStoreServiceClient::PersistKV(
     return true;
 }
 
+/**
+ * @brief Upserts table schema information to the data store.
+ *
+ * Handles table creation, modification, and deletion operations by updating
+ * table schema information in the data store. Validates leadership, processes
+ * the operation asynchronously, and sets appropriate error codes on failure.
+ * Supports various operation types including CREATE, DROP, and ALTER operations.
+ *
+ * @param old_table_schema Pointer to the existing table schema (nullptr for CREATE).
+ * @param new_table_schema Pointer to the new table schema.
+ * @param op_type Type of operation (CREATE, DROP, ALTER, etc.).
+ * @param commit_ts Commit timestamp for the operation.
+ * @param ng_id Node group ID for the operation.
+ * @param tx_term Transaction term for consistency.
+ * @param hd_res Handler result object to store operation outcome.
+ * @param alter_table_info Information about table alterations (nullptr if not applicable).
+ * @param cc_req CC request base object.
+ * @param ccs CC shard reference.
+ * @param err_code Error code output parameter.
+ */
 void DataStoreServiceClient::UpsertTable(
     const txservice::TableSchema *old_table_schema,
     const txservice::TableSchema *new_table_schema,
@@ -598,6 +653,16 @@ void DataStoreServiceClient::UpsertTable(
                                     { this->UpsertTable(table_data); });
 }
 
+/**
+ * @brief Fetches table catalog information from the data store.
+ *
+ * Retrieves catalog information for the specified table by reading from the
+ * KV table catalogs storage. Uses partition ID 0 and the catalog name as the key.
+ * The operation is performed asynchronously with a callback for completion handling.
+ *
+ * @param ccm_table_name The table name to fetch catalog information for.
+ * @param fetch_cc Fetch catalog CC object to store the result and handle completion.
+ */
 void DataStoreServiceClient::FetchTableCatalog(
     const txservice::TableName &ccm_table_name,
     txservice::FetchCatalogCc *fetch_cc)
@@ -611,6 +676,16 @@ void DataStoreServiceClient::FetchTableCatalog(
          &FetchTableCatalogCallback);
 }
 
+/**
+ * @brief Fetches current table statistics from the data store.
+ *
+ * Retrieves the current version of table statistics for the specified table.
+ * Determines the appropriate KV partition ID and reads from the table statistics
+ * version storage. The operation is performed asynchronously with callback handling.
+ *
+ * @param ccm_table_name The table name to fetch statistics for.
+ * @param fetch_cc Fetch table statistics CC object to store the result and handle completion.
+ */
 void DataStoreServiceClient::FetchCurrentTableStatistics(
     const txservice::TableName &ccm_table_name,
     txservice::FetchTableStatisticsCc *fetch_cc)
@@ -626,6 +701,17 @@ void DataStoreServiceClient::FetchCurrentTableStatistics(
          &FetchCurrentTableStatsCallback);
 }
 
+/**
+ * @brief Fetches table statistics for a specific version from the data store.
+ *
+ * Retrieves table statistics for a specific version by constructing key ranges
+ * based on the table name and version number. Clears previous key ranges and
+ * session information, then constructs start and end keys for the version-specific
+ * statistics. The operation is performed asynchronously with callback handling.
+ *
+ * @param ccm_table_name The table name to fetch statistics for.
+ * @param fetch_cc Fetch table statistics CC object containing version information and result storage.
+ */
 void DataStoreServiceClient::FetchTableStatistics(
     const txservice::TableName &ccm_table_name,
     txservice::FetchTableStatisticsCc *fetch_cc)
@@ -698,6 +784,19 @@ std::string EncodeTableStatsKey(const txservice::TableName &base_table_name,
     return key;
 }
 
+/**
+ * @brief Upserts table statistics to the data store.
+ *
+ * Stores table statistics by splitting sample keys into segments and writing them
+ * to the KV storage. Each segment contains index type, record count, and sample keys.
+ * Also updates the checkpoint version for the table statistics. Uses batch write
+ * operations for efficiency and handles both local and remote storage paths.
+ *
+ * @param ccm_table_name The table name to store statistics for.
+ * @param sample_pool_map Map of index names to sample pools containing record counts and sample keys.
+ * @param version The version number for the statistics.
+ * @return true if all statistics are stored successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::UpsertTableStatistics(
     const txservice::TableName &ccm_table_name,
     const std::unordered_map<txservice::TableName,
@@ -875,6 +974,16 @@ bool DataStoreServiceClient::UpsertTableStatistics(
     return true;
 }
 
+/**
+ * @brief Fetches table ranges from the data store.
+ *
+ * Retrieves range information for the specified table by scanning the range table
+ * storage. Constructs start and end keys based on the table name and performs
+ * a scan operation with pagination support. The operation is performed asynchronously
+ * with callback handling for completion.
+ *
+ * @param fetch_cc Fetch table ranges CC object containing table name and result storage.
+ */
 void DataStoreServiceClient::FetchTableRanges(
     txservice::FetchTableRangesCc *fetch_cc)
 {
@@ -899,6 +1008,16 @@ void DataStoreServiceClient::FetchTableRanges(
              &FetchTableRangesCallback);
 }
 
+/**
+ * @brief Fetches range slices from the data store.
+ *
+ * Retrieves range slice information for the specified table and range entry.
+ * Validates node group term consistency and constructs the appropriate key
+ * for reading range information. The operation is performed asynchronously
+ * with callback handling for completion.
+ *
+ * @param fetch_cc Fetch range slices request object containing table name, range entry, and result storage.
+ */
 void DataStoreServiceClient::FetchRangeSlices(
     txservice::FetchRangeSlicesReq *fetch_cc)
 {
@@ -926,6 +1045,20 @@ void DataStoreServiceClient::FetchRangeSlices(
          &FetchRangeSlicesCallback);
 }
 
+/**
+ * @brief Deletes data that is out of the specified range.
+ *
+ * Removes data from the KV table that falls outside the specified range.
+ * Constructs the appropriate start key based on the provided parameters and
+ * performs a delete range operation. Handles special cases for negative infinity
+ * keys and constructs proper key boundaries for the deletion.
+ *
+ * @param table_name The table name to delete data from.
+ * @param partition_id The partition ID for the operation.
+ * @param start_key The start key for the range (nullptr for negative infinity).
+ * @param table_schema The table schema containing KV catalog information.
+ * @return true if the deletion operation succeeds, false otherwise.
+ */
 bool DataStoreServiceClient::DeleteOutOfRangeData(
     const txservice::TableName &table_name,
     int32_t partition_id,
@@ -971,6 +1104,20 @@ bool DataStoreServiceClient::DeleteOutOfRangeData(
     return true;
 }
 
+/**
+ * @brief Reads a record from the data store synchronously.
+ *
+ * Currently not implemented. This method is a placeholder for synchronous
+ * record reading functionality. Will log an error and return true.
+ *
+ * @param table_name The table name to read from.
+ * @param key The key to read.
+ * @param rec The record object to store the result.
+ * @param found Output parameter indicating if the record was found.
+ * @param version_ts Output parameter for the version timestamp.
+ * @param table_schema The table schema information.
+ * @return true (placeholder implementation).
+ */
 bool DataStoreServiceClient::Read(const txservice::TableName &table_name,
                                   const txservice::TxKey &key,
                                   txservice::TxRecord &rec,
@@ -982,6 +1129,25 @@ bool DataStoreServiceClient::Read(const txservice::TableName &table_name,
     return true;
 }
 
+/**
+ * @brief Creates a scanner for forward or backward scanning of table data.
+ *
+ * Creates and initializes a data store scanner for iterating over records in a table.
+ * Supports both forward and backward scanning with configurable search conditions.
+ * The scanner is initialized before returning.
+ *
+ * @param table_name The table name to scan.
+ * @param ng_id Node group ID for the operation.
+ * @param start_key The starting key for the scan.
+ * @param inclusive Whether the start key should be included in the scan.
+ * @param key_parts Number of key parts to consider.
+ * @param search_cond Vector of search conditions for filtering results.
+ * @param key_schema Schema information for the keys.
+ * @param rec_schema Schema information for the records.
+ * @param kv_info KV catalog information for the table.
+ * @param scan_forward Whether to scan forward (true) or backward (false).
+ * @return Unique pointer to the initialized scanner.
+ */
 std::unique_ptr<txservice::store::DataStoreScanner>
 DataStoreServiceClient::ScanForward(
     const txservice::TableName &table_name,
@@ -1139,6 +1305,19 @@ std::string DataStoreServiceClient::EncodeRangeKey(
     return key;
 }
 
+/**
+ * @brief Encodes range information into a binary value format.
+ *
+ * Serializes range metadata including range ID, range version, general version,
+ * and segment count into a binary string format for storage in the KV system.
+ * Uses little-endian encoding for all numeric values.
+ *
+ * @param range_id The range identifier.
+ * @param range_version The version of the range.
+ * @param version The general version number.
+ * @param segment_cnt The number of segments in the range.
+ * @return Binary string containing the encoded range value.
+ */
 std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id,
                                                      uint64_t range_version,
                                                      uint64_t version,
@@ -1159,6 +1338,18 @@ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id,
     return kv_range_record;
 }
 
+/**
+ * @brief Encodes a range slice key for storage in the KV system.
+ *
+ * Creates a composite key by combining table name, range ID, and segment ID.
+ * Uses little-endian encoding for numeric values since range slice operations
+ * are point reads rather than scans, optimizing for direct key lookup performance.
+ *
+ * @param table_name The table name for the range slice.
+ * @param range_id The range identifier.
+ * @param segment_id The segment identifier within the range.
+ * @return Binary string containing the encoded range slice key.
+ */
 std::string DataStoreServiceClient::EncodeRangeSliceKey(
     const txservice::TableName &table_name,
     int32_t range_id,
@@ -1176,7 +1367,16 @@ std::string DataStoreServiceClient::EncodeRangeSliceKey(
     return key;
 }
 
-// Replace the segment_id in range_slice_key
+/**
+ * @brief Updates the segment ID in an encoded range slice key.
+ *
+ * Modifies an existing range slice key by replacing the segment ID portion
+ * with a new segment ID value. This is used for updating range slice keys
+ * without recreating the entire key structure.
+ *
+ * @param range_slice_key The range slice key to update (modified in place).
+ * @param new_segment_id The new segment ID to use.
+ */
 void DataStoreServiceClient::UpdateEncodedRangeSliceKey(
     std::string &range_slice_key, uint32_t new_segment_id)
 {
@@ -1186,6 +1386,23 @@ void DataStoreServiceClient::UpdateEncodedRangeSliceKey(
                             sizeof(new_segment_id));
 }
 
+/**
+ * @brief Updates range slices for a table partition.
+ *
+ * Stores range slice information by segmenting the slices into manageable chunks
+ * and writing them to the KV storage system. Handles slice serialization with
+ * proper key encoding and batch size management. Also updates the range information
+ * with the new version and segment count. Uses both local and remote storage paths
+ * based on configuration.
+ *
+ * @param table_name The table name for the range slices.
+ * @param version The version number for the slices.
+ * @param range_start_key The starting key for the range.
+ * @param slices Vector of store slices to update.
+ * @param partition_id The partition ID for the range.
+ * @param range_version The version of the range.
+ * @return true if all slices are updated successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::UpdateRangeSlices(
     const txservice::TableName &table_name,
     uint64_t version,
@@ -1336,6 +1553,19 @@ bool DataStoreServiceClient::UpdateRangeSlices(
     return true;
 }
 
+/**
+ * @brief Upserts range information for a table.
+ *
+ * Updates range slices for multiple ranges by calling UpdateRangeSlices for each
+ * range in the provided vector. After updating all ranges, flushes the range table
+ * data to ensure persistence. Validates that the table name is not empty and
+ * handles errors from individual range updates.
+ *
+ * @param table_name The table name for the ranges.
+ * @param range_info Vector of split range information to upsert.
+ * @param version The version number for the ranges.
+ * @return true if all ranges are updated and flushed successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::UpsertRanges(
     const txservice::TableName &table_name,
     std::vector<txservice::SplitRangeInfo> range_info,
@@ -1374,6 +1604,20 @@ bool DataStoreServiceClient::UpsertRanges(
     return true;
 }
 
+/**
+ * @brief Fetches table schema information synchronously.
+ *
+ * Retrieves table schema information from the data store using asynchronous
+ * operations with synchronous waiting. Uses FetchTableCatalog internally and
+ * waits for completion before returning the result. Provides schema image,
+ * found status, and version timestamp.
+ *
+ * @param table_name The table name to fetch schema for.
+ * @param schema_image Output parameter for the schema image data.
+ * @param found Output parameter indicating if the table was found.
+ * @param version_ts Output parameter for the version timestamp.
+ * @return true if the fetch operation completes successfully, false otherwise.
+ */
 bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name,
                                         std::string &schema_image,
                                         bool &found,
@@ -1399,6 +1643,18 @@ bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name,
     return !callback_data->HasError();
 }
 
+/**
+ * @brief Discovers all table names in the data store.
+ *
+ * Scans the table catalogs to discover all available table names. Uses pagination
+ * with session management and supports cooperative scheduling through yield/resume
+ * function pointers. Performs the scan asynchronously and waits for completion.
+ *
+ * @param norm_name_vec Output vector to store the discovered table names.
+ * @param yield_fptr Optional function pointer for yielding control during pagination.
+ * @param resume_fptr Optional function pointer for resuming after yielding.
+ * @return true if the discovery operation completes successfully, false if any error occurs.
+ */
 bool DataStoreServiceClient::DiscoverAllTableNames(
     std::vector<std::string> &norm_name_vec,
     const std::function<void()> *yield_fptr,
@@ -1426,10 +1682,18 @@ bool DataStoreServiceClient::DiscoverAllTableNames(
     return !callback_data->HasError();
 }
 
-// The store format of database catalog in kvstore is as follows:
-//
-// key: dbname
-// value: db_definition
+/**
+ * @brief Upserts database definition to the data store.
+ *
+ * Stores database definition information in the KV storage system. The storage
+ * format uses the database name as the key and the database definition as the value.
+ * Uses current timestamp for versioning and performs the operation asynchronously
+ * with synchronous waiting for completion.
+ *
+ * @param db The database name to upsert.
+ * @param definition The database definition to store.
+ * @return true if the database is upserted successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::UpsertDatabase(std::string_view db,
                                             std::string_view definition)
 {
@@ -1475,6 +1739,16 @@ bool DataStoreServiceClient::UpsertDatabase(std::string_view db,
     return true;
 }
 
+/**
+ * @brief Drops a database from the data store.
+ *
+ * Removes a database definition from the KV storage system by performing a DELETE
+ * operation on the database catalog. Uses current timestamp for versioning and
+ * performs the operation asynchronously with synchronous waiting for completion.
+ *
+ * @param db The database name to drop.
+ * @return true if the database is dropped successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::DropDatabase(std::string_view db)
 {
     std::vector<std::string_view> keys;
@@ -1519,6 +1793,20 @@ bool DataStoreServiceClient::DropDatabase(std::string_view db)
     return true;
 }
 
+/**
+ * @brief Fetches database definition from the data store.
+ *
+ * Retrieves database definition information from the KV storage system.
+ * Supports cooperative scheduling through yield/resume function pointers
+ * and performs the operation asynchronously with synchronous waiting.
+ *
+ * @param db The database name to fetch.
+ * @param definition Output parameter for the database definition.
+ * @param found Output parameter indicating if the database was found.
+ * @param yield_fptr Optional function pointer for yielding control.
+ * @param resume_fptr Optional function pointer for resuming after yielding.
+ * @return true if the fetch operation completes successfully, false if any error occurs.
+ */
 bool DataStoreServiceClient::FetchDatabase(
     std::string_view db,
     std::string &definition,
@@ -1793,6 +2081,19 @@ void DataStoreServiceClient::EncodeArchiveKey(
     write_batch_size += sizeof(uint64_t);
 }
 
+/**
+ * @brief Decodes an archive key to extract its components.
+ *
+ * Parses an archive key string to extract the table name, transaction key,
+ * and commit timestamp. The archive key format is: "log:item:{table_name}:{key}:{commit_ts}".
+ * Validates the key format and extracts each component using string separators.
+ *
+ * @param archive_key The archive key string to decode.
+ * @param table_name Output parameter for the extracted table name.
+ * @param key Output parameter for the extracted transaction key.
+ * @param be_commit_ts Output parameter for the extracted commit timestamp (big-endian).
+ * @return true if the key is successfully decoded, false if the format is invalid.
+ */
 bool DataStoreServiceClient::DecodeArchiveKey(const std::string &archive_key,
                                               std::string &table_name,
                                               txservice::TxKey &key,
@@ -1834,6 +2135,20 @@ bool DataStoreServiceClient::DecodeArchiveKey(const std::string &archive_key,
     return true;
 }
 
+/**
+ * @brief Encodes archive value data for storage.
+ *
+ * Serializes archive value information including deletion status, unpack info,
+ * and encoded blob data into record parts for batch writing. Handles both
+ * deleted and non-deleted records with appropriate data encoding.
+ *
+ * @param is_deleted Whether the record is marked as deleted.
+ * @param value Pointer to the transaction record (nullptr for deleted records).
+ * @param unpack_info_size Size of the unpack info data.
+ * @param encoded_blob_size Size of the encoded blob data.
+ * @param record_parts Vector to store the encoded record parts.
+ * @param write_batch_size Running total of batch size (updated in place).
+ */
 void DataStoreServiceClient::EncodeArchiveValue(
     bool is_deleted,
     const txservice::TxRecord *value,
@@ -2127,6 +2442,17 @@ bool DataStoreServiceClient::PutArchivesAll(
     return true;
 }
 
+/**
+ * @brief Copies base table data to archive storage.
+ *
+ * Reads base table records and copies them to archive storage with concurrent
+ * read operations. Manages in-flight read count to control concurrency and
+ * handles both hash and range partitioned tables. Uses archive-specific
+ * encoding and TTL settings for the copied data.
+ *
+ * @param flush_task Map of table names to flush task entries containing base records to copy.
+ * @return true if all records are successfully copied to archive, false if any operation fails.
+ */
 bool DataStoreServiceClient::CopyBaseToArchive(
     std::unordered_map<std::string_view,
                        std::vector<std::unique_ptr<txservice::FlushTaskEntry>>>
@@ -2293,6 +2619,20 @@ bool DataStoreServiceClient::CopyBaseToArchive(
     return true;
 }
 
+/**
+ * @brief Fetches archive records for a specific key from a given timestamp.
+ *
+ * Retrieves archived versions of a record from the MVCC archive storage.
+ * Scans the archive table for records matching the specified key and timestamp range.
+ * Currently asserts false as this functionality is not fully implemented.
+ *
+ * @param table_name The table name to fetch archives for.
+ * @param kv_info KV catalog information for the table.
+ * @param key The key to fetch archive records for.
+ * @param archives Output vector to store the fetched archive records.
+ * @param from_ts Starting timestamp for the archive fetch.
+ * @return Currently always returns false (not implemented).
+ */
 bool DataStoreServiceClient::FetchArchives(
     const txservice::TableName &table_name,
     const txservice::KVCatalogInfo *kv_info,
@@ -2377,6 +2717,23 @@ bool DataStoreServiceClient::FetchArchives(
     return true;
 }
 
+/**
+ * @brief Fetches the visible archive record for a key at a specific timestamp.
+ *
+ * Retrieves the most recent archive record for a given key that is visible
+ * at the specified upper bound timestamp. Scans the archive table in reverse
+ * order to find the latest visible version. Currently asserts false as this
+ * functionality is not fully implemented.
+ *
+ * @param table_name The table name to fetch archive for.
+ * @param kv_info KV catalog information for the table.
+ * @param key The key to fetch archive record for.
+ * @param upper_bound_ts The upper bound timestamp for visibility.
+ * @param rec Output parameter for the fetched record.
+ * @param rec_status Output parameter for the record status.
+ * @param commit_ts Output parameter for the commit timestamp.
+ * @return Currently always returns false (not implemented).
+ */
 bool DataStoreServiceClient::FetchVisibleArchive(
     const txservice::TableName &table_name,
     const txservice::KVCatalogInfo *kv_info,
@@ -2461,6 +2818,17 @@ bool DataStoreServiceClient::FetchVisibleArchive(
     return true;
 }
 
+/**
+ * @brief Fetches archive records for a fetch record CC operation.
+ *
+ * Retrieves archive records for a specific key and snapshot read timestamp.
+ * Encodes the appropriate key range for scanning the archive table and
+ * initiates a scan operation to fetch all relevant archive versions.
+ * Sets up the fetch CC object with the necessary scan parameters.
+ *
+ * @param fetch_cc Fetch record CC object containing key, timestamp, and result storage.
+ * @return DataStoreOpStatus indicating the operation status.
+ */
 txservice::store::DataStoreHandler::DataStoreOpStatus
 DataStoreServiceClient::FetchArchives(txservice::FetchRecordCc *fetch_cc)
 {
@@ -2530,6 +2898,19 @@ DataStoreServiceClient::FetchVisibleArchive(
     return txservice::store::DataStoreHandler::DataStoreOpStatus::Success;
 }
 
+/**
+ * @brief Creates a snapshot for backup operations.
+ *
+ * Initiates a snapshot creation process across all shards in the cluster.
+ * Collects shard IDs from the cluster manager and coordinates snapshot creation
+ * for both local and remote shards. Waits for completion and returns the
+ * backup files generated during the process.
+ *
+ * @param backup_name The name for the backup snapshot.
+ * @param backup_files Output vector to store the generated backup file paths.
+ * @param backup_ts The timestamp for the backup.
+ * @return true if the snapshot is created successfully, false if any operation fails.
+ */
 bool DataStoreServiceClient::CreateSnapshotForBackup(
     const std::string &backup_name,
     std::vector<std::string> &backup_files,
@@ -2562,6 +2943,16 @@ bool DataStoreServiceClient::CreateSnapshotForBackup(
     return !callback_data->HasError();
 }
 
+/**
+ * @brief Internal method for creating snapshots for backup operations.
+ *
+ * Processes snapshot creation for individual shards, handling both local and
+ * remote shards differently. For local shards, prepares local requests; for
+ * remote shards, prepares RPC requests. Manages the closure lifecycle and
+ * coordinates completion when all shards are processed.
+ *
+ * @param closure The closure object managing the snapshot creation process.
+ */
 void DataStoreServiceClient::CreateSnapshotForBackupInternal(
     CreateSnapshotForBackupClosure *closure)
 {
@@ -2610,11 +3001,30 @@ void DataStoreServiceClient::CreateSnapshotForBackupInternal(
     }
 }
 
+/**
+ * @brief Determines if range copying is needed.
+ *
+ * Currently always returns true, indicating that range copying is always required.
+ * This method is used to determine whether range data needs to be copied during
+ * certain operations.
+ *
+ * @return Always returns true.
+ */
 bool DataStoreServiceClient::NeedCopyRange() const
 {
     return true;
 }
 
+/**
+ * @brief Restores transaction cache for a node group.
+ *
+ * Currently not implemented. This method is a placeholder for restoring
+ * transaction cache state for a specific node group and term.
+ * Will log an error and assert false if called.
+ *
+ * @param cc_ng_id The node group ID to restore cache for.
+ * @param cc_ng_term The term for the node group.
+ */
 void DataStoreServiceClient::RestoreTxCache(txservice::NodeGroupId cc_ng_id,
                                             int64_t cc_ng_term)
 {
@@ -2622,19 +3032,50 @@ void DataStoreServiceClient::RestoreTxCache(txservice::NodeGroupId cc_ng_id,
     assert(false);
 }
 
+/**
+ * @brief Handles leader start event.
+ *
+ * Currently always returns true. This method is called when the node becomes
+ * a leader and can be used to perform leader-specific initialization.
+ *
+ * @param next_leader_node Pointer to store the next leader node ID (unused).
+ * @return Always returns true.
+ */
 bool DataStoreServiceClient::OnLeaderStart(uint32_t *next_leader_node)
 {
     return true;
 }
 
+/**
+ * @brief Handles start following event.
+ *
+ * Currently empty implementation. This method is called when the node starts
+ * following another leader and can be used to perform follower-specific initialization.
+ */
 void DataStoreServiceClient::OnStartFollowing()
 {
 }
 
+/**
+ * @brief Handles shutdown event.
+ *
+ * Currently empty implementation. This method is called when the node is shutting
+ * down and can be used to perform cleanup operations.
+ */
 void DataStoreServiceClient::OnShutdown()
 {
 }
 
+/**
+ * @brief Checks if a shard is local to this node.
+ *
+ * Determines whether the specified shard is owned by this node using the
+ * cluster manager. This is used for scale-up scenarios where data needs to be
+ * migrated from smaller to larger nodes.
+ *
+ * @param shard_id The shard ID to check.
+ * @return true if the shard is local to this node, false otherwise.
+ */
 bool DataStoreServiceClient::IsLocalShard(uint32_t shard_id)
 {
     // this is a temporary solution for scale up scenario (from one smaller
@@ -2642,6 +3083,16 @@ bool DataStoreServiceClient::IsLocalShard(uint32_t shard_id)
     return cluster_manager_.IsOwnerOfShard(shard_id);
 }
 
+/**
+ * @brief Checks if a partition is local to this node.
+ *
+ * Determines whether the specified partition is owned by this node using the
+ * cluster manager. Used for determining whether operations should be performed
+ * locally or remotely.
+ *
+ * @param partition_id The partition ID to check.
+ * @return true if the partition is local to this node, false otherwise.
+ */
 bool DataStoreServiceClient::IsLocalPartition(int32_t partition_id)
 {
     return cluster_manager_.IsOwnerOfPartition(partition_id);
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index 77af7b5..16836a5 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -34,103 +34,35 @@
 #include "data_store_service_scanner.h"
 #include "eloq_data_store_service/object_pool.h"
 
-/**
-     * Callback type invoked on completion of a datastore operation.
-     *
-     * Parameters:
-     *  - data: user-provided context pointer passed through the async call.
-     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
-     *  - client: reference to the DataStoreServiceClient that executed the operation.
-     *  - result: operation result detail (error code/message and any operation-specific fields).
-     */
-    
-    /**
-     * Synchronization helper used to wait for an asynchronous datastore operation to complete.
-     *
-     * Provides a mutex/condition variable pair and a CommonResult to store the outcome.
-     * Typical usage: Reset() before issuing the async operation, Notify() from the async
-     * completion callback, and Wait() from the waiting thread. HasError() reports whether
-     * the stored result represents an error other than NO_ERROR or KEY_NOT_FOUND.
-     */
-    
-    /**
-     * Aggregation and flow-control helper for coordinating many concurrent put-all writes.
-     *
-     * - unfinished_request_cnt_: signed count of outstanding write requests (must be signed).
-     * - all_request_started_: set to true once all requests have been launched.
-     * - max_flying_write_count: upper bound on concurrent in-flight writes (32).
-     *
-     * Finish(res) will merge the first non-NO_ERROR result into `result_`, decrement the
-     * unfinished request count, and notify a waiter when either:
-     *  - all requests have been started and the unfinished count reaches zero, or
-     *  - the unfinished count falls to (max_flying_write_count - 1), enabling flow control
-     *    to allow launching further requests while keeping in-flight writes bounded.
-     */
-    
-    /**
-     * Generic synchronous callback adapter invoked by closures to signal completion.
-     *
-     * Parameters:
-     *  - data: user-provided context pointer passed through the async call.
-     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
-     *  - client: reference to the DataStoreServiceClient that executed the operation.
-     *  - result: operation result detail (error code/message and any operation-specific fields).
-     */
-    
-    /**
-     * Shared helper used when reading archived records concurrently.
-     *
-     * Holds references to external synchronization primitives and counters:
-     *  - mtx_, cv_: external mutex and condition variable used to guard flying_read_cnt_.
-     *  - flying_read_cnt_: reference to the shared in-flight read counter.
-     *  - error_code_: reference to an integer used to capture the first observed error.
-     *
-     * Also stores the most recent read result (partition_id_, key_str_, value_str_, ts_, ttl_).
-     * Thread-safe: methods that mutate or read shared resources acquire the provided mutex.
-     */
-    
-    /**
-     * Callback invoked for batch archive reads to aggregate or forward results.
-     *
-     * Parameters:
-     *  - data: user-provided context pointer passed through the async call.
-     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
-     *  - client: reference to the DataStoreServiceClient that executed the operation.
-     *  - result: operation result detail (error code/message and any operation-specific fields).
-     */
-    
-    /**
-     * Callback invoked to load a range slice (archive or otherwise).
-     *
-     * Parameters:
-     *  - data: user-provided context pointer passed through the async call.
-     *  - closure: protobuf closure associated with the RPC (may be nullptr for local paths).
-     *  - client: reference to the DataStoreServiceClient that executed the operation.
-     *  - result: operation result detail (error code/message and any operation-specific fields).
-     */
-    
-    /**
-     * Closure implementing a datastore Read operation supporting both local and remote paths.
-     *
-     * Use Reset(...) to configure a read (table, partition, key, client, and callback), then:
-     *  - PrepareRequest(is_local): prepare an RPC request if remote, or clear local result for local reads.
-     *  - Run(): executed when an RPC completes (or when local processing is finished). Run()
-     *    handles RPC failures with retry logic, translates NOT_OWNER into sharding handling
-     *    and potential retry, and finally invokes the user callback with the CommonResult.
-     *
-     * Accessors provide access to the brpc::Controller, request/response objects, table/partition/key,
-     * and local-result fields (value, ts, ttl, result). Value accessors return either the local
-     * in-memory values or the response's values depending on the request mode.
-     *
-     * Note: retry behavior is governed by the associated DataStoreServiceClient retry_limit_.
-     */
-    namespace EloqDS
+namespace EloqDS
 {
+/**
+ * Callback type invoked on completion of a datastore operation.
+ *
+ * Parameters:
+ *  - data: user-provided context pointer passed through the async call.
+ *  - closure: protobuf closure associated with the RPC (may be nullptr for
+ * local paths).
+ *  - client: reference to the DataStoreServiceClient that executed the
+ * operation.
+ *  - result: operation result detail (error code/message and any
+ * operation-specific fields).
+ */
 typedef void (*DataStoreCallback)(void *data,
                                   ::google::protobuf::Closure *closure,
                                   DataStoreServiceClient &client,
                                   const remote::CommonResult &result);
 
+/**
+ * Synchronization helper used to wait for an asynchronous datastore operation
+ * to complete.
+ *
+ * Provides a mutex/condition variable pair and a CommonResult to store the
+ * outcome. Typical usage: Reset() before issuing the async operation, Notify()
+ * from the async completion callback, and Wait() from the waiting thread.
+ * HasError() reports whether the stored result represents an error other than
+ * NO_ERROR or KEY_NOT_FOUND.
+ */
 struct SyncCallbackData : public Poolable
 {
     SyncCallbackData() : mtx_(), cv_(), finished_(false)
@@ -185,6 +117,22 @@ struct SyncCallbackData : public Poolable
 
     remote::CommonResult result_;
 };
+/**
+ * Aggregation and flow-control helper for coordinating many concurrent put-all
+ * writes.
+ *
+ * - unfinished_request_cnt_: signed count of outstanding write requests (must
+ * be signed).
+ * - all_request_started_: set to true once all requests have been launched.
+ * - max_flying_write_count: upper bound on concurrent in-flight writes (32).
+ *
+ * Finish(res) will merge the first non-NO_ERROR result into `result_`,
+ * decrement the unfinished request count, and notify a waiter when either:
+ *  - all requests have been started and the unfinished count reaches zero, or
+ *  - the unfinished count falls to (max_flying_write_count - 1), enabling flow
+ * control to allow launching further requests while keeping in-flight writes
+ * bounded.
+ */
 
 struct SyncPutAllData : public Poolable
 {
@@ -228,12 +176,44 @@ struct SyncPutAllData : public Poolable
     bthread::Mutex mux_;
     bthread::ConditionVariable cv_;
 };
+/**
+ * Generic synchronous callback adapter invoked by closures to signal
+ * completion.
+ *
+ * Parameters:
+ *  - data: user-provided context pointer passed through the async call.
+ *  - closure: protobuf closure associated with the RPC (may be nullptr for
+ * local paths).
+ *  - client: reference to the DataStoreServiceClient that executed the
+ * operation.
+ *  - result: operation result detail (error code/message and any
+ * operation-specific fields).
+ */
 
 void SyncCallback(void *data,
                   ::google::protobuf::Closure *closure,
                   DataStoreServiceClient &client,
                   const remote::CommonResult &result);
 
+/**
+ * Callback data structure for concurrent archive record reading operations.
+ * 
+ * Manages synchronization and flow control for reading base records that will
+ * be copied to archive storage. Tracks flying read count and provides mutex
+ * synchronization for concurrent access.
+ *
+ * Holds references to external synchronization primitives and counters:
+ *  - mtx_, cv_: external mutex and condition variable used to guard
+ * flying_read_cnt_.
+ *  - flying_read_cnt_: reference to the shared in-flight read counter.
+ *  - error_code_: reference to an integer used to capture the first observed
+ * error.
+ *
+ * Also stores the most recent read result (partition_id_, key_str_, value_str_,
+ * ts_, ttl_). Thread-safe: methods that mutate or read shared resources acquire
+ * the provided mutex.
+ */
+
 struct ReadBaseForArchiveCallbackData
 {
     ReadBaseForArchiveCallbackData(bthread::Mutex &mtx,
@@ -333,17 +313,60 @@ struct ReadBaseForArchiveCallbackData
     uint64_t ts_;
     uint64_t ttl_;
 };
-
+/**
+ * Callback invoked for batch archive reads to aggregate or forward results.
+ *
+ * Parameters:
+ *  - data: user-provided context pointer passed through the async call.
+ *  - closure: protobuf closure associated with the RPC (may be nullptr for
+ * local paths).
+ *  - client: reference to the DataStoreServiceClient that executed the
+ * operation.
+ *  - result: operation result detail (error code/message and any
+ * operation-specific fields).
+ */
 void SyncBatchReadForArchiveCallback(void *data,
                                      ::google::protobuf::Closure *closure,
                                      DataStoreServiceClient &client,
                                      const remote::CommonResult &result);
 
+/**
+ * Callback invoked to load a range slice (archive or otherwise).
+ *
+ * Parameters:
+ *  - data: user-provided context pointer passed through the async call.
+ *  - closure: protobuf closure associated with the RPC (may be nullptr for
+ * local paths).
+ *  - client: reference to the DataStoreServiceClient that executed the
+ * operation.
+ *  - result: operation result detail (error code/message and any
+ * operation-specific fields).
+ */
 void LoadRangeSliceCallback(void *data,
                             ::google::protobuf::Closure *closure,
                             DataStoreServiceClient &client,
                             const remote::CommonResult &result);
-
+/**
+ * Closure implementing a datastore Read operation supporting both local and
+ * remote paths.
+ *
+ * Use Reset(...) to configure a read (table, partition, key, client, and
+ * callback), then:
+ *  - PrepareRequest(is_local): prepare an RPC request if remote, or clear local
+ * result for local reads.
+ *  - Run(): executed when an RPC completes (or when local processing is
+ * finished). Run() handles RPC failures with retry logic, translates NOT_OWNER
+ * into sharding handling and potential retry, and finally invokes the user
+ * callback with the CommonResult.
+ *
+ * Accessors provide access to the brpc::Controller, request/response objects,
+ * table/partition/key, and local-result fields (value, ts, ttl, result). Value
+ * accessors return either the local in-memory values or the response's values
+ * depending on the request mode.
+ *
+ * Note: retry behavior is governed by the associated DataStoreServiceClient
+ * retry_limit_.
+ */
 class ReadClosure : public ::google::protobuf::Closure, public Poolable
 {
 public:
@@ -637,6 +660,13 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable
     void *callback_data_;
 };
 
+/**
+ * Closure for asynchronous data flushing operations to KV storage.
+ * 
+ * Manages the lifecycle of flush operations, including RPC communication,
+ * retry logic, and callback invocation. Supports both local and remote
+ * flush operations with configurable retry behavior.
+ */
 class FlushDataClosure : public ::google::protobuf::Closure, public Poolable
 {
 public:
@@ -2163,31 +2193,62 @@ class CreateSnapshotForBackupClosure : public ::google::protobuf::Closure,
     void *callback_data_;
 };
 
+/**
+ * Callback for fetching individual records from the data store.
+ * 
+ * Handles the completion of record fetch operations and processes the result.
+ */
 void FetchRecordCallback(void *data,
                          ::google::protobuf::Closure *closure,
                          DataStoreServiceClient &client,
                          const remote::CommonResult &result);
 
+/**
+ * Callback for fetching snapshot data from the data store.
+ * 
+ * Handles the completion of snapshot fetch operations and processes the result.
+ */
 void FetchSnapshotCallback(void *data,
                            ::google::protobuf::Closure *closure,
                            DataStoreServiceClient &client,
                            const remote::CommonResult &result);
 
+/**
+ * Callback data for asynchronous table drop operations.
+ * 
+ * Contains the KV table name that is being dropped.
+ */
 struct AsyncDropTableCallbackData
 {
     std::string kv_table_name_;
 };
 
+/**
+ * Callback for asynchronous table drop operations.
+ * 
+ * Handles the completion of table drop operations and processes the result.
+ */
 void AsyncDropTableCallback(void *data,
                             ::google::protobuf::Closure *closure,
                             DataStoreServiceClient &client,
                             const remote::CommonResult &result);
 
+/**
+ * Callback for fetching table catalog information.
+ * 
+ * Handles the completion of table catalog fetch operations and processes the result.
+ */
 void FetchTableCatalogCallback(void *data,
                                ::google::protobuf::Closure *closure,
                                DataStoreServiceClient &client,
                                const remote::CommonResult &result);
 
+/**
+ * Callback data for fetching table information.
+ * 
+ * Extends SyncCallbackData to include table-specific information like
+ * schema image, version timestamp, and found status.
+ */
 struct FetchTableCallbackData : public SyncCallbackData
 {
     FetchTableCallbackData() = default;
@@ -2219,11 +2280,24 @@ void FetchTableCallback(void *data,
                         DataStoreServiceClient &client,
                         const remote::CommonResult &result);
 
+/**
+ * Callback for synchronous put-all operations.
+ * 
+ * Handles the completion of batch put operations and updates the
+ * SyncPutAllData structure with the result.
+ */
 void SyncPutAllCallback(void *data,
                         ::google::protobuf::Closure *closure,
                         DataStoreServiceClient &client,
                         const remote::CommonResult &result);
 
+/**
+ * Callback data for fetching database information.
+ * 
+ * Extends SyncCallbackData to include database-specific information like
+ * database definition, found status, and yield/resume function pointers
+ * for cooperative scheduling.
+ */
 struct FetchDatabaseCallbackData : public SyncCallbackData
 {
     FetchDatabaseCallbackData() = default;
@@ -2280,11 +2354,22 @@ struct FetchDatabaseCallbackData : public SyncCallbackData
     const std::function<void()> *resume_fptr_;
 };
 
+/**
+ * Callback for fetching database information.
+ * 
+ * Handles the completion of database fetch operations and processes the result.
+ */
 void FetchDatabaseCallback(void *data,
                            ::google::protobuf::Closure *closure,
                            DataStoreServiceClient &client,
                            const remote::CommonResult &result);
 
+/**
+ * Callback data for fetching all database names.
+ * 
+ * Extends SyncCallbackData to include database names list and yield/resume
+ * function pointers for cooperative scheduling during pagination.
+ */
 struct FetchAllDatabaseCallbackData : public SyncCallbackData
 {
     FetchAllDatabaseCallbackData() = default;
@@ -2347,11 +2432,22 @@ struct FetchAllDatabaseCallbackData : public SyncCallbackData
     std::string end_key_;
 };
 
+/**
+ * Callback for fetching all database names.
+ * 
+ * Handles the completion of all database names fetch operations and processes the result.
+ */
 void FetchAllDatabaseCallback(void *data,
                               ::google::protobuf::Closure *closure,
                               DataStoreServiceClient &client,
                               const remote::CommonResult &result);
 
+/**
+ * Callback data for discovering all table names.
+ * 
+ * Extends SyncCallbackData to include table names list and yield/resume
+ * function pointers for cooperative scheduling during pagination.
+ */
 struct DiscoverAllTableNamesCallbackData : public SyncCallbackData
 {
     DiscoverAllTableNamesCallbackData() = default;
@@ -2408,30 +2504,61 @@ struct DiscoverAllTableNamesCallbackData : public SyncCallbackData
     std::string session_id_;
 };
 
+/**
+ * Callback for discovering all table names.
+ * 
+ * Handles the completion of table name discovery operations and processes the result.
+ */
 void DiscoverAllTableNamesCallback(void *data,
                                    ::google::protobuf::Closure *closure,
                                    DataStoreServiceClient &client,
                                    const remote::CommonResult &result);
 
+/**
+ * Callback for fetching table ranges.
+ * 
+ * Handles the completion of table range fetch operations and processes the result.
+ */
 void FetchTableRangesCallback(void *data,
                               ::google::protobuf::Closure *closure,
                               DataStoreServiceClient &client,
                               const remote::CommonResult &result);
 
+/**
+ * Callback for fetching range slices.
+ * 
+ * Handles the completion of range slice fetch operations and processes the result.
+ */
 void FetchRangeSlicesCallback(void *data,
                               ::google::protobuf::Closure *closure,
                               DataStoreServiceClient &client,
                               const remote::CommonResult &result);
+/**
+ * Callback for fetching current table statistics.
+ * 
+ * Handles the completion of current table statistics fetch operations and processes the result.
+ */
 void FetchCurrentTableStatsCallback(void *data,
                                     ::google::protobuf::Closure *closure,
                                     DataStoreServiceClient &client,
                                     const remote::CommonResult &result);
 
+/**
+ * Callback for fetching table statistics.
+ * 
+ * Handles the completion of table statistics fetch operations and processes the result.
+ */
 void FetchTableStatsCallback(void *data,
                              ::google::protobuf::Closure *closure,
                              DataStoreServiceClient &client,
                              const remote::CommonResult &result);
 
+/**
+ * Callback data for fetching archive records.
+ * 
+ * Extends SyncCallbackData to include archive-specific information like
+ * table name, partition ID, key ranges, batch size, and scan direction.
+ */
 struct FetchArchivesCallbackData : public SyncCallbackData
 {
     FetchArchivesCallbackData(const std::string_view kv_table_name,
@@ -2464,21 +2591,42 @@ struct FetchArchivesCallbackData : public SyncCallbackData
     std::vector<uint64_t> archive_commit_ts_;
 };
 
+/**
+ * Callback for fetching archive records.
+ * 
+ * Handles the completion of archive record fetch operations and processes the result.
+ */
 void FetchArchivesCallback(void *data,
                            ::google::protobuf::Closure *closure,
                            DataStoreServiceClient &client,
                            const remote::CommonResult &result);
 
+/**
+ * Callback for fetching record archives.
+ * 
+ * Handles the completion of record archive fetch operations and processes the result.
+ */
 void FetchRecordArchivesCallback(void *data,
                                  ::google::protobuf::Closure *closure,
                                  DataStoreServiceClient &client,
                                  const remote::CommonResult &result);
 
+/**
+ * Callback for fetching snapshot archives.
+ * 
+ * Handles the completion of snapshot archive fetch operations and processes the result.
+ */
 void FetchSnapshotArchiveCallback(void *data,
                                   ::google::protobuf::Closure *closure,
                                   DataStoreServiceClient &client,
                                   const remote::CommonResult &result);
 
+/**
+ * Callback data for creating snapshots for backup operations.
+ * 
+ * Extends SyncCallbackData to include backup-specific information like
+ * backup name, timestamp, and backup files list.
+ */
 struct CreateSnapshotForBackupCallbackData : public SyncCallbackData
 {
     CreateSnapshotForBackupCallbackData() = default;
@@ -2505,6 +2653,11 @@ struct CreateSnapshotForBackupCallbackData : public SyncCallbackData
     std::vector<std::string> *backup_files_;
 };
 
+/**
+ * Callback for creating snapshots for backup operations.
+ * 
+ * Handles the completion of snapshot creation for backup operations and processes the result.
+ */
 void CreateSnapshotForBackupCallback(void *data,
                                      ::google::protobuf::Closure *closure,
                                      DataStoreServiceClient &client,

From 383e3925cbc89017ff3b7da614b4316ac3d39de5 Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 10:22:21 +0000
Subject: [PATCH 5/9] refactor so that putall flush different partitions
 concurrently

---
 data_store_service_client.cpp         | 618 ++++++++++++++------------
 data_store_service_client.h           |  29 ++
 data_store_service_client_closure.cpp |  42 ++
 data_store_service_client_closure.h   | 154 ++++++-
 4 files changed, 551 insertions(+), 292 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index 471e2ba..c241167 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -20,6 +20,7 @@
  *
  */
 #include "data_store_service_client.h"
+#include "data_store_service_client_closure.h"
 
 #include <glog/logging.h>
 
@@ -33,7 +34,6 @@
 #include <utility>
 #include <vector>
 
-#include "data_store_service_client_closure.h"
 #include "data_store_service_scanner.h"
 #include "eloq_data_store_service/object_pool.h"  // ObjectPool
 #include "eloq_data_store_service/thread_worker_pool.h"
@@ -183,107 +183,19 @@ bool DataStoreServiceClient::PutAll(
                        std::vector<std::unique_ptr<txservice::FlushTaskEntry>>>
         &flush_task)
 {
-    std::vector<std::string_view> key_parts;
-    std::vector<std::string_view> record_parts;
-    std::vector<uint64_t> records_ts;
-    std::vector<uint64_t> records_ttl;
-    std::vector<WriteOpType> op_types;
-    std::vector<size_t> record_tmp_mem_area;
     uint64_t now = txservice::LocalCcShards::ClockTsInMillseconds();
 
-    auto PrepareObjectData =
-        [&](txservice::FlushRecord &ckpt_rec, size_t &write_batch_size)
-    {
-        txservice::TxKey tx_key = ckpt_rec.Key();
-        uint64_t ttl =
-            ckpt_rec.payload_status_ == txservice::RecordStatus::Normal
-                ? ckpt_rec.Payload()->GetTTL()
-                : 0;
-        if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal &&
-            (!ckpt_rec.Payload()->HasTTL() || ttl > now))
-        {
-            key_parts.emplace_back(
-                std::string_view(tx_key.Data(), tx_key.Size()));
-            write_batch_size += tx_key.Size();
-
-            const txservice::TxRecord *rec = ckpt_rec.Payload();
-            // Upserts a key to the k-v store
-            record_parts.emplace_back(std::string_view(rec->EncodedBlobData(),
-                                                       rec->EncodedBlobSize()));
-            write_batch_size += rec->EncodedBlobSize();
-
-            records_ts.push_back(ckpt_rec.commit_ts_);
-            write_batch_size += sizeof(uint64_t);  // commit_ts
-                                                   //
-            records_ttl.push_back(ttl);
-            write_batch_size += sizeof(uint64_t);  // ttl
-
-            op_types.push_back(WriteOpType::PUT);
-            write_batch_size += sizeof(WriteOpType);
-        }
-        else
-        {
-            key_parts.emplace_back(
-                std::string_view(tx_key.Data(), tx_key.Size()));
-            write_batch_size += tx_key.Size();
-
-            record_parts.emplace_back(std::string_view());
-
-            records_ts.push_back(ckpt_rec.commit_ts_);
-            write_batch_size += sizeof(uint64_t);  // commit_ts
-
-            records_ttl.push_back(0);              // no ttl
-            write_batch_size += sizeof(uint64_t);  // ttl
-
-            op_types.push_back(WriteOpType::DELETE);
-            write_batch_size += sizeof(WriteOpType);
-        }
-    };
-
-    auto PrepareRecordData =
-        [&](txservice::FlushRecord &ckpt_rec, size_t &write_batch_size)
-    {
-        uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000;
-        txservice::TxKey tx_key = ckpt_rec.Key();
-        bool is_deleted =
-            !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal);
-        key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size()));
-        write_batch_size += tx_key.Size();
-
-        const txservice::TxRecord *rec = ckpt_rec.Payload();
-        // encode is_delete, encoded_blob_data and unpack_info
-        if (is_deleted)
-        {
-            records_ttl.push_back(retired_ttl_for_deleted);
-        }
-        else
-        {
-            records_ttl.push_back(0);  // no ttl
-        }
-        write_batch_size += sizeof(uint64_t);  // ttl
-
-        op_types.push_back(WriteOpType::PUT);
-        write_batch_size += sizeof(WriteOpType);
-
-        SerializeTxRecord(is_deleted,
-                          rec,
-                          record_tmp_mem_area,
-                          record_parts,
-                          write_batch_size);
-
-        records_ts.push_back(ckpt_rec.commit_ts_);
-        write_batch_size += sizeof(uint64_t);
-    };
-    // map from (table_name, partition_id) to the index of the records in the
-    // batch
+    // Process each table
     for (auto &[kv_table_name, entries] : flush_task)
     {
         auto &table_name = entries.front()->data_sync_task_->table_name_;
+        
+        // Group records by partition
         std::unordered_map<uint32_t, std::vector<std::pair<size_t, size_t>>>
             hash_partitions_map;
         std::unordered_map<uint32_t, std::vector<size_t>> range_partitions_map;
         std::unordered_map<uint32_t, size_t> partition_record_cnt;
-        size_t write_batch_size = 0;
+        
         size_t flush_task_entry_idx = 0;
         for (auto &entry : entries)
         {
@@ -315,8 +227,7 @@ bool DataStoreServiceClient::PutAll(
             }
             else
             {
-                // All records in the batch are in the same partition for range
-                // table.
+                // All records in the batch are in the same partition for range table
                 uint32_t parition_id =
                     KvPartitionIdOf(batch[0].partition_id_, true);
                 auto [it, inserted] =
@@ -328,6 +239,7 @@ bool DataStoreServiceClient::PutAll(
             flush_task_entry_idx++;
         }
 
+        // Create global coordinator
         SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject();
         PoolableGuard sync_putall_guard(sync_putall);
         sync_putall->Reset();
@@ -335,218 +247,91 @@ bool DataStoreServiceClient::PutAll(
         uint16_t parts_cnt_per_key = 1;
         uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5;
 
-        // Write data for hash_partitioned table
-        for (auto part_it = hash_partitions_map.begin();
-             part_it != hash_partitions_map.end();
-             ++part_it)
+        // Create partition states and prepare batches
+        std::vector<std::unique_ptr<PartitionFlushState>> partition_states;
+        std::vector<std::unique_ptr<PartitionCallbackData>> callback_data_list;
+        
+        // Process hash partitions
+        for (auto &[partition_id, flush_recs] : hash_partitions_map)
         {
-            auto &flush_recs = part_it->second;
-            size_t recs_cnt = partition_record_cnt[part_it->first];
-            key_parts.reserve(recs_cnt * parts_cnt_per_key);
-            record_parts.reserve(recs_cnt * parts_cnt_per_record);
-            records_ts.reserve(recs_cnt);
-            records_ttl.reserve(recs_cnt);
-            op_types.reserve(recs_cnt);
-            for (auto idx : flush_recs)
-            {
-                txservice::FlushRecord &ckpt_rec =
-                    entries.at(idx.first)->data_sync_vec_->at(idx.second);
-                txservice::TxKey tx_key = ckpt_rec.Key();
-
-                // Start a new batch if done with current partition.
-                if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
-                {
-                    BatchWriteRecords(kv_table_name,
-                                      part_it->first,
-                                      std::move(key_parts),
-                                      std::move(record_parts),
-                                      std::move(records_ts),
-                                      std::move(records_ttl),
-                                      std::move(op_types),
-                                      true,
-                                      sync_putall,
-                                      SyncPutAllCallback,
-                                      parts_cnt_per_key,
-                                      parts_cnt_per_record);
-                    // Wait for in-flight requests to decrease if limit reached
-                    {
-                        std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-                        sync_putall->unfinished_request_cnt_++;
-                        while (sync_putall->unfinished_request_cnt_ >=
-                               SyncPutAllData::max_flying_write_count)
-                        {
-                            sync_putall->cv_.wait(lk);
-                        }
-                    }
-                    key_parts.clear();
-                    record_parts.clear();
-                    records_ts.clear();
-                    records_ttl.clear();
-                    op_types.clear();
-                    key_parts.reserve(recs_cnt * parts_cnt_per_key);
-                    record_parts.reserve(recs_cnt * parts_cnt_per_record);
-                    records_ts.reserve(recs_cnt);
-                    records_ttl.reserve(recs_cnt);
-                    op_types.reserve(recs_cnt);
-                    write_batch_size = 0;
-                }
-
-                assert(ckpt_rec.payload_status_ ==
-                           txservice::RecordStatus::Normal ||
-                       ckpt_rec.payload_status_ ==
-                           txservice::RecordStatus::Deleted);
-
-                if (table_name.IsObjectTable())
-                {
-                    PrepareObjectData(ckpt_rec, write_batch_size);
-                }
-                else
-                {
-                    PrepareRecordData(ckpt_rec, write_batch_size);
-                }
-            }
-            // Send out the last batch
-            if (key_parts.size() > 0)
-            {
-                BatchWriteRecords(kv_table_name,
-                                  part_it->first,
-                                  std::move(key_parts),
-                                  std::move(record_parts),
-                                  std::move(records_ts),
-                                  std::move(records_ttl),
-                                  std::move(op_types),
-                                  true,
-                                  sync_putall,
-                                  SyncPutAllCallback,
-                                  parts_cnt_per_key,
-                                  parts_cnt_per_record);
-                {
-                    std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-                    sync_putall->unfinished_request_cnt_++;
-                }
-                key_parts.clear();
-                record_parts.clear();
-                records_ts.clear();
-                records_ttl.clear();
-                op_types.clear();
-                write_batch_size = 0;
-            }
+            auto partition_state = std::make_unique<PartitionFlushState>(partition_id);
+            auto callback_data = std::make_unique<PartitionCallbackData>(
+                partition_state.get(), sync_putall, std::string(kv_table_name));
+            
+            // Prepare batches for this partition
+            PreparePartitionBatches(*partition_state, flush_recs, entries, 
+                                  table_name, parts_cnt_per_key, parts_cnt_per_record, now);
+            
+            partition_states.push_back(std::move(partition_state));
+            callback_data_list.push_back(std::move(callback_data));
         }
-
-        // Write data for range_partitioned table
-        for (auto part_it = range_partitions_map.begin();
-             part_it != range_partitions_map.end();
-             ++part_it)
+        
+        // Process range partitions
+        for (auto &[partition_id, flush_recs] : range_partitions_map)
         {
-            size_t recs_cnt = partition_record_cnt[part_it->first];
-            key_parts.reserve(recs_cnt * parts_cnt_per_key);
-            record_parts.reserve(recs_cnt * parts_cnt_per_record);
-            records_ts.reserve(recs_cnt);
-            records_ttl.reserve(recs_cnt);
-            op_types.reserve(recs_cnt);
-            record_tmp_mem_area.reserve(recs_cnt * 2);
-            for (auto idx : part_it->second)
-            {
-                for (auto &ckpt_rec : *entries.at(idx)->data_sync_vec_)
-                {
-                    txservice::TxKey tx_key = ckpt_rec.Key();
-
-                    // Start a new batch if done with current partition.
-                    if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
-                    {
-                        BatchWriteRecords(kv_table_name,
-                                          part_it->first,
-                                          std::move(key_parts),
-                                          std::move(record_parts),
-                                          std::move(records_ts),
-                                          std::move(records_ttl),
-                                          std::move(op_types),
-                                          true,
-                                          sync_putall,
-                                          SyncPutAllCallback,
-                                          parts_cnt_per_key,
-                                          parts_cnt_per_record);
-                        record_tmp_mem_area.clear();
-                        key_parts.clear();
-                        record_parts.clear();
-                        records_ts.clear();
-                        records_ttl.clear();
-                        op_types.clear();
-                        key_parts.reserve(recs_cnt * parts_cnt_per_key);
-                        record_parts.reserve(recs_cnt * parts_cnt_per_record);
-                        records_ts.reserve(recs_cnt);
-                        records_ttl.reserve(recs_cnt);
-                        op_types.reserve(recs_cnt);
-                        write_batch_size = 0;
-                        // Wait for in-flight requests to decrease if limit
-                        // reached
-                        {
-                            std::unique_lock<bthread::Mutex> lk(
-                                sync_putall->mux_);
-                            sync_putall->unfinished_request_cnt_++;
-                            while (sync_putall->unfinished_request_cnt_ >=
-                                   SyncPutAllData::max_flying_write_count)
-                            {
-                                sync_putall->cv_.wait(lk);
-                            }
-                        }
-                    }
+            auto partition_state = std::make_unique<PartitionFlushState>(partition_id);
+            auto callback_data = std::make_unique<PartitionCallbackData>(
+                partition_state.get(), sync_putall, std::string(kv_table_name));
+            
+            // Prepare batches for this partition
+            PrepareRangePartitionBatches(*partition_state, flush_recs, entries,
+                                       table_name, parts_cnt_per_key, parts_cnt_per_record, now);
+            
+            partition_states.push_back(std::move(partition_state));
+            callback_data_list.push_back(std::move(callback_data));
+        }
 
-                    assert(ckpt_rec.payload_status_ ==
-                               txservice::RecordStatus::Normal ||
-                           ckpt_rec.payload_status_ ==
-                               txservice::RecordStatus::Deleted);
+        // Set up global coordinator
+        sync_putall->total_partitions_ = partition_states.size();
+        sync_putall->partition_states_ = std::move(partition_states);
 
-                    // currently there is no object table in range partitioned
-                    // table
-                    PrepareRecordData(ckpt_rec, write_batch_size);
-                }
-                // Send out the last batch
-                if (key_parts.size() > 0)
-                {
-                    BatchWriteRecords(kv_table_name,
-                                      part_it->first,
-                                      std::move(key_parts),
-                                      std::move(record_parts),
-                                      std::move(records_ts),
-                                      std::move(records_ttl),
-                                      std::move(op_types),
-                                      true,
-                                      sync_putall,
-                                      SyncPutAllCallback,
-                                      parts_cnt_per_key,
-                                      parts_cnt_per_record);
-                    record_tmp_mem_area.clear();
-                    key_parts.clear();
-                    record_parts.clear();
-                    records_ts.clear();
-                    records_ttl.clear();
-                    op_types.clear();
-                    write_batch_size = 0;
-                    {
-                        std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-                        sync_putall->unfinished_request_cnt_++;
-                    }
-                }
+        // Start concurrent processing for each partition
+        for (size_t i = 0; i < callback_data_list.size(); ++i)
+        {
+            auto* partition_state = sync_putall->partition_states_[i].get();
+            auto* callback_data = callback_data_list[i].get();
+            
+            // Start the first batch for this partition
+            PartitionBatchRequest first_batch;
+            if (partition_state->GetNextBatch(first_batch)) {
+                BatchWriteRecords(
+                    callback_data->table_name,
+                    partition_state->partition_id,
+                    std::move(first_batch.key_parts),
+                    std::move(first_batch.record_parts),
+                    std::move(first_batch.records_ts),
+                    std::move(first_batch.records_ttl),
+                    std::move(first_batch.op_types),
+                    true, // skip_wal
+                    callback_data,
+                    PartitionBatchCallback,
+                    first_batch.parts_cnt_per_key,
+                    first_batch.parts_cnt_per_record);
+            } else {
+                // No batches for this partition, mark as completed
+                partition_state->MarkCompleted();
+                sync_putall->OnPartitionCompleted();
             }
         }
 
-        // Wait for all requests to complete
+        // Wait for all partitions to complete
         {
             std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-            sync_putall->all_request_started_ = true;
-            while (sync_putall->unfinished_request_cnt_ != 0)
+            while (sync_putall->completed_partitions_ < sync_putall->total_partitions_)
             {
                 sync_putall->cv_.wait(lk);
             }
         }
 
-        if (sync_putall->result_.error_code() !=
-            remote::DataStoreError::NO_ERROR)
+        // Check for errors
+        for (auto& partition_state : sync_putall->partition_states_)
         {
-            LOG(ERROR) << "PutAll failed for error: "
-                       << sync_putall->result_.error_msg();
+            if (partition_state->IsFailed())
+            {
+                LOG(ERROR) << "PutAll failed for partition " << partition_state->partition_id 
+                           << " with error: " << partition_state->result.error_msg();
             return false;
+            }
         }
     }
     return true;
@@ -4325,4 +4110,257 @@ bool DataStoreServiceClient::DeleteCatalog(
     return true;
 }
 
+void DataStoreServiceClient::PreparePartitionBatches(
+    EloqDS::PartitionFlushState& partition_state,
+    const std::vector<std::pair<size_t, size_t>>& flush_recs,
+    const std::vector<std::unique_ptr<txservice::FlushTaskEntry>>& entries,
+    const txservice::TableName& table_name,
+    uint16_t parts_cnt_per_key,
+    uint16_t parts_cnt_per_record,
+    uint64_t now)
+{
+    std::vector<std::string_view> key_parts;
+    std::vector<std::string_view> record_parts;
+    std::vector<uint64_t> records_ts;
+    std::vector<uint64_t> records_ttl;
+    std::vector<WriteOpType> op_types;
+    std::vector<size_t> record_tmp_mem_area;
+    size_t write_batch_size = 0;
+
+    auto PrepareObjectData =
+        [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size)
+    {
+        txservice::TxKey tx_key = ckpt_rec.Key();
+        uint64_t ttl =
+            ckpt_rec.payload_status_ == txservice::RecordStatus::Normal
+                ? ckpt_rec.Payload()->GetTTL()
+                : 0;
+        if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal &&
+            (!ckpt_rec.Payload()->HasTTL() || ttl > now))
+        {
+            key_parts.emplace_back(
+                std::string_view(tx_key.Data(), tx_key.Size()));
+            batch_size += tx_key.Size();
+
+            const txservice::TxRecord *rec = ckpt_rec.Payload();
+            record_parts.emplace_back(std::string_view(rec->EncodedBlobData(),
+                                                       rec->EncodedBlobSize()));
+            batch_size += rec->EncodedBlobSize();
+
+            records_ts.push_back(ckpt_rec.commit_ts_);
+            batch_size += sizeof(uint64_t);
+
+            records_ttl.push_back(ttl);
+            batch_size += sizeof(uint64_t);
+
+            op_types.push_back(WriteOpType::PUT);
+            batch_size += sizeof(WriteOpType);
+        }
+        else
+        {
+            key_parts.emplace_back(
+                std::string_view(tx_key.Data(), tx_key.Size()));
+            batch_size += tx_key.Size();
+
+            record_parts.emplace_back(std::string_view());
+            batch_size += 0;
+
+            records_ts.push_back(ckpt_rec.commit_ts_);
+            batch_size += sizeof(uint64_t);
+
+            records_ttl.push_back(0);
+            batch_size += sizeof(uint64_t);
+
+            op_types.push_back(WriteOpType::DELETE);
+            batch_size += sizeof(WriteOpType);
+        }
+    };
+
+    auto PrepareRecordData =
+        [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size)
+    {
+        uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000;
+        txservice::TxKey tx_key = ckpt_rec.Key();
+        bool is_deleted =
+            !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal);
+        key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size()));
+        batch_size += tx_key.Size();
+
+        const txservice::TxRecord *rec = ckpt_rec.Payload();
+        if (is_deleted)
+        {
+            records_ttl.push_back(retired_ttl_for_deleted);
+        }
+        else
+        {
+            records_ttl.push_back(0);
+        }
+        batch_size += sizeof(uint64_t);
+
+        op_types.push_back(WriteOpType::PUT);
+        batch_size += sizeof(WriteOpType);
+
+        SerializeTxRecord(is_deleted,
+                          rec,
+                          record_tmp_mem_area,
+                          record_parts,
+                          batch_size);
+
+        records_ts.push_back(ckpt_rec.commit_ts_);
+        batch_size += sizeof(uint64_t);
+    };
+
+    // Process records and create batches
+    for (auto idx : flush_recs)
+    {
+        txservice::FlushRecord &ckpt_rec =
+            entries.at(idx.first)->data_sync_vec_->at(idx.second);
+
+        // Start a new batch if size limit reached
+        if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
+        {
+            partition_state.AddBatch(PartitionBatchRequest(
+                std::move(key_parts),
+                std::move(record_parts),
+                std::move(records_ts),
+                std::move(records_ttl),
+                std::move(op_types),
+                parts_cnt_per_key,
+                parts_cnt_per_record));
+            
+            key_parts.clear();
+            record_parts.clear();
+            records_ts.clear();
+            records_ttl.clear();
+            op_types.clear();
+            record_tmp_mem_area.clear();
+            write_batch_size = 0;
+        }
+
+        assert(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal ||
+               ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted);
+
+        if (table_name.IsObjectTable())
+        {
+            PrepareObjectData(ckpt_rec, write_batch_size);
+        }
+        else
+        {
+            PrepareRecordData(ckpt_rec, write_batch_size);
+        }
+    }
+
+    // Add the last batch if it has data
+    if (key_parts.size() > 0)
+    {
+        partition_state.AddBatch(PartitionBatchRequest(
+            std::move(key_parts),
+            std::move(record_parts),
+            std::move(records_ts),
+            std::move(records_ttl),
+            std::move(op_types),
+            parts_cnt_per_key,
+            parts_cnt_per_record));
+    }
+}
+
+void DataStoreServiceClient::PrepareRangePartitionBatches(
+    EloqDS::PartitionFlushState& partition_state,
+    const std::vector<size_t>& flush_recs,
+    const std::vector<std::unique_ptr<txservice::FlushTaskEntry>>& entries,
+    const txservice::TableName& table_name,
+    uint16_t parts_cnt_per_key,
+    uint16_t parts_cnt_per_record,
+    uint64_t now)
+{
+    std::vector<std::string_view> key_parts;
+    std::vector<std::string_view> record_parts;
+    std::vector<uint64_t> records_ts;
+    std::vector<uint64_t> records_ttl;
+    std::vector<WriteOpType> op_types;
+    std::vector<size_t> record_tmp_mem_area;
+    size_t write_batch_size = 0;
+
+    auto PrepareRecordData =
+        [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size)
+    {
+        uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000;
+        txservice::TxKey tx_key = ckpt_rec.Key();
+        bool is_deleted =
+            !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal);
+        key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size()));
+        batch_size += tx_key.Size();
+
+        const txservice::TxRecord *rec = ckpt_rec.Payload();
+        if (is_deleted)
+        {
+            records_ttl.push_back(retired_ttl_for_deleted);
+        }
+        else
+        {
+            records_ttl.push_back(0);
+        }
+        batch_size += sizeof(uint64_t);
+
+        op_types.push_back(WriteOpType::PUT);
+        batch_size += sizeof(WriteOpType);
+
+        SerializeTxRecord(is_deleted,
+                          rec,
+                          record_tmp_mem_area,
+                          record_parts,
+                          batch_size);
+
+        records_ts.push_back(ckpt_rec.commit_ts_);
+        batch_size += sizeof(uint64_t);
+    };
+
+    // Process records and create batches
+    for (auto idx : flush_recs)
+    {
+        for (auto &ckpt_rec : *entries.at(idx)->data_sync_vec_)
+        {
+            // Start a new batch if size limit reached
+            if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
+            {
+                partition_state.AddBatch(PartitionBatchRequest(
+                    std::move(key_parts),
+                    std::move(record_parts),
+                    std::move(records_ts),
+                    std::move(records_ttl),
+                    std::move(op_types),
+                    parts_cnt_per_key,
+                    parts_cnt_per_record));
+                
+                key_parts.clear();
+                record_parts.clear();
+                records_ts.clear();
+                records_ttl.clear();
+                op_types.clear();
+                record_tmp_mem_area.clear();
+                write_batch_size = 0;
+            }
+
+            assert(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal ||
+                   ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted);
+
+            // Currently there is no object table in range partitioned table
+            PrepareRecordData(ckpt_rec, write_batch_size);
+        }
+    }
+
+    // Add the last batch if it has data
+    if (key_parts.size() > 0)
+    {
+        partition_state.AddBatch(PartitionBatchRequest(
+            std::move(key_parts),
+            std::move(record_parts),
+            std::move(records_ts),
+            std::move(records_ttl),
+            std::move(op_types),
+            parts_cnt_per_key,
+            parts_cnt_per_record));
+    }
+}
+
 }  // namespace EloqDS
\ No newline at end of file
diff --git a/data_store_service_client.h b/data_store_service_client.h
index 4e0348c..fef31fe 100644
--- a/data_store_service_client.h
+++ b/data_store_service_client.h
@@ -39,6 +39,10 @@
 
 namespace EloqDS
 {
+// Forward declarations for types defined in closure header
+struct PartitionFlushState;
+struct PartitionBatchRequest;
+struct PartitionCallbackData;
 class DataStoreServiceClient;
 class BatchWriteRecordsClosure;
 class ReadClosure;
@@ -489,6 +493,27 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler
 
     void BatchWriteRecordsInternal(BatchWriteRecordsClosure *closure);
 
+    /**
+     * Helper methods for concurrent PutAll implementation
+     */
+    void PreparePartitionBatches(
+        PartitionFlushState& partition_state,
+        const std::vector<std::pair<size_t, size_t>>& flush_recs,
+        const std::vector<std::unique_ptr<txservice::FlushTaskEntry>>& entries,
+        const txservice::TableName& table_name,
+        uint16_t parts_cnt_per_key,
+        uint16_t parts_cnt_per_record,
+        uint64_t now);
+
+    void PrepareRangePartitionBatches(
+        PartitionFlushState& partition_state,
+        const std::vector<size_t>& flush_recs,
+        const std::vector<std::unique_ptr<txservice::FlushTaskEntry>>& entries,
+        const txservice::TableName& table_name,
+        uint16_t parts_cnt_per_key,
+        uint16_t parts_cnt_per_record,
+        uint64_t now);
+
     /**
      * Delete range and flush data are not frequent calls, all calls are sent
      * with rpc.
@@ -635,6 +660,10 @@ class DataStoreServiceClient : public txservice::store::DataStoreHandler
     friend class DropTableClosure;
     friend class ScanNextClosure;
     friend class CreateSnapshotForBackupClosure;
+    friend void PartitionBatchCallback(void *data,
+                                      ::google::protobuf::Closure *closure,
+                                      DataStoreServiceClient &client,
+                                      const remote::CommonResult &result);
     friend class SinglePartitionScanner;
     friend void FetchAllDatabaseCallback(void *data,
                                          ::google::protobuf::Closure *closure,
diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp
index b0204a7..77774ce 100644
--- a/data_store_service_client_closure.cpp
+++ b/data_store_service_client_closure.cpp
@@ -405,6 +405,48 @@ void SyncPutAllCallback(void *data,
     callback_data->Finish(result);
 }
 
+void PartitionBatchCallback(void *data,
+                           ::google::protobuf::Closure *closure,
+                           DataStoreServiceClient &client,
+                           const remote::CommonResult &result)
+{
+    auto *callback_data = reinterpret_cast<PartitionCallbackData *>(data);
+    auto *partition_state = callback_data->partition_state;
+    auto *global_coordinator = callback_data->global_coordinator;
+    
+    // Check if the batch failed
+    if (result.error_code() != remote::DataStoreError::NO_ERROR) {
+        partition_state->MarkFailed(result);
+        // Notify the global coordinator that this partition failed
+        global_coordinator->OnPartitionCompleted();
+        return;
+    }
+    
+    // Try to get the next batch for this partition
+    PartitionBatchRequest next_batch;
+    if (partition_state->GetNextBatch(next_batch)) {
+        // Send the next batch
+        client.BatchWriteRecords(
+            callback_data->table_name,
+            partition_state->partition_id,
+            std::move(next_batch.key_parts),
+            std::move(next_batch.record_parts),
+            std::move(next_batch.records_ts),
+            std::move(next_batch.records_ttl),
+            std::move(next_batch.op_types),
+            true, // skip_wal
+            callback_data,
+            PartitionBatchCallback,
+            next_batch.parts_cnt_per_key,
+            next_batch.parts_cnt_per_record);
+    } else {
+        // No more batches, mark partition as completed
+        partition_state->MarkCompleted();
+        // Notify the global coordinator that this partition completed
+        global_coordinator->OnPartitionCompleted();
+    }
+}
+
 void FetchDatabaseCallback(void *data,
                            ::google::protobuf::Closure *closure,
                            DataStoreServiceClient &client,
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index 16836a5..d02934f 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -28,11 +28,17 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <queue>
 #include <vector>
 
 #include "data_store_service_client.h"
-#include "data_store_service_scanner.h"
 #include "eloq_data_store_service/object_pool.h"
+#include "eloq_data_store_service/data_store_service.h"
+
+// Forward declarations
+namespace EloqDS {
+class DataStoreServiceClient;
+}
 
 namespace EloqDS
 {
@@ -143,6 +149,10 @@ struct SyncPutAllData : public Poolable
         unfinished_request_cnt_ = 0;
         all_request_started_ = false;
         result_.Clear();
+        // Clear partition states if using new concurrent approach
+        partition_states_.clear();
+        completed_partitions_ = 0;
+        total_partitions_ = 0;
     }
 
     virtual void Clear() override
@@ -150,6 +160,9 @@ struct SyncPutAllData : public Poolable
         unfinished_request_cnt_ = 0;
         all_request_started_ = false;
         result_.Clear();
+        partition_states_.clear();
+        completed_partitions_ = 0;
+        total_partitions_ = 0;
     }
 
     void Finish(const remote::CommonResult &res)
@@ -169,12 +182,138 @@ struct SyncPutAllData : public Poolable
         }
     }
 
+    // New method for per-partition coordination
+    void OnPartitionCompleted()
+    {
+        std::unique_lock<bthread::Mutex> lk(mux_);
+        completed_partitions_++;
+        if (completed_partitions_ >= total_partitions_) {
+            cv_.notify_one();
+        }
+    }
     // NOTICE: "unfinished_request_cnt_" must use signed integer.
     int32_t unfinished_request_cnt_{0};
     bool all_request_started_{false};
     remote::CommonResult result_;
-    bthread::Mutex mux_;
+    mutable bthread::Mutex mux_;
     bthread::ConditionVariable cv_;
+    
+    // New fields for per-partition coordination
+    std::vector<std::unique_ptr<PartitionFlushState>> partition_states_;
+    int32_t completed_partitions_{0};
+    int32_t total_partitions_{0};
+};
+
+/**
+ * @brief Represents a single batch request for a partition
+ */
+struct PartitionBatchRequest {
+    std::vector<std::string_view> key_parts;
+    std::vector<std::string_view> record_parts;
+    std::vector<uint64_t> records_ts;
+    std::vector<uint64_t> records_ttl;
+    std::vector<WriteOpType> op_types;
+    uint16_t parts_cnt_per_key;
+    uint16_t parts_cnt_per_record;
+    
+    PartitionBatchRequest() = default;
+    
+    PartitionBatchRequest(std::vector<std::string_view>&& keys,
+                         std::vector<std::string_view>&& records,
+                         std::vector<uint64_t>&& ts,
+                         std::vector<uint64_t>&& ttl,
+                         std::vector<WriteOpType>&& ops,
+                         uint16_t key_parts_count,
+                         uint16_t record_parts_count)
+        : key_parts(std::move(keys))
+        , record_parts(std::move(records))
+        , records_ts(std::move(ts))
+        , records_ttl(std::move(ttl))
+        , op_types(std::move(ops))
+        , parts_cnt_per_key(key_parts_count)
+        , parts_cnt_per_record(record_parts_count) {}
+};
+
+/**
+ * @brief Per-partition state management for concurrent flushing
+ */
+struct PartitionFlushState {
+    int32_t partition_id;
+    std::queue<PartitionBatchRequest> pending_batches;
+    bool has_inflight_request = false;
+    bool completed = false;
+    bool failed = false;
+    remote::CommonResult result;
+    mutable bthread::Mutex mux;
+    
+    PartitionFlushState(int32_t pid) : partition_id(pid) {
+        result.Clear();
+    }
+    
+    void Reset() {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        while (!pending_batches.empty()) {
+            pending_batches.pop();
+        }
+        has_inflight_request = false;
+        completed = false;
+        failed = false;
+        result.Clear();
+    }
+    
+    bool HasMoreBatches() const {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        return !pending_batches.empty() || has_inflight_request;
+    }
+    
+    bool IsCompleted() const {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        return completed;
+    }
+    
+    bool IsFailed() const {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        return failed;
+    }
+    
+    void MarkCompleted() {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        completed = true;
+    }
+    
+    void MarkFailed(const remote::CommonResult& error) {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        failed = true;
+        result.set_error_code(error.error_code());
+        result.set_error_msg(error.error_msg());
+    }
+    
+    bool GetNextBatch(PartitionBatchRequest& batch) {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        if (pending_batches.empty()) {
+            return false;
+        }
+        batch = std::move(pending_batches.front());
+        pending_batches.pop();
+        return true;
+    }
+    
+    void AddBatch(PartitionBatchRequest&& batch) {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        pending_batches.push(std::move(batch));
+    }
+};
+
+/**
+ * @brief Wrapper for partition callback data that includes global coordinator
+ */
+struct PartitionCallbackData {
+    PartitionFlushState* partition_state;
+    SyncPutAllData* global_coordinator;
+    std::string table_name;
+    
+    PartitionCallbackData(PartitionFlushState* ps, SyncPutAllData* gc, const std::string& tn)
+        : partition_state(ps), global_coordinator(gc), table_name(tn) {}
 };
 /**
  * Generic synchronous callback adapter invoked by closures to signal
@@ -2291,6 +2430,17 @@ void SyncPutAllCallback(void *data,
                         DataStoreServiceClient &client,
                         const remote::CommonResult &result);
 
+/**
+ * Callback for per-partition batch operations in concurrent PutAll.
+ * 
+ * Handles the completion of a single batch for a partition and chains
+ * to the next batch if available, or marks the partition as completed.
+ */
+void PartitionBatchCallback(void *data,
+                           ::google::protobuf::Closure *closure,
+                           DataStoreServiceClient &client,
+                           const remote::CommonResult &result);
+
 /**
  * Callback data for fetching database information.
  * 

From 5c4457576c69317ab34c50c637497a06e605ffbd Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 11:24:43 +0000
Subject: [PATCH 6/9] put reusable objects in pool

---
 data_store_service_client.cpp         | 544 +++++++++++++++-----------
 data_store_service_client_closure.cpp |  76 ++--
 data_store_service_client_closure.h   | 415 +++++++++++---------
 3 files changed, 596 insertions(+), 439 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index c241167..ea54dfd 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -20,7 +20,6 @@
  *
  */
 #include "data_store_service_client.h"
-#include "data_store_service_client_closure.h"
 
 #include <glog/logging.h>
 
@@ -34,6 +33,7 @@
 #include <utility>
 #include <vector>
 
+#include "data_store_service_client_closure.h"
 #include "data_store_service_scanner.h"
 #include "eloq_data_store_service/object_pool.h"  // ObjectPool
 #include "eloq_data_store_service/thread_worker_pool.h"
@@ -65,6 +65,9 @@ thread_local ObjectPool<FetchAllDatabaseCallbackData>
 thread_local ObjectPool<DiscoverAllTableNamesCallbackData>
     discover_all_tables_callback_data_pool_;
 thread_local ObjectPool<SyncPutAllData> sync_putall_data_pool_;
+thread_local ObjectPool<SyncConcurrentRequest> sync_concurrent_request_pool_;
+thread_local ObjectPool<PartitionFlushState> partition_flush_state_pool_;
+thread_local ObjectPool<PartitionCallbackData> partition_callback_data_pool_;
 
 static const uint64_t MAX_WRITE_BATCH_SIZE = 64 * 1024 * 1024;  // 64MB
 
@@ -93,13 +96,15 @@ DataStoreServiceClient::~DataStoreServiceClient()
 }
 
 /**
- * @brief Configures the data store service client with cluster manager information.
+ * @brief Configures the data store service client with cluster manager
+ * information.
  *
- * Initializes the client with cluster configuration including node hostnames and ports.
- * Logs all node information for debugging purposes and stores the cluster manager
- * reference for future use.
+ * Initializes the client with cluster configuration including node hostnames
+ * and ports. Logs all node information for debugging purposes and stores the
+ * cluster manager reference for future use.
  *
- * @param cluster_manager Reference to the cluster manager containing shard and node information.
+ * @param cluster_manager Reference to the cluster manager containing shard and
+ * node information.
  */
 void DataStoreServiceClient::SetupConfig(
     const DataStoreServiceClusterManager &cluster_manager)
@@ -119,8 +124,8 @@ void DataStoreServiceClient::SetupConfig(
  * @brief Establishes connection to the data store service.
  *
  * Attempts to connect to the data store service with retry logic. Initializes
- * pre-built tables and retries up to 5 times with 1-second delays between attempts.
- * Returns true if connection succeeds, false otherwise.
+ * pre-built tables and retries up to 5 times with 1-second delays between
+ * attempts. Returns true if connection succeeds, false otherwise.
  *
  * @return true if connection is successful, false if all retry attempts fail.
  */
@@ -145,9 +150,9 @@ bool DataStoreServiceClient::Connect()
 /**
  * @brief Schedules timer-based tasks for the data store service.
  *
- * Currently not implemented. This method is a placeholder for future timer-based
- * functionality such as periodic cleanup, health checks, or maintenance tasks.
- * Will assert and log an error if called.
+ * Currently not implemented. This method is a placeholder for future
+ * timer-based functionality such as periodic cleanup, health checks, or
+ * maintenance tasks. Will assert and log an error if called.
  */
 void DataStoreServiceClient::ScheduleTimerTasks()
 {
@@ -156,26 +161,33 @@ void DataStoreServiceClient::ScheduleTimerTasks()
 }
 
 /**
- * @brief Batch-writes a set of flush tasks into KV tables.
+ * @brief Batch-writes a set of flush tasks into KV tables using concurrent
+ * partition processing.
  *
  * Processes the provided flush tasks grouped by table and partition, serializes
  * each record (object tables use raw encoded blobs; non-object tables encode
  * tx-records with unpack info), and issues batched PUT/DELETE operations via
- * BatchWriteRecords. Batches are emitted per KV-partition and sized according
- * to SyncPutAllData::max_flying_write_count; the method blocks as necessary to
- * respect the global in-flight write limit and waits for all dispatched
- * requests to complete before returning.
+ * BatchWriteRecords. The method uses a concurrent approach where different
+ * partitions can flush simultaneously, but each partition maintains
+ * serialization (only one request in-flight per partition at a time).
+ *
+ * Key features:
+ * - Concurrent processing across different partitions
+ * - Per-partition serialization to respect KV store constraints
+ * - Automatic batching based on MAX_WRITE_BATCH_SIZE (64MB)
+ * - Chained callbacks within each partition for sequential processing
+ * - Global coordination to wait for all partitions to complete
  *
  * The function distinguishes hash- and range-partitioned tables, computes
  * per-partition batches, and updates per-record timestamps/TTLs and operation
- * types. Partial batches are flushed at partition boundaries. On any remote or
- * batch-level error the function logs the failure and returns false.
+ * types. On any partition-level error, the function logs the failure and
+ * returns false.
  *
  * @param flush_task Mapping from KV table name to a vector of flush task
  *                   entries containing the records to write. Each entry's
  *                   data_sync_vec_ provides the sequence of records for that
  *                   flush task.
- * @return true if all batches completed successfully; false if any batch
+ * @return true if all partitions completed successfully; false if any partition
  *         reported an error.
  */
 bool DataStoreServiceClient::PutAll(
@@ -189,13 +201,13 @@ bool DataStoreServiceClient::PutAll(
     for (auto &[kv_table_name, entries] : flush_task)
     {
         auto &table_name = entries.front()->data_sync_task_->table_name_;
-        
+
         // Group records by partition
         std::unordered_map<uint32_t, std::vector<std::pair<size_t, size_t>>>
             hash_partitions_map;
         std::unordered_map<uint32_t, std::vector<size_t>> range_partitions_map;
         std::unordered_map<uint32_t, size_t> partition_record_cnt;
-        
+
         size_t flush_task_entry_idx = 0;
         for (auto &entry : entries)
         {
@@ -227,7 +239,8 @@ bool DataStoreServiceClient::PutAll(
             }
             else
             {
-                // All records in the batch are in the same partition for range table
+                // All records in the batch are in the same partition for range
+                // table
                 uint32_t parition_id =
                     KvPartitionIdOf(batch[0].partition_id_, true);
                 auto [it, inserted] =
@@ -248,68 +261,79 @@ bool DataStoreServiceClient::PutAll(
         uint16_t parts_cnt_per_record = table_name.IsObjectTable() ? 1 : 5;
 
         // Create partition states and prepare batches
-        std::vector<std::unique_ptr<PartitionFlushState>> partition_states;
-        std::vector<std::unique_ptr<PartitionCallbackData>> callback_data_list;
-        
+        std::vector<PartitionCallbackData *> callback_data_list;
+
         // Process hash partitions
         for (auto &[partition_id, flush_recs] : hash_partitions_map)
         {
-            auto partition_state = std::make_unique<PartitionFlushState>(partition_id);
-            auto callback_data = std::make_unique<PartitionCallbackData>(
-                partition_state.get(), sync_putall, std::string(kv_table_name));
-            
+            auto partition_state = partition_flush_state_pool_.NextObject();
+            partition_state->Reset(partition_id);
+            auto callback_data = partition_callback_data_pool_.NextObject();
+            callback_data->Reset(partition_state, sync_putall, kv_table_name);
+
             // Prepare batches for this partition
-            PreparePartitionBatches(*partition_state, flush_recs, entries, 
-                                  table_name, parts_cnt_per_key, parts_cnt_per_record, now);
-            
-            partition_states.push_back(std::move(partition_state));
-            callback_data_list.push_back(std::move(callback_data));
+            PreparePartitionBatches(*partition_state,
+                                    flush_recs,
+                                    entries,
+                                    table_name,
+                                    parts_cnt_per_key,
+                                    parts_cnt_per_record,
+                                    now);
+
+            sync_putall->partition_states_.push_back(partition_state);
+            callback_data_list.push_back(callback_data);
         }
-        
+
         // Process range partitions
         for (auto &[partition_id, flush_recs] : range_partitions_map)
         {
-            auto partition_state = std::make_unique<PartitionFlushState>(partition_id);
-            auto callback_data = std::make_unique<PartitionCallbackData>(
-                partition_state.get(), sync_putall, std::string(kv_table_name));
-            
+            auto partition_state = partition_flush_state_pool_.NextObject();
+            partition_state->Reset(partition_id);
+            auto callback_data = partition_callback_data_pool_.NextObject();
+            callback_data->Reset(partition_state, sync_putall, kv_table_name);
+
             // Prepare batches for this partition
-            PrepareRangePartitionBatches(*partition_state, flush_recs, entries,
-                                       table_name, parts_cnt_per_key, parts_cnt_per_record, now);
-            
-            partition_states.push_back(std::move(partition_state));
-            callback_data_list.push_back(std::move(callback_data));
+            PrepareRangePartitionBatches(*partition_state,
+                                         flush_recs,
+                                         entries,
+                                         table_name,
+                                         parts_cnt_per_key,
+                                         parts_cnt_per_record,
+                                         now);
+
+            sync_putall->partition_states_.push_back(partition_state);
+            callback_data_list.push_back(callback_data);
         }
 
         // Set up global coordinator
-        sync_putall->total_partitions_ = partition_states.size();
-        sync_putall->partition_states_ = std::move(partition_states);
+        sync_putall->total_partitions_ = sync_putall->partition_states_.size();
 
         // Start concurrent processing for each partition
         for (size_t i = 0; i < callback_data_list.size(); ++i)
         {
-            auto* partition_state = sync_putall->partition_states_[i].get();
-            auto* callback_data = callback_data_list[i].get();
-            
+            auto *partition_state = sync_putall->partition_states_[i];
+            auto *callback_data = callback_data_list[i];
+
             // Start the first batch for this partition
             PartitionBatchRequest first_batch;
-            if (partition_state->GetNextBatch(first_batch)) {
-                BatchWriteRecords(
-                    callback_data->table_name,
-                    partition_state->partition_id,
-                    std::move(first_batch.key_parts),
-                    std::move(first_batch.record_parts),
-                    std::move(first_batch.records_ts),
-                    std::move(first_batch.records_ttl),
-                    std::move(first_batch.op_types),
-                    true, // skip_wal
-                    callback_data,
-                    PartitionBatchCallback,
-                    first_batch.parts_cnt_per_key,
-                    first_batch.parts_cnt_per_record);
-            } else {
+            if (partition_state->GetNextBatch(first_batch))
+            {
+                BatchWriteRecords(callback_data->table_name,
+                                  partition_state->partition_id,
+                                  std::move(first_batch.key_parts),
+                                  std::move(first_batch.record_parts),
+                                  std::move(first_batch.records_ts),
+                                  std::move(first_batch.records_ttl),
+                                  std::move(first_batch.op_types),
+                                  true,  // skip_wal
+                                  callback_data,
+                                  PartitionBatchCallback,
+                                  first_batch.parts_cnt_per_key,
+                                  first_batch.parts_cnt_per_record);
+            }
+            else
+            {
                 // No batches for this partition, mark as completed
-                partition_state->MarkCompleted();
                 sync_putall->OnPartitionCompleted();
             }
         }
@@ -317,22 +341,30 @@ bool DataStoreServiceClient::PutAll(
         // Wait for all partitions to complete
         {
             std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-            while (sync_putall->completed_partitions_ < sync_putall->total_partitions_)
+            while (sync_putall->completed_partitions_ <
+                   sync_putall->total_partitions_)
             {
                 sync_putall->cv_.wait(lk);
             }
         }
 
         // Check for errors
-        for (auto& partition_state : sync_putall->partition_states_)
+        for (auto &partition_state : sync_putall->partition_states_)
         {
             if (partition_state->IsFailed())
             {
-                LOG(ERROR) << "PutAll failed for partition " << partition_state->partition_id 
-                           << " with error: " << partition_state->result.error_msg();
-            return false;
+                LOG(ERROR) << "PutAll failed for partition "
+                           << partition_state->partition_id << " with error: "
+                           << partition_state->result.error_msg();
+                return false;
             }
         }
+
+        for (auto &callback_data : callback_data_list)
+        {
+            callback_data->Clear();
+            callback_data->Free();
+        }
     }
     return true;
 }
@@ -341,11 +373,12 @@ bool DataStoreServiceClient::PutAll(
  * @brief Persists data from specified KV tables to storage.
  *
  * Flushes data from the provided KV table names to persistent storage using
- * asynchronous flush operations. Waits for completion and returns success/failure
- * status. Logs warnings on failure and debug info on success.
+ * asynchronous flush operations. Waits for completion and returns
+ * success/failure status. Logs warnings on failure and debug info on success.
  *
  * @param kv_table_names Vector of KV table names to persist.
- * @return true if all tables are persisted successfully, false if any operation fails.
+ * @return true if all tables are persisted successfully, false if any operation
+ * fails.
  */
 bool DataStoreServiceClient::PersistKV(
     const std::vector<std::string> &kv_table_names)
@@ -374,16 +407,19 @@ bool DataStoreServiceClient::PersistKV(
  * Handles table creation, modification, and deletion operations by updating
  * table schema information in the data store. Validates leadership, processes
  * the operation asynchronously, and sets appropriate error codes on failure.
- * Supports various operation types including CREATE, DROP, and ALTER operations.
+ * Supports various operation types including CREATE, DROP, and ALTER
+ * operations.
  *
- * @param old_table_schema Pointer to the existing table schema (nullptr for CREATE).
+ * @param old_table_schema Pointer to the existing table schema (nullptr for
+ * CREATE).
  * @param new_table_schema Pointer to the new table schema.
  * @param op_type Type of operation (CREATE, DROP, ALTER, etc.).
  * @param commit_ts Commit timestamp for the operation.
  * @param ng_id Node group ID for the operation.
  * @param tx_term Transaction term for consistency.
  * @param hd_res Handler result object to store operation outcome.
- * @param alter_table_info Information about table alterations (nullptr if not applicable).
+ * @param alter_table_info Information about table alterations (nullptr if not
+ * applicable).
  * @param cc_req CC request base object.
  * @param ccs CC shard reference.
  * @param err_code Error code output parameter.
@@ -442,11 +478,13 @@ void DataStoreServiceClient::UpsertTable(
  * @brief Fetches table catalog information from the data store.
  *
  * Retrieves catalog information for the specified table by reading from the
- * KV table catalogs storage. Uses partition ID 0 and the catalog name as the key.
- * The operation is performed asynchronously with a callback for completion handling.
+ * KV table catalogs storage. Uses partition ID 0 and the catalog name as the
+ * key. The operation is performed asynchronously with a callback for completion
+ * handling.
  *
  * @param ccm_table_name The table name to fetch catalog information for.
- * @param fetch_cc Fetch catalog CC object to store the result and handle completion.
+ * @param fetch_cc Fetch catalog CC object to store the result and handle
+ * completion.
  */
 void DataStoreServiceClient::FetchTableCatalog(
     const txservice::TableName &ccm_table_name,
@@ -465,11 +503,13 @@ void DataStoreServiceClient::FetchTableCatalog(
  * @brief Fetches current table statistics from the data store.
  *
  * Retrieves the current version of table statistics for the specified table.
- * Determines the appropriate KV partition ID and reads from the table statistics
- * version storage. The operation is performed asynchronously with callback handling.
+ * Determines the appropriate KV partition ID and reads from the table
+ * statistics version storage. The operation is performed asynchronously with
+ * callback handling.
  *
  * @param ccm_table_name The table name to fetch statistics for.
- * @param fetch_cc Fetch table statistics CC object to store the result and handle completion.
+ * @param fetch_cc Fetch table statistics CC object to store the result and
+ * handle completion.
  */
 void DataStoreServiceClient::FetchCurrentTableStatistics(
     const txservice::TableName &ccm_table_name,
@@ -491,11 +531,13 @@ void DataStoreServiceClient::FetchCurrentTableStatistics(
  *
  * Retrieves table statistics for a specific version by constructing key ranges
  * based on the table name and version number. Clears previous key ranges and
- * session information, then constructs start and end keys for the version-specific
- * statistics. The operation is performed asynchronously with callback handling.
+ * session information, then constructs start and end keys for the
+ * version-specific statistics. The operation is performed asynchronously with
+ * callback handling.
  *
  * @param ccm_table_name The table name to fetch statistics for.
- * @param fetch_cc Fetch table statistics CC object containing version information and result storage.
+ * @param fetch_cc Fetch table statistics CC object containing version
+ * information and result storage.
  */
 void DataStoreServiceClient::FetchTableStatistics(
     const txservice::TableName &ccm_table_name,
@@ -572,15 +614,18 @@ std::string EncodeTableStatsKey(const txservice::TableName &base_table_name,
 /**
  * @brief Upserts table statistics to the data store.
  *
- * Stores table statistics by splitting sample keys into segments and writing them
- * to the KV storage. Each segment contains index type, record count, and sample keys.
- * Also updates the checkpoint version for the table statistics. Uses batch write
- * operations for efficiency and handles both local and remote storage paths.
+ * Stores table statistics by splitting sample keys into segments and writing
+ * them to the KV storage. Each segment contains index type, record count, and
+ * sample keys. Also updates the checkpoint version for the table statistics.
+ * Uses batch write operations for efficiency and handles both local and remote
+ * storage paths.
  *
  * @param ccm_table_name The table name to store statistics for.
- * @param sample_pool_map Map of index names to sample pools containing record counts and sample keys.
+ * @param sample_pool_map Map of index names to sample pools containing record
+ * counts and sample keys.
  * @param version The version number for the statistics.
- * @return true if all statistics are stored successfully, false if any operation fails.
+ * @return true if all statistics are stored successfully, false if any
+ * operation fails.
  */
 bool DataStoreServiceClient::UpsertTableStatistics(
     const txservice::TableName &ccm_table_name,
@@ -762,12 +807,13 @@ bool DataStoreServiceClient::UpsertTableStatistics(
 /**
  * @brief Fetches table ranges from the data store.
  *
- * Retrieves range information for the specified table by scanning the range table
- * storage. Constructs start and end keys based on the table name and performs
- * a scan operation with pagination support. The operation is performed asynchronously
- * with callback handling for completion.
+ * Retrieves range information for the specified table by scanning the range
+ * table storage. Constructs start and end keys based on the table name and
+ * performs a scan operation with pagination support. The operation is performed
+ * asynchronously with callback handling for completion.
  *
- * @param fetch_cc Fetch table ranges CC object containing table name and result storage.
+ * @param fetch_cc Fetch table ranges CC object containing table name and result
+ * storage.
  */
 void DataStoreServiceClient::FetchTableRanges(
     txservice::FetchTableRangesCc *fetch_cc)
@@ -801,7 +847,8 @@ void DataStoreServiceClient::FetchTableRanges(
  * for reading range information. The operation is performed asynchronously
  * with callback handling for completion.
  *
- * @param fetch_cc Fetch range slices request object containing table name, range entry, and result storage.
+ * @param fetch_cc Fetch range slices request object containing table name,
+ * range entry, and result storage.
  */
 void DataStoreServiceClient::FetchRangeSlices(
     txservice::FetchRangeSlicesReq *fetch_cc)
@@ -835,8 +882,8 @@ void DataStoreServiceClient::FetchRangeSlices(
  *
  * Removes data from the KV table that falls outside the specified range.
  * Constructs the appropriate start key based on the provided parameters and
- * performs a delete range operation. Handles special cases for negative infinity
- * keys and constructs proper key boundaries for the deletion.
+ * performs a delete range operation. Handles special cases for negative
+ * infinity keys and constructs proper key boundaries for the deletion.
  *
  * @param table_name The table name to delete data from.
  * @param partition_id The partition ID for the operation.
@@ -917,9 +964,9 @@ bool DataStoreServiceClient::Read(const txservice::TableName &table_name,
 /**
  * @brief Creates a scanner for forward or backward scanning of table data.
  *
- * Creates and initializes a data store scanner for iterating over records in a table.
- * Supports both forward and backward scanning with configurable search conditions.
- * The scanner is initialized before returning.
+ * Creates and initializes a data store scanner for iterating over records in a
+ * table. Supports both forward and backward scanning with configurable search
+ * conditions. The scanner is initialized before returning.
  *
  * @param table_name The table name to scan.
  * @param ng_id Node group ID for the operation.
@@ -1128,7 +1175,8 @@ std::string DataStoreServiceClient::EncodeRangeValue(int32_t range_id,
  *
  * Creates a composite key by combining table name, range ID, and segment ID.
  * Uses little-endian encoding for numeric values since range slice operations
- * are point reads rather than scans, optimizing for direct key lookup performance.
+ * are point reads rather than scans, optimizing for direct key lookup
+ * performance.
  *
  * @param table_name The table name for the range slice.
  * @param range_id The range identifier.
@@ -1174,11 +1222,11 @@ void DataStoreServiceClient::UpdateEncodedRangeSliceKey(
 /**
  * @brief Updates range slices for a table partition.
  *
- * Stores range slice information by segmenting the slices into manageable chunks
- * and writing them to the KV storage system. Handles slice serialization with
- * proper key encoding and batch size management. Also updates the range information
- * with the new version and segment count. Uses both local and remote storage paths
- * based on configuration.
+ * Stores range slice information by segmenting the slices into manageable
+ * chunks and writing them to the KV storage system. Handles slice serialization
+ * with proper key encoding and batch size management. Also updates the range
+ * information with the new version and segment count. Uses both local and
+ * remote storage paths based on configuration.
  *
  * @param table_name The table name for the range slices.
  * @param version The version number for the slices.
@@ -1186,7 +1234,8 @@ void DataStoreServiceClient::UpdateEncodedRangeSliceKey(
  * @param slices Vector of store slices to update.
  * @param partition_id The partition ID for the range.
  * @param range_version The version of the range.
- * @return true if all slices are updated successfully, false if any operation fails.
+ * @return true if all slices are updated successfully, false if any operation
+ * fails.
  */
 bool DataStoreServiceClient::UpdateRangeSlices(
     const txservice::TableName &table_name,
@@ -1341,15 +1390,16 @@ bool DataStoreServiceClient::UpdateRangeSlices(
 /**
  * @brief Upserts range information for a table.
  *
- * Updates range slices for multiple ranges by calling UpdateRangeSlices for each
- * range in the provided vector. After updating all ranges, flushes the range table
- * data to ensure persistence. Validates that the table name is not empty and
- * handles errors from individual range updates.
+ * Updates range slices for multiple ranges by calling UpdateRangeSlices for
+ * each range in the provided vector. After updating all ranges, flushes the
+ * range table data to ensure persistence. Validates that the table name is not
+ * empty and handles errors from individual range updates.
  *
  * @param table_name The table name for the ranges.
  * @param range_info Vector of split range information to upsert.
  * @param version The version number for the ranges.
- * @return true if all ranges are updated and flushed successfully, false if any operation fails.
+ * @return true if all ranges are updated and flushed successfully, false if any
+ * operation fails.
  */
 bool DataStoreServiceClient::UpsertRanges(
     const txservice::TableName &table_name,
@@ -1431,14 +1481,17 @@ bool DataStoreServiceClient::FetchTable(const txservice::TableName &table_name,
 /**
  * @brief Discovers all table names in the data store.
  *
- * Scans the table catalogs to discover all available table names. Uses pagination
- * with session management and supports cooperative scheduling through yield/resume
- * function pointers. Performs the scan asynchronously and waits for completion.
+ * Scans the table catalogs to discover all available table names. Uses
+ * pagination with session management and supports cooperative scheduling
+ * through yield/resume function pointers. Performs the scan asynchronously and
+ * waits for completion.
  *
  * @param norm_name_vec Output vector to store the discovered table names.
- * @param yield_fptr Optional function pointer for yielding control during pagination.
+ * @param yield_fptr Optional function pointer for yielding control during
+ * pagination.
  * @param resume_fptr Optional function pointer for resuming after yielding.
- * @return true if the discovery operation completes successfully, false if any error occurs.
+ * @return true if the discovery operation completes successfully, false if any
+ * error occurs.
  */
 bool DataStoreServiceClient::DiscoverAllTableNames(
     std::vector<std::string> &norm_name_vec,
@@ -1471,13 +1524,14 @@ bool DataStoreServiceClient::DiscoverAllTableNames(
  * @brief Upserts database definition to the data store.
  *
  * Stores database definition information in the KV storage system. The storage
- * format uses the database name as the key and the database definition as the value.
- * Uses current timestamp for versioning and performs the operation asynchronously
- * with synchronous waiting for completion.
+ * format uses the database name as the key and the database definition as the
+ * value. Uses current timestamp for versioning and performs the operation
+ * asynchronously with synchronous waiting for completion.
  *
  * @param db The database name to upsert.
  * @param definition The database definition to store.
- * @return true if the database is upserted successfully, false if any operation fails.
+ * @return true if the database is upserted successfully, false if any operation
+ * fails.
  */
 bool DataStoreServiceClient::UpsertDatabase(std::string_view db,
                                             std::string_view definition)
@@ -1527,12 +1581,14 @@ bool DataStoreServiceClient::UpsertDatabase(std::string_view db,
 /**
  * @brief Drops a database from the data store.
  *
- * Removes a database definition from the KV storage system by performing a DELETE
- * operation on the database catalog. Uses current timestamp for versioning and
- * performs the operation asynchronously with synchronous waiting for completion.
+ * Removes a database definition from the KV storage system by performing a
+ * DELETE operation on the database catalog. Uses current timestamp for
+ * versioning and performs the operation asynchronously with synchronous waiting
+ * for completion.
  *
  * @param db The database name to drop.
- * @return true if the database is dropped successfully, false if any operation fails.
+ * @return true if the database is dropped successfully, false if any operation
+ * fails.
  */
 bool DataStoreServiceClient::DropDatabase(std::string_view db)
 {
@@ -1590,7 +1646,8 @@ bool DataStoreServiceClient::DropDatabase(std::string_view db)
  * @param found Output parameter indicating if the database was found.
  * @param yield_fptr Optional function pointer for yielding control.
  * @param resume_fptr Optional function pointer for resuming after yielding.
- * @return true if the fetch operation completes successfully, false if any error occurs.
+ * @return true if the fetch operation completes successfully, false if any
+ * error occurs.
  */
 bool DataStoreServiceClient::FetchDatabase(
     std::string_view db,
@@ -1870,14 +1927,17 @@ void DataStoreServiceClient::EncodeArchiveKey(
  * @brief Decodes an archive key to extract its components.
  *
  * Parses an archive key string to extract the table name, transaction key,
- * and commit timestamp. The archive key format is: "log:item:{table_name}:{key}:{commit_ts}".
- * Validates the key format and extracts each component using string separators.
+ * and commit timestamp. The archive key format is:
+ * "log:item:{table_name}:{key}:{commit_ts}". Validates the key format and
+ * extracts each component using string separators.
  *
  * @param archive_key The archive key string to decode.
  * @param table_name Output parameter for the extracted table name.
  * @param key Output parameter for the extracted transaction key.
- * @param be_commit_ts Output parameter for the extracted commit timestamp (big-endian).
- * @return true if the key is successfully decoded, false if the format is invalid.
+ * @param be_commit_ts Output parameter for the extracted commit timestamp
+ * (big-endian).
+ * @return true if the key is successfully decoded, false if the format is
+ * invalid.
  */
 bool DataStoreServiceClient::DecodeArchiveKey(const std::string &archive_key,
                                               std::string &table_name,
@@ -1993,24 +2053,36 @@ void DataStoreServiceClient::DecodeArchiveValue(
 }
 
 /**
- * @brief Writes multiple MVCC archive records to the MVCC archive KV table in partitioned batches.
+ * @brief Writes multiple MVCC archive records to the MVCC archive KV table
+ * using sequential batch processing.
+ *
+ * Groups archive entries from the provided flush tasks by archive partition,
+ * serializes keys and values into batch write requests, and dispatches those
+ * requests sequentially within each partition. Uses SyncConcurrentRequest for
+ * global concurrency control to limit the total number of in-flight requests
+ * across all partitions.
+ *
+ * Key features:
+ * - Sequential processing within each partition to maintain ordering
+ * - Global concurrency control with max_flying_write_count limit (32)
+ * - Automatic batching based on MAX_WRITE_BATCH_SIZE (64MB)
+ * - Flow control to prevent overwhelming the system
  *
- * Groups archive entries from the provided flush tasks by archive partition, serializes keys
- * and values into batch write requests, and dispatches those requests (possibly concurrently)
- * to the KV layer. Batches are split to respect MAX_WRITE_BATCH_SIZE and an internal limit on
- * in-flight write requests; the method waits for all dispatched batches for each partition to
- * complete before returning.
+ * The method waits for all dispatched batches for each partition to complete
+ * before returning.
  *
  * Side effects:
- * - Commits serialized archive records to kv_mvcc_archive_name with a default TTL of 1 day.
- * - Converts per-record commit timestamps to big-endian form as part of key encoding (the
- *   in-memory commit_ts field of those records is mutated during processing).
+ * - Commits serialized archive records to kv_mvcc_archive_name with a default
+ * TTL of 1 day.
+ * - Converts per-record commit timestamps to big-endian form as part of key
+ * encoding (the in-memory commit_ts field of those records is mutated during
+ * processing).
  *
- * @param flush_task Map from KV table name to a vector of FlushTaskEntry pointers whose
- *                   archive vectors contain the FlushRecord entries to write. Only entries
- *                   with non-empty archive vectors are processed.
- * @return true if all batches for all partitions completed successfully; false if any batch
- *         failed (an error will be logged).
+ * @param flush_task Map from KV table name to a vector of FlushTaskEntry
+ * pointers whose archive vectors contain the FlushRecord entries to write. Only
+ * entries with non-empty archive vectors are processed.
+ * @return true if all batches for all partitions completed successfully; false
+ * if any batch failed (an error will be logged).
  */
 bool DataStoreServiceClient::PutArchivesAll(
     std::unordered_map<std::string_view,
@@ -2060,7 +2132,7 @@ bool DataStoreServiceClient::PutArchivesAll(
         std::vector<WriteOpType> op_types;
         // temporary storage for the records in between batch
         // for keeping record upack info and encoded blob sizes
-        std::vector<uint64_t> record_tmp_mem_area;
+        std::vector<size_t> record_tmp_mem_area;
         record_tmp_mem_area.resize(archive_ptrs.size() *
                                    2);  // unpack_info_size + encoded_blob_size
         size_t write_batch_size = 0;
@@ -2073,9 +2145,10 @@ bool DataStoreServiceClient::PutArchivesAll(
         uint16_t parts_cnt_per_record = 5;
 
         // Send the batch request
-        SyncPutAllData *sync_putall = sync_putall_data_pool_.NextObject();
-        PoolableGuard guard(sync_putall);
-        sync_putall->Reset();
+        SyncConcurrentRequest *sync_concurrent =
+            sync_concurrent_request_pool_.NextObject();
+        PoolableGuard guard(sync_concurrent);
+        sync_concurrent->Reset();
 
         size_t recs_cnt = archive_ptrs.size();
         keys.reserve(recs_cnt * parts_cnt_per_key);
@@ -2091,13 +2164,13 @@ bool DataStoreServiceClient::PutArchivesAll(
             {
                 // Wait for in-flight requests to decrease if limit reached
                 {
-                    std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-                    sync_putall->unfinished_request_cnt_++;
-                    while (sync_putall->unfinished_request_cnt_ >=
-                           SyncPutAllData::max_flying_write_count)
+                    std::unique_lock<bthread::Mutex> lk(sync_concurrent->mux_);
+                    while (sync_concurrent->unfinished_request_cnt_ >=
+                           SyncConcurrentRequest::max_flying_write_count)
                     {
-                        sync_putall->cv_.wait(lk);
+                        sync_concurrent->cv_.wait(lk);
                     }
+                    sync_concurrent->unfinished_request_cnt_++;
                 }
                 BatchWriteRecords(kv_mvcc_archive_name,
                                   partition_id,
@@ -2107,8 +2180,8 @@ bool DataStoreServiceClient::PutArchivesAll(
                                   std::move(records_ttl),
                                   std::move(op_types),
                                   true,
-                                  sync_putall,
-                                  SyncPutAllCallback,
+                                  sync_concurrent,
+                                  SyncConcurrentRequestCallback,
                                   parts_cnt_per_key,
                                   parts_cnt_per_record);
                 keys.clear();
@@ -2183,8 +2256,8 @@ bool DataStoreServiceClient::PutArchivesAll(
                               std::move(records_ttl),
                               std::move(op_types),
                               true,
-                              sync_putall,
-                              SyncPutAllCallback,
+                              sync_concurrent,
+                              SyncConcurrentRequestCallback,
                               parts_cnt_per_key,
                               parts_cnt_per_record);
             keys.clear();
@@ -2200,26 +2273,26 @@ bool DataStoreServiceClient::PutArchivesAll(
             op_types.reserve(recs_cnt);
             write_batch_size = 0;
             {
-                std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-                sync_putall->unfinished_request_cnt_++;
+                std::unique_lock<bthread::Mutex> lk(sync_concurrent->mux_);
+                sync_concurrent->unfinished_request_cnt_++;
             }
         }
 
         // Wait the result.
         {
-            std::unique_lock<bthread::Mutex> lk(sync_putall->mux_);
-            sync_putall->all_request_started_ = true;
-            while (sync_putall->unfinished_request_cnt_ != 0)
+            std::unique_lock<bthread::Mutex> lk(sync_concurrent->mux_);
+            sync_concurrent->all_request_started_ = true;
+            while (sync_concurrent->unfinished_request_cnt_ != 0)
             {
-                sync_putall->cv_.wait(lk);
+                sync_concurrent->cv_.wait(lk);
             }
         }
 
-        if (sync_putall->result_.error_code() !=
+        if (sync_concurrent->result_.error_code() !=
             remote::DataStoreError::NO_ERROR)
         {
             LOG(ERROR) << "PutArchivesAll failed for error: "
-                       << sync_putall->result_.error_msg();
+                       << sync_concurrent->result_.error_msg();
             return false;
         }
     }
@@ -2235,8 +2308,10 @@ bool DataStoreServiceClient::PutArchivesAll(
  * handles both hash and range partitioned tables. Uses archive-specific
  * encoding and TTL settings for the copied data.
  *
- * @param flush_task Map of table names to flush task entries containing base records to copy.
- * @return true if all records are successfully copied to archive, false if any operation fails.
+ * @param flush_task Map of table names to flush task entries containing base
+ * records to copy.
+ * @return true if all records are successfully copied to archive, false if any
+ * operation fails.
  */
 bool DataStoreServiceClient::CopyBaseToArchive(
     std::unordered_map<std::string_view,
@@ -2408,8 +2483,9 @@ bool DataStoreServiceClient::CopyBaseToArchive(
  * @brief Fetches archive records for a specific key from a given timestamp.
  *
  * Retrieves archived versions of a record from the MVCC archive storage.
- * Scans the archive table for records matching the specified key and timestamp range.
- * Currently asserts false as this functionality is not fully implemented.
+ * Scans the archive table for records matching the specified key and timestamp
+ * range. Currently asserts false as this functionality is not fully
+ * implemented.
  *
  * @param table_name The table name to fetch archives for.
  * @param kv_info KV catalog information for the table.
@@ -2611,7 +2687,8 @@ bool DataStoreServiceClient::FetchVisibleArchive(
  * initiates a scan operation to fetch all relevant archive versions.
  * Sets up the fetch CC object with the necessary scan parameters.
  *
- * @param fetch_cc Fetch record CC object containing key, timestamp, and result storage.
+ * @param fetch_cc Fetch record CC object containing key, timestamp, and result
+ * storage.
  * @return DataStoreOpStatus indicating the operation status.
  */
 txservice::store::DataStoreHandler::DataStoreOpStatus
@@ -2694,7 +2771,8 @@ DataStoreServiceClient::FetchVisibleArchive(
  * @param backup_name The name for the backup snapshot.
  * @param backup_files Output vector to store the generated backup file paths.
  * @param backup_ts The timestamp for the backup.
- * @return true if the snapshot is created successfully, false if any operation fails.
+ * @return true if the snapshot is created successfully, false if any operation
+ * fails.
  */
 bool DataStoreServiceClient::CreateSnapshotForBackup(
     const std::string &backup_name,
@@ -2789,9 +2867,9 @@ void DataStoreServiceClient::CreateSnapshotForBackupInternal(
 /**
  * @brief Determines if range copying is needed.
  *
- * Currently always returns true, indicating that range copying is always required.
- * This method is used to determine whether range data needs to be copied during
- * certain operations.
+ * Currently always returns true, indicating that range copying is always
+ * required. This method is used to determine whether range data needs to be
+ * copied during certain operations.
  *
  * @return Always returns true.
  */
@@ -2835,7 +2913,8 @@ bool DataStoreServiceClient::OnLeaderStart(uint32_t *next_leader_node)
  * @brief Handles start following event.
  *
  * Currently empty implementation. This method is called when the node starts
- * following another leader and can be used to perform follower-specific initialization.
+ * following another leader and can be used to perform follower-specific
+ * initialization.
  */
 void DataStoreServiceClient::OnStartFollowing()
 {
@@ -2844,8 +2923,8 @@ void DataStoreServiceClient::OnStartFollowing()
 /**
  * @brief Handles shutdown event.
  *
- * Currently empty implementation. This method is called when the node is shutting
- * down and can be used to perform cleanup operations.
+ * Currently empty implementation. This method is called when the node is
+ * shutting down and can be used to perform cleanup operations.
  */
 void DataStoreServiceClient::OnShutdown()
 {
@@ -4111,10 +4190,10 @@ bool DataStoreServiceClient::DeleteCatalog(
 }
 
 void DataStoreServiceClient::PreparePartitionBatches(
-    EloqDS::PartitionFlushState& partition_state,
-    const std::vector<std::pair<size_t, size_t>>& flush_recs,
-    const std::vector<std::unique_ptr<txservice::FlushTaskEntry>>& entries,
-    const txservice::TableName& table_name,
+    EloqDS::PartitionFlushState &partition_state,
+    const std::vector<std::pair<size_t, size_t>> &flush_recs,
+    const std::vector<std::unique_ptr<txservice::FlushTaskEntry>> &entries,
+    const txservice::TableName &table_name,
     uint16_t parts_cnt_per_key,
     uint16_t parts_cnt_per_record,
     uint64_t now)
@@ -4200,11 +4279,8 @@ void DataStoreServiceClient::PreparePartitionBatches(
         op_types.push_back(WriteOpType::PUT);
         batch_size += sizeof(WriteOpType);
 
-        SerializeTxRecord(is_deleted,
-                          rec,
-                          record_tmp_mem_area,
-                          record_parts,
-                          batch_size);
+        SerializeTxRecord(
+            is_deleted, rec, record_tmp_mem_area, record_parts, batch_size);
 
         records_ts.push_back(ckpt_rec.commit_ts_);
         batch_size += sizeof(uint64_t);
@@ -4219,15 +4295,16 @@ void DataStoreServiceClient::PreparePartitionBatches(
         // Start a new batch if size limit reached
         if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
         {
-            partition_state.AddBatch(PartitionBatchRequest(
-                std::move(key_parts),
-                std::move(record_parts),
-                std::move(records_ts),
-                std::move(records_ttl),
-                std::move(op_types),
-                parts_cnt_per_key,
-                parts_cnt_per_record));
-            
+            partition_state.AddBatch(
+                PartitionBatchRequest(std::move(key_parts),
+                                      std::move(record_parts),
+                                      std::move(records_ts),
+                                      std::move(records_ttl),
+                                      std::move(record_tmp_mem_area),
+                                      std::move(op_types),
+                                      parts_cnt_per_key,
+                                      parts_cnt_per_record));
+
             key_parts.clear();
             record_parts.clear();
             records_ts.clear();
@@ -4253,22 +4330,23 @@ void DataStoreServiceClient::PreparePartitionBatches(
     // Add the last batch if it has data
     if (key_parts.size() > 0)
     {
-        partition_state.AddBatch(PartitionBatchRequest(
-            std::move(key_parts),
-            std::move(record_parts),
-            std::move(records_ts),
-            std::move(records_ttl),
-            std::move(op_types),
-            parts_cnt_per_key,
-            parts_cnt_per_record));
+        partition_state.AddBatch(
+            PartitionBatchRequest(std::move(key_parts),
+                                  std::move(record_parts),
+                                  std::move(records_ts),
+                                  std::move(records_ttl),
+                                  std::move(record_tmp_mem_area),
+                                  std::move(op_types),
+                                  parts_cnt_per_key,
+                                  parts_cnt_per_record));
     }
 }
 
 void DataStoreServiceClient::PrepareRangePartitionBatches(
-    EloqDS::PartitionFlushState& partition_state,
-    const std::vector<size_t>& flush_recs,
-    const std::vector<std::unique_ptr<txservice::FlushTaskEntry>>& entries,
-    const txservice::TableName& table_name,
+    EloqDS::PartitionFlushState &partition_state,
+    const std::vector<size_t> &flush_recs,
+    const std::vector<std::unique_ptr<txservice::FlushTaskEntry>> &entries,
+    const txservice::TableName &table_name,
     uint16_t parts_cnt_per_key,
     uint16_t parts_cnt_per_record,
     uint64_t now)
@@ -4305,11 +4383,8 @@ void DataStoreServiceClient::PrepareRangePartitionBatches(
         op_types.push_back(WriteOpType::PUT);
         batch_size += sizeof(WriteOpType);
 
-        SerializeTxRecord(is_deleted,
-                          rec,
-                          record_tmp_mem_area,
-                          record_parts,
-                          batch_size);
+        SerializeTxRecord(
+            is_deleted, rec, record_tmp_mem_area, record_parts, batch_size);
 
         records_ts.push_back(ckpt_rec.commit_ts_);
         batch_size += sizeof(uint64_t);
@@ -4323,15 +4398,16 @@ void DataStoreServiceClient::PrepareRangePartitionBatches(
             // Start a new batch if size limit reached
             if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
             {
-                partition_state.AddBatch(PartitionBatchRequest(
-                    std::move(key_parts),
-                    std::move(record_parts),
-                    std::move(records_ts),
-                    std::move(records_ttl),
-                    std::move(op_types),
-                    parts_cnt_per_key,
-                    parts_cnt_per_record));
-                
+                partition_state.AddBatch(
+                    PartitionBatchRequest(std::move(key_parts),
+                                          std::move(record_parts),
+                                          std::move(records_ts),
+                                          std::move(records_ttl),
+                                          std::move(record_tmp_mem_area),
+                                          std::move(op_types),
+                                          parts_cnt_per_key,
+                                          parts_cnt_per_record));
+
                 key_parts.clear();
                 record_parts.clear();
                 records_ts.clear();
@@ -4341,8 +4417,9 @@ void DataStoreServiceClient::PrepareRangePartitionBatches(
                 write_batch_size = 0;
             }
 
-            assert(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal ||
-                   ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted);
+            assert(
+                ckpt_rec.payload_status_ == txservice::RecordStatus::Normal ||
+                ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted);
 
             // Currently there is no object table in range partitioned table
             PrepareRecordData(ckpt_rec, write_batch_size);
@@ -4352,14 +4429,15 @@ void DataStoreServiceClient::PrepareRangePartitionBatches(
     // Add the last batch if it has data
     if (key_parts.size() > 0)
     {
-        partition_state.AddBatch(PartitionBatchRequest(
-            std::move(key_parts),
-            std::move(record_parts),
-            std::move(records_ts),
-            std::move(records_ttl),
-            std::move(op_types),
-            parts_cnt_per_key,
-            parts_cnt_per_record));
+        partition_state.AddBatch(
+            PartitionBatchRequest(std::move(key_parts),
+                                  std::move(record_parts),
+                                  std::move(records_ts),
+                                  std::move(records_ttl),
+                                  std::move(record_tmp_mem_area),
+                                  std::move(op_types),
+                                  parts_cnt_per_key,
+                                  parts_cnt_per_record));
     }
 }
 
diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp
index 77774ce..a3a1238 100644
--- a/data_store_service_client_closure.cpp
+++ b/data_store_service_client_closure.cpp
@@ -20,11 +20,12 @@
  *
  */
 
+#include "data_store_service_client_closure.h"
+
 #include <memory>
 #include <string>
 #include <utility>
 
-#include "data_store_service_client_closure.h"
 #include "store_util.h"  // host_to_big_endian
 #include "tx_service/include/cc/cc_request.h"
 #include "tx_service/include/cc/local_cc_shards.h"
@@ -173,8 +174,8 @@ void FetchRecordCallback(void *data,
             if (!DataStoreServiceClient::DeserializeTxRecordStr(
                     val, is_deleted, offset))
             {
-                LOG(ERROR) << "====fetch record===decode error=="
-                           << " key: " << read_closure->Key()
+                LOG(ERROR) << "====fetch record===decode error==" << " key: "
+                           << read_closure->Key()
                            << " status: " << (int) fetch_cc->rec_status_;
                 std::abort();
             }
@@ -396,52 +397,53 @@ void FetchTableCallback(void *data,
     fetch_table_data->Notify();
 }
 
-void SyncPutAllCallback(void *data,
-                        ::google::protobuf::Closure *closure,
-                        DataStoreServiceClient &client,
-                        const remote::CommonResult &result)
+void SyncConcurrentRequestCallback(void *data,
+                                   ::google::protobuf::Closure *closure,
+                                   DataStoreServiceClient &client,
+                                   const remote::CommonResult &result)
 {
-    auto *callback_data = reinterpret_cast<SyncPutAllData *>(data);
+    auto *callback_data = reinterpret_cast<SyncConcurrentRequest *>(data);
     callback_data->Finish(result);
 }
 
 void PartitionBatchCallback(void *data,
-                           ::google::protobuf::Closure *closure,
-                           DataStoreServiceClient &client,
-                           const remote::CommonResult &result)
+                            ::google::protobuf::Closure *closure,
+                            DataStoreServiceClient &client,
+                            const remote::CommonResult &result)
 {
     auto *callback_data = reinterpret_cast<PartitionCallbackData *>(data);
     auto *partition_state = callback_data->partition_state;
     auto *global_coordinator = callback_data->global_coordinator;
-    
+
     // Check if the batch failed
-    if (result.error_code() != remote::DataStoreError::NO_ERROR) {
+    if (result.error_code() != remote::DataStoreError::NO_ERROR)
+    {
         partition_state->MarkFailed(result);
         // Notify the global coordinator that this partition failed
         global_coordinator->OnPartitionCompleted();
         return;
     }
-    
+
     // Try to get the next batch for this partition
     PartitionBatchRequest next_batch;
-    if (partition_state->GetNextBatch(next_batch)) {
+    if (partition_state->GetNextBatch(next_batch))
+    {
         // Send the next batch
-        client.BatchWriteRecords(
-            callback_data->table_name,
-            partition_state->partition_id,
-            std::move(next_batch.key_parts),
-            std::move(next_batch.record_parts),
-            std::move(next_batch.records_ts),
-            std::move(next_batch.records_ttl),
-            std::move(next_batch.op_types),
-            true, // skip_wal
-            callback_data,
-            PartitionBatchCallback,
-            next_batch.parts_cnt_per_key,
-            next_batch.parts_cnt_per_record);
-    } else {
-        // No more batches, mark partition as completed
-        partition_state->MarkCompleted();
+        client.BatchWriteRecords(callback_data->table_name,
+                                 partition_state->partition_id,
+                                 std::move(next_batch.key_parts),
+                                 std::move(next_batch.record_parts),
+                                 std::move(next_batch.records_ts),
+                                 std::move(next_batch.records_ttl),
+                                 std::move(next_batch.op_types),
+                                 true,  // skip_wal
+                                 callback_data,
+                                 PartitionBatchCallback,
+                                 next_batch.parts_cnt_per_key,
+                                 next_batch.parts_cnt_per_record);
+    }
+    else
+    {
         // Notify the global coordinator that this partition completed
         global_coordinator->OnPartitionCompleted();
     }
@@ -1428,4 +1430,16 @@ void CreateSnapshotForBackupCallback(void *data,
 
     backup_callback_data->Notify();
 }
+
+bool PartitionFlushState::GetNextBatch(PartitionBatchRequest &batch)
+{
+    std::unique_lock<bthread::Mutex> lk(mux);
+    if (pending_batches.empty())
+    {
+        return false;
+    }
+    batch = std::move(pending_batches.front());
+    pending_batches.pop();
+    return true;
+}
 }  // namespace EloqDS
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index d02934f..ddc6dc3 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -26,17 +26,18 @@
 #include <bthread/mutex.h>
 
 #include <memory>
+#include <queue>
 #include <string>
 #include <utility>
-#include <queue>
 #include <vector>
 
 #include "data_store_service_client.h"
-#include "eloq_data_store_service/object_pool.h"
 #include "eloq_data_store_service/data_store_service.h"
+#include "eloq_data_store_service/object_pool.h"
 
 // Forward declarations
-namespace EloqDS {
+namespace EloqDS
+{
 class DataStoreServiceClient;
 }
 
@@ -123,21 +124,83 @@ struct SyncCallbackData : public Poolable
 
     remote::CommonResult result_;
 };
+
 /**
- * Aggregation and flow-control helper for coordinating many concurrent put-all
- * writes.
+ * @brief Per-partition state management for concurrent flushing
+ */
+struct PartitionFlushState : public Poolable
+{
+    int32_t partition_id;
+    std::queue<PartitionBatchRequest> pending_batches;
+    bool failed = false;
+    remote::CommonResult result;
+    mutable bthread::Mutex mux;
+
+    PartitionFlushState() : partition_id(0)
+    {
+        result.Clear();
+    }
+
+    void Reset(int32_t pid)
+    {
+        partition_id = pid;
+        while (!pending_batches.empty())
+        {
+            pending_batches.pop();
+        }
+        failed = false;
+        result.Clear();
+    }
+
+    void Clear() override
+    {
+        partition_id = 0;
+        while (!pending_batches.empty())
+        {
+            pending_batches.pop();
+        }
+    }
+    bool IsFailed() const
+    {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        return failed;
+    }
+
+    void MarkFailed(const remote::CommonResult &error)
+    {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        failed = true;
+        result.set_error_code(error.error_code());
+        result.set_error_msg(error.error_msg());
+    }
+
+    bool GetNextBatch(PartitionBatchRequest &batch);
+
+    void AddBatch(PartitionBatchRequest &&batch)
+    {
+        std::unique_lock<bthread::Mutex> lk(mux);
+        pending_batches.push(std::move(batch));
+    }
+};
+
+/**
+ * Coordination helper for concurrent partition-based put-all operations.
  *
- * - unfinished_request_cnt_: signed count of outstanding write requests (must
- * be signed).
- * - all_request_started_: set to true once all requests have been launched.
- * - max_flying_write_count: upper bound on concurrent in-flight writes (32).
+ * This structure manages the coordination of multiple partitions that can
+ * process concurrently, with each partition maintaining serialization (only one
+ * request in-flight per partition). It tracks partition completion and provides
+ * global coordination for the entire PutAll operation.
  *
- * Finish(res) will merge the first non-NO_ERROR result into `result_`,
- * decrement the unfinished request count, and notify a waiter when either:
- *  - all requests have been started and the unfinished count reaches zero, or
- *  - the unfinished count falls to (max_flying_write_count - 1), enabling flow
- * control to allow launching further requests while keeping in-flight writes
- * bounded.
+ * Key components:
+ * - partition_states_: vector of PartitionFlushState objects, one per partition
+ * - completed_partitions_: count of partitions that have finished processing
+ * - total_partitions_: total number of partitions to process
+ * - OnPartitionCompleted(): called when a partition finishes (success or
+ * failure)
+ *
+ * The structure waits for all partitions to complete before the PutAll
+ * operation can finish. If any partition fails, the entire operation is
+ * considered failed.
  */
 
 struct SyncPutAllData : public Poolable
@@ -146,23 +209,75 @@ struct SyncPutAllData : public Poolable
 
     void Reset()
     {
-        unfinished_request_cnt_ = 0;
-        all_request_started_ = false;
-        result_.Clear();
         // Clear partition states if using new concurrent approach
         partition_states_.clear();
         completed_partitions_ = 0;
         total_partitions_ = 0;
     }
 
+    virtual void Clear() override
+    {
+        completed_partitions_ = 0;
+        total_partitions_ = 0;
+        for (auto *partition_state : partition_states_)
+        {
+            partition_state->Clear();
+            partition_state->Free();
+        }
+        partition_states_.clear();
+    }
+    void OnPartitionCompleted()
+    {
+        std::unique_lock<bthread::Mutex> lk(mux_);
+        completed_partitions_++;
+        if (completed_partitions_ >= total_partitions_)
+        {
+            cv_.notify_one();
+        }
+    }
+    mutable bthread::Mutex mux_;
+    bthread::ConditionVariable cv_;
+
+    // fields for per-partition coordination
+    std::vector<PartitionFlushState *> partition_states_;
+    int32_t completed_partitions_{0};
+    int32_t total_partitions_{0};
+};
+
+/**
+ * Coordination helper for sequential batch operations with global concurrency
+ * control.
+ *
+ * This structure manages the coordination of sequential batch operations (like
+ * PutArchivesAll) where batches are processed one after another within each
+ * partition, but with global concurrency control to limit the total number of
+ * in-flight requests across all partitions.
+ *
+ * Key features:
+ * - Global concurrency control with max_flying_write_count limit (32)
+ * - Sequential processing within each partition
+ * - Flow control to prevent overwhelming the system
+ * - Error aggregation from all batches
+ *
+ * This is used by operations that need to maintain sequential ordering within
+ * partitions while still allowing some concurrency across the system.
+ */
+struct SyncConcurrentRequest : public Poolable
+{
+    static constexpr int32_t max_flying_write_count = 32;
+
+    void Reset()
+    {
+        unfinished_request_cnt_ = 0;
+        all_request_started_ = false;
+        result_.Clear();
+    }
+
     virtual void Clear() override
     {
         unfinished_request_cnt_ = 0;
         all_request_started_ = false;
         result_.Clear();
-        partition_states_.clear();
-        completed_partitions_ = 0;
-        total_partitions_ = 0;
     }
 
     void Finish(const remote::CommonResult &res)
@@ -182,138 +297,77 @@ struct SyncPutAllData : public Poolable
         }
     }
 
-    // New method for per-partition coordination
-    void OnPartitionCompleted()
-    {
-        std::unique_lock<bthread::Mutex> lk(mux_);
-        completed_partitions_++;
-        if (completed_partitions_ >= total_partitions_) {
-            cv_.notify_one();
-        }
-    }
     // NOTICE: "unfinished_request_cnt_" must use signed integer.
     int32_t unfinished_request_cnt_{0};
     bool all_request_started_{false};
     remote::CommonResult result_;
     mutable bthread::Mutex mux_;
     bthread::ConditionVariable cv_;
-    
-    // New fields for per-partition coordination
-    std::vector<std::unique_ptr<PartitionFlushState>> partition_states_;
-    int32_t completed_partitions_{0};
-    int32_t total_partitions_{0};
 };
 
 /**
  * @brief Represents a single batch request for a partition
  */
-struct PartitionBatchRequest {
+struct PartitionBatchRequest
+{
     std::vector<std::string_view> key_parts;
     std::vector<std::string_view> record_parts;
     std::vector<uint64_t> records_ts;
     std::vector<uint64_t> records_ttl;
+    std::vector<size_t> record_tmp_mem_area;
     std::vector<WriteOpType> op_types;
     uint16_t parts_cnt_per_key;
     uint16_t parts_cnt_per_record;
-    
+
     PartitionBatchRequest() = default;
-    
-    PartitionBatchRequest(std::vector<std::string_view>&& keys,
-                         std::vector<std::string_view>&& records,
-                         std::vector<uint64_t>&& ts,
-                         std::vector<uint64_t>&& ttl,
-                         std::vector<WriteOpType>&& ops,
-                         uint16_t key_parts_count,
-                         uint16_t record_parts_count)
-        : key_parts(std::move(keys))
-        , record_parts(std::move(records))
-        , records_ts(std::move(ts))
-        , records_ttl(std::move(ttl))
-        , op_types(std::move(ops))
-        , parts_cnt_per_key(key_parts_count)
-        , parts_cnt_per_record(record_parts_count) {}
-};
 
-/**
- * @brief Per-partition state management for concurrent flushing
- */
-struct PartitionFlushState {
-    int32_t partition_id;
-    std::queue<PartitionBatchRequest> pending_batches;
-    bool has_inflight_request = false;
-    bool completed = false;
-    bool failed = false;
-    remote::CommonResult result;
-    mutable bthread::Mutex mux;
-    
-    PartitionFlushState(int32_t pid) : partition_id(pid) {
-        result.Clear();
-    }
-    
-    void Reset() {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        while (!pending_batches.empty()) {
-            pending_batches.pop();
-        }
-        has_inflight_request = false;
-        completed = false;
-        failed = false;
-        result.Clear();
-    }
-    
-    bool HasMoreBatches() const {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        return !pending_batches.empty() || has_inflight_request;
-    }
-    
-    bool IsCompleted() const {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        return completed;
-    }
-    
-    bool IsFailed() const {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        return failed;
-    }
-    
-    void MarkCompleted() {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        completed = true;
-    }
-    
-    void MarkFailed(const remote::CommonResult& error) {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        failed = true;
-        result.set_error_code(error.error_code());
-        result.set_error_msg(error.error_msg());
-    }
-    
-    bool GetNextBatch(PartitionBatchRequest& batch) {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        if (pending_batches.empty()) {
-            return false;
-        }
-        batch = std::move(pending_batches.front());
-        pending_batches.pop();
-        return true;
-    }
-    
-    void AddBatch(PartitionBatchRequest&& batch) {
-        std::unique_lock<bthread::Mutex> lk(mux);
-        pending_batches.push(std::move(batch));
+    PartitionBatchRequest(std::vector<std::string_view> &&keys,
+                          std::vector<std::string_view> &&records,
+                          std::vector<uint64_t> &&ts,
+                          std::vector<uint64_t> &&ttl,
+                          std::vector<size_t> &&record_tmp_mem_area,
+                          std::vector<WriteOpType> &&ops,
+                          uint16_t key_parts_count,
+                          uint16_t record_parts_count)
+        : key_parts(std::move(keys)),
+          record_parts(std::move(records)),
+          records_ts(std::move(ts)),
+          records_ttl(std::move(ttl)),
+          op_types(std::move(ops)),
+          parts_cnt_per_key(key_parts_count),
+          parts_cnt_per_record(record_parts_count)
+    {
     }
 };
-
 /**
  * @brief Wrapper for partition callback data that includes global coordinator
  */
-struct PartitionCallbackData {
-    PartitionFlushState* partition_state;
-    SyncPutAllData* global_coordinator;
-    std::string table_name;
-    
-    PartitionCallbackData(PartitionFlushState* ps, SyncPutAllData* gc, const std::string& tn)
-        : partition_state(ps), global_coordinator(gc), table_name(tn) {}
+struct PartitionCallbackData : public Poolable
+{
+    PartitionFlushState *partition_state;
+    SyncPutAllData *global_coordinator;
+    std::string_view table_name;
+
+    PartitionCallbackData()
+        : partition_state(nullptr), global_coordinator(nullptr), table_name("")
+    {
+    }
+
+    void Reset(PartitionFlushState *ps,
+               SyncPutAllData *gc,
+               const std::string_view tn)
+    {
+        partition_state = ps;
+        global_coordinator = gc;
+        table_name = tn;
+    }
+
+    void Clear() override
+    {
+        partition_state = nullptr;
+        global_coordinator = nullptr;
+        table_name = "";
+    }
 };
 /**
  * Generic synchronous callback adapter invoked by closures to signal
@@ -336,7 +390,7 @@ void SyncCallback(void *data,
 
 /**
  * Callback data structure for concurrent archive record reading operations.
- * 
+ *
  * Manages synchronization and flow control for reading base records that will
  * be copied to archive storage. Tracks flying read count and provides mutex
  * synchronization for concurrent access.
@@ -801,7 +855,7 @@ class ReadClosure : public ::google::protobuf::Closure, public Poolable
 
 /**
  * Closure for asynchronous data flushing operations to KV storage.
- * 
+ *
  * Manages the lifecycle of flush operations, including RPC communication,
  * retry logic, and callback invocation. Supports both local and remote
  * flush operations with configurable retry behavior.
@@ -2334,7 +2388,7 @@ class CreateSnapshotForBackupClosure : public ::google::protobuf::Closure,
 
 /**
  * Callback for fetching individual records from the data store.
- * 
+ *
  * Handles the completion of record fetch operations and processes the result.
  */
 void FetchRecordCallback(void *data,
@@ -2344,7 +2398,7 @@ void FetchRecordCallback(void *data,
 
 /**
  * Callback for fetching snapshot data from the data store.
- * 
+ *
  * Handles the completion of snapshot fetch operations and processes the result.
  */
 void FetchSnapshotCallback(void *data,
@@ -2354,7 +2408,7 @@ void FetchSnapshotCallback(void *data,
 
 /**
  * Callback data for asynchronous table drop operations.
- * 
+ *
  * Contains the KV table name that is being dropped.
  */
 struct AsyncDropTableCallbackData
@@ -2364,7 +2418,7 @@ struct AsyncDropTableCallbackData
 
 /**
  * Callback for asynchronous table drop operations.
- * 
+ *
  * Handles the completion of table drop operations and processes the result.
  */
 void AsyncDropTableCallback(void *data,
@@ -2374,8 +2428,9 @@ void AsyncDropTableCallback(void *data,
 
 /**
  * Callback for fetching table catalog information.
- * 
- * Handles the completion of table catalog fetch operations and processes the result.
+ *
+ * Handles the completion of table catalog fetch operations and processes the
+ * result.
  */
 void FetchTableCatalogCallback(void *data,
                                ::google::protobuf::Closure *closure,
@@ -2384,7 +2439,7 @@ void FetchTableCatalogCallback(void *data,
 
 /**
  * Callback data for fetching table information.
- * 
+ *
  * Extends SyncCallbackData to include table-specific information like
  * schema image, version timestamp, and found status.
  */
@@ -2420,30 +2475,30 @@ void FetchTableCallback(void *data,
                         const remote::CommonResult &result);
 
 /**
- * Callback for synchronous put-all operations.
- * 
- * Handles the completion of batch put operations and updates the
- * SyncPutAllData structure with the result.
+ * Callback for synchronous concurrent request operations.
+ *
+ * Handles the completion of concurrent request operations and updates the
+ * SyncConcurrentRequest structure with the result.
  */
-void SyncPutAllCallback(void *data,
-                        ::google::protobuf::Closure *closure,
-                        DataStoreServiceClient &client,
-                        const remote::CommonResult &result);
+void SyncConcurrentRequestCallback(void *data,
+                                   ::google::protobuf::Closure *closure,
+                                   DataStoreServiceClient &client,
+                                   const remote::CommonResult &result);
 
 /**
  * Callback for per-partition batch operations in concurrent PutAll.
- * 
+ *
  * Handles the completion of a single batch for a partition and chains
  * to the next batch if available, or marks the partition as completed.
  */
 void PartitionBatchCallback(void *data,
-                           ::google::protobuf::Closure *closure,
-                           DataStoreServiceClient &client,
-                           const remote::CommonResult &result);
+                            ::google::protobuf::Closure *closure,
+                            DataStoreServiceClient &client,
+                            const remote::CommonResult &result);
 
 /**
  * Callback data for fetching database information.
- * 
+ *
  * Extends SyncCallbackData to include database-specific information like
  * database definition, found status, and yield/resume function pointers
  * for cooperative scheduling.
@@ -2506,7 +2561,7 @@ struct FetchDatabaseCallbackData : public SyncCallbackData
 
 /**
  * Callback for fetching database information.
- * 
+ *
  * Handles the completion of database fetch operations and processes the result.
  */
 void FetchDatabaseCallback(void *data,
@@ -2516,7 +2571,7 @@ void FetchDatabaseCallback(void *data,
 
 /**
  * Callback data for fetching all database names.
- * 
+ *
  * Extends SyncCallbackData to include database names list and yield/resume
  * function pointers for cooperative scheduling during pagination.
  */
@@ -2584,8 +2639,9 @@ struct FetchAllDatabaseCallbackData : public SyncCallbackData
 
 /**
  * Callback for fetching all database names.
- * 
- * Handles the completion of all database names fetch operations and processes the result.
+ *
+ * Handles the completion of all database names fetch operations and processes
+ * the result.
  */
 void FetchAllDatabaseCallback(void *data,
                               ::google::protobuf::Closure *closure,
@@ -2594,7 +2650,7 @@ void FetchAllDatabaseCallback(void *data,
 
 /**
  * Callback data for discovering all table names.
- * 
+ *
  * Extends SyncCallbackData to include table names list and yield/resume
  * function pointers for cooperative scheduling during pagination.
  */
@@ -2656,8 +2712,9 @@ struct DiscoverAllTableNamesCallbackData : public SyncCallbackData
 
 /**
  * Callback for discovering all table names.
- * 
- * Handles the completion of table name discovery operations and processes the result.
+ *
+ * Handles the completion of table name discovery operations and processes the
+ * result.
  */
 void DiscoverAllTableNamesCallback(void *data,
                                    ::google::protobuf::Closure *closure,
@@ -2666,8 +2723,9 @@ void DiscoverAllTableNamesCallback(void *data,
 
 /**
  * Callback for fetching table ranges.
- * 
- * Handles the completion of table range fetch operations and processes the result.
+ *
+ * Handles the completion of table range fetch operations and processes the
+ * result.
  */
 void FetchTableRangesCallback(void *data,
                               ::google::protobuf::Closure *closure,
@@ -2676,8 +2734,9 @@ void FetchTableRangesCallback(void *data,
 
 /**
  * Callback for fetching range slices.
- * 
- * Handles the completion of range slice fetch operations and processes the result.
+ *
+ * Handles the completion of range slice fetch operations and processes the
+ * result.
  */
 void FetchRangeSlicesCallback(void *data,
                               ::google::protobuf::Closure *closure,
@@ -2685,8 +2744,9 @@ void FetchRangeSlicesCallback(void *data,
                               const remote::CommonResult &result);
 /**
  * Callback for fetching current table statistics.
- * 
- * Handles the completion of current table statistics fetch operations and processes the result.
+ *
+ * Handles the completion of current table statistics fetch operations and
+ * processes the result.
  */
 void FetchCurrentTableStatsCallback(void *data,
                                     ::google::protobuf::Closure *closure,
@@ -2695,8 +2755,9 @@ void FetchCurrentTableStatsCallback(void *data,
 
 /**
  * Callback for fetching table statistics.
- * 
- * Handles the completion of table statistics fetch operations and processes the result.
+ *
+ * Handles the completion of table statistics fetch operations and processes the
+ * result.
  */
 void FetchTableStatsCallback(void *data,
                              ::google::protobuf::Closure *closure,
@@ -2705,7 +2766,7 @@ void FetchTableStatsCallback(void *data,
 
 /**
  * Callback data for fetching archive records.
- * 
+ *
  * Extends SyncCallbackData to include archive-specific information like
  * table name, partition ID, key ranges, batch size, and scan direction.
  */
@@ -2743,8 +2804,9 @@ struct FetchArchivesCallbackData : public SyncCallbackData
 
 /**
  * Callback for fetching archive records.
- * 
- * Handles the completion of archive record fetch operations and processes the result.
+ *
+ * Handles the completion of archive record fetch operations and processes the
+ * result.
  */
 void FetchArchivesCallback(void *data,
                            ::google::protobuf::Closure *closure,
@@ -2753,8 +2815,9 @@ void FetchArchivesCallback(void *data,
 
 /**
  * Callback for fetching record archives.
- * 
- * Handles the completion of record archive fetch operations and processes the result.
+ *
+ * Handles the completion of record archive fetch operations and processes the
+ * result.
  */
 void FetchRecordArchivesCallback(void *data,
                                  ::google::protobuf::Closure *closure,
@@ -2763,8 +2826,9 @@ void FetchRecordArchivesCallback(void *data,
 
 /**
  * Callback for fetching snapshot archives.
- * 
- * Handles the completion of snapshot archive fetch operations and processes the result.
+ *
+ * Handles the completion of snapshot archive fetch operations and processes the
+ * result.
  */
 void FetchSnapshotArchiveCallback(void *data,
                                   ::google::protobuf::Closure *closure,
@@ -2773,7 +2837,7 @@ void FetchSnapshotArchiveCallback(void *data,
 
 /**
  * Callback data for creating snapshots for backup operations.
- * 
+ *
  * Extends SyncCallbackData to include backup-specific information like
  * backup name, timestamp, and backup files list.
  */
@@ -2805,8 +2869,9 @@ struct CreateSnapshotForBackupCallbackData : public SyncCallbackData
 
 /**
  * Callback for creating snapshots for backup operations.
- * 
- * Handles the completion of snapshot creation for backup operations and processes the result.
+ *
+ * Handles the completion of snapshot creation for backup operations and
+ * processes the result.
  */
 void CreateSnapshotForBackupCallback(void *data,
                                      ::google::protobuf::Closure *closure,

From 479388b7e028a873792c0f7dabbe3b411bb47bef Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 11:45:46 +0000
Subject: [PATCH 7/9] fix record parts corrupted across retry

---
 data_store_service_client.cpp         |  2 +-
 data_store_service_client_closure.cpp |  2 +-
 data_store_service_client_closure.h   | 14 ++++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index ea54dfd..e40cb60 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -315,7 +315,7 @@ bool DataStoreServiceClient::PutAll(
             auto *callback_data = callback_data_list[i];
 
             // Start the first batch for this partition
-            PartitionBatchRequest first_batch;
+            auto &first_batch = callback_data->inflight_batch;
             if (partition_state->GetNextBatch(first_batch))
             {
                 BatchWriteRecords(callback_data->table_name,
diff --git a/data_store_service_client_closure.cpp b/data_store_service_client_closure.cpp
index a3a1238..be53357 100644
--- a/data_store_service_client_closure.cpp
+++ b/data_store_service_client_closure.cpp
@@ -425,7 +425,7 @@ void PartitionBatchCallback(void *data,
     }
 
     // Try to get the next batch for this partition
-    PartitionBatchRequest next_batch;
+    PartitionBatchRequest &next_batch = callback_data->inflight_batch;
     if (partition_state->GetNextBatch(next_batch))
     {
         // Send the next batch
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index ddc6dc3..9da972d 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -333,11 +333,24 @@ struct PartitionBatchRequest
           record_parts(std::move(records)),
           records_ts(std::move(ts)),
           records_ttl(std::move(ttl)),
+          record_tmp_mem_area(std::move(record_tmp_mem_area)),
           op_types(std::move(ops)),
           parts_cnt_per_key(key_parts_count),
           parts_cnt_per_record(record_parts_count)
     {
     }
+
+    void Clear()
+    {
+        key_parts.clear();
+        record_parts.clear();
+        records_ts.clear();
+        records_ttl.clear();
+        record_tmp_mem_area.clear();
+        op_types.clear();
+        parts_cnt_per_key = 1;
+        parts_cnt_per_record = 1;
+    }
 };
 /**
  * @brief Wrapper for partition callback data that includes global coordinator
@@ -347,6 +360,7 @@ struct PartitionCallbackData : public Poolable
     PartitionFlushState *partition_state;
     SyncPutAllData *global_coordinator;
     std::string_view table_name;
+    PartitionBatchRequest inflight_batch;
 
     PartitionCallbackData()
         : partition_state(nullptr), global_coordinator(nullptr), table_name("")

From 3be1e0bbd1c30cf12d252131a3a5dd33aaf00c20 Mon Sep 17 00:00:00 2001
From: liunyl <lukeliu970702@gmail.com>
Date: Wed, 17 Sep 2025 23:26:46 +0000
Subject: [PATCH 8/9] Fix bug that record_tmp_area invalid due to vector resize

---
 data_store_service_client.cpp       | 183 +++++++++++++---------------
 data_store_service_client_closure.h |  13 ++
 2 files changed, 95 insertions(+), 101 deletions(-)

diff --git a/data_store_service_client.cpp b/data_store_service_client.cpp
index e40cb60..bfc8e21 100644
--- a/data_store_service_client.cpp
+++ b/data_store_service_client.cpp
@@ -4198,16 +4198,13 @@ void DataStoreServiceClient::PreparePartitionBatches(
     uint16_t parts_cnt_per_record,
     uint64_t now)
 {
-    std::vector<std::string_view> key_parts;
-    std::vector<std::string_view> record_parts;
-    std::vector<uint64_t> records_ts;
-    std::vector<uint64_t> records_ttl;
-    std::vector<WriteOpType> op_types;
-    std::vector<size_t> record_tmp_mem_area;
     size_t write_batch_size = 0;
-
-    auto PrepareObjectData =
-        [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size)
+    PartitionBatchRequest batch_request;
+    batch_request.Reset(
+        parts_cnt_per_key, parts_cnt_per_record, flush_recs.size());
+    auto PrepareObjectData = [&](txservice::FlushRecord &ckpt_rec,
+                                 size_t &batch_size,
+                                 PartitionBatchRequest &batch_request)
     {
         txservice::TxKey tx_key = ckpt_rec.Key();
         uint64_t ttl =
@@ -4217,72 +4214,77 @@ void DataStoreServiceClient::PreparePartitionBatches(
         if (ckpt_rec.payload_status_ == txservice::RecordStatus::Normal &&
             (!ckpt_rec.Payload()->HasTTL() || ttl > now))
         {
-            key_parts.emplace_back(
+            batch_request.key_parts.emplace_back(
                 std::string_view(tx_key.Data(), tx_key.Size()));
             batch_size += tx_key.Size();
 
             const txservice::TxRecord *rec = ckpt_rec.Payload();
-            record_parts.emplace_back(std::string_view(rec->EncodedBlobData(),
-                                                       rec->EncodedBlobSize()));
+            batch_request.record_parts.emplace_back(std::string_view(
+                rec->EncodedBlobData(), rec->EncodedBlobSize()));
             batch_size += rec->EncodedBlobSize();
 
-            records_ts.push_back(ckpt_rec.commit_ts_);
+            batch_request.records_ts.push_back(ckpt_rec.commit_ts_);
             batch_size += sizeof(uint64_t);
 
-            records_ttl.push_back(ttl);
+            batch_request.records_ttl.push_back(ttl);
             batch_size += sizeof(uint64_t);
 
-            op_types.push_back(WriteOpType::PUT);
+            batch_request.op_types.push_back(WriteOpType::PUT);
             batch_size += sizeof(WriteOpType);
         }
         else
         {
-            key_parts.emplace_back(
+            batch_request.key_parts.emplace_back(
                 std::string_view(tx_key.Data(), tx_key.Size()));
             batch_size += tx_key.Size();
 
-            record_parts.emplace_back(std::string_view());
+            batch_request.record_parts.emplace_back(std::string_view());
             batch_size += 0;
 
-            records_ts.push_back(ckpt_rec.commit_ts_);
+            batch_request.records_ts.push_back(ckpt_rec.commit_ts_);
             batch_size += sizeof(uint64_t);
 
-            records_ttl.push_back(0);
+            batch_request.records_ttl.push_back(0);
             batch_size += sizeof(uint64_t);
 
-            op_types.push_back(WriteOpType::DELETE);
+            batch_request.op_types.push_back(WriteOpType::DELETE);
             batch_size += sizeof(WriteOpType);
         }
     };
 
-    auto PrepareRecordData =
-        [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size)
+    auto PrepareRecordData = [&](txservice::FlushRecord &ckpt_rec,
+                                 size_t &batch_size,
+                                 PartitionBatchRequest &batch_request)
     {
         uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000;
         txservice::TxKey tx_key = ckpt_rec.Key();
         bool is_deleted =
             !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal);
-        key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size()));
+        batch_request.key_parts.emplace_back(
+            std::string_view(tx_key.Data(), tx_key.Size()));
         batch_size += tx_key.Size();
 
         const txservice::TxRecord *rec = ckpt_rec.Payload();
         if (is_deleted)
         {
-            records_ttl.push_back(retired_ttl_for_deleted);
+            batch_request.records_ttl.push_back(retired_ttl_for_deleted);
         }
         else
         {
-            records_ttl.push_back(0);
+            batch_request.records_ttl.push_back(0);
         }
         batch_size += sizeof(uint64_t);
 
-        op_types.push_back(WriteOpType::PUT);
+        batch_request.op_types.push_back(WriteOpType::PUT);
         batch_size += sizeof(WriteOpType);
 
-        SerializeTxRecord(
-            is_deleted, rec, record_tmp_mem_area, record_parts, batch_size);
+        SerializeTxRecord(is_deleted,
+                          rec,
+                          batch_request.record_tmp_mem_area,
+                          batch_request.record_parts,
+                          batch_size);
 
-        records_ts.push_back(ckpt_rec.commit_ts_);
+        batch_request.records_ts.push_back(ckpt_rec.commit_ts_);
         batch_size += sizeof(uint64_t);
     };
 
@@ -4293,24 +4295,18 @@ void DataStoreServiceClient::PreparePartitionBatches(
             entries.at(idx.first)->data_sync_vec_->at(idx.second);
 
         // Start a new batch if size limit reached
-        if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
+        // or the record_tmp_mem_area is full. Since the record_parts is a
+        // vector of string_view that references the record_tmp_mem_area, we
+        // cannot allow the record_tmp_mem_area to be resized which will cause
+        // the record_parts to be invalid.
+        if (write_batch_size >= MAX_WRITE_BATCH_SIZE ||
+            batch_request.record_tmp_mem_area.size() ==
+                batch_request.record_tmp_mem_area.capacity())
         {
-            partition_state.AddBatch(
-                PartitionBatchRequest(std::move(key_parts),
-                                      std::move(record_parts),
-                                      std::move(records_ts),
-                                      std::move(records_ttl),
-                                      std::move(record_tmp_mem_area),
-                                      std::move(op_types),
-                                      parts_cnt_per_key,
-                                      parts_cnt_per_record));
-
-            key_parts.clear();
-            record_parts.clear();
-            records_ts.clear();
-            records_ttl.clear();
-            op_types.clear();
-            record_tmp_mem_area.clear();
+            partition_state.AddBatch(std::move(batch_request));
+
+            batch_request.Reset(
+                parts_cnt_per_key, parts_cnt_per_record, flush_recs.size());
             write_batch_size = 0;
         }
 
@@ -4319,26 +4315,18 @@ void DataStoreServiceClient::PreparePartitionBatches(
 
         if (table_name.IsObjectTable())
         {
-            PrepareObjectData(ckpt_rec, write_batch_size);
+            PrepareObjectData(ckpt_rec, write_batch_size, batch_request);
         }
         else
         {
-            PrepareRecordData(ckpt_rec, write_batch_size);
+            PrepareRecordData(ckpt_rec, write_batch_size, batch_request);
         }
     }
 
     // Add the last batch if it has data
-    if (key_parts.size() > 0)
+    if (batch_request.key_parts.size() > 0)
     {
-        partition_state.AddBatch(
-            PartitionBatchRequest(std::move(key_parts),
-                                  std::move(record_parts),
-                                  std::move(records_ts),
-                                  std::move(records_ttl),
-                                  std::move(record_tmp_mem_area),
-                                  std::move(op_types),
-                                  parts_cnt_per_key,
-                                  parts_cnt_per_record));
+        partition_state.AddBatch(std::move(batch_request));
     }
 }
 
@@ -4351,69 +4339,70 @@ void DataStoreServiceClient::PrepareRangePartitionBatches(
     uint16_t parts_cnt_per_record,
     uint64_t now)
 {
-    std::vector<std::string_view> key_parts;
-    std::vector<std::string_view> record_parts;
-    std::vector<uint64_t> records_ts;
-    std::vector<uint64_t> records_ttl;
-    std::vector<WriteOpType> op_types;
-    std::vector<size_t> record_tmp_mem_area;
     size_t write_batch_size = 0;
+    PartitionBatchRequest batch_request;
 
-    auto PrepareRecordData =
-        [&](txservice::FlushRecord &ckpt_rec, size_t &batch_size)
+    auto PrepareRecordData = [&](txservice::FlushRecord &ckpt_rec,
+                                 size_t &batch_size,
+                                 PartitionBatchRequest &batch_request)
     {
         uint64_t retired_ttl_for_deleted = now + 24 * 60 * 60 * 1000;
         txservice::TxKey tx_key = ckpt_rec.Key();
         bool is_deleted =
             !(ckpt_rec.payload_status_ == txservice::RecordStatus::Normal);
-        key_parts.emplace_back(std::string_view(tx_key.Data(), tx_key.Size()));
+        batch_request.key_parts.emplace_back(
+            std::string_view(tx_key.Data(), tx_key.Size()));
         batch_size += tx_key.Size();
 
         const txservice::TxRecord *rec = ckpt_rec.Payload();
         if (is_deleted)
         {
-            records_ttl.push_back(retired_ttl_for_deleted);
+            batch_request.records_ttl.push_back(retired_ttl_for_deleted);
         }
         else
         {
-            records_ttl.push_back(0);
+            batch_request.records_ttl.push_back(0);
         }
         batch_size += sizeof(uint64_t);
 
-        op_types.push_back(WriteOpType::PUT);
+        batch_request.op_types.push_back(WriteOpType::PUT);
         batch_size += sizeof(WriteOpType);
 
-        SerializeTxRecord(
-            is_deleted, rec, record_tmp_mem_area, record_parts, batch_size);
+        SerializeTxRecord(is_deleted,
+                          rec,
+                          batch_request.record_tmp_mem_area,
+                          batch_request.record_parts,
+                          batch_size);
 
-        records_ts.push_back(ckpt_rec.commit_ts_);
+        batch_request.records_ts.push_back(ckpt_rec.commit_ts_);
         batch_size += sizeof(uint64_t);
     };
 
+    size_t rec_cnt = 0;
+    for (auto idx : flush_recs)
+    {
+        rec_cnt += entries.at(idx)->data_sync_vec_->size();
+    }
+    batch_request.Reset(parts_cnt_per_key, parts_cnt_per_record, rec_cnt);
+
     // Process records and create batches
     for (auto idx : flush_recs)
     {
         for (auto &ckpt_rec : *entries.at(idx)->data_sync_vec_)
         {
             // Start a new batch if size limit reached
-            if (write_batch_size >= MAX_WRITE_BATCH_SIZE)
+            // or the record_tmp_mem_area is full. Since the record_parts is a
+            // vector of string_view that references the record_tmp_mem_area, we
+            // cannot allow the record_tmp_mem_area to be resized which will
+            // cause the record_parts to be invalid.
+            if (write_batch_size >= MAX_WRITE_BATCH_SIZE ||
+                batch_request.record_tmp_mem_area.size() ==
+                    batch_request.record_tmp_mem_area.capacity())
             {
-                partition_state.AddBatch(
-                    PartitionBatchRequest(std::move(key_parts),
-                                          std::move(record_parts),
-                                          std::move(records_ts),
-                                          std::move(records_ttl),
-                                          std::move(record_tmp_mem_area),
-                                          std::move(op_types),
-                                          parts_cnt_per_key,
-                                          parts_cnt_per_record));
-
-                key_parts.clear();
-                record_parts.clear();
-                records_ts.clear();
-                records_ttl.clear();
-                op_types.clear();
-                record_tmp_mem_area.clear();
+                partition_state.AddBatch(std::move(batch_request));
+
+                batch_request.Reset(
+                    parts_cnt_per_key, parts_cnt_per_record, rec_cnt);
                 write_batch_size = 0;
             }
 
@@ -4422,22 +4411,14 @@ void DataStoreServiceClient::PrepareRangePartitionBatches(
                 ckpt_rec.payload_status_ == txservice::RecordStatus::Deleted);
 
             // Currently there is no object table in range partitioned table
-            PrepareRecordData(ckpt_rec, write_batch_size);
+            PrepareRecordData(ckpt_rec, write_batch_size, batch_request);
         }
     }
 
     // Add the last batch if it has data
-    if (key_parts.size() > 0)
+    if (batch_request.key_parts.size() > 0)
     {
-        partition_state.AddBatch(
-            PartitionBatchRequest(std::move(key_parts),
-                                  std::move(record_parts),
-                                  std::move(records_ts),
-                                  std::move(records_ttl),
-                                  std::move(record_tmp_mem_area),
-                                  std::move(op_types),
-                                  parts_cnt_per_key,
-                                  parts_cnt_per_record));
+        partition_state.AddBatch(std::move(batch_request));
     }
 }
 
diff --git a/data_store_service_client_closure.h b/data_store_service_client_closure.h
index 9da972d..7878c31 100644
--- a/data_store_service_client_closure.h
+++ b/data_store_service_client_closure.h
@@ -351,6 +351,19 @@ struct PartitionBatchRequest
         parts_cnt_per_key = 1;
         parts_cnt_per_record = 1;
     }
+
+    void Reset(uint16_t key_parts_count, uint16_t record_parts_count, size_t record_cnt)
+    {
+        Clear();
+        parts_cnt_per_key = key_parts_count;
+        parts_cnt_per_record = record_parts_count;
+        key_parts.reserve(key_parts_count * record_cnt);
+        record_parts.reserve(record_parts_count * record_cnt);
+        records_ts.reserve(record_cnt);
+        records_ttl.reserve(record_cnt);
+        record_tmp_mem_area.reserve(record_cnt * 2);
+        op_types.reserve(record_cnt);
+    }
 };
 /**
  * @brief Wrapper for partition callback data that includes global coordinator

From 031ae080ccaf6bd895d9daf93ec4c09cb1cd33be Mon Sep 17 00:00:00 2001
From: Chen Zhao <ch3nzhao@gmail.com>
Date: Thu, 18 Sep 2025 12:11:16 +0800
Subject: [PATCH 9/9] fix compile error and change amplification factor to 2

---
 eloq_data_store_service/eloq_store_config.cpp     | 2 +-
 eloq_data_store_service/eloq_store_data_store.cpp | 7 +++++++
 eloq_data_store_service/eloq_store_data_store.h   | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/eloq_data_store_service/eloq_store_config.cpp b/eloq_data_store_service/eloq_store_config.cpp
index f9e10b6..6ffa0ea 100644
--- a/eloq_data_store_service/eloq_store_config.cpp
+++ b/eloq_data_store_service/eloq_store_config.cpp
@@ -85,7 +85,7 @@ DEFINE_uint32(eloq_store_max_archive_tasks,
               256,
               "EloqStore max archive tasks.");
 DEFINE_uint32(eloq_store_file_amplify_factor,
-              4,
+              2,
               "EloqStore file amplify factor.");
 DEFINE_uint64(eloq_store_local_space_limit,
               1ULL << 40,
diff --git a/eloq_data_store_service/eloq_store_data_store.cpp b/eloq_data_store_service/eloq_store_data_store.cpp
index d36b0f5..c9ebf06 100644
--- a/eloq_data_store_service/eloq_store_data_store.cpp
+++ b/eloq_data_store_service/eloq_store_data_store.cpp
@@ -495,6 +495,13 @@ void EloqStoreDataStore::SwitchToReadWrite()
     return;
 }
 
+void EloqStoreDataStore::CreateSnapshotForBackup(
+    CreateSnapshotForBackupRequest *req)
+{
+    return;
+}
+
+
 void EloqStoreDataStore::ScanDelete(DeleteRangeRequest *delete_range_req)
 {
     ::eloqstore::TableIdent eloq_store_table_id;
diff --git a/eloq_data_store_service/eloq_store_data_store.h b/eloq_data_store_service/eloq_store_data_store.h
index 550a569..f0e12a9 100644
--- a/eloq_data_store_service/eloq_store_data_store.h
+++ b/eloq_data_store_service/eloq_store_data_store.h
@@ -252,6 +252,8 @@ class EloqStoreDataStore : public DataStore
      */
     void SwitchToReadWrite() override;
 
+    void CreateSnapshotForBackup(CreateSnapshotForBackupRequest *req) override;
+
 private:
     static void OnRead(::eloqstore::KvRequest *req);
     static void OnBatchWrite(::eloqstore::KvRequest *req);