diff --git a/be/src/exec/scan/olap_scanner.cpp b/be/src/exec/scan/olap_scanner.cpp
index 320976814679b9..efa536ea690779 100644
--- a/be/src/exec/scan/olap_scanner.cpp
+++ b/be/src/exec/scan/olap_scanner.cpp
@@ -152,7 +152,10 @@ static bool has_file_cache_statistics(const io::FileCacheStatistics& stats) {
            stats.inverted_index_bytes_read_from_remote != 0 ||
            stats.inverted_index_bytes_read_from_peer != 0 ||
            stats.inverted_index_local_io_timer != 0 || stats.inverted_index_remote_io_timer != 0 ||
-           stats.inverted_index_peer_io_timer != 0 || stats.inverted_index_io_timer != 0;
+           stats.inverted_index_peer_io_timer != 0 || stats.inverted_index_io_timer != 0 ||
+           stats.inverted_index_request_bytes != 0 || stats.inverted_index_read_bytes != 0 ||
+           stats.inverted_index_range_read_count != 0 ||
+           stats.inverted_index_serial_read_rounds != 0;
 }
 
 Status OlapScanner::_prepare_impl() {
diff --git a/be/src/io/cache/block_file_cache_profile.cpp b/be/src/io/cache/block_file_cache_profile.cpp
index 8f9c167c9989e6..10ea52670789a0 100644
--- a/be/src/io/cache/block_file_cache_profile.cpp
+++ b/be/src/io/cache/block_file_cache_profile.cpp
@@ -98,6 +98,10 @@ FileCacheStatistics diff_file_cache_statistics(const FileCacheStatistics& curren
     SUBTRACT_FIELD(inverted_index_remote_io_timer);
     SUBTRACT_FIELD(inverted_index_peer_io_timer);
     SUBTRACT_FIELD(inverted_index_io_timer);
+    SUBTRACT_FIELD(inverted_index_request_bytes);
+    SUBTRACT_FIELD(inverted_index_read_bytes);
+    SUBTRACT_FIELD(inverted_index_range_read_count);
+    SUBTRACT_FIELD(inverted_index_serial_read_rounds);
 #undef SUBTRACT_FIELD
     return diff;
 }
@@ -156,6 +160,14 @@ FileCacheProfileReporter::FileCacheProfileReporter(RuntimeProfile* profile) {
             ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexPeerIOUseTimer", cache_profile, 1);
     inverted_index_io_timer =
             ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexIOTimer", cache_profile, 1);
+    inverted_index_request_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(
+            profile, "InvertedIndexRequestBytes", TUnit::BYTES, cache_profile, 1);
+    inverted_index_read_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "InvertedIndexReadBytes",
+                                                             TUnit::BYTES, cache_profile, 1);
+    inverted_index_range_read_count = ADD_CHILD_COUNTER_WITH_LEVEL(
+            profile, "InvertedIndexRangeReadCount", TUnit::UNIT, cache_profile, 1);
+    inverted_index_serial_read_rounds = ADD_CHILD_COUNTER_WITH_LEVEL(
+            profile, "InvertedIndexSerialReadRounds", TUnit::UNIT, cache_profile, 1);
 }
 
 void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) const {
@@ -193,6 +205,11 @@ void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) con
     COUNTER_UPDATE(inverted_index_remote_io_timer, statistics->inverted_index_remote_io_timer);
     COUNTER_UPDATE(inverted_index_peer_io_timer, statistics->inverted_index_peer_io_timer);
     COUNTER_UPDATE(inverted_index_io_timer, statistics->inverted_index_io_timer);
+    COUNTER_UPDATE(inverted_index_request_bytes, statistics->inverted_index_request_bytes);
+    COUNTER_UPDATE(inverted_index_read_bytes, statistics->inverted_index_read_bytes);
+    COUNTER_UPDATE(inverted_index_range_read_count, statistics->inverted_index_range_read_count);
+    COUNTER_UPDATE(inverted_index_serial_read_rounds,
+                   statistics->inverted_index_serial_read_rounds);
 }
 
 } // namespace doris::io
diff --git a/be/src/io/cache/block_file_cache_profile.h b/be/src/io/cache/block_file_cache_profile.h
index 6c95e49791c054..41cc2e0c01b41a 100644
--- a/be/src/io/cache/block_file_cache_profile.h
+++ b/be/src/io/cache/block_file_cache_profile.h
@@ -58,7 +58,6 @@ class FileCacheMetrics {
     void register_entity();
     void update_metrics_callback();
 
-private:
     std::mutex _mtx;
     // use shared_ptr for concurrent
     std::shared_ptr<AtomicStatistics> _statistics;
@@ -97,6 +96,10 @@ struct FileCacheProfileReporter {
     RuntimeProfile::Counter* inverted_index_remote_io_timer = nullptr;
     RuntimeProfile::Counter* inverted_index_peer_io_timer = nullptr;
     RuntimeProfile::Counter* inverted_index_io_timer = nullptr;
+    RuntimeProfile::Counter* inverted_index_request_bytes = nullptr;
+    RuntimeProfile::Counter* inverted_index_read_bytes = nullptr;
+    RuntimeProfile::Counter* inverted_index_range_read_count = nullptr;
+    RuntimeProfile::Counter* inverted_index_serial_read_rounds = nullptr;
 
     FileCacheProfileReporter(RuntimeProfile* profile);
     void update(const FileCacheStatistics* statistics) const;
diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h
index 36b20517afb87c..391f3b15c34e8d 100644
--- a/be/src/io/io_common.h
+++ b/be/src/io/io_common.h
@@ -74,6 +74,10 @@ struct FileCacheStatistics {
     int64_t inverted_index_remote_io_timer = 0;
     int64_t inverted_index_peer_io_timer = 0;
     int64_t inverted_index_io_timer = 0;
+    int64_t inverted_index_request_bytes = 0;
+    int64_t inverted_index_read_bytes = 0;
+    int64_t inverted_index_range_read_count = 0;
+    int64_t inverted_index_serial_read_rounds = 0;
 };
 
 struct IOContext {
diff --git a/be/src/snii/common/slice.h b/be/src/snii/common/slice.h
new file mode 100644
index 00000000000000..db10b2dfc52b6f
--- /dev/null
+++ b/be/src/snii/common/slice.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+namespace snii {
+
+// Read-only byte view (does not own memory). Lifetime is managed by the underlying buffer.
+class Slice {
+public:
+    Slice() = default;
+    Slice(const uint8_t* d, size_t n) : data_(d), size_(n) {}
+    explicit Slice(const std::vector<uint8_t>& v) : data_(v.data()), size_(v.size()) {}
+    explicit Slice(std::string_view sv)
+            : data_(reinterpret_cast<const uint8_t*>(sv.data())), size_(sv.size()) {}
+
+    const uint8_t* data() const { return data_; }
+    size_t size() const { return size_; }
+    bool empty() const { return size_ == 0; }
+
+    uint8_t operator[](size_t i) const {
+        assert(i < size_);
+        return data_[i];
+    }
+
+    Slice subslice(size_t off, size_t n) const {
+        assert(off + n <= size_);
+        return Slice(data_ + off, n);
+    }
+
+private:
+    const uint8_t* data_ = nullptr;
+    size_t size_ = 0;
+};
+
+} // namespace snii
diff --git a/be/src/snii/common/status.h b/be/src/snii/common/status.h
new file mode 100644
index 00000000000000..a8e21da814184a
--- /dev/null
+++ b/be/src/snii/common/status.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace snii {
+
+enum class StatusCode {
+    kOk,
+    kCorruption,
+    kNotFound,
+    kInvalidArgument,
+    kIoError,
+    kUnsupported,
+    kInternal,
+};
+
+// Lightweight error type: success is kOk with no message; failure carries a code + human-readable message.
+// Always return Status across API boundaries; silent failures are not allowed.
+class Status {
+public:
+    Status() = default;
+
+    static Status OK() { return Status(); }
+    static Status Corruption(std::string m) {
+        return Status(StatusCode::kCorruption, std::move(m));
+    }
+    static Status NotFound(std::string m) { return Status(StatusCode::kNotFound, std::move(m)); }
+    static Status InvalidArgument(std::string m) {
+        return Status(StatusCode::kInvalidArgument, std::move(m));
+    }
+    static Status IoError(std::string m) { return Status(StatusCode::kIoError, std::move(m)); }
+    static Status Unsupported(std::string m) {
+        return Status(StatusCode::kUnsupported, std::move(m));
+    }
+    static Status Internal(std::string m) { return Status(StatusCode::kInternal, std::move(m)); }
+
+    bool ok() const { return code_ == StatusCode::kOk; }
+    StatusCode code() const { return code_; }
+    const std::string& message() const { return message_; }
+    std::string to_string() const;
+
+private:
+    Status(StatusCode c, std::string m) : code_(c), message_(std::move(m)) {}
+
+    StatusCode code_ = StatusCode::kOk;
+    std::string message_;
+};
+
+} // namespace snii
+
+// Short-circuit return for expressions returning Status (propagate errors upward).
+#define SNII_RETURN_IF_ERROR(expr)  \
+    do {                            \
+        ::snii::Status _s = (expr); \
+        if (!_s.ok()) return _s;    \
+    } while (0)
diff --git a/be/src/snii/encoding/byte_sink.h b/be/src/snii/encoding/byte_sink.h
new file mode 100644
index 00000000000000..604e307228cf39
--- /dev/null
+++ b/be/src/snii/encoding/byte_sink.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+
+namespace snii {
+
+// append-only write cursor: all section serialization goes through this; manual byte assembly is forbidden.
+// All multi-byte fixed-width fields are little-endian.
+class ByteSink {
+public:
+    void put_u8(uint8_t v) { buf_.push_back(v); }
+    void put_fixed16(uint16_t v);
+    void put_fixed32(uint32_t v);
+    void put_fixed64(uint64_t v);
+    void put_varint32(uint32_t v);
+    void put_varint64(uint64_t v);
+    void put_zigzag(int64_t v);
+    void put_bytes(Slice s);
+
+    size_t size() const { return buf_.size(); }
+    const std::vector<uint8_t>& buffer() const { return buf_; }
+    Slice view() const { return Slice(buf_); }
+
+    // Resets the cursor to empty while RETAINING the backing capacity, so a sink can
+    // be reused across many small encodes (e.g. per-window region/prx scratch in the
+    // windowed posting builder) without re-allocating each time -- this avoids the
+    // cumulative small-allocation churn that fragments the heap arena and inflates
+    // peak RSS during the merge of a high-df term split into thousands of windows.
+    void clear() { buf_.clear(); }
+
+    // Moves the backing buffer OUT to the caller (the sink is left empty), so an encoded
+    // section can be handed off without the copy (+ copy-induced capacity slack) that
+    // reading buffer() and copy-assigning would incur. Use only when the sink is not
+    // reused afterward (a stack-local about to die, or one that is clear()'d next).
+    std::vector<uint8_t> take() { return std::move(buf_); }
+
+private:
+    std::vector<uint8_t> buf_;
+};
+
+} // namespace snii
diff --git a/be/src/snii/encoding/byte_source.h b/be/src/snii/encoding/byte_source.h
new file mode 100644
index 00000000000000..96cf4eed665269
--- /dev/null
+++ b/be/src/snii/encoding/byte_source.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+
+namespace snii {
+
+// Slice read cursor: all section deserialization goes through this; any overrun returns Corruption.
+class ByteSource {
+public:
+    explicit ByteSource(Slice s) : s_(s) {}
+
+    Status get_u8(uint8_t* v);
+    Status get_fixed16(uint16_t* v);
+    Status get_fixed32(uint32_t* v);
+    Status get_fixed64(uint64_t* v);
+    Status get_varint32(uint32_t* v);
+    Status get_varint64(uint64_t* v);
+    Status get_zigzag(int64_t* v);
+    Status get_bytes(size_t n, Slice* out);
+
+    size_t remaining() const { return s_.size() - pos_; }
+    size_t position() const { return pos_; }
+    bool eof() const { return pos_ == s_.size(); }
+
+    // Returns a sub-view starting at absolute offset start with length len (used by framer etc. to rewind over the CRC coverage region).
+    Slice slice_from(size_t start, size_t len) const { return s_.subslice(start, len); }
+
+private:
+    Slice s_;
+    size_t pos_ = 0;
+};
+
+} // namespace snii
diff --git a/be/src/snii/encoding/crc32c.h b/be/src/snii/encoding/crc32c.h
new file mode 100644
index 00000000000000..08210379064d91
--- /dev/null
+++ b/be/src/snii/encoding/crc32c.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+
+#include "snii/common/slice.h"
+
+namespace snii {
+
+// CRC32C (Castagnoli, polynomial 0x1EDC6F41). Used to checksum the tail of each format block.
+uint32_t crc32c_extend(uint32_t crc, Slice data);
+
+inline uint32_t crc32c(Slice data) {
+    return crc32c_extend(0, data);
+}
+
+} // namespace snii
diff --git a/be/src/snii/encoding/pfor.h b/be/src/snii/encoding/pfor.h
new file mode 100644
index 00000000000000..743cfe6f58e1a7
--- /dev/null
+++ b/be/src/snii/encoding/pfor.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/byte_source.h"
+
+namespace snii {
+
+// PFOR integer block encoder/decoder (unsigned uint32 array).
+// Encoded layout: [u8 bit_width][varint n_exceptions][bit-packed low
+// bits][exception table]. Selects the bit_width that minimizes total byte size;
+// values exceeding it go into the exception table (index_delta, full_value).
+// delta/zigzag is handled by the upper layer (.frq window); PFOR only processes
+// unsigned integer arrays.
+void pfor_encode(const uint32_t* values, size_t n, ByteSink* out);
+Status pfor_decode(ByteSource* src, size_t n, uint32_t* out);
+Status pfor_skip(ByteSource* src, size_t n);
+
+} // namespace snii
diff --git a/be/src/snii/encoding/section_framer.h b/be/src/snii/encoding/section_framer.h
new file mode 100644
index 00000000000000..cd8594f589a8da
--- /dev/null
+++ b/be/src/snii/encoding/section_framer.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstdint>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/byte_source.h"
+
+namespace snii {
+
+// A framed section: type + payload view.
+struct FramedSection {
+    uint8_t type = 0;
+    Slice payload;
+};
+
+// Unified section framing: [u8 type][varint64 len][payload][fixed32 crc32c(type+len+payload)].
+// All full-format sections reuse this encode/checksum path to avoid ad-hoc hand-assembly.
+// Unknown optional sections are dispatched by the caller based on type; read still verifies the CRC and skips the payload.
+class SectionFramer {
+public:
+    static void write(ByteSink& sink, uint8_t section_type, Slice payload);
+    static Status read(ByteSource& src, FramedSection* out);
+};
+
+} // namespace snii
diff --git a/be/src/snii/encoding/varint.h b/be/src/snii/encoding/varint.h
new file mode 100644
index 00000000000000..8a878b1d2928b4
--- /dev/null
+++ b/be/src/snii/encoding/varint.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "snii/common/status.h"
+
+namespace snii {
+
+// LEB128 variable-length integer encoding + zigzag. out buffer must be >=10 bytes; returns number of bytes written.
+size_t varint_len(uint64_t v);
+size_t encode_varint32(uint32_t v, uint8_t* out);
+size_t encode_varint64(uint64_t v, uint8_t* out);
+
+// Decode a varint from the range [p, end); on success *next points to the next byte after the consumed input.
+Status decode_varint32(const uint8_t* p, const uint8_t* end, uint32_t* v, const uint8_t** next);
+Status decode_varint64(const uint8_t* p, const uint8_t* end, uint64_t* v, const uint8_t** next);
+
+inline uint64_t zigzag_encode(int64_t v) {
+    return (static_cast<uint64_t>(v) << 1) ^ static_cast<uint64_t>(v >> 63);
+}
+inline int64_t zigzag_decode(uint64_t v) {
+    return static_cast<int64_t>(v >> 1) ^ -static_cast<int64_t>(v & 1);
+}
+
+} // namespace snii
diff --git a/be/src/snii/encoding/zstd_codec.h b/be/src/snii/encoding/zstd_codec.h
new file mode 100644
index 00000000000000..838df9af41b617
--- /dev/null
+++ b/be/src/snii/encoding/zstd_codec.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+
+namespace snii {
+
+// Thin ZSTD wrapper. Used for compressing large payloads such as .prx windows. Decompression requires the caller to supply the original uncompressed length (from the block header).
+Status zstd_compress(Slice input, int level, std::vector<uint8_t>* out);
+Status zstd_decompress(Slice input, size_t expected_uncomp_len, std::vector<uint8_t>* out);
+
+} // namespace snii
diff --git a/be/src/snii/format/bootstrap_header.h b/be/src/snii/format/bootstrap_header.h
new file mode 100644
index 00000000000000..1face0347596c6
--- /dev/null
+++ b/be/src/snii/format/bootstrap_header.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <cstdint>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+
+// Fixed container header at the very start of a {rowset_id}_{seg_id}.idx file.
+// Identifies the SNII container and carries basic compatibility info so a
+// reader can fail fast before touching any streamed section or the tail meta
+// region.
+//
+// On-disk layout (all multi-byte fields little-endian, fixed width; NOT framed
+// by SectionFramer because it must be parseable without prior knowledge of the
+// file):
+//   u32 magic              == kContainerMagic
+//   u16 format_version      == kFormatVersion
+//   u16 min_reader_version  readers with kFormatVersion < this MUST refuse to
+//   read u32 flags               container-level feature flags u32
+//   header_length       total bytes of this header including the checksum u8
+//   tail_pointer_size   size of the fixed tail pointer at EOF (hint for the
+//   reader) u32 header_checksum     crc32c over all preceding header bytes
+struct BootstrapHeader {
+    uint32_t magic = kContainerMagic;
+    uint16_t format_version = kFormatVersion;
+    uint16_t min_reader_version = kMinReaderVersion;
+    uint32_t flags = 0;
+    uint32_t header_length = 0;
+    uint8_t tail_pointer_size = 0;
+};
+
+// Total fixed on-disk size of the header, including the trailing crc32c.
+inline constexpr uint32_t kBootstrapHeaderSize =
+        4 /*magic*/ + 2 /*format_version*/ + 2 /*min_reader_version*/ + 4 /*flags*/ +
+        4 /*header_length*/ + 1 /*tail_pointer_size*/ + 4 /*header_checksum*/;
+
+// Serializes the header to sink: writes header_length = kBootstrapHeaderSize
+// and appends a crc32c over all preceding bytes. The caller's header_length
+// field is ignored on input (it is always derived). Returns OK.
+Status encode_bootstrap_header(const BootstrapHeader& header, ByteSink* sink);
+
+// Parses and validates a bootstrap header from the front of data.
+//   - too short / trailing bytes beyond the fixed header -> kCorruption
+//   - magic != kContainerMagic                           -> kCorruption
+//   - checksum mismatch                                  -> kCorruption
+//   - format_version != kFormatVersion                   -> kUnsupported
+//   - min_reader_version > kFormatVersion                -> kUnsupported
+Status decode_bootstrap_header(Slice data, BootstrapHeader* out);
+
+} // namespace snii::format
diff --git a/be/src/snii/format/bsbf.h b/be/src/snii/format/bsbf.h
new file mode 100644
index 00000000000000..42a4e80f4dac12
--- /dev/null
+++ b/be/src/snii/format/bsbf.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/io/file_reader.h"
+
+// Block-split bloom filter (BSBF) -- Apache Parquet split-block spec, with an
+// S3-native on-demand single-block probe that none of the reference implementations
+// (Apache Parquet, Doris storage, Doris format/parquet) ship.
+//
+// BIT FORMAT IS PARQUET-CANONICAL (interoperable with Apache Parquet / Doris
+// format/parquet for the bitset bytes):
+//   - 256-bit (32-byte) blocks, 8 bits set per block.
+//   - key = XXH64(term, seed=0); high 32 bits select the block via FASTRANGE
+//     `block = ((hash>>32) * num_blocks) >> 32` (no power-of-2 requirement); low 32
+//     bits select 8 in-block positions `1 << ((key * SALT[i]) >> 27)`.
+//   - num_bytes via Parquet OptimalNumOfBytes: power of 2 in [32, 128 MiB].
+//
+// SNII WRAPPER (NOT Parquet's variable thrift header): a FIXED 28-byte header, then
+// the contiguous, uncompressed, little-endian bitset. Because the header size is a
+// constant, the bitset start is a constant offset (`section_base + 28`) and block i
+// is at `section_base + 28 + i*32` -- so a single 32-byte block can be range-read on
+// demand WITHOUT parsing a variable-length header and WITHOUT loading the whole blob.
+namespace snii::format {
+
+constexpr uint32_t kBsbfBytesPerBlock = 32;  // 256-bit block
+constexpr uint32_t kBsbfBitsSetPerBlock = 8; // 8 uint32 words / block
+constexpr uint32_t kBsbfMinBytes = 32;
+constexpr uint32_t kBsbfMaxBytes = 128u * 1024 * 1024; // Parquet kMaximumBloomFilterBytes
+constexpr uint32_t kBsbfHeaderSize = 28;               // FIXED (constant bitset offset)
+// L0/L1 tiering threshold (design "不存在的term快速过滤"): a bsbf section whose total
+// size is <= this is loaded WHOLE into the resident reader at open (L0 -> free
+// in-memory probe, no per-lookup round); larger filters stay L1 (header-only, probed
+// one 32-byte block on demand). 256 KiB fits in a single cloud FileCache block.
+constexpr uint32_t kBsbfResidentMaxBytes = 256u * 1024;
+
+// Canonical Parquet/Doris split-block SALT (8 odd 32-bit constants).
+extern const uint32_t kBsbfSalt[kBsbfBitsSetPerBlock];
+
+// XXH64(term, seed=0) -- the Parquet-canonical key (NOT XXH3, NOT Doris murmur).
+uint64_t bsbf_hash(std::string_view term);
+
+// Parquet OptimalNumOfBytes(ndv, fpp): power of 2 in [32, 128 MiB].
+uint32_t bsbf_optimal_num_bytes(uint32_t ndv, double fpp);
+
+// Fastrange block index from a 64-bit hash and the block count.
+inline uint32_t bsbf_block_index(uint64_t hash, uint32_t num_blocks) {
+    return static_cast<uint32_t>(((hash >> 32) * num_blocks) >> 32);
+}
+
+// Pure 32-byte-block kernel: does `block` contain the key's 8 bits? SIMD (AVX2)
+// accelerated at runtime when available, scalar otherwise. Returns true => the term
+// MAY be present (could be a false positive); false => DEFINITELY ABSENT.
+bool bsbf_block_contains(uint64_t hash, const uint8_t block[kBsbfBytesPerBlock]);
+
+// In-memory builder + serializer.
+class BsbfBuilder {
+public:
+    BsbfBuilder() = default;
+
+    // Sizes the filter for `ndv` distinct keys at target `fpp`. fpp in (0,1).
+    static Status create(uint32_t ndv, double fpp, BsbfBuilder* out);
+
+    // Insert a key / term. SIMD-accelerated.
+    void insert(uint64_t hash);
+    void insert_term(std::string_view term) { insert(bsbf_hash(term)); }
+
+    // In-memory probe over the resident bitset (build/warm path). SIMD-accelerated.
+    bool maybe_contains(uint64_t hash) const;
+    bool maybe_contains_term(std::string_view term) const {
+        return maybe_contains(bsbf_hash(term));
+    }
+
+    // Serialize [28-byte header][contiguous LE bitset] into `sink`. The header carries
+    // magic/version/hash+index strategy/num_bytes/num_blocks/ndv + header & bitset
+    // crc32c. The bitset is Parquet-canonical bytes.
+    Status serialize(ByteSink* sink) const;
+
+    uint32_t num_bytes() const { return num_bytes_; }
+    uint32_t num_blocks() const { return num_blocks_; }
+
+private:
+    std::vector<uint32_t> words_; // num_bytes_/4, blocks of 8 words
+    uint32_t num_bytes_ = 0;
+    uint32_t num_blocks_ = 0;
+    uint32_t ndv_ = 0;
+};
+
+// Resident header (28 bytes), parsed once at open. Validates magic/version/crc/bounds.
+struct BsbfHeader {
+    uint32_t num_bytes = 0;
+    uint32_t num_blocks = 0;
+    uint32_t bitset_crc = 0;  // stored crc32c of the bitset body (for L0 verification)
+    uint64_t bitset_base = 0; // absolute file offset of block 0 = section_base + 28
+
+    // Parse a 28-byte header located at `section_base` in the file. The bitset_base
+    // is set to section_base + kBsbfHeaderSize.
+    static Status parse(Slice header28, uint64_t section_base, BsbfHeader* out);
+
+    // Absolute file offset of the 32-byte block this hash maps to.
+    uint64_t block_offset(uint64_t hash) const {
+        return bitset_base +
+               static_cast<uint64_t>(bsbf_block_index(hash, num_blocks)) * kBsbfBytesPerBlock;
+    }
+};
+
+// On-demand probe: read EXACTLY ONE 32-byte block via `reader`, then test. No whole
+// blob load, no deep copy. *maybe_present=false means DEFINITELY ABSENT.
+Status bsbf_probe(snii::io::FileReader* reader, const BsbfHeader& header, uint64_t hash,
+                  bool* maybe_present);
+
+} // namespace snii::format
diff --git a/be/src/snii/format/dict_block.h b/be/src/snii/format/dict_block.h
new file mode 100644
index 00000000000000..82ae2476c53561
--- /dev/null
+++ b/be/src/snii/format/dict_block.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/format_constants.h"
+
+// DICT block —— a positioning unit mapping term → postings read plan, and also
+// the unit for remote on-demand fetching, caching, and CRC checksum
+// verification (see docs/design/SNII-design-spec.source.md "DICT block" and
+// "dict lookup flow summary" sections).
+//
+// Byte layout (strictly implemented; multi-byte fixed-width fields are
+// little-endian, variable-length integers use LEB128):
+//   header:
+//     n_entries        varint
+//     entry_format_ver u8        # = kDictBlockFormatVer
+//     block_flags      u8        # bit0 = has_positions (consistency check
+//     against the value passed to reader) frq_base         varint64 prx_base
+//     varint64  # present only when has_positions is set
+//   entries[n_entries]           # variable-length DictEntry, front-coded in
+//   lexicographic order anchor_offsets[n_anchors]    # u32 * n_anchors, byte
+//   offset of each anchor entry within the block n_anchors        u32 crc32c
+//   u32         # covers [header .. n_anchors], detects corruption (sole CRC
+//   layer)
+//
+// Anchor rule: every anchor_interval entries, one "term anchor" is forced —
+// that entry is encoded with prev_term="" (prefix_len=0, storing the full
+// term), and its byte offset is recorded in anchor_offsets; non-anchor entries
+// use the preceding entry's term as prev_term for front coding. The reader can
+// start from any anchor and scan independently without needing earlier terms,
+// enabling anchor binary search + local scan for exact term lookup.
+namespace snii::format {
+
+// DICT block entry_format_ver: self-describing version of the DictEntry
+// encoding. Reader rejects a mismatch so a query-only run cannot silently read
+// an older dict-entry layout as the current one.
+inline constexpr uint8_t kDictBlockFormatVer = 2;
+
+// block_flags bit definitions.
+namespace dict_block_flags {
+inline constexpr uint8_t kHasPositions = 1u << 0; // whether to write prx_base / .prx fields
+// bit1-7 reserved
+} // namespace dict_block_flags
+
+// DICT block writer: entries are added in lexicographic order via add_entry;
+// internally maintains prev_term, determines anchors, accumulates size
+// estimates, and on finish serializes header + entries + anchor table + CRC in
+// one pass.
+class DictBlockBuilder {
+public:
+    DictBlockBuilder(IndexTier tier, bool has_positions, uint64_t frq_base, uint64_t prx_base,
+                     uint32_t anchor_interval = 16);
+
+    // Append one entry (caller must guarantee lexicographic term order).
+    // Internally decides whether it becomes an anchor.
+    void add_entry(const DictEntry& entry);
+
+    // Upper-bound estimate of the serialized size of the current block (including
+    // header + entries + anchor table + CRC footer), used by the upper layer to
+    // decide when to cut a new block based on target_dict_block_bytes.
+    size_t estimated_bytes() const;
+
+    // Number of entries.
+    uint32_t n_entries() const { return n_entries_; }
+
+    // Serialize the entire block and append it to sink.
+    void finish(ByteSink* sink) const;
+
+private:
+    bool is_anchor(uint32_t index) const { return index % anchor_interval_ == 0; }
+
+    IndexTier tier_;
+    bool has_positions_;
+    uint64_t frq_base_;
+    uint64_t prx_base_;
+    uint32_t anchor_interval_;
+
+    uint32_t n_entries_ = 0;
+    std::vector<DictEntry> entries_;
+    std::string prev_term_;  // term of the previous entry (front coding base)
+    size_t entries_est_ = 0; // accumulated byte estimate for the entries section
+    size_t n_anchors_ = 0;   // number of anchors
+};
+
+// DICT block reader: on open, verifies the CRC and parses the header / anchor
+// table; find_term uses anchor binary search + local scan to locate a
+// DictEntry. Holds a byte view of the block (non-owning); lifetime is managed
+// by the caller.
+class DictBlockReader {
+public:
+    DictBlockReader() = default;
+
+    // Parse and verify the entire block. CRC mismatch / truncation / invalid
+    // structure → Corruption; has_positions in the header inconsistent with the
+    // supplied argument → InvalidArgument.
+    static Status open(Slice block, IndexTier tier, bool has_positions, DictBlockReader* out);
+
+    // Anchor binary search + local scan to locate target. Hit → *found=true and
+    // *out is filled; miss (including out-of-range, gap) → *found=false.
+    // Structural error → non-OK Status.
+    Status find_term(std::string_view target, bool* found, DictEntry* out) const;
+
+    // Decodes EVERY entry in the block in lexicographic order into *out (each a
+    // self-contained DictEntry, owning its term). Used for ordered term
+    // enumeration (prefix / range scans). Resets the front-coding base at each
+    // anchor segment.
+    Status decode_all(std::vector<DictEntry>* out) const;
+
+    uint64_t frq_base() const { return frq_base_; }
+    uint64_t prx_base() const { return prx_base_; }
+    uint32_t n_entries() const { return n_entries_; }
+
+private:
+    // Sequentially scan from anchor anchor_idx to the end of that anchor segment,
+    // searching for target.
+    Status scan_from_anchor(size_t anchor_idx, std::string_view target, bool* found,
+                            DictEntry* out) const;
+
+    // Find the last anchor index where first_term(anchor) <= target; return false
+    // if none exists.
+    bool locate_anchor(std::string_view target, size_t* anchor_idx) const;
+
+    Slice block_; // [header .. crc) full block view
+    IndexTier tier_ = IndexTier::kT1;
+    bool has_positions_ = false;
+    uint64_t frq_base_ = 0;
+    uint64_t prx_base_ = 0;
+    uint32_t n_entries_ = 0;
+
+    size_t entries_begin_ = 0;             // absolute offset of the start of the entries section
+    std::vector<uint32_t> anchor_offsets_; // byte offset within the block for each anchor entry
+    std::vector<std::string>
+            anchor_terms_; // full term of each anchor entry (used for binary search)
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/dict_block_directory.h b/be/src/snii/format/dict_block_directory.h
new file mode 100644
index 00000000000000..a1d70e9ed5aec9
--- /dev/null
+++ b/be/src/snii/format/dict_block_directory.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+namespace snii::format {
+
+// BlockRef.flags bit definitions.
+namespace block_ref_flags {
+// bit0: the on-disk block bytes are zstd(uncompressed_block). When set, the
+// directory also stores uncomp_len, and the reader zstd-decompresses the fetched
+// [offset, offset+length) range to uncomp_len before parsing the dict block. The
+// block-level crc32c (and BlockRef.checksum) cover the UNCOMPRESSED bytes, so a
+// zstd block shrinks the bytes fetched from S3 while keeping the same integrity
+// guarantees after decompression in RAM.
+inline constexpr uint8_t kZstd = 1u << 0;
+} // namespace block_ref_flags
+
+// Physical location and checksum info for a single DICT block. Aligned with SampledTermIndex by ordinal:
+// SampledTermIndex[i]'s first_term corresponds to DictBlockDirectory[i] (see design spec
+// "sampled dict index"). The read path issues a single range read over [offset, offset+length).
+struct BlockRef {
+    uint64_t offset = 0;     // absolute byte offset of the block within the container
+    uint64_t length = 0;     // ON-DISK byte length of the block (compressed when kZstd)
+    uint32_t n_entries = 0;  // number of DictEntry records within this block
+    uint8_t flags = 0;       // block-level flags (block_ref_flags::*)
+    uint32_t checksum = 0;   // crc32c of the block's UNCOMPRESSED content (verified after read)
+    uint64_t uncomp_len = 0; // uncompressed block byte length (stored only when kZstd set)
+};
+
+// DICT block directory: block ordinal → physical location mapping.
+//
+// on-disk layout (framed by SectionFramer with a unified type+len+crc32c wrapper):
+//   [u8 type=kDictBlockDirectory][varint64 payload_len][payload][fixed32 crc32c]
+//   payload = varint32 n_blocks
+//             then n_blocks × block_ref{
+//               varint64 offset, varint64 length, varint32 n_entries,
+//               u8 flags, fixed32 checksum }
+// Section-level crc detects truncation/corruption; block_ref.checksum is the per-block crc.
+class DictBlockDirectoryBuilder {
+public:
+    void add(const BlockRef& ref) { refs_.push_back(ref); }
+
+    // Encodes as a kDictBlockDirectory framed section (with embedded crc32c) and appends to sink.
+    void finish(ByteSink* sink) const;
+
+private:
+    std::vector<BlockRef> refs_;
+};
+
+// Reads and verifies a kDictBlockDirectory framed section; provides ordinal → BlockRef lookup.
+// After parsing, all block_refs reside in the reader (entering the searcher cache along with meta).
+class DictBlockDirectoryReader {
+public:
+    // Verifies the section crc and deserializes all block_refs.
+    // crc mismatch / truncation / trailing bytes → kCorruption; wrong section type → kInvalidArgument.
+    static Status open(Slice section, DictBlockDirectoryReader* out);
+
+    uint32_t n_blocks() const { return static_cast<uint32_t>(refs_.size()); }
+
+    // Returns the ordinal-th block_ref; ordinal >= n_blocks → kNotFound.
+    Status get(uint32_t ordinal, BlockRef* out) const;
+
+private:
+    std::vector<BlockRef> refs_;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/dict_entry.h b/be/src/snii/format/dict_entry.h
new file mode 100644
index 00000000000000..e2b434ece3a22f
--- /dev/null
+++ b/be/src/snii/format/dict_entry.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/frq_pod.h"
+
+// DictEntry —— on-disk encoding/decoding of a dict entry.
+//
+// Byte layout (see docs/design/SNII-design-spec.source.md "dict entry"
+// section):
+//   entry_len   varint   # byte length of entry body, allowing reader to skip
+//   unknown extensions or fast-skip entries
+//   --- entry body begins here, covered by entry_len ---
+//   prefix_len  varint   # length of shared prefix with prev_term
+//   suffix_len  varint   # number of suffix bytes
+//   suffix      u8[]     # suffix bytes that differ from prev_term
+//   flags       u8       # bit0 kind / bit1 enc / bit2 has_sb / bit3
+//   has_champion(=0) / bit4 offsets_ref(=0) df          varint ttf_delta varint
+//   # only when tier>=T2 max_freq    varint   # only when tier>=T2 locator:
+//     pod_ref: frq_off_delta varint, frq_len varint,
+//              [prelude_len varint, frq_docs_len varint when enc=windowed]
+//                  # docs-only prefix [prelude][dd-block]; windowed entries
+//                  carry # per-window region metadata in the prelude.
+//              [frq_docs_len varint, slim region meta when enc=slim]:
+//                  # frq_docs_len == dd region on-disk length; the docs-only
+//                  prefix # [frq_off, frq_off+frq_docs_len) a docid-only reader
+//                  fetches # without the freq region. win_mode u8 (bit0
+//                  dd_zstd, bit1 freq_zstd) dd_uncomp_len varint, crc_dd u32
+//                  [freq_uncomp_len varint, crc_freq u32 when tier>=T2]
+//                  # The single slim window is [dd_region][freq_region];
+//                  dd_disk_len # = frq_docs_len, freq_disk_len = frq_len -
+//                  frq_docs_len.
+//              [prx_off_delta varint, prx_len varint when tier>=T2]
+//     inline:  frq_len varint, frq_bytes u8[],   # frq_bytes =
+//     [dd_region][freq_region]
+//              slim region meta (as above, sans frq_docs_len which == dd disk
+//              len
+//                  carried as inline_dd_disk_len varint),
+//              [prx_len varint, prx_bytes u8[] when tier>=T2]
+//   --- entry body ends ---
+//
+// CRC verification is performed at the DICT block level (covering block header
+// + all entries + anchor offset table), no per-entry CRC to keep slim/inline
+// low-frequency terms compact (spec §DICT block line 330/348). tier and
+// positions capability are provided by per-index meta (not stored redundantly
+// inside entries): when tier>=T2, ttf_delta / max_freq and .prx locator/bytes
+// are written.
+namespace snii::format {
+
+// Dict entry: inline or pod-ref (two states), self-described length, supports
+// intra-block front coding.
+struct DictEntry {
+    // term key (front coding relative to prev_term is applied during
+    // encode/decode; full term stored here).
+    std::string term;
+
+    // flags.
+    DictEntryKind kind = DictEntryKind::kPodRef;
+    DictEntryEnc enc = DictEntryEnc::kSlim;
+    bool has_sb = false;
+
+    // term stats.
+    uint32_t df = 0;
+    uint64_t ttf_delta = 0; // only when tier>=T2
+    uint64_t max_freq = 0;  // only when tier>=T2
+
+    // pod_ref locator.
+    uint64_t frq_off_delta = 0;
+    uint64_t frq_len = 0;
+    uint64_t prelude_len = 0;   // only when enc=windowed
+    uint64_t frq_docs_len = 0;  // pod_ref docs-only prefix length
+    uint64_t prx_off_delta = 0; // only when tier>=T2
+    uint64_t prx_len = 0;       // only when tier>=T2
+
+    // slim/inline single-window region codecs. The window is
+    // [dd_region][freq_region] (no self-describing header). dd_meta drives the
+    // docs-only decode; freq_meta the scoring decode (only when tier>=T2). For
+    // slim pod_ref dd_meta.disk_len == frq_docs_len; for inline it is stored as
+    // inline_dd_disk_len.
+    FrqRegionMeta dd_meta;
+    FrqRegionMeta freq_meta;         // only when tier>=T2
+    uint64_t inline_dd_disk_len = 0; // only for inline: dd region on-disk length
+
+    // inline payload.
+    std::vector<uint8_t> frq_bytes; // = [dd_region][freq_region]
+    std::vector<uint8_t> prx_bytes; // only when tier>=T2
+};
+
+// Encodes an entry into sink (appending) using the layout above, with front
+// coding relative to prev_term. tier determines whether optional fields are
+// written.
+Status encode_dict_entry(const DictEntry& entry, std::string_view prev_term, IndexTier tier,
+                         ByteSink* sink);
+
+// Decodes one entry from the current position of src; term is reconstructed
+// from prev_term + suffix. Verifies the trailing CRC; out-of-range / CRC
+// mismatch / invalid prefix_len all return Corruption.
+Status decode_dict_entry(ByteSource* src, std::string_view prev_term, IndexTier tier,
+                         DictEntry* out);
+
+// Skips one entry using only entry_len (does not parse internal fields or
+// verify CRC).
+Status skip_dict_entry(ByteSource* src);
+
+} // namespace snii::format
diff --git a/be/src/snii/format/format_constants.h b/be/src/snii/format/format_constants.h
new file mode 100644
index 00000000000000..188266d02910cf
--- /dev/null
+++ b/be/src/snii/format/format_constants.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <cstdint>
+
+// SNII container and per-section on-disk contract constants.
+// Once published, these values are format semantics; changes require bumping
+// format_version and maintaining a compatibility policy. All multi-byte
+// fixed-width fields are little-endian; variable-length integers use LEB128
+// (see snii/encoding/varint.h).
+namespace snii::format {
+
+// ---- Container-level magic / version ----
+// "SNII" reads as 0x49494E53 in little-endian.
+inline constexpr uint32_t kContainerMagic = 0x49494E53u; // 'S''N''I''I'
+inline constexpr uint32_t kTailMagic = 0x4C494154u;      // 'T''A''I''L'
+inline constexpr uint16_t kFormatVersion = 2;
+inline constexpr uint16_t kMinReaderVersion = 2;
+// Self-describing version of the meta layout (the per-index meta header AND the
+// tail meta region share this single constant; a reader fails fast with
+// Corruption on any mismatch). This is a from-scratch, pre-launch format: there
+// is exactly ONE meta layout, so the value is 1. Bump it only AFTER launch,
+// when a real on-disk change must coexist with already-written indexes --
+// pre-launch changes just fold into v1.
+inline constexpr uint16_t kMetaFormatVersion = 1;
+
+// ---- SectionFramer section type ids (within per-index meta / tail region)
+// ----
+enum class SectionType : uint8_t {
+    kStatsBlock = 1,
+    kSampledTermIndex = 2,
+    kDictBlockDirectory = 3,
+    kXFilter = 4, // reserved: legacy embedded XFilter; meta no longer emits/reads it
+    kSectionRefs = 5,
+    kPerIndexMetaHeader = 6,
+    kLogicalIndexDirectory = 7,
+    kTailMetaHeader = 8,
+    kFeatureBits = 9,
+};
+
+// ---- Logical index postings storage content configuration (fixed per logical
+// index, not per-term) ---- Determines whether to write freq / positions /
+// norms+stats.
+enum class IndexConfig : uint8_t {
+    kDocsOnly = 0,             // docid only: term/match filtering
+    kDocsPositions = 1,        // docid+freq+positions: MATCH_PHRASE
+    kDocsPositionsScoring = 2, // + norms + stats: phrase + BM25
+    kPositionsOffsets = 3,     // reserved (highlight/RAG), not implemented in this release
+};
+
+// term stats / postings capability tiers: only tier>=kT2 writes
+// ttf_delta/max_freq and .prx.
+enum class IndexTier : uint8_t {
+    kT1 = 1, // docs-only
+    kT2 = 2, // docs-positions
+    kT3 = 3, // docs-positions-scoring
+};
+
+inline constexpr IndexTier tier_of(IndexConfig cfg) {
+    return cfg == IndexConfig::kDocsOnly        ? IndexTier::kT1
+           : cfg == IndexConfig::kDocsPositions ? IndexTier::kT2
+                                                : IndexTier::kT3; // scoring / offsets
+}
+inline constexpr bool has_positions(IndexConfig cfg) {
+    return cfg != IndexConfig::kDocsOnly;
+}
+inline constexpr bool has_scoring(IndexConfig cfg) {
+    return cfg == IndexConfig::kDocsPositionsScoring;
+}
+
+// ---- DictEntry flags bit definitions ----
+namespace dict_flags {
+inline constexpr uint8_t kKind = 1u << 0;        // 0=pod_ref / 1=inline
+inline constexpr uint8_t kEnc = 1u << 1;         // 0=slim / 1=windowed
+inline constexpr uint8_t kHasSb = 1u << 2;       // posting prelude includes sub-block directory
+inline constexpr uint8_t kHasChampion = 1u << 3; // v1 always 0
+inline constexpr uint8_t kOffsetsRef = 1u << 4;  // v1 always 0
+// bit5-7 reserved
+} // namespace dict_flags
+
+enum class DictEntryKind : uint8_t { kPodRef = 0, kInline = 1 };
+enum class DictEntryEnc : uint8_t { kSlim = 0, kWindowed = 1 };
+
+// ---- .prx window codec (codec byte bit0-5) ----
+// kRaw  : plaintext varint payload (doc_count, per-doc pos_count + position
+// deltas). kZstd : zstd-compressed plaintext payload (legacy reader still
+// supported). kPfor : doc_count + per-doc pos_count (varint), then position
+// deltas bit-packed
+//         as PFOR runs (kFrqBaseUnit each). No entropy coding -> far cheaper
+//         build CPU than zstd while staying competitive on size for ascending
+//         deltas.
+enum class PrxCodec : uint8_t {
+    kRaw = 0,
+    kZstd = 1,
+    kPfor = 2 /* bit7 cont-reserved */
+};
+
+// ---- Build-time parameters (not format semantics; may be tuned against real
+// metrics) ----
+inline constexpr uint32_t kFrqBaseUnit = 256;            // window base unit
+inline constexpr uint32_t kSlimDfThreshold = 512;        // df < this → slim
+inline constexpr uint32_t kDefaultInlineThreshold = 256; // slim encoded bytes ≤ this → inline
+// Adaptive window sizing (design #4): high-df windowed terms use larger windows
+// to cut prelude rows + per-window header/crc overhead. Windows remain a whole
+// multiple of kFrqBaseUnit so .prx alignment and win_base/last_docid semantics
+// are preserved. A term whose df >= kAdaptiveWindowDfThreshold splits into
+// kAdaptiveWindowDocs-sized windows instead of kFrqBaseUnit-sized ones.
+inline constexpr uint32_t kAdaptiveWindowDfThreshold = 8192; // df >= this -> larger windows
+inline constexpr uint32_t kAdaptiveWindowDocs = 1024;        // larger window size (4 * base unit)
+inline constexpr uint32_t kDefaultTargetDictBlockBytes = 64 * 1024;
+
+} // namespace snii::format
diff --git a/be/src/snii/format/frq_pod.h b/be/src/snii/format/frq_pod.h
new file mode 100644
index 00000000000000..aa3b36b23a4af5
--- /dev/null
+++ b/be/src/snii/format/frq_pod.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+// .frq region codec (FrqPod): doc-delta (dd) and freq postings, columnar + PFOR
+// (see docs/design SNII "frq design" and the read-byte-optimizations
+// design 1.6).
+//
+// PHASE D (posting-level dd/freq grouping): windows are NO LONGER
+// self-describing. A windowed .frq payload is laid out as
+//   [prelude][dd-block][freq-block]
+// where the dd-block concatenates every window's dd_region and the freq-block
+// concatenates every window's freq_region. Each region is independently encoded
+// (raw or zstd, chosen by size) and the per-window codec metadata (mode,
+// lengths, crc, offsets) is hoisted into the frq_prelude rows -- the region
+// bytes carry NO header. This makes the docs-only prefix ([prelude][dd-block])
+// ONE contiguous run a docid-only / phrase reader can fetch in a single range,
+// skipping the freq-block entirely.
+//
+// dd_region plaintext   = VInt n ++ PFOR_runs(doc_delta)   # n = doc count
+//   dd[0] = first_docid - win_base; dd[i] = docid[i] - docid[i-1]; win_base is
+//   the previous window's last docid (first window = 0).
+// freq_region plaintext = PFOR_runs(freq)                  # present iff
+// has_freq PFOR runs are segmented at 256 docs (kFrqBaseUnit); a partial
+// segment writes the remainder. Variable-length integers reuse
+// snii/encoding/varint; PFOR reuses snii/encoding/pfor; crc32c covers each
+// region's ON-DISK bytes.
+namespace snii::format {
+
+// Codec metadata for ONE encoded region (dd or freq), hoisted into the prelude.
+// The region's on-disk bytes are pure payload (no header); these fields drive
+// the decode. crc covers the on-disk (disk_len) bytes.
+struct FrqRegionMeta {
+    bool zstd = false;       // true => disk bytes are zstd(plaintext); false => raw
+    uint64_t uncomp_len = 0; // plaintext byte length (== disk_len when raw)
+    uint64_t disk_len = 0;   // on-disk byte length of this region
+    uint32_t crc = 0;        // crc32c of the on-disk (disk_len) bytes
+    // When false, decode_*_region SKIPS the per-region crc check (and the writer
+    // omits the 4-byte crc from the dict entry). Set false for INLINE entries:
+    // their region bytes live inside the dict block, whose own block-level crc32c
+    // already covers them, so a per-region crc is fully redundant. POD-ref
+    // regions (slim/windowed) live in the separately-fetched .frq POD -- their
+    // crc stays.
+    bool verify_crc = true;
+};
+
+// Encodes a window's dd_region plaintext (VInt n ++ PFOR_runs(doc_delta)) into
+// raw or zstd (per zstd_level_or_neg_for_auto), APPENDS the on-disk bytes to
+// out, and fills meta (mode/uncomp_len/disk_len/crc). The region carries no
+// header. docids_ascending: ascending docids in this window (single doc or
+// empty allowed). win_base: previous window's last docid (first window = 0);
+// requires docids[0] >= win_base. zstd_level_or_neg_for_auto: <0 auto (zstd
+// when large enough, else raw); 0 force
+//   raw; >0 force zstd at that level.
+// Non-ascending docids / first_docid < win_base / null out returns
+// InvalidArgument.
+Status build_dd_region(std::span<const uint32_t> docids_ascending, uint64_t win_base,
+                       int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta);
+
+// Vector convenience overload (forwards a span view; no copy of the elements).
+inline Status build_dd_region(const std::vector<uint32_t>& docids_ascending, uint64_t win_base,
+                              int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta) {
+    return build_dd_region(std::span<const uint32_t>(docids_ascending), win_base,
+                           zstd_level_or_neg_for_auto, out, meta);
+}
+
+// Encodes a window's freq_region plaintext (PFOR_runs(freq)) into raw or zstd,
+// APPENDS the on-disk bytes to out, and fills meta. Empty freqs yields a
+// zero-length region. Null out returns InvalidArgument.
+Status build_freq_region(std::span<const uint32_t> freqs, int zstd_level_or_neg_for_auto,
+                         ByteSink* out, FrqRegionMeta* meta);
+
+// Vector convenience overload (forwards a span view; no copy of the elements).
+inline Status build_freq_region(const std::vector<uint32_t>& freqs, int zstd_level_or_neg_for_auto,
+                                ByteSink* out, FrqRegionMeta* meta) {
+    return build_freq_region(std::span<const uint32_t>(freqs), zstd_level_or_neg_for_auto, out,
+                             meta);
+}
+
+// Decodes a dd_region from its on-disk slice (exactly disk_len bytes) + meta +
+// win_base, reconstructing ascending docids. Verifies meta.crc against the
+// slice. crc mismatch / wrong slice length / truncation / decompression /
+// oversized count all return a non-OK Status. The freq region is irrelevant
+// here (docs-only path).
+Status decode_dd_region(Slice dd_disk, const FrqRegionMeta& meta, uint64_t win_base,
+                        std::vector<uint32_t>* docids);
+
+// Decodes a freq_region from its on-disk slice (exactly disk_len bytes) + meta,
+// producing doc_count freqs. Verifies meta.crc. doc_count == 0 yields empty
+// freqs (and requires a zero-length region). crc mismatch / wrong slice length
+// / etc. return a non-OK Status.
+Status decode_freq_region(Slice freq_disk, const FrqRegionMeta& meta, size_t doc_count,
+                          std::vector<uint32_t>* freqs);
+
+} // namespace snii::format
diff --git a/be/src/snii/format/frq_prelude.h b/be/src/snii/format/frq_prelude.h
new file mode 100644
index 00000000000000..848e2bf0e2926b
--- /dev/null
+++ b/be/src/snii/format/frq_prelude.h
@@ -0,0 +1,178 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+// FrqPrelude: a TWO-LEVEL (super-block -> window) skippable directory that
+// precedes a windowed .frq posting whose payload is laid out (PHASE D, design
+// 1.6) with dd and freq regions GROUPED at posting level:
+//   windowed .frq payload = [prelude][dd-block][freq-block]
+//     dd-block   = dd_region_0 ++ dd_region_1 ++ ... ++ dd_region_{N-1}
+//     freq-block = freq_region_0 ++ ... ++ freq_region_{N-1}   (iff has_freq)
+// Windows are NOT self-describing: each window's full codec metadata (region
+// offsets, on-disk/uncompressed lengths, modes, crcs) lives in the prelude rows.
+// The docs-only prefix [prelude][dd-block] is therefore ONE contiguous run a
+// docid-only / phrase reader fetches in a single range, skipping the freq-block.
+//
+// DictEntry records prelude_len, frq_len (whole payload) and frq_docs_len
+// (= prelude_len + dd_block_len) so a reader can range-fetch the prelude first,
+// then fetch either the contiguous dd-block (docs-only) or both blocks (scoring).
+//
+// On-disk layout (strict; all multi-byte fixed fields little-endian, VInt =
+// LEB128 via snii/encoding):
+//   header:
+//     u8   flags        # bit0 has_freq, bit1 has_prx
+//     VInt N            # number of .frq windows
+//     VInt G            # windows per super-block (group_size; >=1)
+//     VInt n_super      # = ceil(N / G); 0 when N==0
+//     VInt sbdir_len    # byte length of the super_block_dir region
+//     u32  crc32c       # covers header + super_block_dir (NOT the window blocks)
+//   super_block_dir[n_super]:  # small, resident: one row per super-block
+//     VInt sb_last_docid_delta # cumulative across super-blocks => absolute last
+//                              #   docid of the super-block's last window
+//     VInt sb_block_off        # byte offset of this super-block's window block,
+//                              #   measured from the start of the window_dir region
+//     VInt sb_block_len        # byte length of this super-block's window block
+//   window_dir: n_super self-contained blocks, each holding <=G window rows.
+//     per window row:
+//       VInt last_docid_delta  # cumulative WITHIN the block => absolute last docid
+//                              #   (previous window's absolute last docid = win_base;
+//                              #    first window of first block: win_base = 0)
+//       VInt doc_count         # number of docs in the window (frq_pod needs it)
+//       u8   win_mode          # bit0 dd_zstd, bit1 freq_zstd
+//       VInt dd_off            # dd_region byte offset within the dd-block
+//       VInt dd_disk_len       # dd_region on-disk byte length
+//       VInt dd_uncomp_len     # dd_region plaintext byte length
+//       u32  crc_dd            # crc32c of the dd_region on-disk bytes
+//       VInt freq_off          # freq_region offset within the freq-block (has_freq)
+//       VInt freq_disk_len     # freq_region on-disk byte length (has_freq)
+//       VInt freq_uncomp_len   # freq_region plaintext byte length (has_freq)
+//       u32  crc_freq          # crc32c of the freq_region on-disk bytes (has_freq)
+//       VInt prx_off           # .prx payload byte offset (present iff has_prx)
+//       VInt prx_len           # .prx payload byte length (present iff has_prx)
+//       VInt max_freq          # window max term frequency (WAND block-max)
+//       u8   max_norm          # window score-max norm (WAND); 0 acceptable
+//
+// Reconstructing win_base / absolute last_docid (READER CONTRACT) is unchanged:
+// the writer chains absolute last docids across windows; each row stores the delta
+// of its absolute last docid from the previous window, and sb_last_docid seeds
+// each block, so super-block binary search then in-block window binary search
+// locate the window covering any docid without decoding the .frq blocks.
+//
+// The trailing crc32c covers only header + super_block_dir; every region carries
+// its own crc (crc_dd / crc_freq) in the row.
+namespace snii::format {
+
+namespace frq_prelude_flags {
+inline constexpr uint8_t kHasFreq = 1u << 0;
+inline constexpr uint8_t kHasPrx = 1u << 1;
+} // namespace frq_prelude_flags
+
+// Per-window codec mode bits (win_mode byte).
+namespace frq_win_mode {
+inline constexpr uint8_t kDdZstd = 1u << 0;
+inline constexpr uint8_t kFreqZstd = 1u << 1;
+inline constexpr uint8_t kKnownBits = kDdZstd | kFreqZstd;
+} // namespace frq_win_mode
+
+// Absolute, decoded metadata for one window (as the reader exposes it). The dd /
+// freq region locators are offsets WITHIN the dd-block / freq-block respectively
+// (both blocks follow the prelude). The reader derives the dd-block length from
+// the last window's dd_off + dd_disk_len.
+struct WindowMeta {
+    uint32_t last_docid = 0; // absolute last docid in the window
+    uint64_t win_base = 0;   // absolute last docid of the previous window (0 for w==0)
+    uint32_t doc_count = 0;
+
+    // dd_region locator (within the dd-block).
+    bool dd_zstd = false;
+    uint64_t dd_off = 0;
+    uint64_t dd_disk_len = 0;
+    uint64_t dd_uncomp_len = 0;
+    uint32_t crc_dd = 0;
+
+    // freq_region locator (within the freq-block); valid only when has_freq.
+    bool freq_zstd = false;
+    uint64_t freq_off = 0;
+    uint64_t freq_disk_len = 0;
+    uint64_t freq_uncomp_len = 0;
+    uint32_t crc_freq = 0;
+
+    uint64_t prx_off = 0; // valid only when has_prx
+    uint64_t prx_len = 0; // valid only when has_prx
+    uint32_t max_freq = 0;
+    uint8_t max_norm = 0;
+
+    // In-memory only (NOT serialized in the prelude row). When false, the dd/freq
+    // region decode skips crc verification -- used when these region bytes are
+    // covered by an enclosing crc (e.g. an INLINE entry inside its dict block).
+    // Windowed/slim POD-ref rows leave this true (their regions carry a crc).
+    bool verify_crc = true;
+};
+
+// Builder input: one fully-computed WindowMeta per window, in term order, plus the
+// super-block grouping factor. The writer fills last_docid (absolute), doc_count,
+// the region locators/crcs, prx locator, max_freq and max_norm; win_base is derived
+// during build (so callers may leave it 0). group_size must be >= 1.
+struct FrqPreludeColumns {
+    bool has_freq = true;
+    bool has_prx = false;
+    uint32_t group_size = 64; // windows per super-block (G)
+    std::vector<WindowMeta> windows;
+};
+
+// Builds the prelude bytes and appends them to out.
+// Returns InvalidArgument when out is null, group_size is 0, or the windows are
+// not in non-decreasing last_docid order (a window's absolute last docid must be
+// >= the previous window's).
+Status build_frq_prelude(const FrqPreludeColumns& cols, ByteSink* out);
+
+// Reads and verifies a prelude buffer, exposing two-level skip access. The reader
+// parses the header + super_block_dir on open (verifying the trailing crc) and
+// eagerly decodes every window block into owned WindowMeta rows (the prelude is
+// small relative to the postings). It does not retain the input.
+class FrqPreludeReader {
+public:
+    // Parses + verifies the prelude. crc mismatch / truncation / inconsistent
+    // offsets-or-lengths / oversized counts => kCorruption.
+    static Status open(Slice prelude, FrqPreludeReader* out);
+
+    uint32_t n_windows() const { return static_cast<uint32_t>(windows_.size()); }
+    uint32_t n_super_blocks() const { return n_super_; }
+    bool has_freq() const { return has_freq_; }
+    bool has_prx() const { return has_prx_; }
+
+    // Total on-disk byte length of the dd-block (== sum of dd_disk_len; the docs-only
+    // prefix after the prelude). 0 when there are no windows.
+    uint64_t dd_block_len() const { return dd_block_len_; }
+    // Total on-disk byte length of the freq-block (== sum of freq_disk_len). 0 when
+    // !has_freq or no windows.
+    uint64_t freq_block_len() const { return freq_block_len_; }
+
+    // Returns the absolute WindowMeta for window w. Out-of-range => InvalidArgument.
+    Status window(uint32_t w, WindowMeta* out) const;
+
+    // Locates the window covering docid via super-block binary search then window
+    // binary search. *found=false (with OK) when docid is past the term's last
+    // docid; otherwise *w is the index of the covering window (the first window
+    // whose absolute last_docid >= docid).
+    Status locate_window(uint32_t docid, bool* found, uint32_t* w) const;
+
+private:
+    bool has_freq_ = false;
+    bool has_prx_ = false;
+    uint32_t group_size_ = 1;
+    uint32_t n_super_ = 0;
+    uint64_t dd_block_len_ = 0;
+    uint64_t freq_block_len_ = 0;
+    // Absolute last docid at each super-block boundary (size n_super_).
+    std::vector<uint64_t> sb_last_docid_;
+    // All windows decoded with absolute fields, in term order (size N).
+    std::vector<WindowMeta> windows_;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/logical_index_directory.h b/be/src/snii/format/logical_index_directory.h
new file mode 100644
index 00000000000000..3cfddbd7227bb8
--- /dev/null
+++ b/be/src/snii/format/logical_index_directory.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+namespace snii::format {
+
+// Container-level directory entry: maps a logical index identity (index_id, index_suffix)
+// to the physical location of its per-index meta block. Aligned with Doris key system
+// (see design spec "footer meta region" logical index directory). The reader issues a
+// single range read over [meta_off, meta_off + meta_len) to load that per-index meta.
+struct LogicalIndexRef {
+    uint64_t index_id = 0;    // logical index id (matches Doris InvertedIndexDescriptor key)
+    std::string index_suffix; // UTF-8 sub-index suffix; may be empty for the primary index
+    uint64_t meta_off = 0;    // absolute byte offset of the per-index meta block in the container
+    uint64_t meta_len = 0;    // byte length of the per-index meta block
+};
+
+// Logical index directory: (index_id, index_suffix) -> per-index meta block reference.
+//
+// on-disk layout (framed by SectionFramer with a unified type+len+crc32c wrapper):
+//   [u8 type=kLogicalIndexDirectory][varint64 payload_len][payload][fixed32 crc32c]
+//   payload = varint32 n_entries
+//             then n_entries x {
+//               varint64 index_id,
+//               varint32 suffix_len, suffix_bytes,
+//               varint64 per_index_meta_off,
+//               varint64 per_index_meta_len }
+// The section-level crc covers the whole directory, so no per-entry crc is stored
+// (the spec lists a per-entry crc32c as optional; it is folded into the framer crc here).
+class LogicalIndexDirectoryBuilder {
+public:
+    void add(const LogicalIndexRef& ref) { refs_.push_back(ref); }
+
+    // Encodes as a kLogicalIndexDirectory framed section (with embedded crc32c) and appends to sink.
+    void finish(ByteSink* sink) const;
+
+private:
+    std::vector<LogicalIndexRef> refs_;
+};
+
+// Reads and verifies a kLogicalIndexDirectory framed section; provides ordinal access and
+// (index_id, suffix) lookup. After parsing, all entries reside in the reader (entering the
+// searcher cache along with the rest of the tail meta region).
+class LogicalIndexDirectoryReader {
+public:
+    // Verifies the section crc and deserializes all entries.
+    // crc mismatch / truncation / trailing bytes / oversized counts -> kCorruption;
+    // wrong section type -> kInvalidArgument; null out -> kInvalidArgument.
+    static Status open(Slice framed, LogicalIndexDirectoryReader* out);
+
+    uint32_t size() const { return static_cast<uint32_t>(refs_.size()); }
+
+    // Returns the i-th entry in encounter order; i >= size -> kNotFound.
+    Status get(uint32_t i, LogicalIndexRef* out) const;
+
+    // Looks up the entry for (index_id, suffix). On match, *found=true and *out is populated;
+    // when absent, *found=false and *out is left untouched. Returns kInvalidArgument on null
+    // output pointers. The pair (index_id, suffix) is the unique key.
+    Status find(uint64_t index_id, std::string_view suffix, bool* found,
+                LogicalIndexRef* out) const;
+
+private:
+    std::vector<LogicalIndexRef> refs_;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/norms_pod.h b/be/src/snii/format/norms_pod.h
new file mode 100644
index 00000000000000..6580b1df2ffcc1
--- /dev/null
+++ b/be/src/snii/format/norms_pod.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+namespace snii::format {
+
+// norms POD: per logical index / field stores 1-byte encoded doc length per doc,
+// used by BM25 length normalization (SniiStatsProvider::encoded_norm) for per-docid lookup.
+//
+// On-disk layout (the whole section is framed by SectionFramer, which adds a type+len+crc32c envelope):
+//   framer payload = [varint64 doc_count][bytes encoded_norm[doc_count]]
+//   framer envelope = [u8 type][varint64 payload_len][payload][fixed32 crc32c]
+// The encoding of encoded_norm (length -> 1B) is out of scope for this module; here we only handle raw byte storage and retrieval.
+class NormsPodWriter {
+public:
+    // Appends the encoded_norm for the next docid (docid is implicit, assigned in append order starting from 0).
+    void add(uint8_t encoded_norm) { norms_.push_back(encoded_norm); }
+
+    // Number of docs accumulated so far (i.e., the next docid to be assigned).
+    size_t count() const { return norms_.size(); }
+
+    // Writes [doc_count][bytes] framed by SectionFramer into sink (appends; does not clear sink).
+    void finish(ByteSink* sink) const;
+
+private:
+    std::vector<uint8_t> norms_;
+};
+
+// Read-only view: on open, verifies the framer CRC and checks that doc_count/payload length are consistent,
+// afterwards encoded_norm(docid) is O(1) direct indexing (zero-copy, borrows the underlying buffer).
+class NormsPodReader {
+public:
+    NormsPodReader() = default;
+
+    // Parses the entire section (including the framer envelope). Returns Corruption on CRC mismatch, truncation, or length inconsistency.
+    // On success, *out borrows the memory pointed to by framer_payload; the caller must ensure its lifetime.
+    static Status open(Slice framed, NormsPodReader* out);
+
+    uint32_t doc_count() const { return doc_count_; }
+
+    // Precondition (hard contract): docid < doc_count(). Semantics match std::vector::operator[]:
+    // the caller is responsible for guaranteeing this (docid comes from trusted postings decoded internally by SNII). Asserts in debug builds;
+    // no check in Release (NDEBUG). Use try_encoded_norm when the docid is untrusted and needs validation.
+    uint8_t encoded_norm(uint32_t docid) const {
+        assert(docid < doc_count_);
+        return norms_[docid];
+    }
+
+    // Checked access: returns InvalidArgument if docid is out of range; never reads out-of-range memory.
+    Status try_encoded_norm(uint32_t docid, uint8_t* out) const {
+        if (docid >= doc_count_) return Status::InvalidArgument("norms: docid out of range");
+        *out = norms_[docid];
+        return Status::OK();
+    }
+
+private:
+    const uint8_t* norms_ = nullptr;
+    uint32_t doc_count_ = 0;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/null_bitmap.h b/be/src/snii/format/null_bitmap.h
new file mode 100644
index 00000000000000..21c6f92be59709
--- /dev/null
+++ b/be/src/snii/format/null_bitmap.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+// Forward-declare the CRoaring C++ bitmap so this header stays free of the
+// (large) roaring include; the concrete type is only needed in the .cpp.
+namespace roaring {
+class Roaring;
+} // namespace roaring
+
+namespace snii::format {
+
+// SectionFramer type byte for the null-bitmap POD. There is no dedicated
+// SectionType enum value yet, so we use a documented literal (0x20) outside the
+// currently allocated enum range (1..9) to avoid colliding with existing types.
+inline constexpr uint8_t kNullBitmapSectionType = 0x20;
+
+// NullBitmap POD: per logical index, a Roaring bitmap of null docids (docs whose
+// value is NULL / not indexed). It decouples per-doc NULL information from the
+// per-term dictionary / postings so NULL handling can pull only this side POD.
+//
+// On-disk layout (the whole section is framed by SectionFramer, which adds a
+// type + varint64 len + payload + fixed32 crc32c envelope):
+//   framer payload = [varint64 doc_count][varint64 roaring_size][roaring_bytes]
+// roaring_bytes is the portable CRoaring serialization (Roaring::write).
+class NullBitmapWriter {
+public:
+    NullBitmapWriter();
+    ~NullBitmapWriter();
+
+    NullBitmapWriter(const NullBitmapWriter&) = delete;
+    NullBitmapWriter& operator=(const NullBitmapWriter&) = delete;
+
+    // Marks docid as NULL (adding the same docid twice is idempotent).
+    void add_null(uint32_t docid);
+
+    // Number of distinct null docids accumulated so far.
+    uint32_t null_count() const;
+
+    // Serializes [doc_count][roaring_size][roaring_bytes] framed by SectionFramer
+    // and appends it to sink (does not clear sink). doc_count is the total number
+    // of docs in the logical index (recorded so the reader can round-trip it).
+    void finish(uint32_t doc_count, ByteSink* sink) const;
+
+private:
+    std::unique_ptr<roaring::Roaring> bitmap_;
+};
+
+// Read-only view: on open, SectionFramer verifies the CRC and truncation; this
+// class then guards roaring_size against the remaining payload bytes before
+// deserializing the Roaring bitmap (anti-DoS), so a corrupt size cannot trigger
+// an oversized allocation/read. is_null() is then an O(1) membership test.
+class NullBitmapReader {
+public:
+    NullBitmapReader();
+    ~NullBitmapReader();
+
+    NullBitmapReader(const NullBitmapReader&) = delete;
+    NullBitmapReader& operator=(const NullBitmapReader&) = delete;
+    NullBitmapReader(NullBitmapReader&&) noexcept;
+    NullBitmapReader& operator=(NullBitmapReader&&) noexcept;
+
+    // Parses the entire section (framer envelope + payload). Returns Corruption on
+    // CRC mismatch, truncation, doc_count overflow, or an oversized roaring_size.
+    static Status open(Slice framed, NullBitmapReader* out);
+
+    // True iff docid was marked NULL. docids outside the null set (including those
+    // >= doc_count) return false.
+    bool is_null(uint32_t docid) const;
+
+    // Number of distinct null docids in the bitmap.
+    uint32_t null_count() const;
+
+    // Copies the decoded bitmap into the caller-owned Roaring object.
+    void copy_to(roaring::Roaring* out) const;
+
+    // Total doc count of the logical index, as recorded by the writer.
+    uint32_t doc_count() const { return doc_count_; }
+
+private:
+    std::unique_ptr<roaring::Roaring> bitmap_;
+    uint32_t doc_count_ = 0;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/per_index_meta.h b/be/src/snii/format/per_index_meta.h
new file mode 100644
index 00000000000000..1a89a8710fbd7a
--- /dev/null
+++ b/be/src/snii/format/per_index_meta.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/stats_block.h"
+
+// PerIndexMeta -- the per-logical-index metadata block that enters the searcher
+// cache. It COMPOSES already-built sub-sections (StatsBlock, SampledTermIndex,
+// DICT block directory, optional XFilter) plus the physical SectionRefs into a
+// single contiguous block. See design spec "Per-index meta block".
+//
+// On-disk layout:
+//   PerIndexMetaHeader (fixed prefix, self-checksummed):
+//     u16      meta_format_version (== kMetaFormatVersion), little-endian
+//     varint64 index_id
+//     varint32 suffix_len
+//     u8[]     suffix_bytes
+//     u32      flags (fixed32, little-endian)   # feature bits, e.g. kHasBsbf
+//     u32      crc32c (fixed32) over all preceding header bytes
+//   then framed sub-sections (each via SectionFramer, type+len+payload+crc32c):
+//     StatsBlock            (kStatsBlock,        built here)
+//     SampledTermIndex      (kSampledTermIndex,  embedded already-framed bytes)
+//     DICT block directory  (kDictBlockDirectory,embedded already-framed bytes)
+//     SectionRefs           (kSectionRefs,       built here; carries the bsbf ref)
+//     (+ any extra raw framed sections appended by add_raw_section)
+//
+// Design choice: the SampledTermIndex / DICT block directory / XFilter
+// sub-sections are EMBEDDED as their producers' already-framed output (the raw
+// SectionFramer frame), not re-framed. This lets the reader hand the exact frame
+// Slice straight back to each sub-module's open() (which expects a full frame),
+// and reuses the framer instead of re-implementing sub-section parsing.
+namespace snii::format {
+
+// Physical reference to a contiguous region within the container. (0, 0) means
+// the region is absent (e.g. no norms POD for a non-scoring index). A present-
+// but-empty region (e.g. an all-INLINE index's posting_region) is (off, 0).
+struct RegionRef {
+    uint64_t offset = 0;
+    uint64_t length = 0;
+};
+
+// Physical references to the data sections / side PODs of one logical index.
+// Each RegionRef is encoded as varint64 offset followed by varint64 length, in
+// the field order below.
+//
+// posting_region is the single interleaved [prx][frq] posting region (it replaced
+// the former two separate frq_pod + prx_pod refs). Each pod_ref term writes its
+// prx span first then its frq span, contiguously, in term order; both
+// frq_off_delta and prx_off_delta now index into this one region. NO positions
+// capability is inferred from posting_region.length -- it is non-zero for any
+// docs-only index with a pod_ref term, and zero for an all-INLINE positional
+// index; capability lives in the header kHasPositions flag instead.
+struct SectionRefs {
+    RegionRef dict_region;
+    RegionRef posting_region; // interleaved [prx][frq] per term; was frq_pod + prx_pod
+    RegionRef norms;
+    RegionRef null_bitmap;
+    // Block-split bloom XFilter section ([28B header][bitset]); {0,0} when absent.
+    // A PHYSICAL section (not embedded in the resident meta) so a single 32-byte block
+    // can be probed on demand without loading the whole filter at open.
+    RegionRef bsbf;
+};
+
+// Builds a per-index meta block by composing already-built sub-sections.
+class PerIndexMetaBuilder {
+public:
+    // Header flags / feature bits.
+    static constexpr uint32_t kHasPositions = 1u << 0; // index is positions-capable (tier>=T2)
+    static constexpr uint32_t kHasBsbf = 1u << 1;      // block-split bloom XFilter (section ref)
+
+    PerIndexMetaBuilder(uint64_t index_id, std::string index_suffix, uint32_t flags);
+
+    void set_stats(const StatsBlock& stats);
+
+    // Raw output of SampledTermIndexBuilder::finish (a full kSampledTermIndex frame).
+    void set_sampled_term_index(Slice framed_bytes);
+
+    // Raw output of DictBlockDirectoryBuilder::finish (a full kDictBlockDirectory frame).
+    void set_dict_block_directory(Slice framed_bytes);
+
+    void set_section_refs(const SectionRefs& refs);
+
+    // Appends an arbitrary already-framed section verbatim. Used for forward-compat
+    // optional sections; the reader skips unrecognized types.
+    void add_raw_section(Slice framed_bytes);
+
+    // Serializes the header and all sub-sections into sink.
+    // sink == nullptr -> kInvalidArgument.
+    Status finish(ByteSink* sink) const;
+
+private:
+    uint64_t index_id_;
+    std::string index_suffix_;
+    uint32_t flags_;
+    StatsBlock stats_;
+    std::vector<uint8_t> sampled_term_index_;
+    std::vector<uint8_t> dict_block_directory_;
+    SectionRefs section_refs_;
+    std::vector<std::vector<uint8_t>> extra_sections_;
+};
+
+// Parses a per-index meta block: verifies the header crc, then walks the framed
+// sub-sections (each crc-verified by the framer), capturing the full frame Slice
+// of each known sub-section so callers can re-open it with the sub-module reader.
+// Unrecognized optional section types are skipped.
+class PerIndexMetaReader {
+public:
+    PerIndexMetaReader() = default;
+
+    // block == the full per-index meta block bytes; out must be non-null.
+    // Header crc mismatch / truncation / a sub-section crc mismatch -> kCorruption;
+    // missing a required sub-section -> kCorruption; out == nullptr -> kInvalidArgument.
+    static Status open(Slice block, PerIndexMetaReader* out);
+
+    uint64_t index_id() const { return index_id_; }
+    const std::string& index_suffix() const { return index_suffix_; }
+    uint32_t flags() const { return flags_; }
+
+    const StatsBlock& stats() const { return stats_; }
+    const SectionRefs& section_refs() const { return section_refs_; }
+
+    // Full kSampledTermIndex frame Slice, ready for SampledTermIndexReader::open.
+    Slice sampled_term_index_bytes() const { return sampled_term_index_; }
+    // Full kDictBlockDirectory frame Slice, ready for DictBlockDirectoryReader::open.
+    Slice dict_block_directory_bytes() const { return dict_block_directory_; }
+
+    // Block-split bloom XFilter: present iff a non-empty bsbf section ref exists.
+    bool has_bsbf() const { return section_refs_.bsbf.length > 0; }
+
+    // Positions capability, read from the persisted header flag (NOT from any region
+    // length). True iff the index was built as docs-positions(+scoring) (tier>=T2).
+    bool has_positions() const { return (flags_ & PerIndexMetaBuilder::kHasPositions) != 0; }
+
+private:
+    uint64_t index_id_ = 0;
+    std::string index_suffix_;
+    uint32_t flags_ = 0;
+    StatsBlock stats_;
+    SectionRefs section_refs_;
+    Slice sampled_term_index_;
+    Slice dict_block_directory_;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/prx_pod.h b/be/src/snii/format/prx_pod.h
new file mode 100644
index 00000000000000..50c8536acb4cfe
--- /dev/null
+++ b/be/src/snii/format/prx_pod.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/byte_source.h"
+
+// .prx position window (PrxPod): stores term position information for several
+// docs within one window.
+//
+// Single-window on-disk byte layout (see docs/design SNII "prx design"):
+//   u8   codec        # PrxCodec: 0=raw / 1=zstd / 2=pfor (bit7 cont-reserved)
+//   VInt uncomp_len   # payload length (raw/pfor: on-disk payload bytes; zstd:
+//   plaintext) VInt comp_len     # present only when codec==zstd u32  crc32c #
+//   covers header (codec..comp_len) + payload bytes payload     # raw: varint
+//   plaintext; zstd: compressed; pfor: bit-packed
+//
+// raw/zstd plaintext payload (self-describing per-doc boundaries):
+//   VInt doc_count
+//   per doc: VInt pos_count, followed by pos_count position deltas (VInt)
+//   positions within a doc are ascending, stored as deltas (first absolute).
+//
+// pfor payload (default build codec; no entropy coding):
+//   VInt doc_count
+//   VInt total_pos                   # sum of all pos_counts
+//   per doc: VInt pos_count
+//   PFOR_runs(position_deltas)       # total_pos deltas, kFrqBaseUnit per run,
+//                                    #   flat doc order (first per doc
+//                                    absolute)
+//
+// Multi-byte fixed-length fields are little-endian; variable-length integers
+// reuse snii/encoding/varint. crc32c checksum at window tail detects
+// corruption.
+namespace snii::format {
+
+// Build a .prx window and append it to sink.
+// per_doc_positions[d] is the position list for the d-th doc within this
+// window; must be ascending (duplicates allowed).
+// zstd_level_or_negative_for_auto:
+//   <0  → auto: use ZSTD (default level) when payload is large enough,
+//   otherwise raw. 0   → force raw (no compression). >0  → force ZSTD with the
+//   given level.
+// Non-ascending positions within a doc return InvalidArgument.
+Status build_prx_window(std::span<const std::vector<uint32_t>> per_doc_positions,
+                        int zstd_level_or_negative_for_auto, ByteSink* sink);
+
+// Vector convenience overload (forwards a span view over the window's per-doc
+// lists; the writer can pass a slice of its flat positions WITHOUT deep-copying
+// the inner vectors into a fresh std::vector<std::vector<uint32_t>> per
+// window).
+inline Status build_prx_window(const std::vector<std::vector<uint32_t>>& per_doc_positions,
+                               int zstd_level_or_negative_for_auto, ByteSink* sink) {
+    return build_prx_window(std::span<const std::vector<uint32_t>>(per_doc_positions),
+                            zstd_level_or_negative_for_auto, sink);
+}
+
+// FLAT-positions builder: byte-identical output to build_prx_window above, but
+// reads the window's positions from a single flat span partitioned per-doc by
+// `freqs` (doc d owns the next freqs[d] entries; freqs.size() == doc count and
+// sum(freqs) == positions_flat.size()). Lets the writer pass a subspan of the
+// term's flat positions/freqs with NO vector-of-vectors materialization.
+Status build_prx_window_flat(std::span<const uint32_t> positions_flat,
+                             std::span<const uint32_t> freqs, int zstd_level_or_negative_for_auto,
+                             ByteSink* sink);
+
+// Read and verify a .prx window from source, reconstructing the per-doc
+// position list. CRC mismatch / invalid codec / truncation / decompression
+// failure all return a non-OK Status.
+Status read_prx_window(ByteSource* source, std::vector<std::vector<uint32_t>>* per_doc_positions);
+
+// CSR variant of read_prx_window: decodes ALL docs' positions into one flat
+// buffer `pos_flat` with per-doc offsets `pos_off` (size doc_count+1,
+// pos_off[0]==0), so doc d's positions are pos_flat[pos_off[d] ..
+// pos_off[d+1]). Avoids the per-doc std::vector allocation of read_prx_window
+// -- both output vectors are flat uint32 buffers whose capacity a caller can
+// retain (clear()) across windows/queries.
+Status read_prx_window_csr(ByteSource* source, std::vector<uint32_t>* pos_flat,
+                           std::vector<uint32_t>* pos_off);
+
+// Selective CSR variant: decodes positions only for the requested local doc
+// ordinals within this PRX window. `doc_ordinals` must be strictly ascending.
+// The output uses the same CSR shape, but has doc_ordinals.size()+1 offsets.
+Status read_prx_window_csr_selective(ByteSource* source, std::span<const uint32_t> doc_ordinals,
+                                     std::vector<uint32_t>* pos_flat,
+                                     std::vector<uint32_t>* pos_off);
+
+} // namespace snii::format
diff --git a/be/src/snii/format/sampled_term_index.h b/be/src/snii/format/sampled_term_index.h
new file mode 100644
index 00000000000000..b4348dd74eccd9
--- /dev/null
+++ b/be/src/snii/format/sampled_term_index.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/format/format_constants.h"
+
+// SampledTermIndex -- resident metadata for locating a query term to a candidate DICT block.
+//
+// Sampling granularity is per DICT block (not a fixed term count): each time the writer produces a DICT block,
+// it writes the block's first_term into this index. Size grows proportionally to block count. At read time it is
+// loaded into the searcher cache together with SniiLogicalIndexReader. See design spec "Sampled Term Index".
+//
+// On-disk layout (framed by SectionFramer, uniform type+len+crc32c):
+//   [u8 type=kSampledTermIndex][varint64 payload_len][payload][fixed32 crc32c]
+//   payload =
+//     n_blocks       varint32
+//     min_term        len(varint32) + bytes        # == sample_terms[0], omitted when n_blocks=0
+//     max_term        len(varint32) + bytes        # == sample_terms[n-1], omitted when n_blocks=0
+//     sample_terms[n_blocks]:                       # first_term of each block, in ascending order
+//       prefix_len   varint32                       # shared prefix length with the previous sample_term
+//       suffix_len   varint32
+//       suffix       u8[suffix_len]
+//
+// Term bytes are compared as unsigned byte order (UTF-8 friendly, binary-safe). Front coding reuses
+// the same prefix/suffix primitives as DictEntry; do not reimplement.
+namespace snii::format {
+
+// Builder: appends the first_term of each DICT block in block ordinal order (must be strictly ascending),
+// and serializes the entire set into a single kSampledTermIndex framed section on finish.
+class SampledTermIndexBuilder {
+public:
+    // Appends the first_term of the next DICT block. Call order determines block ordinal order.
+    void add_block_first_term(std::string_view first_term);
+
+    // Serializes and appends to sink. An empty collection (no blocks) is valid; n_blocks=0.
+    void finish(ByteSink* sink);
+
+private:
+    std::vector<std::string> first_terms_;
+};
+
+// Reader: verifies the checksum and materializes all sample_terms on open; subsequent locate calls are pure in-memory binary search.
+class SampledTermIndexReader {
+public:
+    SampledTermIndexReader() = default;
+
+    // Parses a kSampledTermIndex framed section.
+    // CRC mismatch / truncation / field overrun → kCorruption; type != kSampledTermIndex → kInvalidArgument.
+    static Status open(Slice section, SampledTermIndexReader* out);
+
+    // Binary-search locate: returns the block ordinal of the last sample_term <= target.
+    //   target < min_term or target > max_term (including empty index) → *maybe_present=false (out of range, term is definitely absent).
+    //   Otherwise *maybe_present=true and *block_ordinal is the ordinal of the matching block.
+    Status locate(std::string_view target, bool* maybe_present, uint32_t* block_ordinal) const;
+
+    uint32_t n_blocks() const { return static_cast<uint32_t>(sample_terms_.size()); }
+
+private:
+    std::vector<std::string> sample_terms_;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/stats_block.h b/be/src/snii/format/stats_block.h
new file mode 100644
index 00000000000000..20ef0c6613f85d
--- /dev/null
+++ b/be/src/snii/format/stats_block.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <cstdint>
+
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/section_framer.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+
+// Statistics block within the per-index meta block. Carries only the counting stats
+// needed for query planning and BM25; section location info is stored separately in SectionRefs (see design spec "Per-index meta block").
+//
+// On-disk layout (framed by SectionFramer with unified type+len+crc32c):
+//   [u8 type=kStatsBlock][varint64 payload_len][payload][fixed32 crc32c]
+//   payload = varint64{ doc_count, indexed_doc_count, term_count,
+//                       sum_total_term_freq, null_count }
+// For field semantics see design spec "Scoring statistics design".
+struct StatsBlock {
+    uint64_t doc_count = 0;           // total doc count at segment level (including unindexed/NULL)
+    uint64_t indexed_doc_count = 0;   // number of docs actually indexed (denominator for avgdl)
+    uint64_t term_count = 0;          // number of unique terms in this index
+    uint64_t sum_total_term_freq = 0; // total token count across all indexed docs
+    uint64_t null_count = 0;          // number of NULL / not-indexed docs
+};
+
+// Encodes into a kStatsBlock framed section (with built-in crc32c checksum) and appends to sink.
+void encode_stats_block(const StatsBlock& sb, ByteSink* sink);
+
+// Reads and verifies a kStatsBlock framed section from src, populates out.
+// CRC mismatch / truncation → kCorruption; type is not kStatsBlock → kInvalidArgument.
+Status decode_stats_block(ByteSource* src, StatsBlock* out);
+
+} // namespace snii::format
diff --git a/be/src/snii/format/tail_meta_region.h b/be/src/snii/format/tail_meta_region.h
new file mode 100644
index 00000000000000..21fd737e55cf30
--- /dev/null
+++ b/be/src/snii/format/tail_meta_region.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/format/logical_index_directory.h"
+
+namespace snii::format {
+
+// TailMetaRegion: the container's tail metadata region, located via the fixed
+// tail pointer and read in one range. It bundles the per-logical-index meta
+// blocks and the logical index directory so a reader can, after a single read,
+// map (index_id, index_suffix) -> per-index meta block. See spec "footer meta
+// region".
+//
+// On-disk layout (offsets are relative to the region start; the region is read
+// whole into memory, so internal refs need not be file-absolute):
+//   TailMetaHeader:
+//     u32 meta_format_version (== kMetaFormatVersion)
+//     u32 flags
+//     u64 meta_region_len      (== total region byte length)
+//     u32 n_logical_indexes
+//     u64 directory_offset     (offset of the logical index directory in-region)
+//     u64 directory_length
+//     u32 header_crc32c        (covers the header fields above)
+//   [per-index meta block #0][per-index meta block #1]...   (opaque payloads)
+//   [logical index directory]  (framed via LogicalIndexDirectory)
+//   u32 meta_region_checksum   (crc32c over everything before it)
+class TailMetaRegionBuilder {
+public:
+    // Adds a per-index meta block (already serialized by PerIndexMetaBuilder) keyed
+    // by (index_id, index_suffix). Bytes are copied.
+    void add_index(uint64_t index_id, std::string index_suffix, Slice per_index_meta_bytes);
+
+    // Serializes the whole region and appends it to sink.
+    void finish(ByteSink* sink) const;
+
+private:
+    struct Entry {
+        uint64_t index_id;
+        std::string suffix;
+        std::vector<uint8_t> bytes;
+    };
+    std::vector<Entry> entries_;
+};
+
+class TailMetaRegionReader {
+public:
+    TailMetaRegionReader() = default;
+
+    // Parses and validates the region (header crc + region checksum + directory).
+    // region must outlive this reader (find() returns sub-views of it).
+    static Status open(Slice region, TailMetaRegionReader* out);
+
+    uint32_t n_logical_indexes() const { return n_; }
+    const LogicalIndexDirectoryReader& directory() const { return dir_; }
+
+    // Locates the per-index meta block bytes for (index_id, suffix). On match,
+    // *found=true and *per_index_meta_bytes views into the region; else *found=false.
+    Status find(uint64_t index_id, std::string_view suffix, bool* found,
+                Slice* per_index_meta_bytes) const;
+
+private:
+    Slice region_;
+    LogicalIndexDirectoryReader dir_;
+    uint32_t n_ = 0;
+};
+
+} // namespace snii::format
diff --git a/be/src/snii/format/tail_pointer.h b/be/src/snii/format/tail_pointer.h
new file mode 100644
index 00000000000000..655635bf071fb8
--- /dev/null
+++ b/be/src/snii/format/tail_pointer.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+
+namespace snii::format {
+
+// Fixed-size entry written at the very end of a segment's .idx file. It lets a
+// reader locate the tail meta region with a single read of the trailing
+// tail_pointer_size() bytes (see design spec "fixed tail pointer").
+//
+// On-disk layout (all multi-byte fields little-endian, FIXED total size so the
+// reader can read exactly the last tail_pointer_size() bytes):
+//   [u32 magic = kTailMagic]
+//   [u16 format_version = kFormatVersion]
+//   [u64 meta_region_offset]
+//   [u64 meta_region_length]
+//   [u64 hot_off]                  (offset of the hot region [hot_off, EOF);
+//                                   0 if absent)
+//   [u32 meta_region_checksum]
+//   [u32 bootstrap_header_checksum]
+//   [u8  tail_pointer_size]        (== tail_pointer_size())
+//   [u32 tail_checksum]            (crc32c over all preceding tail-pointer bytes)
+//
+// The fixed layout deliberately does NOT use the SectionFramer (which is
+// variable-length): a footer needs a constant trailing size the reader knows up
+// front.
+struct TailPointer {
+    uint64_t meta_region_offset = 0;
+    uint64_t meta_region_length = 0;
+    uint64_t hot_off = 0;
+    uint32_t meta_region_checksum = 0;
+    uint32_t bootstrap_header_checksum = 0;
+};
+
+// Constant on-disk size of the tail pointer, so the reader knows how many
+// trailing bytes to read.
+size_t tail_pointer_size();
+
+// Appends the fixed-layout tail-pointer bytes (magic / version / fields / size /
+// tail_checksum) to sink. Returns Internal if the encoded size would not fit the
+// fixed-size contract (a programming error, never expected at runtime).
+Status encode_tail_pointer(const TailPointer& tp, ByteSink* sink);
+
+// Parses the trailing tail-pointer bytes. last_bytes must be exactly
+// tail_pointer_size() bytes long. Verifies magic and tail_checksum, then fills
+// out with the parsed fields. Wrong magic / checksum mismatch / wrong length ->
+// Corruption.
+Status decode_tail_pointer(Slice last_bytes, TailPointer* out);
+
+} // namespace snii::format
diff --git a/be/src/snii/io/batch_range_fetcher.h b/be/src/snii/io/batch_range_fetcher.h
new file mode 100644
index 00000000000000..c9fc7bd083558e
--- /dev/null
+++ b/be/src/snii/io/batch_range_fetcher.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/io/file_reader.h"
+
+namespace snii::io {
+
+// Collects the byte ranges a query plan needs, coalesces overlapping/adjacent
+// ranges into physical reads, and fetches them in a single batch (one serial
+// I/O round on a MeteredFileReader). Callers retrieve each requested range by
+// the handle returned from add(). This is the SNII read path's batching layer:
+// it front-loads range planning so reads are issued concurrently rather than
+// cursor-by-cursor.
+class BatchRangeFetcher {
+public:
+    // coalesce_gap: requests separated by a gap <= this many bytes are merged into
+    // one physical read (reads a few extra bytes to save a request). 0 merges only
+    // overlapping/adjacent ranges.
+    explicit BatchRangeFetcher(FileReader* reader, uint64_t coalesce_gap = 0);
+
+    // Registers a desired range; returns a handle usable with get() after fetch().
+    size_t add(uint64_t offset, uint64_t len);
+
+    // Coalesces and issues one batched read; fills internal buffers.
+    Status fetch();
+
+    // Bytes for handle h (valid only after a successful fetch(), until clear()).
+    Slice get(size_t h) const;
+
+    size_t pending() const { return reqs_.size(); }
+    void clear();
+
+private:
+    struct Req {
+        uint64_t offset;
+        uint64_t len;
+        size_t len_size = 0;   // validated size_t length after successful fetch()
+        size_t phys_idx = 0;   // index into phys_ after fetch
+        size_t sub_offset = 0; // byte offset of this req within its physical read
+    };
+
+    FileReader* reader_;
+    uint64_t coalesce_gap_;
+    std::vector<Req> reqs_;
+    std::vector<std::vector<uint8_t>> phys_; // physical read buffers after fetch
+};
+
+} // namespace snii::io
diff --git a/be/src/snii/io/file_reader.h b/be/src/snii/io/file_reader.h
new file mode 100644
index 00000000000000..b8aae0c9957d1a
--- /dev/null
+++ b/be/src/snii/io/file_reader.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/io/io_metrics.h"
+
+namespace snii::io {
+
+// One logical read request (offset, length).
+struct Range {
+    uint64_t offset = 0;
+    size_t len = 0;
+};
+
+// The single physical-read primitive (a BE-internal read_at). All higher layers
+// route reads through this so I/O can be accounted and backed by local files or
+// object storage interchangeably.
+class FileReader {
+public:
+    virtual ~FileReader() = default;
+
+    // Reads exactly len bytes starting at offset into *out (which is resized to
+    // len). Reading past EOF is an error (Corruption/IoError).
+    virtual Status read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) = 0;
+
+    // Reads a batch of ranges that may be served concurrently. The default is a
+    // sequential loop; backends that model concurrency (MeteredFileReader) or
+    // perform real parallel fetches (object storage) override this.
+    virtual Status read_batch(const std::vector<Range>& ranges,
+                              std::vector<std::vector<uint8_t>>* outs) {
+        outs->resize(ranges.size());
+        for (size_t i = 0; i < ranges.size(); ++i) {
+            SNII_RETURN_IF_ERROR(read_at(ranges[i].offset, ranges[i].len, &(*outs)[i]));
+        }
+        return Status::OK();
+    }
+
+    // Total size of the underlying object in bytes.
+    virtual uint64_t size() const = 0;
+
+    // Optional live metrics. Readers that do not account I/O return nullptr.
+    virtual const IoMetrics* io_metrics() const { return nullptr; }
+};
+
+} // namespace snii::io
diff --git a/be/src/snii/io/file_writer.h b/be/src/snii/io/file_writer.h
new file mode 100644
index 00000000000000..a216898423c209
--- /dev/null
+++ b/be/src/snii/io/file_writer.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstdint>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+
+namespace snii::io {
+
+// Append-only writer (no seek-back), so the format can be produced in a single
+// streaming pass compatible with S3FileWriter / StreamSinkFileWriter / packed
+// writer. All container bytes are written front-to-back; back-references are
+// resolved by writing metadata last.
+class FileWriter {
+public:
+    virtual ~FileWriter() = default;
+
+    virtual Status append(Slice data) = 0;
+    virtual Status finalize() = 0;
+    virtual uint64_t bytes_written() const = 0;
+};
+
+} // namespace snii::io
diff --git a/be/src/snii/io/io_metrics.h b/be/src/snii/io/io_metrics.h
new file mode 100644
index 00000000000000..27e4d21bb0c2f8
--- /dev/null
+++ b/be/src/snii/io/io_metrics.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstdint>
+
+namespace snii::io {
+
+// Object-storage access metrics collected at FileReader boundaries.
+struct IoMetrics {
+    uint64_t read_at_calls = 0;       // BE-internal logical read requests issued
+    uint64_t serial_rounds = 0;       // dependent serial I/O rounds
+    uint64_t range_gets = 0;          // remote range GETs after cache coalescing
+    uint64_t remote_bytes = 0;        // bytes fetched from remote
+    uint64_t total_request_bytes = 0; // sum of requested lengths before cache
+};
+
+inline IoMetrics delta(const IoMetrics& after, const IoMetrics& before) {
+    IoMetrics out;
+    out.read_at_calls = after.read_at_calls - before.read_at_calls;
+    out.serial_rounds = after.serial_rounds - before.serial_rounds;
+    out.range_gets = after.range_gets - before.range_gets;
+    out.remote_bytes = after.remote_bytes - before.remote_bytes;
+    out.total_request_bytes = after.total_request_bytes - before.total_request_bytes;
+    return out;
+}
+
+} // namespace snii::io
diff --git a/be/src/snii/io/local_file.h b/be/src/snii/io/local_file.h
new file mode 100644
index 00000000000000..a67477750c2be3
--- /dev/null
+++ b/be/src/snii/io/local_file.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/io/file_reader.h"
+#include "snii/io/file_writer.h"
+
+namespace snii::io {
+
+// Local-filesystem FileReader. Uses pread for positional, thread-safe reads
+// (so concurrent batch fetches do not contend on a shared file offset).
+class LocalFileReader : public FileReader {
+public:
+    LocalFileReader() = default;
+    ~LocalFileReader() override;
+
+    LocalFileReader(const LocalFileReader&) = delete;
+    LocalFileReader& operator=(const LocalFileReader&) = delete;
+
+    Status open(const std::string& path);
+    Status read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) override;
+    uint64_t size() const override { return size_; }
+
+private:
+    int fd_ = -1;
+    uint64_t size_ = 0;
+};
+
+// Local-filesystem append-only FileWriter. Appends accumulate in a fixed
+// userspace buffer and are flushed to the fd in large chunks, collapsing the
+// many tiny per-append ::write() syscalls of the build path (e.g. ~53k writes
+// averaging ~683 B each) into a handful of big writes. The produced file is
+// byte-identical to the unbuffered path; only the syscall count drops.
+class LocalFileWriter : public FileWriter {
+public:
+    LocalFileWriter() = default;
+    ~LocalFileWriter() override;
+
+    LocalFileWriter(const LocalFileWriter&) = delete;
+    LocalFileWriter& operator=(const LocalFileWriter&) = delete;
+
+    Status open(const std::string& path);
+    Status append(Slice data) override;
+    Status finalize() override;
+    uint64_t bytes_written() const override { return bytes_written_; }
+
+private:
+    // Userspace write buffer size. 256 KiB amortizes the write() syscall cost over
+    // many appends while keeping transient RAM negligible vs the index sections.
+    static constexpr size_t kBufCapacity = 256u * 1024;
+
+    // Flushes the userspace buffer to the fd with a robust partial-write loop.
+    Status flush_buffer();
+    // Writes a raw byte span straight to the fd (used for spans larger than the
+    // buffer, bypassing a needless copy).
+    Status write_all(const uint8_t* data, size_t len);
+
+    int fd_ = -1;
+    uint64_t bytes_written_ = 0;
+    std::vector<uint8_t> buf_;
+};
+
+} // namespace snii::io
diff --git a/be/src/snii/io/metered_file_reader.h b/be/src/snii/io/metered_file_reader.h
new file mode 100644
index 00000000000000..41fed3eb7ac49a
--- /dev/null
+++ b/be/src/snii/io/metered_file_reader.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <unordered_set>
+#include <vector>
+
+#include "snii/io/file_reader.h"
+#include "snii/io/io_metrics.h"
+
+namespace snii::io {
+
+// A FileReader decorator that models an object-storage FileCache: reads are
+// aligned to fixed (default 1MiB) blocks; only not-yet-resident blocks become
+// remote range GETs (adjacent misses are coalesced). It is the single shared
+// "yardstick" through which both single blocking reads and batched concurrent
+// reads are measured.
+//
+//   - read_at(): a single blocking read. Any cache miss => +1 serial round
+//     (the cursor must wait for bytes before the next offset is known).
+//   - read_batch(): all ranges submitted concurrently => the whole batch is at
+//     most one serial round (+1 iff any range misses).
+class MeteredFileReader : public FileReader {
+public:
+    explicit MeteredFileReader(FileReader* inner, size_t block_size = (1u << 20));
+
+    Status read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) override;
+    Status read_batch(const std::vector<Range>& ranges,
+                      std::vector<std::vector<uint8_t>>* outs) override;
+    uint64_t size() const override { return inner_->size(); }
+
+    const IoMetrics& metrics() const { return metrics_; }
+    const IoMetrics* io_metrics() const override { return &metrics_; }
+    // Clears counters AND the resident block set, modelling a cold (cache-empty) query.
+    void reset_metrics();
+
+private:
+    Status validate_range(uint64_t offset, size_t len) const;
+
+    // Accounts the cache effect of touching [offset, offset+len): records misses,
+    // coalesced GETs, and remote bytes. Returns true iff at least one block missed.
+    bool account_blocks(uint64_t offset, size_t len);
+
+    FileReader* inner_;
+    size_t block_size_;
+    std::unordered_set<uint64_t> resident_;
+    IoMetrics metrics_;
+};
+
+} // namespace snii::io
diff --git a/be/src/snii/io/s3_object_store.h b/be/src/snii/io/s3_object_store.h
new file mode 100644
index 00000000000000..2cf2270d751bb6
--- /dev/null
+++ b/be/src/snii/io/s3_object_store.h
@@ -0,0 +1,122 @@
+#pragma once
+
+// S3 / OSS object-storage backend for snii::io.
+//
+// ISOLATION: the ENTIRE body of this header (and its .cpp) is guarded by
+// SNII_WITH_S3. When the option is OFF the translation unit compiles to nothing
+// and pulls in NO aws-sdk headers, so core stays free of any aws dependency by
+// default. Only when CMake is configured with -DSNII_WITH_S3=ON is the macro
+// defined and aws linked.
+#ifdef SNII_WITH_S3
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/io/file_reader.h"
+#include "snii/io/file_writer.h"
+
+// Forward declarations only -- aws types are pimpl'd in the .cpp so that this
+// header never leaks aws-sdk includes to its consumers.
+namespace Aws::S3 {
+class S3Client;
+} // namespace Aws::S3
+
+namespace snii::io {
+
+// Connection / addressing parameters for an S3-compatible endpoint (tested
+// against Aliyun OSS, which requires virtual-hosted addressing).
+struct S3Config {
+    std::string endpoint; // e.g. "oss-cn-hongkong.aliyuncs.com"
+    std::string region;   // e.g. "cn-hongkong"
+    std::string bucket;   // e.g. "doris-community-test"
+    std::string prefix;   // object key prefix (no trailing slash required)
+    std::string ak;       // access key id
+    std::string sk;       // secret access key
+    long connect_timeout_ms = 10000;
+    long request_timeout_ms = 180000;
+    long http_request_timeout_ms = 180000;
+};
+
+// Process-wide aws InitAPI / ShutdownAPI lifecycle guard.
+//
+// aws-sdk-cpp requires Aws::InitAPI to be called exactly once before any client
+// is used and Aws::ShutdownAPI once at teardown. Construct a single
+// AwsApiGuard (e.g. on the stack of main, or as a static) that lives for the
+// whole duration during which S3FileReader / S3FileWriter are used. The guard is
+// reference counted, so nested guards are safe; the underlying InitAPI runs only
+// for the first live instance and ShutdownAPI when the last one is destroyed.
+class AwsApiGuard {
+public:
+    AwsApiGuard();
+    ~AwsApiGuard();
+
+    AwsApiGuard(const AwsApiGuard&) = delete;
+    AwsApiGuard& operator=(const AwsApiGuard&) = delete;
+};
+
+// Read-only FileReader backed by an S3/OSS object. Range reads use a ranged
+// GetObject; size() is the object length cached from a HeadObject at open().
+class S3FileReader : public FileReader {
+public:
+    S3FileReader() = default;
+    ~S3FileReader() override;
+
+    S3FileReader(const S3FileReader&) = delete;
+    S3FileReader& operator=(const S3FileReader&) = delete;
+    S3FileReader(S3FileReader&&) noexcept;
+    S3FileReader& operator=(S3FileReader&&) noexcept;
+
+    // Opens the object (prefix + "/" + key) and caches its size via HeadObject.
+    static Status open(const S3Config& cfg, const std::string& key, S3FileReader* out);
+
+    Status read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) override;
+    // Concurrent batch: issues the ranges' GetObjects in parallel (bounded), so a
+    // planned read round costs ~one round-trip instead of the sum of all GETs.
+    Status read_batch(const std::vector<Range>& ranges,
+                      std::vector<std::vector<uint8_t>>* outs) override;
+    uint64_t size() const override { return size_; }
+
+private:
+    std::shared_ptr<Aws::S3::S3Client> client_;
+    std::string bucket_;
+    std::string object_key_; // full key (prefix + "/" + key)
+    uint64_t size_ = 0;
+};
+
+// Append-only FileWriter backed by an S3/OSS object. Appends are buffered in
+// memory; finalize() flushes the whole buffer in a single PutObject. Multipart
+// upload is a future optimization.
+class S3FileWriter : public FileWriter {
+public:
+    S3FileWriter() = default;
+    ~S3FileWriter() override;
+
+    S3FileWriter(const S3FileWriter&) = delete;
+    S3FileWriter& operator=(const S3FileWriter&) = delete;
+    S3FileWriter(S3FileWriter&&) noexcept;
+    S3FileWriter& operator=(S3FileWriter&&) noexcept;
+
+    // Opens a writer targeting object (prefix + "/" + key).
+    Status open(const S3Config& cfg, const std::string& key);
+
+    Status append(Slice data) override;
+    Status finalize() override;
+    uint64_t bytes_written() const override { return bytes_written_; }
+
+private:
+    std::shared_ptr<Aws::S3::S3Client> client_;
+    std::string bucket_;
+    std::string object_key_; // full key (prefix + "/" + key)
+    std::vector<uint8_t> buffer_;
+    uint64_t bytes_written_ = 0;
+    bool finalized_ = false;
+};
+
+} // namespace snii::io
+
+#endif // SNII_WITH_S3
diff --git a/be/src/snii/query/bm25_scorer.h b/be/src/snii/query/bm25_scorer.h
new file mode 100644
index 00000000000000..85df67d3f5e1be
--- /dev/null
+++ b/be/src/snii/query/bm25_scorer.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <cstdint>
+
+// Bm25Scorer -- classic Okapi BM25 relevance scoring over SNII native stats.
+//
+// Per query term, idf is precomputed once from the collection statistics:
+//   idf = log(1 + (N - df + 0.5) / (df + 0.5))
+// where N = indexed doc count and df = the term's document frequency. The
+// per-document contribution of a term then is:
+//   score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / avgdl))
+// where tf is the in-doc term frequency, dl the document length decoded from the
+// 1-byte encoded norm, and avgdl the average document length.
+//
+// Norm encode/decode (DOCUMENTED CONTRACT): the writer stores doc length as a
+// byte-quantized value floor-clamped to [1, 255]; decode is the identity map
+// back to a double length. encode_norm(len) = clamp(len, 1, 255);
+// decode_norm(b) = (b == 0 ? 1.0 : (double)b). This keeps short docs (len <= 255)
+// exact and saturates longer docs at 255, matching the reference oracle.
+namespace snii::query {
+
+// BM25 free parameters. Defaults are the classic Lucene/Elasticsearch values.
+struct Bm25Params {
+    double k1 = 1.2;
+    double b = 0.75;
+};
+
+// Decodes a 1-byte encoded norm into a document length. byte 0 maps to 1.0 to
+// avoid a zero-length divisor; otherwise it is the byte value itself.
+double decode_norm(uint8_t encoded);
+
+// Encodes a document length into a 1-byte norm (clamped to [1, 255]). Provided
+// so writers and test oracles share one quantization.
+uint8_t encode_norm(uint64_t doc_length);
+
+// Per-term scoring context: the precomputed idf and the term's df. Built once per
+// query term, then reused for every candidate document of that term.
+class ScorerContext {
+public:
+    // Builds the context from collection size n (indexed doc count) and the term's
+    // document frequency df. avgdl and params are supplied per score call.
+    static ScorerContext make(uint64_t n, uint64_t df);
+
+    double idf() const { return idf_; }
+    uint64_t df() const { return df_; }
+
+    // Scores one document occurrence: tf is the in-doc term frequency, encoded_norm
+    // the doc's 1-byte length norm, avgdl the collection average length.
+    double score(uint32_t tf, uint8_t encoded_norm, double avgdl, const Bm25Params& params) const;
+
+    // Upper bound on score() over any document, given a window's maximum tf and the
+    // shortest doc length in the window (smallest dl maximizes the score). Used by
+    // the WAND-style block-max pruner. max_freq is the window's max tf; min_norm is
+    // the smallest encoded norm (=> smallest dl => largest score).
+    double max_score(uint32_t max_freq, uint8_t min_norm, double avgdl,
+                     const Bm25Params& params) const;
+
+private:
+    double idf_ = 0.0;
+    uint64_t df_ = 0;
+};
+
+} // namespace snii::query
diff --git a/be/src/snii/query/boolean_query.h b/be/src/snii/query/boolean_query.h
new file mode 100644
index 00000000000000..f9cba6485eb37c
--- /dev/null
+++ b/be/src/snii/query/boolean_query.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/query_profile.h"
+#include "snii/reader/logical_index_reader.h"
+
+// boolean_or -- MATCH_ANY semantics: return the sorted docid set containing at
+// least one query term. Empty terms or all-absent terms produce an empty
+// result. Duplicate input terms are ignored semantically and do not duplicate
+// output docids.
+namespace snii::query {
+
+Status boolean_or(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, std::vector<uint32_t>* docids);
+Status boolean_or(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, std::vector<uint32_t>* docids,
+                  QueryProfile* profile);
+Status boolean_or(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, DocIdSink* sink);
+
+// boolean_and (MATCH all-terms): sorted docid set of docs containing EVERY
+// term, no positional constraint. Valid on docs-only indexes. Empty terms or
+// any absent term -> empty result.
+Status boolean_and(const snii::reader::LogicalIndexReader& idx,
+                   const std::vector<std::string>& terms, std::vector<uint32_t>* docids);
+Status boolean_and(const snii::reader::LogicalIndexReader& idx,
+                   const std::vector<std::string>& terms, std::vector<uint32_t>* docids,
+                   QueryProfile* profile);
+
+} // namespace snii::query
diff --git a/be/src/snii/query/docid_sink.h b/be/src/snii/query/docid_sink.h
new file mode 100644
index 00000000000000..9fc5dc2d9739d3
--- /dev/null
+++ b/be/src/snii/query/docid_sink.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <span>
+#include <vector>
+
+#include "snii/common/status.h"
+
+namespace snii::query {
+
+// Bulk docid handoff for query operators. Each span is sorted ascending; callers
+// that need a single vector can use VectorDocIdSink.
+class DocIdSink {
+public:
+    virtual ~DocIdSink() = default;
+    virtual Status append_sorted(std::span<const uint32_t> docids) = 0;
+    virtual Status append_range(uint32_t first, uint64_t last_exclusive) = 0;
+};
+
+class VectorDocIdSink final : public DocIdSink {
+public:
+    explicit VectorDocIdSink(std::vector<uint32_t>& docids) : docids_(docids) {}
+
+    Status append_sorted(std::span<const uint32_t> docids) override {
+        docids_.insert(docids_.end(), docids.begin(), docids.end());
+        return Status::OK();
+    }
+
+    Status append_range(uint32_t first, uint64_t last_exclusive) override {
+        if (last_exclusive <= first) {
+            return Status::OK();
+        }
+        if (last_exclusive > static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1) {
+            return Status::InvalidArgument("docid_sink: range exceeds uint32 docid space");
+        }
+        const uint64_t count = last_exclusive - first;
+        if (count > static_cast<uint64_t>(docids_.max_size() - docids_.size())) {
+            return Status::InvalidArgument("docid_sink: range too large");
+        }
+        docids_.reserve(docids_.size() + static_cast<size_t>(count));
+        for (uint64_t docid = first; docid < last_exclusive; ++docid) {
+            docids_.push_back(static_cast<uint32_t>(docid));
+        }
+        return Status::OK();
+    }
+
+private:
+    std::vector<uint32_t>& docids_;
+};
+
+} // namespace snii::query
diff --git a/be/src/snii/query/internal/docid_conjunction.h b/be/src/snii/query/internal/docid_conjunction.h
new file mode 100644
index 00000000000000..3cb6cc42f5a294
--- /dev/null
+++ b/be/src/snii/query/internal/docid_conjunction.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/io/batch_range_fetcher.h"
+#include "snii/reader/logical_index_reader.h"
+
+namespace snii::query::internal {
+
+struct ResolvedQueryTerm {
+    snii::format::DictEntry entry;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+};
+
+struct TermPlan {
+    snii::format::DictEntry entry;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+    uint32_t df = 0;
+    size_t order = 0;
+    size_t frq_handle = 0;
+    size_t prx_handle = 0;
+    size_t prelude_handle = 0;
+    bool pod_ref = false;
+    bool windowed = false;
+    snii::format::FrqPreludeReader prelude;
+};
+
+struct DocidChunk {
+    std::vector<uint32_t> docids;
+    std::vector<uint32_t> prx_doc_ordinals;
+    uint32_t prx_doc_count = 0;
+    bool windowed = false;
+    uint32_t window = 0;
+};
+
+struct DocidSource {
+    std::vector<DocidChunk> chunks;
+    bool docids_are_final_candidates = false;
+};
+
+Status resolve_query_term(const snii::reader::LogicalIndexReader& idx, const std::string& term,
+                          ResolvedQueryTerm* resolved, bool* found);
+
+Status plan_terms(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, snii::io::BatchRangeFetcher* fetcher,
+                  std::vector<TermPlan>* plans, bool* all_present, bool need_positions);
+
+Status plan_resolved_terms(const snii::reader::LogicalIndexReader& idx,
+                           const std::vector<ResolvedQueryTerm>& terms,
+                           snii::io::BatchRangeFetcher* fetcher, std::vector<TermPlan>* plans,
+                           bool need_positions);
+
+Status open_preludes(const snii::io::BatchRangeFetcher& fetcher, std::vector<TermPlan>* plans,
+                     bool need_positions);
+
+Status inline_dd_region(const snii::format::DictEntry& entry, Slice* out);
+
+Status build_docid_only_conjunction(const snii::reader::LogicalIndexReader& idx,
+                                    const snii::io::BatchRangeFetcher& round1,
+                                    const std::vector<TermPlan>& plans,
+                                    std::vector<uint32_t>* candidates);
+
+Status build_docid_only_conjunction(const snii::reader::LogicalIndexReader& idx,
+                                    const snii::io::BatchRangeFetcher& round1,
+                                    const std::vector<TermPlan>& plans,
+                                    std::vector<uint32_t>* candidates,
+                                    std::vector<DocidSource>* sources);
+
+} // namespace snii::query::internal
diff --git a/be/src/snii/query/internal/docid_posting_reader.h b/be/src/snii/query/internal/docid_posting_reader.h
new file mode 100644
index 00000000000000..bf5927b5857335
--- /dev/null
+++ b/be/src/snii/query/internal/docid_posting_reader.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/format/dict_entry.h"
+#include "snii/query/docid_sink.h"
+#include "snii/reader/logical_index_reader.h"
+
+namespace snii::query::internal {
+
+struct ResolvedDocidPosting {
+    snii::format::DictEntry entry;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+};
+
+// Decodes the docid-only posting for a resolved term. The caller owns term
+// lookup and can batch/plan lookups independently; this module owns only the
+// three posting encodings (inline, slim pod_ref, windowed pod_ref).
+Status read_docid_posting(const snii::reader::LogicalIndexReader& idx,
+                          const snii::format::DictEntry& entry, uint64_t frq_base,
+                          uint64_t prx_base, std::vector<uint32_t>* docids);
+
+Status read_docid_posting(const snii::reader::LogicalIndexReader& idx,
+                          const snii::format::DictEntry& entry, uint64_t frq_base,
+                          uint64_t prx_base, snii::query::DocIdSink* sink);
+
+// Batch counterpart for multi-term docid-only operators. Windowed terms share one
+// prelude fetch round and one docid fetch round, so OR-style operators pay by
+// stage rather than by term.
+Status read_docid_postings_batched(const snii::reader::LogicalIndexReader& idx,
+                                   const std::vector<ResolvedDocidPosting>& postings,
+                                   std::vector<std::vector<uint32_t>>* docids);
+
+} // namespace snii::query::internal
diff --git a/be/src/snii/query/internal/docid_set_ops.h b/be/src/snii/query/internal/docid_set_ops.h
new file mode 100644
index 00000000000000..8aae88b90fa974
--- /dev/null
+++ b/be/src/snii/query/internal/docid_set_ops.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace snii::query::internal {
+
+std::vector<uint32_t> intersect_sorted(const std::vector<uint32_t>& a,
+                                       const std::vector<uint32_t>& b);
+
+void union_sorted_into(std::vector<uint32_t>* acc, const std::vector<uint32_t>& next);
+
+std::vector<uint32_t> union_sorted_many(const std::vector<std::vector<uint32_t>>& lists);
+
+} // namespace snii::query::internal
diff --git a/be/src/snii/query/internal/docid_union.h b/be/src/snii/query/internal/docid_union.h
new file mode 100644
index 00000000000000..89c53f103d2343
--- /dev/null
+++ b/be/src/snii/query/internal/docid_union.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/internal/docid_posting_reader.h"
+#include "snii/reader/logical_index_reader.h"
+
+namespace snii::query::internal {
+
+// Reads already-resolved docid postings in planned batches, merges them as a
+// sorted deduplicated union, then emits one bulk span to the sink.
+Status build_docid_union(const snii::reader::LogicalIndexReader& idx,
+                         const std::vector<ResolvedDocidPosting>& postings,
+                         std::vector<uint32_t>* out);
+
+Status emit_docid_union(const snii::reader::LogicalIndexReader& idx,
+                        const std::vector<ResolvedDocidPosting>& postings, DocIdSink* sink);
+
+} // namespace snii::query::internal
diff --git a/be/src/snii/query/internal/position_math.h b/be/src/snii/query/internal/position_math.h
new file mode 100644
index 00000000000000..04e964a67b6e7e
--- /dev/null
+++ b/be/src/snii/query/internal/position_math.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+namespace snii::query::internal {
+
+inline bool build_position_offsets(size_t count, std::vector<uint32_t>* out) {
+    if (count >= std::numeric_limits<uint32_t>::max()) {
+        return false;
+    }
+    out->clear();
+    out->reserve(count);
+    uint32_t offset = 0;
+    while (out->size() < count) {
+        out->push_back(offset);
+        ++offset;
+    }
+    return true;
+}
+
+inline bool add_position_offset(uint32_t start, uint32_t offset, uint32_t* out) {
+    if (start > std::numeric_limits<uint32_t>::max() - offset) return false;
+    *out = start + offset;
+    return true;
+}
+
+} // namespace snii::query::internal
diff --git a/be/src/snii/query/internal/term_expansion.h b/be/src/snii/query/internal/term_expansion.h
new file mode 100644
index 00000000000000..3393c31dc8457a
--- /dev/null
+++ b/be/src/snii/query/internal/term_expansion.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <string_view>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/reader/logical_index_reader.h"
+
+namespace snii::query::internal {
+
+using TermMatcher = std::function<bool(std::string_view)>;
+
+// Enumerates dictionary terms from `enum_prefix`, filters them with `matches`,
+// and emits the sorted docid union for matching entries. PrefixHit carries the
+// DictEntry and block bases, so callers avoid a second lookup per expanded term.
+Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx,
+                                 std::string_view enum_prefix, const TermMatcher& matches,
+                                 DocIdSink* const sink, int32_t max_expansions = 0);
+
+} // namespace snii::query::internal
diff --git a/be/src/snii/query/phrase_query.h b/be/src/snii/query/phrase_query.h
new file mode 100644
index 00000000000000..0de44c1fdbd921
--- /dev/null
+++ b/be/src/snii/query/phrase_query.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/query_profile.h"
+#include "snii/reader/logical_index_reader.h"
+
+// phrase_query -- MATCH_PHRASE: return the sorted docid set in which the terms
+// occur consecutively (for some i, every term k appears at position pos+k in
+// the same doc). It first builds the docid conjunction with docs-only posting
+// reads, then fetches PRX only for chunks that can contain final candidates:
+//   1. read preludes / docs-only posting ranges and intersect per-term docids;
+//   2. fetch retained PRX chunks and stream positions for survivors;
+//   3. for each surviving doc, check that some position p exists with
+//      term[0]@p, term[1]@p+1, ... term[n-1]@p+(n-1).
+// An empty term list -> empty result. Any term absent -> empty result.
+namespace snii::query {
+
+Status phrase_query(const snii::reader::LogicalIndexReader& idx,
+                    const std::vector<std::string>& terms, std::vector<uint32_t>* docids);
+Status phrase_query(const snii::reader::LogicalIndexReader& idx,
+                    const std::vector<std::string>& terms, std::vector<uint32_t>* docids,
+                    QueryProfile* profile);
+
+// phrase_prefix_query -- MATCH_PHRASE_PREFIX: the last item in `terms` is a
+// term prefix and preceding items are exact terms. For example {"quick", "bro"}
+// matches "quick brown" and "quick bronze". Empty terms -> empty result.
+Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx,
+                           const std::vector<std::string>& terms,
+                           std::vector<uint32_t>* const docids, int32_t max_expansions = 0);
+Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx,
+                           const std::vector<std::string>& terms,
+                           std::vector<uint32_t>* const docids, QueryProfile* profile,
+                           int32_t max_expansions = 0);
+
+} // namespace snii::query
diff --git a/be/src/snii/query/prefix_query.h b/be/src/snii/query/prefix_query.h
new file mode 100644
index 00000000000000..cd8dc5559f3232
--- /dev/null
+++ b/be/src/snii/query/prefix_query.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/query_profile.h"
+#include "snii/reader/logical_index_reader.h"
+
+// prefix_query -- MATCH_PREFIX semantics: enumerate dictionary terms with the
+// requested prefix, then return the sorted docid set containing any enumerated
+// term. Empty prefix enumerates all terms. No matching terms -> empty result.
+namespace snii::query {
+
+Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix,
+                    std::vector<uint32_t>* const docids, int32_t max_expansions = 0);
+Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix,
+                    std::vector<uint32_t>* const docids, QueryProfile* profile,
+                    int32_t max_expansions = 0);
+Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix,
+                    DocIdSink* const sink, int32_t max_expansions = 0);
+
+} // namespace snii::query
diff --git a/be/src/snii/query/query_profile.h b/be/src/snii/query/query_profile.h
new file mode 100644
index 00000000000000..a4988f6a80c8d1
--- /dev/null
+++ b/be/src/snii/query/query_profile.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+
+#include "snii/io/io_metrics.h"
+
+namespace snii::io {
+class FileReader;
+}
+
+namespace snii::query {
+
+struct QueryProfile {
+    uint64_t elapsed_ns = 0;
+    bool has_io_metrics = false;
+    snii::io::IoMetrics io_before;
+    snii::io::IoMetrics io_after;
+    snii::io::IoMetrics io_delta;
+};
+
+class QueryProfileScope {
+public:
+    QueryProfileScope(snii::io::FileReader* reader, QueryProfile* profile);
+    ~QueryProfileScope();
+    QueryProfileScope(const QueryProfileScope&) = delete;
+    QueryProfileScope& operator=(const QueryProfileScope&) = delete;
+
+    void finish();
+
+private:
+    snii::io::FileReader* reader_ = nullptr;
+    QueryProfile* profile_ = nullptr;
+    std::chrono::steady_clock::time_point start_;
+    bool finished_ = false;
+};
+
+} // namespace snii::query
diff --git a/be/src/snii/query/regexp_query.h b/be/src/snii/query/regexp_query.h
new file mode 100644
index 00000000000000..a088ed42dcc1f8
--- /dev/null
+++ b/be/src/snii/query/regexp_query.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/query_profile.h"
+#include "snii/reader/logical_index_reader.h"
+
+// regexp_query -- MATCH_REGEXP semantics over dictionary terms. The pattern is
+// evaluated with std::regex_match, so it must match the whole term. Matching
+// terms are executed as a sorted deduplicated docid union.
+namespace snii::query {
+
+Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                    std::vector<uint32_t>* const docids, int32_t max_expansions = 0);
+Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                    std::vector<uint32_t>* const docids, QueryProfile* profile,
+                    int32_t max_expansions = 0);
+Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                    DocIdSink* const sink, int32_t max_expansions = 0);
+
+} // namespace snii::query
diff --git a/be/src/snii/query/scoring_query.h b/be/src/snii/query/scoring_query.h
new file mode 100644
index 00000000000000..dc2ea75f0751e7
--- /dev/null
+++ b/be/src/snii/query/scoring_query.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/bm25_scorer.h"
+#include "snii/reader/logical_index_reader.h"
+#include "snii/stats/snii_stats_provider.h"
+
+// scoring_query -- top-K BM25 scored retrieval over one logical index for one or
+// more query terms. Two entry points produce IDENTICAL rankings:
+//   - scoring_query_exhaustive(): scores every candidate document (the baseline
+//     correctness oracle).
+//   - scoring_query_wand(): a block-max / WAND-style optimization that uses the
+//     per-window max_freq / max_norm columns from the frq_prelude to bound each
+//     window's best possible score and SKIP windows that cannot enter the
+//     current top-K. A window without block-max stats (slim/inline entries or a
+//     missing prelude) is never pruned, so the result still equals the
+//     exhaustive ranking.
+//
+// Results are sorted by score descending; ties are broken by ascending docid so
+// the ordering is deterministic and the two paths compare equal.
+namespace snii::query {
+
+// One scored hit.
+struct ScoredDoc {
+    uint32_t docid = 0;
+    double score = 0.0;
+};
+
+// Exhaustive baseline: score every doc that contains any query term, return the
+// top-k by score. params controls k1/b. Unknown terms are skipped.
+Status scoring_query_exhaustive(const snii::reader::LogicalIndexReader& idx,
+                                const snii::stats::SniiStatsProvider& stats,
+                                const std::vector<std::string>& terms, uint32_t k,
+                                const Bm25Params& params, std::vector<ScoredDoc>* out);
+
+// WAND-style block-max pruning. MUST return the same top-k as the exhaustive
+// path. Windows whose block-max upper bound cannot beat the current k-th score
+// are skipped; windows lacking block-max stats are scored fully.
+Status scoring_query_wand(const snii::reader::LogicalIndexReader& idx,
+                          const snii::stats::SniiStatsProvider& stats,
+                          const std::vector<std::string>& terms, uint32_t k,
+                          const Bm25Params& params, std::vector<ScoredDoc>* out);
+
+// SELECTIVE-FETCH block-max WAND (design spec section 5, "Phase C"). Same WAND /
+// theta / >= tie machinery as scoring_query_wand, but it DEFERS the .frq window
+// fetch: for each windowed term it first reads ONLY the frq_prelude (block-max
+// columns), then fetches a term's .frq window lazily and at most once -- and ONLY
+// when the running block-max bound proves a doc in that window can still reach the
+// top-K (bound >= theta). A window the bound rules out is never fetched. The
+// result (top-K docids AND scores, INCLUDING ties) is byte-identical to
+// scoring_query_exhaustive / scoring_query_wand; only the bytes read differ.
+// Slim/inline terms (no prelude) are fetched fully, exactly as today.
+Status scoring_query_wand_selective(const snii::reader::LogicalIndexReader& idx,
+                                    const snii::stats::SniiStatsProvider& stats,
+                                    const std::vector<std::string>& terms, uint32_t k,
+                                    const Bm25Params& params, std::vector<ScoredDoc>* out);
+
+} // namespace snii::query
diff --git a/be/src/snii/query/term_query.h b/be/src/snii/query/term_query.h
new file mode 100644
index 00000000000000..c804405a2ec104
--- /dev/null
+++ b/be/src/snii/query/term_query.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/query_profile.h"
+#include "snii/reader/logical_index_reader.h"
+
+// term_query -- the simplest SNII query: return the sorted docid set that
+// contains term. It runs the term lookup on the logical index, then issues a
+// single batched .frq range read (one serial round) to decode the postings.
+// Absent term -> empty result (OK status).
+namespace snii::query {
+
+Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term,
+                  std::vector<uint32_t>* docids);
+Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term,
+                  DocIdSink* sink);
+Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term,
+                  std::vector<uint32_t>* docids, QueryProfile* profile);
+
+} // namespace snii::query
diff --git a/be/src/snii/query/wildcard_query.h b/be/src/snii/query/wildcard_query.h
new file mode 100644
index 00000000000000..1cb0d5551dcf09
--- /dev/null
+++ b/be/src/snii/query/wildcard_query.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/query_profile.h"
+#include "snii/reader/logical_index_reader.h"
+
+// wildcard_query -- MATCH_WILDCARD semantics over dictionary terms. `*` matches
+// any byte sequence, `?` matches one byte, and all other bytes match literally.
+// Matching terms are executed as a sorted deduplicated docid union.
+namespace snii::query {
+
+Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                      std::vector<uint32_t>* const docids, int32_t max_expansions = 0);
+Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                      std::vector<uint32_t>* const docids, QueryProfile* profile,
+                      int32_t max_expansions = 0);
+Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                      DocIdSink* const sink, int32_t max_expansions = 0);
+
+} // namespace snii::query
diff --git a/be/src/snii/reader/logical_index_reader.h b/be/src/snii/reader/logical_index_reader.h
new file mode 100644
index 00000000000000..b10a5d7c7791f5
--- /dev/null
+++ b/be/src/snii/reader/logical_index_reader.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/format/bsbf.h"
+#include "snii/format/dict_block.h"
+#include "snii/format/dict_block_directory.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/per_index_meta.h"
+#include "snii/format/sampled_term_index.h"
+#include "snii/format/stats_block.h"
+#include "snii/io/file_reader.h"
+
+// LogicalIndexReader -- read-side counterpart of LogicalIndexWriter for one
+// logical index. It owns the resident per-index meta sub-readers (XFilter,
+// SampledTermIndex, DICT block directory, StatsBlock, SectionRefs) parsed from
+// the per-index meta block, and resolves a query term to its DictEntry through
+// the documented lookup flow:
+//   XFilter (reject absent) -> SampledTermIndex (candidate block ordinal) ->
+//   DICT block directory (block range) -> resident small-DICT block or one
+//   range read of the DICT block -> DictBlockReader::find_term.
+//
+// lookup() also returns the block's frq_base/prx_base (captured by the
+// DictBlockReader) so callers can resolve a pod_ref entry's absolute .frq/.prx
+// offsets via the writer's contract. Both deltas index into the SAME
+// interleaved posting region (prx_base == frq_base; the prx span precedes the
+// frq span):
+//   abs_frq = posting_region.offset + frq_base + entry.frq_off_delta
+//   abs_prx = posting_region.offset + prx_base + entry.prx_off_delta
+//
+// The meta block bytes must outlive this reader (they are owned by the parent
+// SniiSegmentReader's resident meta region).
+namespace snii::reader {
+
+class LogicalIndexReader {
+public:
+    LogicalIndexReader() = default;
+
+    // Parses the per-index meta block and binds the reader to file_reader.
+    // file_reader / meta_block must outlive this reader.
+    static Status open(snii::io::FileReader* file_reader, snii::format::IndexTier tier,
+                       bool has_positions, Slice meta_block, LogicalIndexReader* out);
+
+    // Resolves term to a DictEntry. *found=false when the term is absent (XFilter
+    // rejection, out-of-range sample, or DICT-block miss). On a hit, *entry is
+    // filled and *frq_base / *prx_base carry the candidate block's bases.
+    Status lookup(std::string_view term, bool* found, snii::format::DictEntry* entry,
+                  uint64_t* frq_base, uint64_t* prx_base) const;
+
+    // One enumerated term whose key has the requested prefix, with its DictEntry
+    // and the owning DICT block's frq/prx bases (for posting resolution).
+    struct PrefixHit {
+        std::string term;
+        snii::format::DictEntry entry;
+        uint64_t frq_base = 0;
+        uint64_t prx_base = 0;
+    };
+
+    using PrefixHitVisitor = std::function<Status(PrefixHit&& hit, bool* stop)>;
+
+    // Ordered term enumeration: every term with `prefix`, in lexicographic order,
+    // by seeking the start DICT block via the SampledTermIndex and scanning
+    // forward across contiguous blocks until the terms pass the prefix range.
+    // Empty prefix enumerates all terms. This is the contiguous-DICT-block design
+    // the term-anchor layout was built for (MATCH_PHRASE_PREFIX / prefix / range
+    // queries). The visitor form avoids materializing all hits when callers only
+    // need a bounded expansion.
+    Status visit_prefix_terms(std::string_view prefix, const PrefixHitVisitor& visitor) const;
+    Status prefix_terms(std::string_view prefix, std::vector<PrefixHit>* const out,
+                        int32_t max_terms = 0) const;
+
+    // Resolves a pod_ref entry's absolute .frq / .prx window byte range,
+    // validating the locator against the posting_region length (defends against
+    // corrupt entries: prelude_len > frq_len underflow, or off_delta+len past the
+    // region). Both windows resolve against the single posting_region. *abs_off
+    // is the absolute file offset of the window (after prelude); *len its byte
+    // length.
+    Status resolve_frq_window(const snii::format::DictEntry& entry, uint64_t frq_base,
+                              uint64_t* abs_off, uint64_t* len) const;
+    Status resolve_prx_window(const snii::format::DictEntry& entry, uint64_t prx_base,
+                              uint64_t* abs_off, uint64_t* len) const;
+
+    const snii::format::SectionRefs& section_refs() const { return meta_.section_refs(); }
+    const snii::format::StatsBlock& stats() const { return meta_.stats(); }
+    snii::format::IndexTier tier() const { return tier_; }
+    bool has_positions() const { return has_positions_; }
+    snii::io::FileReader* reader() const { return reader_; }
+
+private:
+    snii::io::FileReader* reader_ = nullptr;
+    snii::format::IndexTier tier_ = snii::format::IndexTier::kT1;
+    bool has_positions_ = false;
+    snii::format::PerIndexMetaReader meta_;
+    snii::format::SampledTermIndexReader sti_;
+    snii::format::DictBlockDirectoryReader dbd_;
+    snii::format::BsbfHeader bsbf_header_; // resident header (from section ref)
+    bool has_bsbf_ = false;
+    // L0 tiering: when the bsbf section is small (<= kBsbfResidentMaxBytes) its
+    // whole bitset is loaded here at open -> in-memory probe, no per-lookup
+    // round. Empty => L1 (on-demand single-block probe via bsbf_probe).
+    bool bsbf_resident_ = false;
+    std::vector<uint8_t> bsbf_resident_bitset_;
+
+    // Small DICT blocks are opened once with the index so exact lookups avoid an
+    // otherwise serial S3 round for the term dictionary. Empty means the
+    // dictionary exceeded the resident threshold and lookup/prefix enumeration
+    // read blocks on demand. Each DictBlockReader holds a Slice into the owning
+    // bytes.
+    struct ResidentDictBlock {
+        std::vector<uint8_t> bytes;
+        snii::format::DictBlockReader reader;
+    };
+    struct OnDemandDictBlock {
+        std::vector<uint8_t> bytes;
+        snii::format::DictBlockReader reader;
+    };
+    Status load_resident_dict_blocks();
+    Status dict_block_reader_for_ordinal(uint32_t ordinal, OnDemandDictBlock* on_demand,
+                                         const snii::format::DictBlockReader** out) const;
+    std::vector<ResidentDictBlock> resident_dict_blocks_;
+};
+
+} // namespace snii::reader
diff --git a/be/src/snii/reader/snii_segment_reader.h b/be/src/snii/reader/snii_segment_reader.h
new file mode 100644
index 00000000000000..fc725889a03f94
--- /dev/null
+++ b/be/src/snii/reader/snii_segment_reader.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/format/tail_meta_region.h"
+#include "snii/io/file_reader.h"
+#include "snii/reader/logical_index_reader.h"
+
+// SniiSegmentReader -- entry point for the SNII segment read path. It opens a
+// single .idx container through a (possibly metered) io::FileReader and exposes
+// its logical indexes. open() performs the minimal bootstrap reads:
+//   1. the fixed bootstrap header (front of the file),
+//   2. the fixed tail pointer (last tail_pointer_size() bytes), and
+//   3. the tail meta region (one range read located via the tail pointer).
+// The meta region bytes are held resident by the reader so per-index meta blocks
+// (returned as sub-views) remain valid for the reader's lifetime.
+//
+// open_index() then materializes one LogicalIndexReader from the per-index meta
+// block of a given (index_id, suffix); query functions operate on that reader.
+namespace snii::reader {
+
+class SniiSegmentReader {
+public:
+    SniiSegmentReader() = default;
+
+    // Reads bootstrap header + tail pointer + tail meta region from reader.
+    // reader must outlive the returned SniiSegmentReader and every
+    // LogicalIndexReader opened from it. reader == nullptr / out == nullptr ->
+    // InvalidArgument; structural problems -> Corruption / Unsupported.
+    static Status open(snii::io::FileReader* reader, SniiSegmentReader* out);
+
+    uint32_t n_logical_indexes() const { return region_reader_.n_logical_indexes(); }
+
+    // Loads the per-index meta block for (index_id, suffix) and builds a
+    // LogicalIndexReader bound to the same FileReader. Absent index -> NotFound.
+    Status open_index(uint64_t index_id, std::string_view suffix, LogicalIndexReader* out) const;
+
+    snii::io::FileReader* reader() const { return reader_; }
+
+private:
+    snii::io::FileReader* reader_ = nullptr;
+    std::vector<uint8_t> meta_region_; // owned resident copy of the tail meta region
+    snii::format::TailMetaRegionReader region_reader_;
+};
+
+} // namespace snii::reader
diff --git a/be/src/snii/reader/windowed_posting.h b/be/src/snii/reader/windowed_posting.h
new file mode 100644
index 00000000000000..e02e6e2831e05b
--- /dev/null
+++ b/be/src/snii/reader/windowed_posting.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/reader/logical_index_reader.h"
+
+// WindowedPostingReader -- shared read-side decode of a windowed term's posting
+// from its two-level frq_prelude + GROUPED dd-block / freq-block (design 1.6).
+//
+// A windowed pod_ref entry's .frq payload is laid out
+//   [prelude][dd-block][freq-block]
+// where the dd-block concatenates every window's dd_region and the freq-block
+// every window's freq_region. The docs-only prefix [prelude][dd-block] is ONE
+// contiguous run. This helper:
+//   1. range-fetches the prelude (prelude_len bytes) and parses the directory,
+//   2. range-fetches the WHOLE dd-block in ONE contiguous range (and, for
+//   scoring,
+//      the whole freq-block in one more range),
+//   3. decodes each window's dd region (and freq region) from the in-memory
+//   blocks
+//      via the prelude metadata (dd_off/dd_disk_len, freq_off/freq_disk_len),
+//      and concatenates the per-window docids / freqs / positions.
+//
+// The slim/inline single-window path is handled by the term/phrase/scoring
+// callers directly; this helper is for enc=windowed entries only.
+namespace snii::reader {
+
+// Coalesce gap (bytes) used when batch-fetching MULTIPLE dd sub-ranges of the
+// SAME term (the phrase window-skip path): dd regions of one term are
+// contiguous in the dd-block, so merging reads separated by <= this gap into
+// one physical Range GET trades a little over-read for fewer remote GETs (the
+// design's higher-priority metric). Only applied to same-term multi-window
+// batches, never to cross-term.
+inline constexpr uint64_t kSameTermCoalesceGap = 0;
+
+// Full decoded posting for one windowed term (docids ascending across windows).
+struct DecodedPosting {
+    std::vector<uint32_t> docids;
+    std::vector<uint32_t> freqs;                  // aligned with docids
+    std::vector<std::vector<uint32_t>> positions; // aligned; empty when no prx
+};
+
+// Decodes the entire windowed posting. want_positions requires the index to
+// have positions (and the entry to carry prx). want_freq selects whether the
+// freq-block is fetched + decoded: when false ONLY the contiguous
+// [prelude][dd-block] prefix is fetched (docid-only / phrase callers) and
+// DecodedPosting.freqs stays empty; when true the freq-block is additionally
+// fetched (scoring). Returns Corruption on any prelude/block inconsistency
+// (doc-count mismatch, out-of-range offsets).
+Status read_windowed_posting(const LogicalIndexReader& idx, const snii::format::DictEntry& entry,
+                             uint64_t frq_base, uint64_t prx_base, bool want_positions,
+                             bool want_freq, DecodedPosting* out);
+
+// --- Sub-block (window) skipping helpers (shared with phrase / selective WAND)
+// --
+//
+// These expose the per-window dd/freq/prx addressing within the grouped blocks
+// so the skip path can fetch ONLY the windows covering candidate docids (their
+// dd sub-ranges within the dd-block, near-contiguous and coalesce-friendly)
+// instead of the whole posting, without duplicating the offset arithmetic.
+
+// Absolute file byte ranges of one window's regions. dd is always valid; freq
+// is valid only when want_freq; prx is valid only when want_positions (and
+// has_prx).
+struct WindowAbsRange {
+    uint64_t dd_off = 0;
+    uint64_t dd_len = 0;
+    uint64_t freq_off = 0;
+    uint64_t freq_len = 0;
+    uint64_t prx_off = 0;
+    uint64_t prx_len = 0;
+};
+
+// Fetches + parses the two-level prelude of a windowed entry (one batched
+// read).
+Status fetch_windowed_prelude(const LogicalIndexReader& idx, const snii::format::DictEntry& entry,
+                              uint64_t frq_base, snii::format::FrqPreludeReader* prelude);
+
+// Computes the absolute file ranges of window w's dd region (and freq region
+// when want_freq, and .prx window when want_positions), fully validated against
+// the POD sections (anti-DoS: rejects out-of-range offsets and overflowing
+// locators).
+Status windowed_window_range(const LogicalIndexReader& idx, const snii::format::DictEntry& entry,
+                             uint64_t frq_base, uint64_t prx_base,
+                             const snii::format::FrqPreludeReader& prelude, uint32_t w,
+                             bool want_positions, bool want_freq, WindowAbsRange* out);
+
+// Decodes one window's docids (and per-doc positions when want_positions, and
+// per-doc freqs when want_freq) from already-fetched byte slices: dd_region is
+// the window's dd sub-slice; freq_region its freq sub-slice (ignored when
+// !want_freq); prx_window its .prx bytes. The decoded docids are absolute
+// (win_base applied). Returns Corruption on any doc-count mismatch between the
+// prelude, dd/freq and prx.
+Status decode_window_slices(const snii::format::WindowMeta& meta, Slice dd_region,
+                            Slice freq_region, Slice prx_window, bool want_positions,
+                            bool want_freq, std::vector<uint32_t>* docids,
+                            std::vector<uint32_t>* freqs,
+                            std::vector<std::vector<uint32_t>>* positions);
+
+} // namespace snii::reader
diff --git a/be/src/snii/stats/snii_stats_provider.h b/be/src/snii/stats/snii_stats_provider.h
new file mode 100644
index 00000000000000..12fdfa607bf0bd
--- /dev/null
+++ b/be/src/snii/stats/snii_stats_provider.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+
+#include "snii/common/status.h"
+#include "snii/format/norms_pod.h"
+#include "snii/reader/logical_index_reader.h"
+
+// SniiStatsProvider -- exposes the native SNII scoring statistics required by
+// BM25, sourced directly from the on-disk structures of one logical index:
+//   - segment-level counts (doc_count, indexed_doc_count, sum_total_term_freq)
+//     from the StatsBlock embedded in the per-index meta block.
+//   - per-term df / ttf from the term's DictEntry (resolved through the reader's
+//     lookup flow). The LogicalIndexWriter stores ttf directly in ttf_delta for
+//     tier>=T2 entries, so total_term_freq returns entry.ttf_delta.
+//   - per-doc length normalization byte (encoded_norm) from the norms POD,
+//     range-read once at open via section_refs().norms and parsed with
+//     NormsPodReader.
+//
+// avgdl() = sum_total_term_freq / max(1, indexed_doc_count): the average document
+// length used by BM25 length normalization. The provider performs no scoring; it
+// only surfaces the statistics so snii::query::Bm25Scorer can combine them.
+namespace snii::stats {
+
+class SniiStatsProvider {
+public:
+    SniiStatsProvider() = default;
+
+    // Binds to idx and materializes the norms POD (one range read) when the index
+    // carries scoring norms. idx must outlive this provider. A scoring index
+    // without a norms section, or a corrupt norms POD, returns a non-OK Status.
+    static Status open(const snii::reader::LogicalIndexReader* idx, SniiStatsProvider* out);
+
+    // Segment-level counts (direct StatsBlock fields).
+    uint64_t doc_count() const { return doc_count_; }
+    uint64_t indexed_doc_count() const { return indexed_doc_count_; }
+    uint64_t sum_total_term_freq() const { return sum_total_term_freq_; }
+
+    // Average document length: sum_total_term_freq / max(1, indexed_doc_count).
+    double avgdl() const;
+
+    // Per-term document frequency. Absent term -> *df = 0 (OK status).
+    Status doc_freq(std::string_view term, uint64_t* df) const;
+
+    // Per-term total term frequency (ttf = df + ttf_delta at tier>=T2). Absent
+    // term -> *ttf = 0 (OK status).
+    Status total_term_freq(std::string_view term, uint64_t* ttf) const;
+
+    // 1-byte encoded doc-length norm for docid (raw byte from the norms POD).
+    // Out-of-range docid -> InvalidArgument; index without norms -> InvalidArgument.
+    Status encoded_norm(uint32_t docid, uint8_t* out) const;
+
+    bool has_norms() const { return has_norms_; }
+
+private:
+    const snii::reader::LogicalIndexReader* idx_ = nullptr;
+    uint64_t doc_count_ = 0;
+    uint64_t indexed_doc_count_ = 0;
+    uint64_t sum_total_term_freq_ = 0;
+    bool has_norms_ = false;
+    // Owned copy of the framed norms section bytes; norms_reader_ borrows from it.
+    std::vector<uint8_t> norms_bytes_;
+    snii::format::NormsPodReader norms_reader_;
+};
+
+} // namespace snii::stats
diff --git a/be/src/snii/version.h b/be/src/snii/version.h
new file mode 100644
index 00000000000000..dd2bdef2af8e3e
--- /dev/null
+++ b/be/src/snii/version.h
@@ -0,0 +1,4 @@
+#pragma once
+#define SNII_VERSION_MAJOR 0
+#define SNII_VERSION_MINOR 1
+#define SNII_VERSION_STRING "0.1.0"
diff --git a/be/src/snii/writer/compact_posting_pool.h b/be/src/snii/writer/compact_posting_pool.h
new file mode 100644
index 00000000000000..ceeb150faffc4f
--- /dev/null
+++ b/be/src/snii/writer/compact_posting_pool.h
@@ -0,0 +1,180 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace snii::writer {
+
+// SEGMENTED BYTE ARENA with per-term SLICED runs (a ByteBlockPool, after Lucene).
+//
+// WHY: the SPIMI accumulator's bulk memory is the per-term posting bytes. Backing
+// each term with its own std::vector<uint8_t> pays two taxes that dominate peak
+// RSS at scale: (1) geometric-growth doubling slack (~1.17x of the live payload),
+// and (2) a 24-32 B vector/struct header per term (hundreds of thousands of
+// terms). This pool removes both: all term bytes live in a few large fixed-size
+// blocks (so slack is ~one block, amortized to ~1.05x), and a term needs only two
+// 32-bit cursors of live state (chain head for reads + write head for appends).
+//
+// HOW (slices): a term's bytes are not stored contiguously. They live in a chain
+// of SLICES of geometrically growing payload capacity (the kSliceSizes schedule:
+// 4, 8, 16, ... bytes of payload). Each slice is laid out as
+//   [ payload bytes ... ][ 4-byte forward pointer ]
+// The forward pointer holds the absolute offset of the next slice's first payload
+// byte (0 while the slice is still the tail of the chain). When a slice's payload
+// region fills, the writer allocates a larger slice, stores its head into the old
+// slice's 4 pointer bytes, and keeps appending. A reader walks the chain by
+// reading payload bytes until a slice boundary, then following the pointer.
+//
+// Both writer and reader recompute each slice's capacity from the chain's slice
+// INDEX (0, 1, 2, ...) via the deterministic schedule, so neither needs to store
+// per-slice sizes. The writer carries the current slice's end offset in its
+// SliceWriter handle; the reader recomputes capacities as it advances.
+//
+// Offsets are GLOBAL absolute byte indices into the logical concatenation of all
+// blocks: offset = block_index * kBlockSize + byte_in_block. kBlockSize is a power
+// of two, so offset -> (block, byte) is a shift/mask.
+class CompactPostingPool {
+public:
+    // Block size (power of two). 32 KiB blocks keep per-block tail waste tiny (it
+    // matters at the smaller 1M scale where the whole arena is only tens of MiB) and
+    // bound the outer vector<block> header cost; at the 5M scale a few thousand
+    // blocks is still cheap. Empirically the lowest peak across both scales.
+    static constexpr uint32_t kBlockShift = 15;
+    static constexpr uint32_t kBlockSize = 1u << kBlockShift; // 32 KiB
+    static constexpr uint32_t kBlockMask = kBlockSize - 1;
+
+    // Per-slice forward-pointer width (absolute uint32 next-slice offset).
+    static constexpr uint32_t kPtrBytes = 4;
+
+    // Geometric slice payload-capacity schedule and the level transition. Level i
+    // slices hold kSliceSizes[i] payload bytes; on overflow the chain advances to
+    // kNextLevel[i] (capping at the largest level). A GENTLE (~1.5x) many-level
+    // schedule starting small minimizes the over-allocated final slice (the
+    // dominant arena overhead) while keeping the per-slice forward-pointer count
+    // bounded for high-df chains.
+    static constexpr int kLevelCount = 16;
+
+    CompactPostingPool();
+
+    CompactPostingPool(const CompactPostingPool&) = delete;
+    CompactPostingPool& operator=(const CompactPostingPool&) = delete;
+
+    // Payload capacity (bytes) of a fresh level-0 slice. Exposed for tests that need
+    // to fill exactly one slice without hardcoding the schedule.
+    static uint32_t kSliceSizes_level0();
+
+    // Payload capacity of the slice at `level`, and the level a chain advances to when
+    // that slice overflows. Exposed (like kSliceSizes_level0) so tests can simulate the
+    // arena's bump allocator exactly -- e.g. to construct an EXACT block-boundary fill --
+    // without hardcoding the private schedule. `level` must be in [0, kLevelCount).
+    static uint32_t kSliceSize_at(int level);
+    static uint8_t kNextLevel_at(int level);
+
+    // Live append handle for one term's chain. POD, 8 bytes: the absolute write
+    // cursor and the absolute end of the current slice's payload region. The chain's
+    // current slice LEVEL is kept by the caller (a uint8, packed alongside its other
+    // flags) so this handle stays 8 bytes -- shaving the per-term accumulator. `head`
+    // (the chain's first payload offset) is also stored by the CALLER (the read entry
+    // point); start_chain returns it.
+    struct SliceWriter {
+        uint32_t cur = 0;       // next byte to write (absolute)
+        uint32_t slice_end = 0; // one-past-last payload byte of the current slice
+    };
+
+    // Begins a fresh chain, initializing `w` to its first (level-0) slice and
+    // *level to 0, and returns the chain head (absolute first payload offset).
+    uint32_t start_chain(SliceWriter* w, uint8_t* level);
+
+    // Appends one payload byte to the chain described by `w` / `*level`, growing the
+    // chain with a new linked slice (and advancing *level) when the current slice's
+    // payload region is exhausted.
+    void append_byte(SliceWriter* w, uint8_t* level, uint8_t value);
+
+    // Total live payload bytes ever written across all chains (excludes slice
+    // forward-pointer overhead). Drives the spill-threshold estimate only.
+    uint64_t payload_bytes() const { return payload_bytes_; }
+
+    // Bytes the arena currently occupies (block_count * kBlockSize). The pool
+    // addresses bytes with a uint32 offset (next_offset_), so the arena MUST stay
+    // below 4 GiB or alloc_run wraps and silently aliases block 0. The accumulator
+    // watches this to force a safety spill before the wrap; alloc_run also enforces it
+    // directly (throws std::overflow_error on a would-be wrap) so a direct user of the
+    // pool fails loudly rather than silently corrupting.
+    // Hard invariant: a single CompactPostingPool never exceeds UINT32_MAX bytes.
+    uint64_t arena_bytes() const { return static_cast<uint64_t>(blocks_.size()) << kBlockShift; }
+
+    // Releases ALL blocks back to the OS. Called after the accumulator is fully
+    // drained (or before a spill's next fill) so no input-side bytes stay resident.
+    void reset();
+
+    // ---- Reader ----------------------------------------------------------------
+    // Forward cursor over one term's chain, yielding its payload bytes in write
+    // order by walking the slice forward pointers.
+    //
+    // CONTRACT of the `budget` ctor argument (single, unambiguous meaning):
+    //   `budget` is an UPPER BOUND on the number of bytes this cursor may yield. It
+    //   is NOT required to equal the exact payload length: passing the exact length
+    //   is fine, and so is passing any value >= it (the production caller passes the
+    //   chain's write-head offset, which always bounds the payload from above). The
+    //   cursor is SELF-TERMINATING: once it walks off the last written byte it sees
+    //   the tail slice's zero forward pointer and stops, regardless of how much
+    //   budget remains. So an over-large budget can never make next() read past the
+    //   chain (no aliasing of block 0, no off-chain access) -- the budget is purely a
+    //   secondary cap. has_next() is therefore a reliable "more bytes remain"
+    //   predicate for ANY budget >= the true length: it becomes false at the smaller
+    //   of (budget exhausted, chain tail reached).
+    class Cursor {
+    public:
+        Cursor(const CompactPostingPool* pool, uint32_t head, uint64_t budget);
+
+        // True while the cursor can still yield a REAL payload byte: the budget is not
+        // spent AND the cursor has not reached the chain tail. It peeks the tail forward
+        // pointer at a slice boundary so it never reports a phantom trailing byte, making
+        // has_next()/next() a safe loop for any budget >= the true payload length.
+        bool has_next() const;
+        // Yields the next payload byte. Returns 0 (and yields no more) once the chain
+        // tail is reached or the budget is spent -- never reads past the chain.
+        uint8_t next();
+
+    private:
+        const CompactPostingPool* pool_;
+        uint32_t cur_;       // absolute read cursor
+        uint32_t slice_end_; // one-past-last payload byte of the current slice
+        uint32_t level_;     // current slice level
+        uint64_t budget_;    // remaining byte budget (upper bound on bytes to yield)
+    };
+
+    // Builds a cursor over the chain at `head`. `budget` is an UPPER BOUND on bytes to
+    // read (see Cursor's contract): the exact payload length or anything larger. The
+    // production caller passes the write-head offset, which always bounds the payload
+    // from above; the cursor self-terminates at the chain tail regardless.
+    Cursor cursor(uint32_t head, uint64_t budget) const { return Cursor(this, head, budget); }
+
+private:
+    static const uint32_t kSliceSizes[kLevelCount];
+    static const uint8_t kNextLevel[kLevelCount];
+
+    uint8_t* at(uint32_t off) { return &blocks_[off >> kBlockShift][off & kBlockMask]; }
+    const uint8_t* at(uint32_t off) const { return &blocks_[off >> kBlockShift][off & kBlockMask]; }
+
+    // Reads/writes the 4-byte forward pointer at the END of a slice whose payload
+    // region ends at `slice_end` (pointer occupies [slice_end, slice_end+4)).
+    uint32_t read_ptr(uint32_t slice_end) const;
+    void write_ptr(uint32_t slice_end, uint32_t next_head);
+
+    // Reserves `bytes` contiguous bytes from the arena tail (a fresh block if the
+    // current tail cannot hold them) and returns the first reserved absolute offset.
+    // `bytes` must be <= kBlockSize.
+    uint32_t alloc_run(uint32_t bytes);
+
+    // Allocates a slice at `level` (payload region + 4 pointer bytes), zeroes its
+    // forward pointer, and returns the first payload offset; sets *slice_end.
+    uint32_t alloc_slice(int level, uint32_t* slice_end);
+
+    std::vector<std::vector<uint8_t>> blocks_; // fixed kBlockSize blocks
+    uint32_t next_offset_ = 0;                 // global bump pointer (absolute) into the tail block
+    uint64_t payload_bytes_ = 0;
+};
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/logical_index_writer.h b/be/src/snii/writer/logical_index_writer.h
new file mode 100644
index 00000000000000..03fbe7994918a7
--- /dev/null
+++ b/be/src/snii/writer/logical_index_writer.h
@@ -0,0 +1,238 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/format/dict_block.h"
+#include "snii/format/dict_block_directory.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/per_index_meta.h"
+#include "snii/format/sampled_term_index.h"
+#include "snii/format/stats_block.h"
+#include "snii/io/file_writer.h"
+#include "snii/writer/memory_reporter.h"
+#include "snii/writer/spillable_byte_buffer.h"
+#include "snii/writer/spimi_term_buffer.h"
+
+// LogicalIndexWriter -- builds the per-logical-index section bytes (interleaved
+// posting region + DICT block region) and the meta sub-sections (SampledTermIndex,
+// DICT block directory, StatsBlock, XFilter) for ONE logical index. It owns the
+// in-memory section bytes and the metadata needed by the container orchestrator
+// (SniiCompoundWriter) to resolve absolute offsets and emit the per-index meta
+// block.
+//
+// This module deliberately produces ONLY relative bytes/structures: it has no
+// knowledge of the absolute file position where the sections will land. The
+// orchestrator stitches the absolute offsets in afterward (append-only, no
+// seek-back). See snii_compound_writer.h for the precise offset contract.
+//
+// POSTING REGION (single interleaved sink): the former separate .frq POD and .prx
+// POD are merged into ONE posting region. For each pod_ref term, in term order, the
+// writer appends its prx span FIRST then its frq span, contiguously:
+//   posting region = concat over pod_ref terms of [prx span][frq span].
+// The prx span is empty when !has_prx (docs-only / keyword tier). INLINE terms
+// append NOTHING to the posting region.
+//
+// Per-term encoding policy (v1):
+//   df >= kSlimDfThreshold (512): WINDOWED pod_ref. The term's [prx windows] are
+//     appended to the posting region first, then its [prelude][dd-block][freq-block]
+//     frq span. The DictEntry records frq/prx off_delta+len relative to
+//     frq_base/prx_base (see below).
+//   df < kSlimDfThreshold: SLIM. The postings are encoded as a single .frq
+//     window (and .prx window). If the encoded .frq bytes are small
+//     (<= kDefaultInlineThreshold), they are stored INLINE inside the DictEntry
+//     (kind=inline); otherwise the term's [prx][frq] spans are appended to the
+//     posting region as a slim pod_ref (kind=pod_ref, enc=slim, no prelude).
+//
+// frq_base / prx_base convention (DOCUMENTED CONTRACT):
+//   For each DICT block, frq_base == prx_base == the running byte offset into THIS
+//   index's posting region at the moment the block opens (the posting-region size
+//   when the block's first POD-backed entry is appended). A windowed/slim pod_ref
+//   entry then sets frq_off_delta = (offset of its frq span within the posting
+//   region) - frq_base, so the reader computes the absolute file offset as
+//     section_refs.posting_region.offset + frq_base + frq_off_delta.
+//   prx_base / prx_off_delta follow the identical rule against the SAME region.
+//   Because [prx][frq] are written contiguously per term, a writer-side property
+//   holds when has_prx: frq_off_delta == prx_off_delta + prx_len. The reader does
+//   NOT rely on it -- each delta is resolved independently.
+//   Inline entries carry no off_delta (bytes live in the entry).
+namespace snii::writer {
+
+// Inputs describing one logical index to be written.
+struct SniiIndexInput {
+    uint64_t index_id = 0;
+    std::string index_suffix;
+    snii::format::IndexConfig config = snii::format::IndexConfig::kDocsPositions;
+    uint32_t doc_count = 0;
+    std::vector<uint32_t> null_docids;
+    // Per-doc 1-byte encoded norm (length doc_count); only consumed when the
+    // config has scoring. May be empty otherwise.
+    std::vector<uint8_t> encoded_norms;
+    // Lexicographically sorted terms with ascending-docid postings. Used when
+    // `term_source` is null (callers that already hold a materialized vector,
+    // e.g. unit tests). The writer reads but does not retain these.
+    std::vector<TermPostings> terms;
+    // Optional streaming term source. When non-null, the writer DRAINS it via
+    // SpimiTermBuffer::for_each_term_sorted so that only one term's postings is
+    // materialized at a time (avoiding the full TermPostings vector and its
+    // second-copy peak). `terms` is ignored when this is set. The buffer is
+    // consumed (emptied) by build(); the caller must keep it alive until build()
+    // returns and must not reuse it afterwards.
+    SpimiTermBuffer* term_source = nullptr;
+    // Target DICT block size in bytes; a block is cut once its estimate reaches
+    // this. 0 uses kDefaultTargetDictBlockBytes. Smaller values yield more blocks
+    // (and a finer-grained sampled-term index).
+    uint32_t target_dict_block_bytes = 0;
+    // Optional writer-level build-RAM reporter (one per SniiCompoundWriter = one
+    // segment inverted index). When non-null, the dict buffer reports its REAL
+    // resident-byte deltas (positive on grow, negative on spill). The SPIMI side
+    // (arena + slot index) reports through the SAME reporter, injected directly at
+    // the term_source's construction by the caller. null in bench / unit tests -> no
+    // reporting. NEVER report live_bytes_ (a gated estimate); report
+    // arena_bytes()+slot_of_+dict ram_bytes_.
+    MemoryReporter* mem_reporter = nullptr;
+};
+
+// Builds and holds the section bytes + meta sub-sections for one logical index.
+class LogicalIndexWriter {
+public:
+    explicit LogicalIndexWriter(const SniiIndexInput& in);
+
+    // Builds DICT blocks, the interleaved posting region, sampled-term index, dict
+    // directory, stats and bsbf. The posting region is written STRAIGHT into
+    // `posting_out` as terms are produced (no temp round-trip for the bulk); the
+    // orchestrator captures its absolute offset/length from posting_out->bytes_written()
+    // around this call. Must be called once before the accessors below. Returns
+    // InvalidArgument on a null sink or inconsistent input (e.g. norms/doc_count
+    // mismatch when scoring is enabled, or non-ascending docids).
+    Status build(snii::io::FileWriter* posting_out);
+
+    // DICT region byte length (relative; orchestrator decides its absolute offset). The
+    // DICT region (zstd-compressed blocks) is built into a tiered buffer during build()
+    // -- it must land contiguously AFTER the posting region (streamed concurrently), so
+    // it cannot stream directly. The buffer stays in RAM while small (spill-only build)
+    // and spills to a temp once it crosses the RAM cap (bounded peak RSS for a huge
+    // dict). Its bytes are emitted via stream_dict_region_into below. The posting region
+    // went straight to the output during build(), so it has no length accessor here --
+    // the orchestrator measures it directly. norms stays in RAM (1 byte/doc).
+    uint64_t dict_region_size() const { return dict_buf_.size(); }
+    const std::vector<uint8_t>& norms_bytes() const { return norms_section_; }
+    const std::vector<uint8_t>& null_bitmap_bytes() const { return null_bitmap_section_; }
+    // Block-split bloom XFilter blob ([28B header][bitset]); empty when no terms.
+    const std::vector<uint8_t>& bsbf_bytes() const { return bsbf_bytes_; }
+    bool has_bsbf() const { return !bsbf_bytes_.empty(); }
+    bool has_null_bitmap() const { return !null_bitmap_section_.empty(); }
+
+    // Streams the DICT region (RAM or spilled temp) into the append-only container
+    // after its posting region.
+    Status stream_dict_region_into(snii::io::FileWriter* out) const {
+        return dict_buf_.stream_into(out);
+    }
+
+    bool has_prx() const { return has_prx_; }
+    bool has_norms() const { return has_norms_; }
+    snii::format::IndexTier tier() const { return tier_; }
+    uint64_t index_id() const { return index_id_; }
+    const std::string& index_suffix() const { return index_suffix_; }
+
+    // Builds the per-index meta block bytes given the resolved ABSOLUTE section
+    // refs (filled by the orchestrator), appending them to out. The DICT block
+    // directory entries are rebased to absolute offsets using dict_region_offset.
+    Status finish_meta(const snii::format::SectionRefs& abs_refs, uint64_t dict_region_offset,
+                       ByteSink* out) const;
+
+private:
+    // One DICT block's directory record. The block's serialized bytes are appended to
+    // the in-RAM dict buffer as soon as the block is cut; only this compact summary
+    // (offset within the dict region + length + entry count + checksum) is kept to
+    // build the DICT block directory at finish_meta time. The absolute file offset is
+    // computed as dict_region_offset + rel_offset.
+    struct BlockRecord {
+        uint64_t rel_offset = 0; // byte offset of this block within the dict region
+        uint64_t length = 0;     // ON-DISK block length (compressed when flags&kZstd)
+        uint32_t n_entries = 0;
+        uint32_t checksum = 0;   // crc32c of the UNCOMPRESSED block bytes
+        uint8_t flags = 0;       // block_ref_flags::* (kZstd when block is compressed)
+        uint64_t uncomp_len = 0; // uncompressed block length (when flags&kZstd)
+        std::string first_term;
+    };
+
+    // Validates one term's shape (parallel lengths, strictly ascending docids).
+    Status validate_term(const TermPostings& tp) const;
+    // Iterates terms (from the streaming source or the materialized vector),
+    // splitting DICT blocks by target size and filling PODs + blocks_.
+    Status build_blocks();
+    // Per-term driver shared by both the streaming and materialized paths:
+    // validates the term, opens a block if needed, builds its DictEntry, and cuts
+    // the block once it reaches the target size. Mutates the running block state.
+    struct BlockState;
+    // `tp` is taken by mutable reference: the encode FREES the term's large flat
+    // arrays (docids/freqs/positions_flat) as soon as they are consumed, so the
+    // widest term's source does not co-exist with its encoded output at peak RSS.
+    Status process_term(TermPostings& tp, BlockState* st);
+    // Region-relative byte count of the posting bytes written so far (the offset basis
+    // for frq_base/prx_base + frq_off_delta/prx_off_delta). During build() the only
+    // writes to posting_out_ are this index's posting region, so the count is the
+    // output offset advanced since the region began.
+    uint64_t posting_size() const { return posting_out_->bytes_written() - posting_off0_; }
+    // Builds one DictEntry (inline or pod_ref), growing the posting region as needed.
+    Status build_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base,
+                       snii::format::DictEntry* e);
+    // Builds a windowed (df >= kSlimDfThreshold) entry: multi-window + two-level
+    // prelude. The term's [prx span][frq span] is appended to the posting region.
+    Status build_windowed_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base,
+                                snii::format::DictEntry* e);
+    // Builds a slim (df < kSlimDfThreshold) entry: single window, inline or
+    // pod_ref, no prelude.
+    Status build_slim_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base,
+                            snii::format::DictEntry* e);
+    // Serializes the current open block, streams its bytes into the dict scratch
+    // file, and records a compact directory entry (no block bytes retained).
+    Status flush_block(snii::format::DictBlockBuilder* block, std::string first_term);
+
+    uint64_t index_id_;
+    std::string index_suffix_;
+    snii::format::IndexTier tier_;
+    bool has_prx_;
+    bool has_freq_; // tier >= T2: a freq region is encoded per window
+    bool has_norms_;
+    uint32_t doc_count_;
+    std::vector<uint32_t> null_docids_;
+    const std::vector<TermPostings>& terms_; // materialized fallback (may be empty)
+    SpimiTermBuffer* term_source_;           // streaming source (null => use terms_)
+    uint64_t term_count_ = 0;                // distinct terms actually consumed
+    const std::vector<uint8_t>& encoded_norms_;
+
+    uint32_t target_dict_block_bytes_;
+    // The DICT region (zstd-compressed blocks) is staged here as blocks flush. It must
+    // land contiguously AFTER the posting region (which streams concurrently to the
+    // output), so it cannot stream directly; the orchestrator streams it into the
+    // container right after the posting region. It has NO independent local cap -- it
+    // spills to a temp via the writer's UNIFIED gate-2 cap (the MemoryReporter from
+    // SniiIndexInput, null off-Doris), the same single cap the SPIMI arena uses, so one
+    // threshold bounds the writer's total build RAM. The dict self-reports its ram_bytes_
+    // deltas; the SPIMI term_source self-reports its arena+slot deltas (its reporter is
+    // injected at the source's own construction by the caller).
+    SpillableByteBuffer dict_buf_;
+    // The interleaved [prx][frq] posting region streams STRAIGHT into the container
+    // output during build() -- no temp. posting_out_ is the container writer (borrowed
+    // for the duration of build); posting_off0_ is its absolute offset when this index's
+    // region began, so posting_size() = bytes_written() - posting_off0_.
+    snii::io::FileWriter* posting_out_ = nullptr;
+    uint64_t posting_off0_ = 0;
+    std::vector<uint8_t> norms_section_;
+    std::vector<uint8_t> null_bitmap_section_;
+
+    std::vector<BlockRecord> blocks_;
+    // One 8-byte XXH64 (seed 0) filter key per term, collected during the build pass
+    // so the whole-vocabulary string copy is never retained.
+    std::vector<uint64_t> term_hashes_;
+    snii::format::StatsBlock stats_;
+    std::vector<uint8_t> bsbf_bytes_; // serialized block-split bloom XFilter section
+};
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/memory_reporter.h b/be/src/snii/writer/memory_reporter.h
new file mode 100644
index 00000000000000..e9352d43d18e61
--- /dev/null
+++ b/be/src/snii/writer/memory_reporter.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+namespace snii::writer {
+
+// Per-WRITER accurate byte counter for build-time RAM (one per SniiCompoundWriter =
+// one per segment's inverted index). Modules report their own resident-byte deltas;
+// current_bytes() is that writer's accurate live usage. OBSERVE-ONLY -- SNII never
+// makes a flush decision from it (gate 1 belongs to Doris; gate 2 is the internal
+// threshold). consume_release mirrors the delta into Doris's LOAD MemTracker so the
+// inverted-index RAM is counted by MemTableMemoryLimiter's pressure decision; it is
+// null off-Doris (bench / unit tests), where only the local atomic is updated.
+class MemoryReporter {
+public:
+    using ConsumeReleaseFn = std::function<void(int64_t delta)>; // null off-Doris
+    // cap_bytes is the UNIFIED gate-2 buffer cap for the WHOLE writer (e.g. Doris's
+    // 512 MiB inverted-index buffer config); 0 = unlimited. Every build buffer of this
+    // writer (SPIMI arena + dict) self-spills when over_cap() is true -- one threshold on
+    // the unified total, not a separate per-buffer threshold.
+    explicit MemoryReporter(ConsumeReleaseFn consume_release = nullptr, uint64_t cap_bytes = 0)
+            : consume_release_(std::move(consume_release)), cap_bytes_(cap_bytes) {}
+
+    MemoryReporter(const MemoryReporter&) = delete;
+    MemoryReporter& operator=(const MemoryReporter&) = delete;
+
+    // delta > 0 grows, delta < 0 shrinks/frees. Exactly one report per change site.
+    void report(int64_t delta) {
+        current_.fetch_add(delta, std::memory_order_relaxed);
+        if (consume_release_) consume_release_(delta); // mirror into Doris load tracker
+    }
+
+    int64_t current_bytes() const { return current_.load(std::memory_order_relaxed); }
+
+    // True once the writer's UNIFIED total build RAM (arena + slot index + dict + ...)
+    // reaches the cap. The single gate-2 trigger shared by every buffer of the writer.
+    bool over_cap() const {
+        return cap_bytes_ != 0 && current_bytes() >= static_cast<int64_t>(cap_bytes_);
+    }
+    uint64_t cap_bytes() const { return cap_bytes_; }
+
+private:
+    std::atomic<int64_t> current_ {0};
+    ConsumeReleaseFn consume_release_;
+    uint64_t cap_bytes_ = 0;
+};
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/snii_compound_writer.h b/be/src/snii/writer/snii_compound_writer.h
new file mode 100644
index 00000000000000..bd3a7c454026ad
--- /dev/null
+++ b/be/src/snii/writer/snii_compound_writer.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/io/file_writer.h"
+#include "snii/writer/logical_index_writer.h"
+
+// SniiCompoundWriter -- orchestrates a single-segment SNII container for one or
+// more logical indexes, written front-to-back through an append-only
+// io::FileWriter (no seek-back). It resolves all back-references by writing the
+// tail meta region and the fixed tail pointer LAST.
+//
+// CONTAINER LAYOUT PRODUCED (this is the on-disk contract the reader matches):
+//   [bootstrap_header]                          (kBootstrapHeaderSize bytes)
+//   for each logical index, in add order:
+//     [posting region]       interleaved [prx][frq] per pod_ref term, term order
+//                            (prx span empty when !has_prx)
+//     [DICT blocks region]   concatenated DICT blocks, split by
+//                            target_dict_block_bytes
+//   for each logical index, in add order:
+//     [norms POD]            NormsPodWriter::finish (scoring only; else absent)
+//     [null bitmap POD]      NullBitmapWriter::finish (when nulls exist)
+//   [tail_meta_region]       one per_index_meta block per index + directory
+//   [tail_pointer]           encode_tail_pointer at EOF
+//
+// (The posting region is streamed BEFORE the DICT region per index: postings are
+// the large append-only term-ordered stream; the DICT region is the compact
+// compressed trailer.)
+//
+// OFFSET CONVENTIONS (ABSOLUTE file offsets unless stated otherwise):
+//   - SectionRefs in each per_index_meta record ABSOLUTE file offset+length of
+//     that index's posting_region, dict_region, norms. Absent regions are (0,0)
+//     (e.g. norms for a docs-positions index; null_bitmap is always (0,0) in v1).
+//     A present-but-empty posting_region (all-INLINE index) is (off, 0).
+//   - DictBlockDirectory entries record each DICT block's ABSOLUTE file offset +
+//     length.
+//   - A windowed/slim pod_ref entry's absolute .frq offset =
+//       section_refs.posting_region.offset + frq_base + frq_off_delta
+//     where frq_base is the posting-region-relative running offset captured at the
+//     block's open (see logical_index_writer.h). prx follows the identical rule
+//     against the SAME region (prx_base == frq_base).
+//   - tail_pointer.meta_region_offset/length point at the tail_meta_region;
+//     hot_off = 0 (no hot region in v1).
+namespace snii::writer {
+
+class SniiCompoundWriter {
+public:
+    explicit SniiCompoundWriter(snii::io::FileWriter* out);
+
+    // Buffers one logical index: builds its section bytes and meta sub-sections.
+    // The actual file writing happens in finish() (single front-to-back pass).
+    Status add_logical_index(const SniiIndexInput& in);
+
+    // Writes bootstrap header + all index sections + norms + tail meta region +
+    // tail pointer, then finalizes the underlying writer. May be called once.
+    Status finish();
+
+private:
+    // Absolute placement of one index's sections, resolved during finish().
+    struct Placement {
+        uint64_t dict_off = 0;
+        uint64_t dict_len = 0;
+        uint64_t post_off = 0; // interleaved [prx][frq] posting region (was frq + prx)
+        uint64_t post_len = 0;
+        uint64_t norms_off = 0;
+        uint64_t norms_len = 0;
+        uint64_t null_off = 0;
+        uint64_t null_len = 0;
+        uint64_t bsbf_off = 0;
+        uint64_t bsbf_len = 0;
+    };
+
+    Status ensure_bootstrap();
+    Status write_bootstrap();
+    Status write_norms();
+    Status write_tail();
+    Status append(const std::vector<uint8_t>& bytes);
+
+    snii::io::FileWriter* out_;
+    std::vector<std::unique_ptr<LogicalIndexWriter>> indexes_;
+    // Per-index placement; post_off/post_len are filled as each index's posting region
+    // streams in during add_logical_index, the rest during finish(). The absolute write
+    // offset is out_->bytes_written() (the single source of truth -- no separate cursor).
+    std::vector<Placement> placements_;
+    bool bootstrap_written_ = false;
+    bool finished_ = false;
+};
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/spill_run_codec.h b/be/src/snii/writer/spill_run_codec.h
new file mode 100644
index 00000000000000..d79381aa67184f
--- /dev/null
+++ b/be/src/snii/writer/spill_run_codec.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/writer/spimi_term_buffer.h"
+
+namespace snii::writer {
+
+// On-disk SPIMI "run" codec for the spill / k-way-merge out-of-core build path.
+//
+// A RUN is a self-describing file holding a sequence of terms keyed by TERM-ID,
+// each followed by its postings, in this exact wire layout. The file is produced
+// and consumed by THIS module only (a private temp file -- the on-disk INDEX is
+// unaffected), so the format is chosen for cheap I/O: docids, freqs and positions
+// are ALL RAW fixed-width little-endian u32 BLOCKS (bulk memcpy on both ends,
+// ~10x cheaper than per-value varint -- which cost ~1.5s of encode CPU over the
+// 5M build's ~60M docids and compressed those streams poorly anyway). Decode
+// still validates every length against the file size.
+//
+//   run := record*                       (term-ids ordered by vocab string,
+//                                          strictly ascending within a run)
+//   record :=
+//     VInt term_id                       (index into the shared vocabulary; the
+//                                          string is NOT stored -- smaller runs,
+//                                          no per-record string IO)
+//     VInt n_docs
+//     u32  docid   * n_docs              (RAW LE block, memcpy; ABSOLUTE ascending
+//                                          docids -- the merge concatenates across
+//                                          runs and re-deltas at index encode time)
+//     u32  freq    * n_docs              (RAW LE block, memcpy; each >= 1)
+//     VInt n_pos                         (== sum(freqs) when has_positions, else 0)
+//     u32  position * n_pos              (RAW LE block, document-order, partitioned
+//                                          by freqs)
+//
+// Decode is fully STREAMED: a RunReader reads a small fixed buffer at a time and
+// materializes only the CURRENT term's postings, never the whole run. The k-way
+// merge keeps one heap slot per run (each holding only its current term-id +
+// that term's postings), so peak memory is bounded by the widest single term
+// summed across the runs that contain it -- not by total postings. The merge
+// orders runs by the term-id's VOCAB STRING (resolved via the shared vocabulary)
+// so the merged stream is lexicographic.
+
+// Writes a sorted sequence of terms (by id) to one run file. Term-ids must be
+// handed to write_term in vocab-string ascending order (the spill caller sorts
+// before spilling). RAII: the file is flushed and closed on close(); the partial
+// file is left for the owning SpimiTermBuffer to delete on its temp-path list.
+class RunWriter {
+public:
+    RunWriter() = default;
+    ~RunWriter();
+
+    RunWriter(const RunWriter&) = delete;
+    RunWriter& operator=(const RunWriter&) = delete;
+
+    // Opens `path` for writing (truncating). Returns IoError on failure.
+    Status open(const std::string& path);
+
+    // Appends one term's postings under `term_id`. `tp.positions_flat` must be empty
+    // iff !has_positions (and otherwise hold sum(freqs) entries in doc order).
+    // Caller guarantees ascending docids and parallel docids/freqs lengths.
+    Status write_term(uint32_t term_id, const TermPostings& tp);
+
+    // Flushes the buffer and closes the file. Safe to call once; idempotent.
+    Status close();
+
+private:
+    Status flush();
+
+    int fd_ = -1;
+    std::vector<uint8_t> buf_; // staging buffer; flushed in fixed-size chunks
+};
+
+// Streamed reader over one run file. After open() the first term is loaded;
+// current()/current_id() expose it; advance() loads the next (or marks
+// exhausted). Only the current term's postings live in memory at a time. The
+// current record's `term` string is left EMPTY -- runs store only the id; the
+// owner resolves the string via the shared vocabulary.
+//
+// LAZY POSITIONS (peak-RSS optimization for the widest merged term): advance()
+// loads term_id / docids / freqs and the position-block COUNT, but does NOT read
+// the position bytes -- it leaves the decode window cursor parked at the start of
+// the position block. The owner then chooses, per term:
+//   * materialize_positions(): bulk-reads the block into current().positions_flat
+//     (the default; behaves exactly as the old eager reader).
+//   * stream_positions(dst, n): pulls the next n positions straight from the
+//     window in 64 KiB chunks, never materializing the whole block -- used by the
+//     k-way merge's wide-term position pump so the widest term's tens-of-MiB
+//     positions buffer is never resident.
+// advance() drains any positions left unread from the previous term before the
+// next record, so a partly-streamed (or skipped) term still lands at the right
+// record boundary. The yielded byte sequence is identical either way.
+class RunReader {
+public:
+    RunReader() = default;
+    ~RunReader();
+
+    RunReader(const RunReader&) = delete;
+    RunReader& operator=(const RunReader&) = delete;
+
+    // Opens `path`, loading the first record (if any). has_positions must match
+    // the writer's setting so n_pos is interpreted consistently.
+    Status open(const std::string& path, bool has_positions);
+
+    bool exhausted() const { return exhausted_; }
+    const TermPostings& current() const { return current_; }
+    uint32_t current_id() const { return current_id_; }
+
+    // Number of positions in the current term's (lazily-loaded) position block.
+    uint64_t current_pos_count() const { return pos_count_; }
+    // True once the current term's positions have been materialized OR fully
+    // streamed (i.e. nothing remains to read before advance()).
+    bool positions_drained() const { return pos_remaining_ == 0; }
+
+    // Materializes the current term's position block into current().positions_flat
+    // (bulk read). Idempotent within a term: a no-op once positions are drained.
+    Status materialize_positions();
+    // Streams the next `n` positions of the current term into dst[0..n) directly
+    // from the decode window (64 KiB chunks topped up on demand). Caller must not
+    // request more than positions_remaining(); each call advances the cursor.
+    Status stream_positions(uint32_t* dst, size_t n);
+    uint64_t positions_remaining() const { return pos_remaining_; }
+
+    // Loads the next record into current(); sets exhausted() at end of file. Any
+    // positions of the current term left unread are skipped first.
+    Status advance();
+
+private:
+    size_t available() const;        // buffered bytes from pos_ to window end
+    Status fill();                   // tops up the decode window from disk
+    Status ensure(size_t n);         // guarantees >= n buffered bytes (or eof)
+    Status read_varint(uint64_t* v); // bounds-checked streamed varint
+    // Bulk-reads `count` RAW little-endian u32s from the window into `out` (resized
+    // to count). Bounds-checked against the run's true length (Corruption on EOF).
+    Status read_raw_u32(size_t count, std::vector<uint32_t>* out);
+    // Streams `count` raw u32s from the window into dst (caller-owned, sized by the
+    // caller); shared by read_raw_u32 (into a vector) and stream_positions.
+    Status pull_raw_u32(uint8_t* dst, size_t count);
+    // Drains (and discards) any remaining positions of the current term so the
+    // window cursor lands at the next record boundary.
+    Status skip_remaining_positions();
+
+    int fd_ = -1;
+    bool has_positions_ = false;
+    bool exhausted_ = false;
+    uint64_t file_size_ = 0;      // total run byte size (fstat at open); bounds lengths
+    std::vector<uint8_t> window_; // sliding decode window
+    size_t pos_ = 0;              // consumed offset within window_
+    bool eof_ = false;            // no more bytes on disk
+    uint32_t current_id_ = 0;     // current record's term-id
+    uint64_t pos_count_ = 0;      // current term's total position count (from n_pos)
+    uint64_t pos_remaining_ = 0;  // positions still unread in the current block
+    TermPostings current_;
+};
+
+// K-way merges the given run files into a single term stream ordered by the
+// term-id's VOCAB STRING (lexicographic), invoking `fn` once per distinct
+// term-id with its postings concatenated across all runs that contain it (in
+// run order -> docids stay ascending) and its `term` resolved from `vocab`.
+// Only one merged term is materialized at a time. Returns IoError/Corruption on
+// bad run data. has_positions must match how the runs were written. `vocab`
+// maps term-id -> string and is borrowed.
+//
+// allow_stream_positions (peak-RSS optimization): when true (the streaming-writer
+// path), a WIDE merged term's positions are NOT materialized into positions_flat;
+// instead the TermPostings carries a pos_pump that streams positions in document
+// order straight from the run readers (which stay parked at this term's blocks
+// for the duration of the fn() call). `fn` MUST therefore consume each term
+// SYNCHRONOUSLY and must NOT retain the TermPostings past the call (the pump
+// references live readers freed when the merge advances). Callers that retain the
+// term (e.g. finalize_sorted) MUST pass false, so positions are always fully
+// materialized. The produced bytes are identical either way.
+Status MergeRuns(const std::vector<std::string>& run_paths, const std::vector<std::string>& vocab,
+                 bool has_positions, const std::function<void(TermPostings&&)>& fn,
+                 bool allow_stream_positions = true);
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/spillable_byte_buffer.h b/be/src/snii/writer/spillable_byte_buffer.h
new file mode 100644
index 00000000000000..0f5737e2bdd2f1
--- /dev/null
+++ b/be/src/snii/writer/spillable_byte_buffer.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/common/status.h"
+#include "snii/io/local_file.h"
+#include "snii/writer/memory_reporter.h"
+#include "snii/writer/temp_dir.h"
+
+namespace snii::writer {
+
+// A tiered append buffer for one build-time section. While resident it holds the
+// bytes as a CHAIN OF CHUNKS (one per append) rather than a single growing vector:
+// each append owns a right-sized allocation, so there is NO geometric-doubling
+// realloc transient and NO power-of-two capacity slack -- the resident cost is
+// exactly the bytes appended, for any section size. Once the running size crosses
+// `cap_bytes` the buffer SPILLS to a temp file (resolve_temp_dir()) and routes later
+// appends there, so a huge section stays RSS-bounded at ~cap_bytes while a small one
+// is RAM-only (zero disk, spill-only build). append order/bytes are identical
+// wherever they land; stream_into() reproduces the section in order. RAII-removes the
+// temp. (cap_bytes == UINT64_MAX disables spilling -> always RAM.)
+class SpillableByteBuffer {
+public:
+    // `reporter` is an OPTIONAL writer-level build-RAM reporter (null off-Doris /
+    // unit tests). When non-null, every change to ram_bytes_ (the RESIDENT tier) is
+    // mirrored to it as a signed delta: a positive delta per RAM append, and a single
+    // negative delta == prior ram_bytes_ when the buffer spills (the resident chunks
+    // are dropped and the bytes move to disk, so they must NOT be counted as RSS).
+    // Spilled bytes live on disk and are never reported.
+    SpillableByteBuffer(uint64_t cap_bytes, std::string tag, MemoryReporter* reporter = nullptr)
+            : cap_bytes_(cap_bytes), tag_(std::move(tag)), reporter_(reporter) {}
+    ~SpillableByteBuffer() {
+        // Balance the reporter: on the common un-spilled path the resident ram_bytes_ was
+        // reported as positive on append but never released, so release it now (a missed
+        // negative would leak into Doris's MemTracker). After a spill, spill_to_disk()
+        // already reported the negative and ram_bytes_ no longer counts as resident.
+        if (reporter_ && !spilled_ && ram_bytes_ > 0) {
+            reporter_->report(-static_cast<int64_t>(ram_bytes_));
+        }
+        if (!temp_path_.empty()) std::remove(temp_path_.c_str());
+    }
+    SpillableByteBuffer(const SpillableByteBuffer&) = delete;
+    SpillableByteBuffer& operator=(const SpillableByteBuffer&) = delete;
+
+    // Total bytes appended so far (the offset basis for callers recording sub-offsets).
+    uint64_t size() const { return spilled_ ? spilled_bytes_ : ram_bytes_; }
+
+    // Copying append (the Slice bytes are copied into a fresh chunk).
+    Status append(Slice bytes) {
+        if (spilled_) {
+            SNII_RETURN_IF_ERROR(temp_.append(bytes));
+            spilled_bytes_ += bytes.size();
+            return Status::OK();
+        }
+        if (!bytes.empty()) {
+            chunks_.emplace_back(bytes.data(), bytes.data() + bytes.size());
+            ram_bytes_ += bytes.size();
+            if (reporter_) reporter_->report(static_cast<int64_t>(bytes.size()));
+        }
+        if (over_cap()) return spill_to_disk();
+        return Status::OK();
+    }
+
+    // Move append: the section ADOPTS the caller's vector (no copy, no slack). The
+    // common dict path -- each flushed block is handed off by move.
+    Status append_move(std::vector<uint8_t>&& v) {
+        if (spilled_) {
+            SNII_RETURN_IF_ERROR(temp_.append(Slice(v)));
+            spilled_bytes_ += v.size();
+            return Status::OK();
+        }
+        if (!v.empty()) {
+            ram_bytes_ += v.size();
+            if (reporter_) reporter_->report(static_cast<int64_t>(v.size()));
+            chunks_.push_back(std::move(v));
+        }
+        if (over_cap()) return spill_to_disk();
+        return Status::OK();
+    }
+
+    // Must be called once after the last append, before stream_into(): flushes the temp
+    // (if spilled) so it can be read back. A no-op for a RAM-resident buffer.
+    Status seal() {
+        if (spilled_ && !sealed_) {
+            SNII_RETURN_IF_ERROR(temp_.finalize());
+            sealed_ = true;
+        }
+        return Status::OK();
+    }
+
+    // Streams the whole section (RAM chunks or sealed temp) into `out`, in append order.
+    Status stream_into(snii::io::FileWriter* out) const {
+        if (!spilled_) {
+            for (const auto& c : chunks_) {
+                if (!c.empty()) SNII_RETURN_IF_ERROR(out->append(Slice(c)));
+            }
+            return Status::OK();
+        }
+        snii::io::LocalFileReader r;
+        SNII_RETURN_IF_ERROR(r.open(temp_path_));
+        constexpr uint64_t kChunk = 1u << 20; // fixed copy window (no whole-section reload)
+        std::vector<uint8_t> buf;
+        for (uint64_t off = 0; off < spilled_bytes_; off += kChunk) {
+            const uint64_t n = std::min(kChunk, spilled_bytes_ - off);
+            SNII_RETURN_IF_ERROR(r.read_at(off, n, &buf));
+            SNII_RETURN_IF_ERROR(out->append(Slice(buf)));
+        }
+        return Status::OK();
+    }
+
+    bool spilled() const { return spilled_; }
+
+private:
+    // Gate-2 spill condition (UNIFIED): spill when the writer's TOTAL build RAM crosses
+    // the one shared cap (reporter_->over_cap()), with the local cap_bytes_ kept only as
+    // a defensive per-buffer hard ceiling (e.g. when no reporter is attached).
+    bool over_cap() const {
+        return (reporter_ != nullptr && reporter_->over_cap()) || ram_bytes_ >= cap_bytes_;
+    }
+    Status spill_to_disk() {
+        temp_path_ = resolve_temp_dir() + "/snii_" + tag_ + "_" + std::to_string(::getpid()) + "_" +
+                     std::to_string(reinterpret_cast<uintptr_t>(this)) + ".tmp";
+        SNII_RETURN_IF_ERROR(temp_.open(temp_path_));
+        for (const auto& c : chunks_) {
+            if (!c.empty()) SNII_RETURN_IF_ERROR(temp_.append(Slice(c)));
+        }
+        spilled_bytes_ = ram_bytes_;
+        // The resident tier is freed: report the full negative delta == prior ram_bytes_
+        // so the writer-level RAM counter (and Doris's LOAD tracker) no longer counts
+        // these bytes as RSS -- they now live on disk. This single negative balances the
+        // sum of all prior positive append deltas (net-zero RAM after spill).
+        if (reporter_) reporter_->report(-static_cast<int64_t>(ram_bytes_));
+        std::vector<std::vector<uint8_t>>().swap(chunks_); // reclaim the RAM immediately
+        spilled_ = true;
+        return Status::OK();
+    }
+
+    uint64_t cap_bytes_;
+    std::string tag_;
+    MemoryReporter* reporter_ = nullptr;       // optional build-RAM reporter (null off-Doris)
+    std::vector<std::vector<uint8_t>> chunks_; // resident tier: one chunk per append
+    uint64_t ram_bytes_ = 0;
+    bool spilled_ = false;
+    bool sealed_ = false;
+    snii::io::LocalFileWriter temp_;
+    std::string temp_path_;
+    uint64_t spilled_bytes_ = 0;
+};
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/spimi_term_buffer.h b/be/src/snii/writer/spimi_term_buffer.h
new file mode 100644
index 00000000000000..d2b617ccfb4c69
--- /dev/null
+++ b/be/src/snii/writer/spimi_term_buffer.h
@@ -0,0 +1,362 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#include "snii/common/status.h"
+#include "snii/writer/compact_posting_pool.h"
+#include "snii/writer/memory_reporter.h"
+
+namespace snii::writer {
+
+// One term's posting list: docids ascending, with parallel freqs and (when
+// positions are enabled) a single FLAT positions buffer.
+//
+// positions_flat holds every position for the term in document order, partitioned
+// by freqs: doc i owns the next freqs[i] entries. This is the SAME layout the
+// accumulator stores natively, so no per-doc vector-of-vectors is ever built on
+// the build/merge hot path (that vector-of-vectors was the dominant peak-RSS
+// driver for high-df terms). doc_positions(i) returns a non-owning span view of
+// doc i's positions for consumers that want per-doc access (e.g. the prx window
+// builder, tests). positions_flat is empty when positions are disabled.
+struct TermPostings {
+    std::string term;
+    std::vector<uint32_t> docids;
+    std::vector<uint32_t> freqs;
+    std::vector<uint32_t> positions_flat; // empty when positions disabled
+
+    // OPTIONAL streamed-positions source (peak-RSS optimization for very-high-df
+    // terms). When set, positions_flat is left EMPTY and the writer pulls positions
+    // SEQUENTIALLY in document order via pos_pump(dst, n) -- filling `dst[0..n)` with
+    // the next n positions -- one window at a time, so the term's full flat positions
+    // buffer (tens of MiB for the widest term) is never materialized. The yielded
+    // bytes are byte-identical to building from positions_flat (same values, same
+    // order). pos_total is the total number of positions the pump will yield (==
+    // sum(freqs)); it lets the writer validate without a flat buffer. When pos_pump
+    // is null, positions come from positions_flat as before. Only the writer's prx
+    // builders consume this; all other consumers use positions_flat.
+    //
+    // OWNERSHIP CONTRACT (synchronous-consume-once): a streamed pos_pump captures
+    // references into the producer's stack and its parked run readers/arena, valid ONLY
+    // for the duration of the synchronous fn(TermPostings&&) call that delivered this
+    // TermPostings. The consumer MUST pull all positions inside fn() and MUST NOT store
+    // the TermPostings or invoke pos_pump after fn() returns. Callers that retain the
+    // TermPostings pass allow_stream_positions=false, which materializes positions into
+    // positions_flat instead (no pump). As a safety net, a deferred call to a streamed
+    // pump throws std::logic_error rather than dereferencing freed state.
+    std::function<void(uint32_t*, size_t)> pos_pump;
+    uint64_t pos_total = 0;
+
+    // Byte offset of doc i's first position within positions_flat (prefix sum of
+    // freqs). O(i) -- callers iterating all docs should track a running offset.
+    size_t pos_offset(size_t doc_index) const {
+        size_t off = 0;
+        for (size_t i = 0; i < doc_index; ++i) off += freqs[i];
+        return off;
+    }
+    // Non-owning view of doc i's positions (length freqs[i]) into positions_flat.
+    std::span<const uint32_t> doc_positions(size_t doc_index) const {
+        const size_t off = pos_offset(doc_index);
+        return std::span<const uint32_t>(positions_flat.data() + off, freqs[doc_index]);
+    }
+
+    // Rebuilds the per-doc position lists (for callers/tests wanting per-doc access)
+    // from positions_flat partitioned by freqs. O(total positions); allocates.
+    std::vector<std::vector<uint32_t>> positions_per_doc() const {
+        std::vector<std::vector<uint32_t>> out(freqs.size());
+        size_t off = 0;
+        for (size_t i = 0; i < freqs.size(); ++i) {
+            out[i].assign(positions_flat.begin() + off, positions_flat.begin() + off + freqs[i]);
+            off += freqs[i];
+        }
+        return out;
+    }
+
+    // Sets the flat positions from per-doc lists (convenience for tests / callers
+    // that produce per-doc positions). Does NOT touch freqs; the caller is expected
+    // to keep freqs[i] == per_doc[i].size() consistent (the writer validates this).
+    void set_positions_per_doc(const std::vector<std::vector<uint32_t>>& per_doc) {
+        positions_flat.clear();
+        for (const auto& d : per_doc)
+            positions_flat.insert(positions_flat.end(), d.begin(), d.end());
+    }
+};
+
+// In-memory SPIMI (Single-Pass In-Memory Indexing) accumulator for one logical
+// index. Records term occurrences and produces lexicographically sorted terms
+// with ascending-docid posting lists.
+//
+// TERM-ID ACCUMULATION (no per-token string work): tokens are accumulated by an
+// INTEGER term-id, not by hashing/constructing a std::string per token. The
+// caller supplies a VOCABULARY mapping term-id -> term string; the buffer keeps
+// a DENSE std::vector<Term> indexed by term-id, so the hot add_token path is a
+// vector index + a couple of pushes -- no hashing, no allocation per token. The
+// vocabulary is resolved to strings only once per distinct term at finalize.
+//
+// Two construction modes:
+//   * BORROWED vocab (the fast path): pass a non-null `vocab` that the caller
+//     owns and keeps alive; add_token(term_id, ...) indexes straight into it.
+//   * OWNED vocab (compatibility): pass a null `vocab`; the string-keyed
+//     add_token(string_view, ...) interns each new term into an internal owned
+//     vocabulary (assigning ids in first-seen order) and forwards to the id
+//     path. Existing callers that feed strings keep working unchanged.
+//
+// SPILL / K-WAY MERGE (out-of-core, bounds input RAM): when a non-zero
+// spill_threshold_bytes is set, the REAL resident accumulator size (the posting
+// arena + the vocab-sized slot index, pool_.arena_bytes() + slot_of_.capacity()*4)
+// is compared against the threshold as tokens arrive; once it crosses the
+// threshold the buffer SORTS its current terms,
+// writes a self-describing sorted RUN to a temp file, and CLEARS memory. Each
+// run record is keyed by the TERM-ID (varint); the k-way merge orders runs by
+// the id's VOCAB STRING so the merged stream stays lexicographic. Because
+// tokens arrive in globally ascending docid order, a term that reappears in a
+// later run only covers strictly-later docids, so concatenating its postings in
+// run order during the final merge keeps docids ascending. for_each_term_sorted
+// flushes the residual buffer as a final run, then k-way merges all runs
+// materializing only ONE merged term at a time -> peak memory stays bounded by
+// the threshold (plus the widest single term), NOT by total postings. With the
+// default threshold 0 (unlimited) the path is exactly the in-memory behavior.
+//
+// Internal representation is a COMPACT TAGGED VARINT byte stream per term, held in
+// a shared SEGMENTED ARENA (CompactPostingPool), NOT per-term uint32 vectors. Each
+// term owns ONE arena chain holding a stream of per-TOKEN entries in arrival
+// order: every token contributes varint((pos << 1) | new_doc_bit); when new_doc_bit
+// is set, the token's doc differs from the previous one, so a zigzag-varint(docid -
+// prev_docid) immediately follows. Frequencies are NOT stored -- a doc's freq is
+// the count of consecutive same-doc tokens, recovered while decoding. This drops
+// the entire freq stream and the second (positions) chain versus a freq/prox split,
+// so the payload is ~3.4x smaller than raw uint32 docids/freqs/positions, and the
+// shared arena removes per-vector doubling slack and per-term vector headers. Each
+// append writes straight into the chain (no deferred per-doc flush): the only live
+// per-term state is the current doc id (to detect a doc change) and the delta base.
+// to_postings() decodes a term's chain back to the SAME flat TermPostings the
+// writer consumes, so the produced .idx is BYTE-IDENTICAL. positions_flat stays
+// empty (and pos is tagged as 0) when positions are disabled; freq still counts.
+//
+// Duplicate vocab strings: the vocab is assumed to map each id to a DISTINCT
+// string (a dense vocabulary). If two ids share a string they sort adjacently
+// but are emitted as two separate terms; callers must not rely on coalescing.
+class SpimiTermBuffer {
+public:
+    // BORROWED-vocab constructor: `vocab` maps term-id -> term string and is
+    // borrowed (NOT owned) -- the caller must keep it alive for the buffer's
+    // lifetime. add_token(term_id, ...) accumulates by id with no string work.
+    // spill_threshold_bytes is the gate-2 internal buffer cap (e.g. 512 MiB),
+    // sourced from config; == 0 means unlimited (pure in-memory, default). A
+    // positive value caps the REAL resident accumulator size (pool_.arena_bytes() +
+    // slot_of_.capacity()*4), triggering a spill when that crosses the cap -- NOT the
+    // old per-token estimate.
+    // `reporter` is the OPTIONAL writer-level build-RAM reporter (null off-Doris /
+    // unit tests). When non-null, the accumulator reports its REAL resident-byte
+    // deltas -- pool_.arena_bytes() + slot_of_.capacity()*4 -- positive on grow,
+    // negative on every reset/free, exactly once. NEVER reports live_bytes_ (a gated
+    // estimate that feeds only the spill threshold).
+    explicit SpimiTermBuffer(const std::vector<std::string>* vocab, bool has_positions,
+                             size_t spill_threshold_bytes = 0, MemoryReporter* reporter = nullptr);
+
+    // OWNED-vocab (compatibility) constructor: no external vocab. The string-keyed
+    // add_token interns terms into an internal vocabulary on first occurrence.
+    explicit SpimiTermBuffer(bool has_positions, size_t spill_threshold_bytes = 0,
+                             MemoryReporter* reporter = nullptr);
+
+    ~SpimiTermBuffer();
+
+    SpimiTermBuffer(const SpimiTermBuffer&) = delete;
+    SpimiTermBuffer& operator=(const SpimiTermBuffer&) = delete;
+
+    // Records one token by TERM-ID: term `term_id` occurs in `docid` at `pos`.
+    // `term_id` must be in [0, vocab_size). An out-of-range id latches an
+    // InvalidArgument into status() and is ignored. For a given term, docids are
+    // expected to arrive in non-decreasing order, and positions within a docid in
+    // ascending order; out-of-order docids (INCLUDING a REVISITED docid -- the same
+    // docid appearing again after a different one) are tolerated and reordered at
+    // finalize: SortByDocid stably sorts by docid and COALESCES same-docid groups
+    // (summing freqs, concatenating positions in document order), so the emitted
+    // postings have exactly ONE strictly-ascending entry per docid -- matching the
+    // k-way merge path and the writer's strictly-ascending precondition.
+    void add_token(uint32_t term_id, uint32_t docid, uint32_t pos);
+
+    // Compatibility overload: records one token by TERM STRING. Valid ONLY on an
+    // OWNED-vocab buffer (constructed without an external vocab); interns `term`
+    // into the internal vocabulary on first occurrence, then forwards by id. Called
+    // on a BORROWED-vocab buffer it is REJECTED (latches InvalidArgument, token
+    // ignored) -- interning would grow the owned vocab out of step with the borrowed
+    // one and corrupt the build. It also allocates a std::string per call, so the
+    // hot path is the id overload; prefer that and reserve this for tests / legacy
+    // string-fed callers.
+    void add_token(std::string_view term, uint32_t docid, uint32_t pos);
+
+    // Number of DISTINCT terms accumulated so far (touched ids still resident).
+    size_t unique_terms() const;
+    uint64_t total_tokens() const { return total_tokens_; }
+    bool has_positions() const { return has_positions_; }
+
+    // OK unless an add_token validation error (out-of-range term-id, wrong vocab
+    // mode) was latched. for_each_term_sorted now returns its own I/O Status
+    // directly; callers that use add_token's latch-and-report pattern MUST check
+    // this after draining to surface input-side validation errors.
+    [[nodiscard]] Status status() const { return spill_status_; }
+
+    // TEST-ONLY: number of spill run files written so far (== 0 in pure in-memory
+    // mode). Lets tests assert that a gate-2 spill actually fired once the REAL
+    // resident size crossed the configured cap. Not part of the production API.
+    size_t run_count_for_test() const { return run_paths_.size(); }
+
+    // Materializes all terms sorted lexicographically; each term's docids are
+    // ascending. Convenience wrapper around for_each_term_sorted that keeps the
+    // whole result alive at once. Prefer for_each_term_sorted for low peak memory.
+    // MUST be called at most once: it drains internal state. A SECOND drain (a
+    // repeat call, or a finalize_sorted after a for_each_term_sorted, or vice versa)
+    // returns EMPTY and latches an error into status() rather than re-emitting.
+    std::vector<TermPostings> finalize_sorted();
+
+    // Streams terms to `fn` in lexicographic order, building ONE transient
+    // TermPostings at a time and freeing that term's accumulated arrays before
+    // moving to the next. This keeps at most a single term's postings duplicated,
+    // avoiding the input+output coexistence peak. MUST be called at most once: it
+    // drains internal state. A SECOND drain invokes `fn` zero times and returns
+    // an Internal error (a re-merge of the still-present run files would otherwise
+    // re-emit every term). Returns non-OK on spill/merge I/O or corruption errors,
+    // or if a prior add_token latched a validation error into status().
+    Status for_each_term_sorted(const std::function<void(TermPostings&&)>& fn);
+
+private:
+    // Compact per-term accumulator: ONE tagged-varint arena chain plus a few cursors.
+    // Every token is appended immediately (no deferred flush), so the only running
+    // state is the current doc id and the delta base. A sentinel chain head of
+    // kNoChain marks a term that has not started its chain yet (so an all-empty term
+    // costs no arena bytes). ntok / ndocs bound the decode loop and size reserves.
+    // Total ~36 B per live term.
+    static constexpr uint32_t kNoChain = 0xFFFFFFFFu;
+    struct Term {
+        uint32_t head = kNoChain;          // chain read entry point
+        CompactPostingPool::SliceWriter w; // append cursor for the chain (8 B)
+        uint32_t ntok = 0;                 // total tokens (entries) in the chain
+        uint32_t cur_docid = 0;            // most-recent doc id: detects doc change AND
+                                           // is the zigzag delta base for the next doc
+        uint8_t level = 0;                 // current slice level of w (packed here, not in w)
+        bool started = false;              // false until the first token is appended
+        bool sorted = true;                // false if a docid arrived out of ascending order
+    };
+    static_assert(sizeof(CompactPostingPool::SliceWriter) == 8,
+                  "SliceWriter must stay 8 bytes to keep Term compact");
+
+    // The active vocabulary (term-id -> string): either the borrowed pointer or,
+    // in owned mode, &owned_vocab_. Always non-null after construction.
+    const std::vector<std::string>& vocab() const { return *vocab_; }
+
+    // Accumulates one already-validated token into the per-id Term.
+    void accumulate(uint32_t term_id, uint32_t docid, uint32_t pos);
+
+    // Decodes `t`'s compact chain into a TermPostings (the exact docids/freqs/
+    // positions the writer consumes), sorting by docid first if `t.sorted` is false.
+    // When `allow_stream_positions` is true (the in-memory drain path), a large
+    // sorted term's positions are provided via TermPostings::pos_pump instead of a
+    // materialized positions_flat (peak-RSS win). The spill path passes false so the
+    // run codec always sees a fully-materialized positions_flat.
+    TermPostings to_postings(std::string term, Term&& t, bool allow_stream_positions) const;
+
+    // Returns the touched term-ids sorted by their vocab string (lexicographic).
+    // Sorts by a PRECOMPUTED integer string-rank (term-id -> lexicographic rank),
+    // not by full std::string compare: a single std::string sort over the whole
+    // vocabulary is amortized across every spill, so each spill's sort is an
+    // integer compare instead of paying a fresh O(touched * strcmp) on every spill.
+    std::vector<uint32_t> sorted_ids() const;
+    // Builds string_rank_ (term-id -> lexicographic rank) once, lazily. Idempotent.
+    void ensure_string_rank() const;
+    // Streams the in-memory terms in sorted order, draining the slot pool (the
+    // in-memory single-pass path). When `allow_stream_positions` is true, large
+    // sorted terms stream positions via pos_pump (valid only because the callback
+    // consumes each term synchronously while the arena is still resident); callers
+    // that RETAIN the TermPostings past the drain (finalize_sorted) must pass false.
+    Status drain_sorted(const std::function<void(TermPostings&&)>& fn, bool allow_stream_positions);
+    // Spills the current buffer to a fresh sorted run file and clears memory.
+    Status spill_to_run();
+    // Writes all current terms (sorted) to an already-open RunWriter, draining.
+    Status drain_to_writer(class RunWriter* w);
+    // REAL resident accumulator bytes: pool_.arena_bytes() + slot_of_.capacity()*4.
+    // The single source of truth for both the gate-2 spill trigger and the spill
+    // space-precheck -- replaces the old gated live_bytes_ estimate.
+    uint64_t resident_bytes() const;
+    // Reports the signed change in REAL resident bytes (pool_.arena_bytes() +
+    // slot_of_.capacity()*4) to mem_reporter_ since the previous call, then caches the
+    // new total. Single-source diff: every grow/reset/free emits EXACTLY ONE delta
+    // (self-balancing -> impossible to double-count or miss a negative). No-op when
+    // mem_reporter_ is null.
+    void report_arena_delta();
+    // Final k-way merge over the spilled runs (+ the residual flushed as a run).
+    // When `allow_stream_positions` is true (the streaming for_each path), a wide
+    // merged term streams positions via pos_pump (valid only because fn consumes
+    // synchronously while the run readers stay parked); callers that RETAIN the
+    // TermPostings past the merge (finalize_sorted) MUST pass false.
+    Status merge_runs(const std::function<void(TermPostings&&)>& fn, bool allow_stream_positions);
+    // Deletes every temp run file; called from the destructor (RAII cleanup).
+    void cleanup_runs();
+    // Frees a drained term's accumulator (id leaves the touched set).
+    void release_term(uint32_t term_id);
+
+    const std::vector<std::string>* vocab_; // active vocab (borrowed or &owned_)
+    std::vector<std::string> owned_vocab_;  // owned mode: interned term strings
+    // Owned mode only: term string -> term-id, for interning on first occurrence.
+    std::unordered_map<std::string, uint32_t> intern_;
+
+    bool has_positions_;
+    size_t spill_threshold_bytes_; // 0 => unlimited (no spilling)
+    uint64_t total_tokens_ = 0;
+
+    // POOLED accumulators (replaces a dense vocab-sized std::vector<Term>, which
+    // cost ~80 B per vocab id even for the ~empty majority -- the single largest
+    // input-phase memory line). slot_of_ is the only vocab-sized array: a 4 B index
+    // per id (0 == no live Term; otherwise slot index + 1). slots_ holds ONE Term
+    // per CURRENTLY-LIVE id, so its size tracks the live touched count, not the
+    // vocabulary. On first touch an id claims a slot (reusing a freed one from
+    // free_slots_ when available, else appending). release_term frees the slot back
+    // to the pool and clears slot_of_[id]. touched_ids_ lists every live id so
+    // finalize/spill iterate touched ids without scanning the whole vocabulary.
+    // present_[id] is now (slot_of_[id] != 0). The hot add path is still a vector
+    // index + a couple of pushes: no hashing, no per-token allocation.
+    std::vector<uint32_t> slot_of_;    // vocab-sized: id -> slot index + 1 (0=empty)
+    std::vector<Term> slots_;          // live Term pool (size ~ live touched count)
+    std::vector<uint32_t> free_slots_; // recycled slot indices (drained terms)
+    std::vector<uint32_t> touched_ids_;
+    size_t live_term_count_ = 0; // present (non-drained) terms; == unique_terms()
+
+    // Shared arena backing every live term's DOC and POS varint byte chains. Holds
+    // the bulk of the accumulator's memory in a few large blocks (no per-term vector
+    // headers, no per-vector doubling slack) -- the compact-RSS win.
+    CompactPostingPool pool_;
+
+    // Optional writer-level build-RAM reporter (null off-Doris / unit tests) and the
+    // last resident-byte total it was told about. report_arena_delta() diffs the live
+    // total (arena_bytes() + slot_of_.capacity()*4) against reported_resident_.
+    MemoryReporter* mem_reporter_ = nullptr;
+    int64_t reported_resident_ = 0;
+
+    // Returns the live Term for `term_id`, claiming a pool slot on first touch.
+    Term& term_slot(uint32_t term_id, bool* new_term);
+
+    // Appends one byte / one varint to a term's tagged chain, lazily starting the
+    // chain on first use (so an untouched term costs no arena bytes).
+    void put_byte(Term* t, uint8_t b);
+    void put_varint(Term* t, uint64_t v);
+
+    std::vector<std::string> run_paths_; // spilled run temp files (deleted in dtor)
+    Status spill_status_;                // first spill / range error, at finalize
+    bool drained_ = false;               // set once finalize_sorted/for_each_term_sorted has run;
+                                         // a second drain would (spilled path) re-merge the run
+                                         // files and re-emit every term, or (in-memory path) emit
+                                         // nothing -- both wrong. Guard against the double-drain.
+
+    // Lazily-built vocab-sized map: term-id -> its lexicographic rank among all
+    // vocab strings. Computed once (one full std::string sort of the vocabulary)
+    // on the first sorted_ids() call, then reused by every spill's id sort. mutable
+    // so the const sorted_ids() can fill it on demand.
+    mutable std::vector<uint32_t> string_rank_;
+};
+
+} // namespace snii::writer
diff --git a/be/src/snii/writer/temp_dir.h b/be/src/snii/writer/temp_dir.h
new file mode 100644
index 00000000000000..36d51d578a5e2a
--- /dev/null
+++ b/be/src/snii/writer/temp_dir.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <sys/statvfs.h>
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+
+namespace snii::writer {
+
+// Scratch directory for spill runs and section temp files. Resolution order:
+//   SNII_TEMP_DIR (explicit config) -> TMPDIR (POSIX default) -> /tmp (fallback).
+//
+// Point SNII_TEMP_DIR / TMPDIR at a REAL disk (SSD/NVMe). /tmp is often tmpfs (a
+// RAM-backed filesystem) on modern systems, where spilling does NOT reduce RSS --
+// it just moves bytes from heap to tmpfs, defeating the purpose of spilling.
+inline std::string resolve_temp_dir() {
+    for (const char* var : {"SNII_TEMP_DIR", "TMPDIR"}) {
+        const char* v = std::getenv(var);
+        if (v != nullptr && v[0] != '\0') {
+            std::string d(v);
+            while (d.size() > 1 && d.back() == '/') d.pop_back(); // strip trailing '/'
+            return d;
+        }
+    }
+    return "/tmp";
+}
+
+// Best-effort free bytes on the filesystem backing `dir`. Returns UINT64_MAX when
+// statvfs fails, so a caller's space pre-check never false-positives on an
+// unstattable path. CAVEATS: this is best-effort only -- it is subject to TOCTOU
+// (free space can drop before/while the write runs), and on tmpfs it reports
+// RAM-backed space (use the temp-dir config to avoid tmpfs in the first place).
+inline uint64_t temp_dir_available_bytes(const std::string& dir) {
+    struct statvfs vfs;
+    if (::statvfs(dir.c_str(), &vfs) != 0) return UINT64_MAX;
+    return static_cast<uint64_t>(vfs.f_bavail) * static_cast<uint64_t>(vfs.f_frsize);
+}
+
+} // namespace snii::writer
diff --git a/be/src/storage/CMakeLists.txt b/be/src/storage/CMakeLists.txt
index e7a82b486dbe63..3aee9b6a87bae2 100644
--- a/be/src/storage/CMakeLists.txt
+++ b/be/src/storage/CMakeLists.txt
@@ -28,6 +28,7 @@ file(GLOB_RECURSE SRC_FILES CONFIGURE_DEPENDS *.cpp)
 # files in the ann_index directory. They are compiled separately as a .a library
 # and linked by Storage.
 list(FILTER SRC_FILES EXCLUDE REGEX ".*/storage/index/ann/.*\\.cpp$")
+list(FILTER SRC_FILES EXCLUDE REGEX ".*/storage/index/snii/core/src/io/s3_object_store\\.cpp$")
 
 if (ENABLE_VARIANT_NESTED_GROUP)
     list(REMOVE_ITEM SRC_FILES
diff --git a/be/src/storage/compaction/compaction.cpp b/be/src/storage/compaction/compaction.cpp
index df2fee8b1146d8..5f040fae3ac00f 100644
--- a/be/src/storage/compaction/compaction.cpp
+++ b/be/src/storage/compaction/compaction.cpp
@@ -1221,6 +1221,12 @@ static bool check_rowset_has_inverted_index(const RowsetSharedPtr& src_rs, int32
 }
 
 void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) {
+    if (_cur_tablet_schema->get_inverted_index_storage_format() ==
+        InvertedIndexStorageFormatPB::SNII) {
+        LOG(INFO) << "tablet[" << _tablet->tablet_id()
+                  << "] uses SNII inverted index storage format, skip CLucene index compaction";
+        return;
+    }
     for (const auto& index : _cur_tablet_schema->inverted_indexes()) {
         auto col_unique_ids = index->col_unique_ids();
         // check if column unique ids is empty to avoid crash
diff --git a/be/src/storage/index/index_file_reader.cpp b/be/src/storage/index/index_file_reader.cpp
index 348e1399421e5a..e90d642b56c57b 100644
--- a/be/src/storage/index/index_file_reader.cpp
+++ b/be/src/storage/index/index_file_reader.cpp
@@ -20,6 +20,8 @@
 #include <memory>
 #include <utility>
 
+#include "common/cast_set.h"
+#include "common/config.h"
 #include "storage/index/inverted/inverted_index_compound_reader.h"
 #include "storage/index/inverted/inverted_index_fs_directory.h"
 #include "storage/tablet/tablet_schema.h"
@@ -31,7 +33,9 @@ Status IndexFileReader::init(int32_t read_buffer_size, const io::IOContext* io_c
     std::unique_lock<std::shared_mutex> lock(_mutex); // Lock for writing
     if (!_inited) {
         _read_buffer_size = read_buffer_size;
-        if (_storage_format >= InvertedIndexStorageFormatPB::V2) {
+        if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+            RETURN_IF_ERROR(_init_snii(io_ctx));
+        } else if (_storage_format >= InvertedIndexStorageFormatPB::V2) {
             RETURN_IF_ERROR(_init_from(read_buffer_size, io_ctx));
         }
         _inited = true;
@@ -136,7 +140,35 @@ Status IndexFileReader::_init_from(int32_t read_buffer_size, const io::IOContext
     return Status::OK();
 }
 
+Status IndexFileReader::_init_snii(const io::IOContext* io_ctx) {
+    auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix);
+    int64_t file_size = -1;
+    if (_idx_file_info.has_index_size()) {
+        file_size = _idx_file_info.index_size();
+    }
+    file_size = file_size == 0 ? -1 : file_size;
+
+    io::FileReaderOptions opts;
+    opts.cache_type = config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE
+                                                : io::FileCachePolicy::NO_CACHE;
+    opts.is_doris_table = true;
+    opts.file_size = file_size;
+    opts.tablet_id = _tablet_id;
+    io::FileReaderSPtr reader;
+    RETURN_IF_ERROR(_fs->open_file(index_file_full_path, &reader, &opts));
+    _snii_file_reader = std::make_shared<snii_doris::DorisSniiFileReader>(std::move(reader));
+    _snii_segment_reader = std::make_unique<snii::reader::SniiSegmentReader>();
+    snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(io_ctx);
+    RETURN_IF_ERROR(snii_doris::to_doris_status(snii::reader::SniiSegmentReader::open(
+            _snii_file_reader.get(), _snii_segment_reader.get())));
+    return Status::OK();
+}
+
 Result<InvertedIndexDirectoryMap> IndexFileReader::get_all_directories() {
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        return ResultError(Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "SNII format does not expose CLucene directories"));
+    }
     InvertedIndexDirectoryMap res;
     std::shared_lock<std::shared_mutex> lock(_mutex); // Lock for reading
     for (auto& [index, _] : _indices_entries) {
@@ -155,6 +187,11 @@ Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>> IndexFileReader::
         int64_t index_id, const std::string& index_suffix, const io::IOContext* io_ctx) const {
     std::unique_ptr<DorisCompoundReader, DirectoryDeleter> compound_reader;
 
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        return ResultError(Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "SNII format does not open CLucene compound readers"));
+    }
+
     if (_storage_format == InvertedIndexStorageFormatPB::V1) {
         auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1(
                 _index_path_prefix, index_id, index_suffix);
@@ -229,6 +266,26 @@ Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>> IndexFileReader::
     return compound_reader;
 }
 
+Result<std::unique_ptr<snii::reader::LogicalIndexReader>> IndexFileReader::open_snii_index(
+        const TabletIndex* index_meta) const {
+    DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII);
+    std::shared_lock<std::shared_mutex> lock(_mutex);
+    if (_snii_segment_reader == nullptr) {
+        return ResultError(Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
+                "SNII index file {} is not opened",
+                InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)));
+    }
+    auto logical_reader = std::make_unique<snii::reader::LogicalIndexReader>();
+    auto status =
+            _snii_segment_reader->open_index(cast_set<uint64_t>(index_meta->index_id()),
+                                             index_meta->get_index_suffix(), logical_reader.get());
+    auto doris_status = snii_doris::to_doris_status(status);
+    if (!doris_status.ok()) {
+        return ResultError(doris_status);
+    }
+    return logical_reader;
+}
+
 Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>> IndexFileReader::open(
         const TabletIndex* index_meta, const io::IOContext* io_ctx) const {
     auto index_id = index_meta->index_id();
@@ -254,6 +311,28 @@ Status IndexFileReader::index_file_exist(const TabletIndex* index_meta, bool* re
         auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1(
                 _index_path_prefix, index_meta->index_id(), index_meta->get_index_suffix());
         return _fs->exists(index_file_path, res);
+    } else if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix);
+        RETURN_IF_ERROR(_fs->exists(index_file_path, res));
+        if (!*res) {
+            return Status::OK();
+        }
+        std::shared_lock<std::shared_mutex> lock(_mutex);
+        if (_snii_segment_reader == nullptr) {
+            *res = false;
+            return Status::OK();
+        }
+        auto logical_reader = std::make_unique<snii::reader::LogicalIndexReader>();
+        auto status = _snii_segment_reader->open_index(cast_set<uint64_t>(index_meta->index_id()),
+                                                       index_meta->get_index_suffix(),
+                                                       logical_reader.get());
+        if (status.code() == snii::StatusCode::kNotFound) {
+            *res = false;
+            return Status::OK();
+        }
+        RETURN_IF_ERROR(snii_doris::to_doris_status(status));
+        *res = true;
+        return Status::OK();
     } else {
         std::shared_lock<std::shared_mutex> lock(_mutex); // Lock for reading
         if (_stream == nullptr) {
@@ -279,6 +358,11 @@ Status IndexFileReader::has_null(const TabletIndex* index_meta, bool* res) const
         *res = true;
         return Status::OK();
     }
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        auto logical_reader = DORIS_TRY(open_snii_index(index_meta));
+        *res = logical_reader->section_refs().null_bitmap.length > 0;
+        return Status::OK();
+    }
     std::shared_lock<std::shared_mutex> lock(_mutex); // Lock for reading
     if (_stream == nullptr) {
         return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
diff --git a/be/src/storage/index/index_file_reader.h b/be/src/storage/index/index_file_reader.h
index fb4ec2b9a62fe3..896c8bd51745ff 100644
--- a/be/src/storage/index/index_file_reader.h
+++ b/be/src/storage/index/index_file_reader.h
@@ -33,8 +33,11 @@
 #include "common/be_mock_util.h"
 #include "common/config.h"
 #include "io/fs/file_system.h"
+#include "snii/reader/logical_index_reader.h"
+#include "snii/reader/snii_segment_reader.h"
 #include "storage/index/index_file_writer.h"
 #include "storage/index/inverted/inverted_index_desc.h"
+#include "storage/index/snii/snii_doris_adapter.h"
 
 namespace doris {
 class TabletIndex;
@@ -60,7 +63,7 @@ class IndexFileReader {
             : _fs(std::move(fs)),
               _index_path_prefix(std::move(index_path_prefix)),
               _storage_format(storage_format),
-              _idx_file_info(idx_file_info),
+              _idx_file_info(std::move(idx_file_info)),
               _tablet_id(tablet_id) {}
     virtual ~IndexFileReader() = default;
 
@@ -68,6 +71,8 @@ class IndexFileReader {
                               const io::IOContext* io_ctx = nullptr);
     MOCK_FUNCTION Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>> open(
             const TabletIndex* index_meta, const io::IOContext* io_ctx = nullptr) const;
+    Result<std::unique_ptr<snii::reader::LogicalIndexReader>> open_snii_index(
+            const TabletIndex* index_meta) const;
     void debug_file_entries();
     std::string get_index_file_cache_key(const TabletIndex* index_meta) const;
     std::string get_index_file_path(const TabletIndex* index_meta) const;
@@ -75,12 +80,19 @@ class IndexFileReader {
     Status has_null(const TabletIndex* index_meta, bool* res) const;
     Result<InvertedIndexDirectoryMap> get_all_directories();
     // open file v2, init _stream
-    int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); }
+    int64_t get_inverted_file_size() const {
+        if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+            return _snii_file_reader == nullptr ? 0 : _snii_file_reader->size();
+        }
+        return _stream == nullptr ? 0 : _stream->length();
+    }
     const std::string& get_index_path_prefix() const { return _index_path_prefix; }
+    InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; }
     friend IndexFileWriter;
 
 protected:
     Status _init_from(int32_t read_buffer_size, const io::IOContext* io_ctx);
+    Status _init_snii(const io::IOContext* io_ctx);
     Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>> _open(
             int64_t index_id, const std::string& index_suffix,
             const io::IOContext* io_ctx = nullptr) const;
@@ -88,6 +100,8 @@ class IndexFileReader {
 private:
     IndicesEntriesMap _indices_entries;
     std::unique_ptr<CL_NS(store)::IndexInput> _stream = nullptr;
+    std::shared_ptr<snii_doris::DorisSniiFileReader> _snii_file_reader;
+    std::unique_ptr<snii::reader::SniiSegmentReader> _snii_segment_reader;
     const io::FileSystemSPtr _fs;
     std::string _index_path_prefix;
     int32_t _read_buffer_size = -1;
@@ -99,4 +113,4 @@ class IndexFileReader {
 };
 
 } // namespace segment_v2
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/be/src/storage/index/index_file_writer.cpp b/be/src/storage/index/index_file_writer.cpp
index afd09c84620bb5..665cb185d4aae7 100644
--- a/be/src/storage/index/index_file_writer.cpp
+++ b/be/src/storage/index/index_file_writer.cpp
@@ -22,6 +22,7 @@
 #include <atomic>
 #include <filesystem>
 
+#include "common/cast_set.h"
 #include "common/status.h"
 #include "io/fs/packed_file_writer.h"
 #include "io/fs/s3_file_writer.h"
@@ -34,6 +35,7 @@
 #include "storage/index/inverted/inverted_index_desc.h"
 #include "storage/index/inverted/inverted_index_fs_directory.h"
 #include "storage/index/inverted/inverted_index_reader.h"
+#include "storage/index/snii/snii_doris_adapter.h"
 #include "storage/tablet/tablet_schema.h"
 
 namespace doris::segment_v2 {
@@ -56,7 +58,7 @@ IndexFileWriter::IndexFileWriter(io::FileSystemSPtr fs, std::string index_path_p
     _tmp_dir = tmp_file_dir.native();
     if (_storage_format == InvertedIndexStorageFormatPB::V1) {
         _index_storage_format = std::make_unique<IndexStorageFormatV1>(this);
-    } else {
+    } else if (_storage_format != InvertedIndexStorageFormatPB::SNII) {
         _index_storage_format = std::make_unique<IndexStorageFormatV2>(this);
     }
 }
@@ -84,6 +86,10 @@ Status IndexFileWriter::_insert_directory_into_map(int64_t index_id,
 }
 
 Result<std::shared_ptr<DorisFSDirectory>> IndexFileWriter::open(const TabletIndex* index_meta) {
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        return ResultError(Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "SNII format does not open CLucene directories"));
+    }
     auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path(
             _tmp_dir, _rowset_id, _seg_id, index_meta->index_id(), index_meta->get_index_suffix());
     auto dir = std::shared_ptr<DorisFSDirectory>(DorisFSDirectoryFactory::getDirectory(
@@ -97,6 +103,43 @@ Result<std::shared_ptr<DorisFSDirectory>> IndexFileWriter::open(const TabletInde
     return dir;
 }
 
+Status IndexFileWriter::add_snii_index(const TabletIndex* index_meta, uint32_t doc_count,
+                                       std::vector<uint32_t> null_docids,
+                                       snii::writer::SpimiTermBuffer* const term_buffer,
+                                       snii::format::IndexConfig index_config,
+                                       snii::writer::MemoryReporter* const mem_reporter) {
+    DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII);
+    DCHECK(index_meta != nullptr);
+    DCHECK(term_buffer != nullptr);
+    if (_idx_v2_writer == nullptr) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
+                "SNII index file writer is null for {}", _index_path_prefix);
+    }
+    if (_snii_file_writer == nullptr) {
+        _snii_file_writer = std::make_unique<snii_doris::DorisSniiFileWriter>(_idx_v2_writer.get());
+        _snii_compound_writer =
+                std::make_unique<snii::writer::SniiCompoundWriter>(_snii_file_writer.get());
+    }
+
+    snii::writer::SniiIndexInput input;
+    input.index_id = cast_set<uint64_t>(index_meta->index_id());
+    input.index_suffix = index_meta->get_index_suffix();
+    input.config = index_config;
+    input.doc_count = doc_count;
+    input.null_docids = std::move(null_docids);
+    input.term_source = term_buffer;
+    input.mem_reporter = mem_reporter;
+    RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->add_logical_index(input)));
+    ++_snii_index_count;
+    return Status::OK();
+}
+
+void IndexFileWriter::retain_snii_memory_reporter(
+        std::unique_ptr<snii::writer::MemoryReporter> mem_reporter) {
+    DCHECK(mem_reporter != nullptr);
+    _snii_memory_reporters.push_back(std::move(mem_reporter));
+}
+
 Status IndexFileWriter::delete_index(const TabletIndex* index_meta) {
     DBUG_EXECUTE_IF("IndexFileWriter::delete_index_index_meta_nullptr", { index_meta = nullptr; });
     if (!index_meta) {
@@ -123,6 +166,9 @@ Status IndexFileWriter::delete_index(const TabletIndex* index_meta) {
 }
 
 Status IndexFileWriter::add_into_searcher_cache() {
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        return Status::OK();
+    }
     auto index_file_reader = std::make_unique<IndexFileReader>(
             _fs, _index_path_prefix, _storage_format, InvertedIndexFileInfo(), _tablet_id);
     auto st = index_file_reader->init();
@@ -196,6 +242,21 @@ Result<std::unique_ptr<IndexSearcherBuilder>> IndexFileWriter::_construct_index_
 Status IndexFileWriter::begin_close() {
     DCHECK(!_closed) << debug_string();
     _closed = true;
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        if (_snii_compound_writer == nullptr) {
+            if (_idx_v2_writer == nullptr) {
+                return Status::OK();
+            }
+            _snii_file_writer =
+                    std::make_unique<snii_doris::DorisSniiFileWriter>(_idx_v2_writer.get());
+            _snii_compound_writer =
+                    std::make_unique<snii::writer::SniiCompoundWriter>(_snii_file_writer.get());
+        }
+        RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->finish()));
+        _total_file_size = _idx_v2_writer == nullptr ? 0 : _idx_v2_writer->bytes_appended();
+        _file_info.set_index_size(_total_file_size);
+        return Status::OK();
+    }
     if (_indices_dirs.empty()) {
         // An empty file must still be created even if there are no indexes to write
         if (dynamic_cast<io::StreamSinkFileWriter*>(_idx_v2_writer.get()) != nullptr ||
@@ -238,6 +299,12 @@ Status IndexFileWriter::begin_close() {
 
 Status IndexFileWriter::finish_close() {
     DCHECK(_closed) << debug_string();
+    if (_storage_format == InvertedIndexStorageFormatPB::SNII) {
+        if (_idx_v2_writer != nullptr && _idx_v2_writer->state() != io::FileWriter::State::CLOSED) {
+            RETURN_IF_ERROR(_idx_v2_writer->close(false));
+        }
+        return Status::OK();
+    }
     if (_indices_dirs.empty()) {
         // An empty file must still be created even if there are no indexes to write
         if (dynamic_cast<io::StreamSinkFileWriter*>(_idx_v2_writer.get()) != nullptr ||
diff --git a/be/src/storage/index/index_file_writer.h b/be/src/storage/index/index_file_writer.h
index a303de8b68c156..7f16d19cb90e74 100644
--- a/be/src/storage/index/index_file_writer.h
+++ b/be/src/storage/index/index_file_writer.h
@@ -24,21 +24,34 @@
 
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "common/be_mock_util.h"
 #include "io/fs/file_system.h"
 #include "io/fs/file_writer.h"
 #include "io/fs/local_file_system.h"
+#include "snii/format/format_constants.h"
+#include "snii/writer/snii_compound_writer.h"
 #include "storage/index/index_storage_format.h"
 #include "storage/index/inverted/inverted_index_common.h"
 #include "storage/index/inverted/inverted_index_compound_reader.h"
 #include "storage/index/inverted/inverted_index_searcher.h"
+#include "storage/index/snii/snii_doris_adapter.h"
+
+namespace snii::writer {
+class MemoryReporter;
+class SpimiTermBuffer;
+class SniiCompoundWriter;
+} // namespace snii::writer
 
 namespace doris {
 class TabletIndex;
 
 namespace segment_v2 {
 class DorisFSDirectory;
+namespace snii_doris {
+class DorisSniiFileWriter;
+} // namespace snii_doris
 
 using InvertedIndexDirectoryMap =
         std::map<std::pair<int64_t, std::string>, std::shared_ptr<lucene::store::Directory>>;
@@ -55,6 +68,12 @@ class IndexFileWriter {
     virtual ~IndexFileWriter() = default;
 
     MOCK_FUNCTION Result<std::shared_ptr<DorisFSDirectory>> open(const TabletIndex* index_meta);
+    Status add_snii_index(const TabletIndex* index_meta, uint32_t doc_count,
+                          std::vector<uint32_t> null_docids,
+                          snii::writer::SpimiTermBuffer* const term_buffer,
+                          snii::format::IndexConfig config,
+                          snii::writer::MemoryReporter* const mem_reporter);
+    void retain_snii_memory_reporter(std::unique_ptr<snii::writer::MemoryReporter> mem_reporter);
     Status delete_index(const TabletIndex* index_meta);
     Status initialize(InvertedIndexDirectoryMap& indices_dirs);
     Status add_into_searcher_cache();
@@ -113,6 +132,10 @@ class IndexFileWriter {
 
     IndexStorageFormatPtr _index_storage_format;
     int64_t _tablet_id = -1;
+    std::unique_ptr<snii_doris::DorisSniiFileWriter> _snii_file_writer;
+    std::vector<std::unique_ptr<snii::writer::MemoryReporter>> _snii_memory_reporters;
+    std::unique_ptr<snii::writer::SniiCompoundWriter> _snii_compound_writer;
+    size_t _snii_index_count = 0;
 
     friend class IndexStorageFormatV1;
     friend class IndexStorageFormatV2;
diff --git a/be/src/storage/index/index_writer.cpp b/be/src/storage/index/index_writer.cpp
index 2325d280471337..6fb23c3c107e51 100644
--- a/be/src/storage/index/index_writer.cpp
+++ b/be/src/storage/index/index_writer.cpp
@@ -18,6 +18,7 @@
 #include "common/exception.h"
 #include "storage/index/ann/ann_index_writer.h"
 #include "storage/index/inverted/inverted_index_writer.h"
+#include "storage/index/snii/snii_index_writer.h"
 #include "storage/tablet/tablet_schema.h"
 #include "storage/types.h"
 
@@ -80,6 +81,22 @@ Status IndexColumnWriter::create(const TabletColumn* column,
             }
         }
 
+        if (storage_format == InvertedIndexStorageFormatPB::SNII) {
+            if (!is_string_type(type)) {
+                return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                        "SNII inverted index storage format does not support BKD index type {}",
+                        type);
+            }
+            *res = std::make_unique<SniiIndexColumnWriter>(index_file_writer, index_meta,
+                                                           single_field);
+            auto st = (*res)->init();
+            if (!st.ok()) {
+                (*res)->close_on_error();
+                return st;
+            }
+            return Status::OK();
+        }
+
         DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_unsupported_type_for_inverted_index",
                         { type = FieldType::OLAP_FIELD_TYPE_JSONB; })
         switch (type) {
diff --git a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp
index e65025b25a4fc7..d03cdf38a9abf1 100644
--- a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp
+++ b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp
@@ -179,16 +179,15 @@ void DorisFSDirectory::FSIndexInput::close() {
 }
 
 void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) {
+    const bool is_index_data = _io_ctx.is_index_data;
     if (io_ctx) {
         const auto& ctx = static_cast<const io::IOContext*>(io_ctx);
-        _io_ctx.reader_type = ctx->reader_type;
-        _io_ctx.query_id = ctx->query_id;
-        _io_ctx.file_cache_stats = ctx->file_cache_stats;
+        _io_ctx = *ctx;
     } else {
-        _io_ctx.reader_type = ReaderType::UNKNOWN;
-        _io_ctx.query_id = nullptr;
-        _io_ctx.file_cache_stats = nullptr;
+        _io_ctx = io::IOContext {};
     }
+    _io_ctx.is_index_data = is_index_data;
+    _io_ctx.is_inverted_index = true;
 }
 
 const void* DorisFSDirectory::FSIndexInput::getIoContext() {
@@ -247,6 +246,10 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len)
 
     if (_io_ctx.file_cache_stats != nullptr) {
         _io_ctx.file_cache_stats->inverted_index_io_timer += inverted_index_io_timer;
+        _io_ctx.file_cache_stats->inverted_index_request_bytes += len;
+        _io_ctx.file_cache_stats->inverted_index_read_bytes += len;
+        ++_io_ctx.file_cache_stats->inverted_index_range_read_count;
+        ++_io_ctx.file_cache_stats->inverted_index_serial_read_rounds;
     }
 }
 
diff --git a/be/src/storage/index/inverted/inverted_index_reader.h b/be/src/storage/index/inverted/inverted_index_reader.h
index 0e2f6a120d41e3..a2aa0533f2bf7b 100644
--- a/be/src/storage/index/inverted/inverted_index_reader.h
+++ b/be/src/storage/index/inverted/inverted_index_reader.h
@@ -230,9 +230,9 @@ class InvertedIndexReader : public IndexReader {
                              const Field& query_value, InvertedIndexQueryType query_type,
                              size_t* count) = 0;
 
-    Status read_null_bitmap(const IndexQueryContextPtr& context,
-                            InvertedIndexQueryCacheHandle* cache_handle,
-                            lucene::store::Directory* dir = nullptr);
+    virtual Status read_null_bitmap(const IndexQueryContextPtr& context,
+                                    InvertedIndexQueryCacheHandle* cache_handle,
+                                    lucene::store::Directory* dir = nullptr);
 
     virtual InvertedIndexReaderType type() = 0;
 
@@ -335,7 +335,6 @@ class InvertedIndexVisitor : public lucene::util::bkd::bkd_reader::intersect_vis
     std::string query_min;
     std::string query_max;
 
-public:
     InvertedIndexVisitor(const void* io_ctx, lucene::util::bkd::bkd_reader* r,
                          roaring::Roaring* hits, bool only_count = false);
     ~InvertedIndexVisitor() override = default;
diff --git a/be/src/storage/index/snii/core/src/common/status.cpp b/be/src/storage/index/snii/core/src/common/status.cpp
new file mode 100644
index 00000000000000..d8f66b4a68cd98
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/common/status.cpp
@@ -0,0 +1,24 @@
+#include "snii/common/status.h"
+
+#include <array>
+#include <cstddef>
+
+namespace snii {
+namespace {
+
+// Name table in the same order as the StatusCode enum, to avoid a long switch chain in to_string.
+constexpr std::array<const char*, 7> kCodeNames = {
+        "OK", "Corruption", "NotFound", "InvalidArgument", "IoError", "Unsupported", "Internal"};
+
+} // namespace
+
+std::string Status::to_string() const {
+    std::string out = kCodeNames[static_cast<std::size_t>(code_)];
+    if (!message_.empty()) {
+        out += ": ";
+        out += message_;
+    }
+    return out;
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp b/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp
new file mode 100644
index 00000000000000..fc5c70d6b5569d
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp
@@ -0,0 +1,39 @@
+#include "snii/encoding/byte_sink.h"
+
+#include "snii/encoding/varint.h"
+
+namespace snii {
+
+void ByteSink::put_fixed16(uint16_t v) {
+    for (int i = 0; i < 2; ++i) buf_.push_back(static_cast<uint8_t>(v >> (8 * i)));
+}
+
+void ByteSink::put_fixed32(uint32_t v) {
+    for (int i = 0; i < 4; ++i) buf_.push_back(static_cast<uint8_t>(v >> (8 * i)));
+}
+
+void ByteSink::put_fixed64(uint64_t v) {
+    for (int i = 0; i < 8; ++i) buf_.push_back(static_cast<uint8_t>(v >> (8 * i)));
+}
+
+void ByteSink::put_varint32(uint32_t v) {
+    uint8_t tmp[5];
+    size_t n = encode_varint32(v, tmp);
+    buf_.insert(buf_.end(), tmp, tmp + n);
+}
+
+void ByteSink::put_varint64(uint64_t v) {
+    uint8_t tmp[10];
+    size_t n = encode_varint64(v, tmp);
+    buf_.insert(buf_.end(), tmp, tmp + n);
+}
+
+void ByteSink::put_zigzag(int64_t v) {
+    put_varint64(zigzag_encode(v));
+}
+
+void ByteSink::put_bytes(Slice s) {
+    buf_.insert(buf_.end(), s.data(), s.data() + s.size());
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/byte_source.cpp b/be/src/storage/index/snii/core/src/encoding/byte_source.cpp
new file mode 100644
index 00000000000000..d75d4945ff7f9d
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/byte_source.cpp
@@ -0,0 +1,70 @@
+#include "snii/encoding/byte_source.h"
+
+#include "snii/encoding/varint.h"
+
+namespace snii {
+
+Status ByteSource::get_u8(uint8_t* v) {
+    if (remaining() < 1) return Status::Corruption("get_u8 overrun");
+    *v = s_[pos_++];
+    return Status::OK();
+}
+
+Status ByteSource::get_fixed16(uint16_t* v) {
+    if (remaining() < 2) return Status::Corruption("get_fixed16 overrun");
+    uint16_t r = 0;
+    for (int i = 0; i < 2; ++i) r |= static_cast<uint16_t>(s_[pos_ + i]) << (8 * i);
+    pos_ += 2;
+    *v = r;
+    return Status::OK();
+}
+
+Status ByteSource::get_fixed32(uint32_t* v) {
+    if (remaining() < 4) return Status::Corruption("get_fixed32 overrun");
+    uint32_t r = 0;
+    for (int i = 0; i < 4; ++i) r |= static_cast<uint32_t>(s_[pos_ + i]) << (8 * i);
+    pos_ += 4;
+    *v = r;
+    return Status::OK();
+}
+
+Status ByteSource::get_fixed64(uint64_t* v) {
+    if (remaining() < 8) return Status::Corruption("get_fixed64 overrun");
+    uint64_t r = 0;
+    for (int i = 0; i < 8; ++i) r |= static_cast<uint64_t>(s_[pos_ + i]) << (8 * i);
+    pos_ += 8;
+    *v = r;
+    return Status::OK();
+}
+
+Status ByteSource::get_varint64(uint64_t* v) {
+    const uint8_t* p = s_.data() + pos_;
+    const uint8_t* next = nullptr;
+    SNII_RETURN_IF_ERROR(decode_varint64(p, s_.data() + s_.size(), v, &next));
+    pos_ = static_cast<size_t>(next - s_.data());
+    return Status::OK();
+}
+
+Status ByteSource::get_varint32(uint32_t* v) {
+    uint64_t tmp;
+    SNII_RETURN_IF_ERROR(get_varint64(&tmp));
+    if (tmp > 0xFFFFFFFFu) return Status::Corruption("varint32 overflow");
+    *v = static_cast<uint32_t>(tmp);
+    return Status::OK();
+}
+
+Status ByteSource::get_zigzag(int64_t* v) {
+    uint64_t tmp;
+    SNII_RETURN_IF_ERROR(get_varint64(&tmp));
+    *v = zigzag_decode(tmp);
+    return Status::OK();
+}
+
+Status ByteSource::get_bytes(size_t n, Slice* out) {
+    if (remaining() < n) return Status::Corruption("get_bytes overrun");
+    *out = s_.subslice(pos_, n);
+    pos_ += n;
+    return Status::OK();
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/crc32c.cpp b/be/src/storage/index/snii/core/src/encoding/crc32c.cpp
new file mode 100644
index 00000000000000..811ef86a697152
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/crc32c.cpp
@@ -0,0 +1,111 @@
+#include "snii/encoding/crc32c.h"
+
+#include <array>
+#include <cstddef>
+#include <cstring>
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define SNII_CRC32C_X86 1
+#include <cpuid.h>
+#include <nmmintrin.h> // _mm_crc32_u8/u32/u64 (SSE4.2)
+#endif
+
+namespace snii {
+namespace {
+
+// Bit-reflected Castagnoli polynomial (CRC32C / iSCSI).
+constexpr uint32_t kPoly = 0x82F63B78u;
+
+// Builds the slice-by-8 lookup tables. Column 0 is the classic byte table; each
+// successive column folds in one more byte of look-ahead, letting the inner loop
+// consume 8 bytes per iteration with 8 table reads + XORs instead of 8 dependent
+// shift/lookup steps. The checksum value is identical to the byte-at-a-time loop.
+std::array<std::array<uint32_t, 256>, 8> make_slice8_table() {
+    std::array<std::array<uint32_t, 256>, 8> t {};
+    for (uint32_t i = 0; i < 256; ++i) {
+        uint32_t c = i;
+        for (int k = 0; k < 8; ++k) c = (c & 1) ? (kPoly ^ (c >> 1)) : (c >> 1);
+        t[0][i] = c;
+    }
+    for (uint32_t i = 0; i < 256; ++i) {
+        uint32_t c = t[0][i];
+        for (int s = 1; s < 8; ++s) {
+            c = t[0][c & 0xFF] ^ (c >> 8);
+            t[s][i] = c;
+        }
+    }
+    return t;
+}
+
+const std::array<std::array<uint32_t, 256>, 8> kSlice8 = make_slice8_table();
+
+inline uint32_t load_le32(const uint8_t* p) {
+    return static_cast<uint32_t>(p[0]) | (static_cast<uint32_t>(p[1]) << 8) |
+           (static_cast<uint32_t>(p[2]) << 16) | (static_cast<uint32_t>(p[3]) << 24);
+}
+
+// Pure software slice-by-8 (used as the portable path and the hardware fallback).
+uint32_t crc32c_slice8(uint32_t crc, const uint8_t* p, size_t n) {
+    while (n >= 8) {
+        crc ^= load_le32(p);
+        const uint32_t hi = load_le32(p + 4);
+        crc = kSlice8[7][crc & 0xFF] ^ kSlice8[6][(crc >> 8) & 0xFF] ^
+              kSlice8[5][(crc >> 16) & 0xFF] ^ kSlice8[4][crc >> 24] ^ kSlice8[3][hi & 0xFF] ^
+              kSlice8[2][(hi >> 8) & 0xFF] ^ kSlice8[1][(hi >> 16) & 0xFF] ^ kSlice8[0][hi >> 24];
+        p += 8;
+        n -= 8;
+    }
+    while (n--) {
+        crc = kSlice8[0][(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+    }
+    return crc;
+}
+
+#if SNII_CRC32C_X86
+// Hardware CRC32C via the SSE4.2 crc32 instruction. The intrinsics operate on the
+// same bit-reflected Castagnoli polynomial as the tables, so the result is
+// byte-identical. This TU is compiled without -msse4.2, so gate the intrinsics
+// behind a function-level target attribute and a runtime CPUID check.
+__attribute__((target("sse4.2"))) uint32_t crc32c_hw(uint32_t crc, const uint8_t* p, size_t n) {
+    while (n >= 8) {
+        uint64_t v;
+        std::memcpy(&v, p, sizeof(v)); // unaligned-safe; x86 folds to a plain load
+        crc = static_cast<uint32_t>(_mm_crc32_u64(crc, v));
+        p += 8;
+        n -= 8;
+    }
+    if (n >= 4) {
+        crc = _mm_crc32_u32(crc, load_le32(p));
+        p += 4;
+        n -= 4;
+    }
+    while (n--) crc = _mm_crc32_u8(crc, *p++);
+    return crc;
+}
+
+bool detect_sse42() {
+    unsigned eax = 0, ebx = 0, ecx = 0, edx = 0;
+    if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) return false;
+    return (ecx & bit_SSE4_2) != 0;
+}
+
+const bool kHasSse42 = detect_sse42();
+#endif
+
+} // namespace
+
+uint32_t crc32c_extend(uint32_t crc, Slice data) {
+    const uint8_t* p = data.data();
+    const size_t n = data.size();
+    crc = ~crc;
+#if SNII_CRC32C_X86
+    if (kHasSse42) {
+        crc = crc32c_hw(crc, p, n);
+        return ~crc;
+    }
+#endif
+    crc = crc32c_slice8(crc, p, n);
+    return ~crc;
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/pfor.cpp b/be/src/storage/index/snii/core/src/encoding/pfor.cpp
new file mode 100644
index 00000000000000..5cdf8fdb57f9d6
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/pfor.cpp
@@ -0,0 +1,360 @@
+#include "snii/encoding/pfor.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "snii/common/slice.h"
+
+namespace snii {
+namespace {
+
+// Unaligned little-endian 64-bit load from a raw byte pointer (single
+// instruction on x86; memcpy is the portable, UB-free spelling the compiler
+// folds to a mov).
+inline uint64_t load_u64_le(const uint8_t* p) {
+    uint64_t v;
+    std::memcpy(&v, p, sizeof(v));
+#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    v = __builtin_bswap64(v);
+#endif
+    return v;
+}
+
+uint8_t bits_for(uint32_t v) {
+    uint8_t b = 0;
+    while (v) {
+        ++b;
+        v >>= 1;
+    }
+    return b;
+}
+
+// Choose the bit_width that minimizes total bytes (packed + exceptions).
+// Exception cost estimated at ~6 bytes each.
+uint8_t choose_width(const uint32_t* v, size_t n) {
+    uint8_t maxw = 0;
+    for (size_t i = 0; i < n; ++i) {
+        maxw = std::max(maxw, bits_for(v[i]));
+    }
+    uint8_t best = maxw;
+    size_t best_cost = SIZE_MAX;
+    for (uint8_t w = 0; w <= maxw; ++w) {
+        size_t exc = 0;
+        for (size_t i = 0; i < n; ++i) {
+            if (bits_for(v[i]) > w) {
+                ++exc;
+            }
+        }
+        size_t cost = (static_cast<size_t>(w) * n + 7) / 8 + exc * 6;
+        if (cost < best_cost) {
+            best_cost = cost;
+            best = w;
+        }
+    }
+    return best;
+}
+
+uint32_t low_mask(uint8_t w) {
+    return (w >= 32) ? 0xFFFFFFFFU : ((1U << w) - 1U);
+}
+
+void bitpack(const uint32_t* v, size_t n, uint8_t w, ByteSink* out) {
+    if (w == 0) {
+        return;
+    }
+    uint64_t acc = 0;
+    int filled = 0;
+    for (size_t i = 0; i < n; ++i) {
+        acc |= static_cast<uint64_t>(v[i] & low_mask(w)) << filled;
+        filled += w;
+        while (filled >= 8) {
+            out->put_u8(static_cast<uint8_t>(acc));
+            acc >>= 8;
+            filled -= 8;
+        }
+    }
+    if (filled > 0) {
+        out->put_u8(static_cast<uint8_t>(acc));
+    }
+}
+
+void bitunpack_tail(const uint8_t* base, size_t packed, size_t n, uint8_t w, size_t i,
+                    uint64_t mask, uint32_t* out) {
+    for (; i < n; ++i) {
+        const size_t bit_off = static_cast<size_t>(w) * i;
+        const size_t byte_off = bit_off >> 3;
+        uint64_t word = 0;
+        for (size_t b = byte_off; b < packed && b < byte_off + 8; ++b) {
+            word |= static_cast<uint64_t>(base[b]) << ((b - byte_off) * 8);
+        }
+        out[i] = static_cast<uint32_t>((word >> (bit_off & 7)) & mask);
+    }
+}
+
+void bitunpack_w1(const uint8_t* base, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 8 <= n; i += 8, ++byte) {
+        const uint8_t v = base[byte];
+        out[i] = v & 1U;
+        out[i + 1] = (v >> 1) & 1U;
+        out[i + 2] = (v >> 2) & 1U;
+        out[i + 3] = (v >> 3) & 1U;
+        out[i + 4] = (v >> 4) & 1U;
+        out[i + 5] = (v >> 5) & 1U;
+        out[i + 6] = (v >> 6) & 1U;
+        out[i + 7] = (v >> 7) & 1U;
+    }
+    if (i < n) {
+        const uint8_t v = base[byte];
+        for (uint8_t bit = 0; i < n; ++i, ++bit) {
+            out[i] = (v >> bit) & 1U;
+        }
+    }
+}
+
+void bitunpack_w2(const uint8_t* base, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 4 <= n; i += 4, ++byte) {
+        const uint8_t v = base[byte];
+        out[i] = v & 3U;
+        out[i + 1] = (v >> 2) & 3U;
+        out[i + 2] = (v >> 4) & 3U;
+        out[i + 3] = (v >> 6) & 3U;
+    }
+    if (i < n) {
+        const uint8_t v = base[byte];
+        for (uint8_t shift = 0; i < n; ++i, shift += 2) {
+            out[i] = (v >> shift) & 3U;
+        }
+    }
+}
+
+void bitunpack_w3(const uint8_t* base, size_t packed, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 8 <= n; i += 8, byte += 3) {
+        const uint32_t b0 = base[byte];
+        const uint32_t b1 = base[byte + 1];
+        const uint32_t b2 = base[byte + 2];
+        out[i] = b0 & 7U;
+        out[i + 1] = (b0 >> 3) & 7U;
+        out[i + 2] = ((b0 >> 6) | (b1 << 2)) & 7U;
+        out[i + 3] = (b1 >> 1) & 7U;
+        out[i + 4] = (b1 >> 4) & 7U;
+        out[i + 5] = ((b1 >> 7) | (b2 << 1)) & 7U;
+        out[i + 6] = (b2 >> 2) & 7U;
+        out[i + 7] = (b2 >> 5) & 7U;
+    }
+    bitunpack_tail(base, packed, n, 3, i, 7U, out);
+}
+
+void bitunpack_w4(const uint8_t* base, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 2 <= n; i += 2, ++byte) {
+        const uint8_t v = base[byte];
+        out[i] = v & 15U;
+        out[i + 1] = (v >> 4) & 15U;
+    }
+    if (i < n) {
+        out[i] = base[byte] & 15U;
+    }
+}
+
+void bitunpack_w5(const uint8_t* base, size_t packed, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 8 <= n; i += 8, byte += 5) {
+        const uint32_t b0 = base[byte];
+        const uint32_t b1 = base[byte + 1];
+        const uint32_t b2 = base[byte + 2];
+        const uint32_t b3 = base[byte + 3];
+        const uint32_t b4 = base[byte + 4];
+        out[i] = b0 & 31U;
+        out[i + 1] = ((b0 >> 5) | (b1 << 3)) & 31U;
+        out[i + 2] = (b1 >> 2) & 31U;
+        out[i + 3] = ((b1 >> 7) | (b2 << 1)) & 31U;
+        out[i + 4] = ((b2 >> 4) | (b3 << 4)) & 31U;
+        out[i + 5] = (b3 >> 1) & 31U;
+        out[i + 6] = ((b3 >> 6) | (b4 << 2)) & 31U;
+        out[i + 7] = (b4 >> 3) & 31U;
+    }
+    bitunpack_tail(base, packed, n, 5, i, 31U, out);
+}
+
+void bitunpack_w6(const uint8_t* base, size_t packed, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 4 <= n; i += 4, byte += 3) {
+        const uint32_t b0 = base[byte];
+        const uint32_t b1 = base[byte + 1];
+        const uint32_t b2 = base[byte + 2];
+        out[i] = b0 & 63U;
+        out[i + 1] = ((b0 >> 6) | (b1 << 2)) & 63U;
+        out[i + 2] = ((b1 >> 4) | (b2 << 4)) & 63U;
+        out[i + 3] = (b2 >> 2) & 63U;
+    }
+    bitunpack_tail(base, packed, n, 6, i, 63U, out);
+}
+
+void bitunpack_w7(const uint8_t* base, size_t packed, size_t n, uint32_t* out) {
+    size_t i = 0;
+    size_t byte = 0;
+    for (; i + 8 <= n; i += 8, byte += 7) {
+        const uint32_t b0 = base[byte];
+        const uint32_t b1 = base[byte + 1];
+        const uint32_t b2 = base[byte + 2];
+        const uint32_t b3 = base[byte + 3];
+        const uint32_t b4 = base[byte + 4];
+        const uint32_t b5 = base[byte + 5];
+        const uint32_t b6 = base[byte + 6];
+        out[i] = b0 & 127U;
+        out[i + 1] = ((b0 >> 7) | (b1 << 1)) & 127U;
+        out[i + 2] = ((b1 >> 6) | (b2 << 2)) & 127U;
+        out[i + 3] = ((b2 >> 5) | (b3 << 3)) & 127U;
+        out[i + 4] = ((b3 >> 4) | (b4 << 4)) & 127U;
+        out[i + 5] = ((b4 >> 3) | (b5 << 5)) & 127U;
+        out[i + 6] = ((b5 >> 2) | (b6 << 6)) & 127U;
+        out[i + 7] = (b6 >> 1) & 127U;
+    }
+    bitunpack_tail(base, packed, n, 7, i, 127U, out);
+}
+
+void bitunpack_w8(const uint8_t* base, size_t n, uint32_t* out) {
+    for (size_t i = 0; i < n; ++i) {
+        out[i] = base[i];
+    }
+}
+
+void bitunpack_generic(const uint8_t* base, size_t packed, size_t n, uint8_t w, uint32_t* out) {
+    const uint64_t mask = low_mask(w);
+    size_t i = 0;
+    if (packed >= 8) {
+        const size_t last_safe_byte = packed - 8;
+        for (; i < n; ++i) {
+            const size_t bit_off = static_cast<size_t>(w) * i;
+            const size_t byte_off = bit_off >> 3;
+            if (byte_off > last_safe_byte) {
+                break;
+            }
+            out[i] = static_cast<uint32_t>((load_u64_le(base + byte_off) >> (bit_off & 7)) & mask);
+        }
+    }
+    bitunpack_tail(base, packed, n, w, i, mask, out);
+}
+
+Status bitunpack(ByteSource* src, size_t n, uint8_t w, uint32_t* out) {
+    if (w == 0) {
+        std::memset(out, 0, n * sizeof(uint32_t));
+        return Status::OK();
+    }
+    // Pull the packed run once and unpack from the contiguous slice; this keeps
+    // the hot decode path free of per-byte ByteSource calls.
+    const size_t packed = (static_cast<size_t>(w) * n + 7) / 8;
+    Slice buf;
+    SNII_RETURN_IF_ERROR(src->get_bytes(packed, &buf));
+    const uint8_t* base = buf.data();
+
+    switch (w) {
+    case 1:
+        bitunpack_w1(base, n, out);
+        break;
+    case 2:
+        bitunpack_w2(base, n, out);
+        break;
+    case 3:
+        bitunpack_w3(base, packed, n, out);
+        break;
+    case 4:
+        bitunpack_w4(base, n, out);
+        break;
+    case 5:
+        bitunpack_w5(base, packed, n, out);
+        break;
+    case 6:
+        bitunpack_w6(base, packed, n, out);
+        break;
+    case 7:
+        bitunpack_w7(base, packed, n, out);
+        break;
+    case 8:
+        bitunpack_w8(base, n, out);
+        break;
+    default:
+        bitunpack_generic(base, packed, n, w, out);
+        break;
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+void pfor_encode(const uint32_t* values, size_t n, ByteSink* out) {
+    uint8_t w = choose_width(values, n);
+    std::vector<std::pair<uint32_t, uint32_t>> exc; // (index, full value)
+    std::vector<uint32_t> low(values, values + n);
+    for (size_t i = 0; i < n; ++i) {
+        if (bits_for(values[i]) > w) {
+            exc.emplace_back(static_cast<uint32_t>(i), values[i]);
+            low[i] = 0; // Write 0 as placeholder at exception position; true value
+                        // stored in exception table
+        }
+    }
+    out->put_u8(w);
+    out->put_varint32(static_cast<uint32_t>(exc.size()));
+    bitpack(low.data(), n, w, out);
+    uint32_t prev = 0;
+    for (const auto& e : exc) {
+        out->put_varint32(e.first - prev);
+        out->put_varint32(e.second);
+        prev = e.first;
+    }
+}
+
+Status pfor_decode(ByteSource* src, size_t n, uint32_t* out) {
+    uint8_t w;
+    SNII_RETURN_IF_ERROR(src->get_u8(&w));
+    uint32_t n_exc;
+    SNII_RETURN_IF_ERROR(src->get_varint32(&n_exc));
+    SNII_RETURN_IF_ERROR(bitunpack(src, n, w, out));
+    uint32_t idx = 0;
+    for (uint32_t i = 0; i < n_exc; ++i) {
+        uint32_t d, val;
+        SNII_RETURN_IF_ERROR(src->get_varint32(&d));
+        SNII_RETURN_IF_ERROR(src->get_varint32(&val));
+        idx += d;
+        if (idx >= n) {
+            return Status::Corruption("pfor exception index out of range");
+        }
+        out[idx] = val;
+    }
+    return Status::OK();
+}
+
+Status pfor_skip(ByteSource* src, size_t n) {
+    uint8_t w = 0;
+    SNII_RETURN_IF_ERROR(src->get_u8(&w));
+    uint32_t n_exc = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint32(&n_exc));
+    const size_t packed = (static_cast<size_t>(w) * n + 7) / 8;
+    Slice unused;
+    SNII_RETURN_IF_ERROR(src->get_bytes(packed, &unused));
+    uint32_t idx = 0;
+    for (uint32_t i = 0; i < n_exc; ++i) {
+        uint32_t d = 0;
+        uint32_t val = 0;
+        SNII_RETURN_IF_ERROR(src->get_varint32(&d));
+        SNII_RETURN_IF_ERROR(src->get_varint32(&val));
+        idx += d;
+        if (idx >= n) {
+            return Status::Corruption("pfor exception index out of range");
+        }
+    }
+    return Status::OK();
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/section_framer.cpp b/be/src/storage/index/snii/core/src/encoding/section_framer.cpp
new file mode 100644
index 00000000000000..99d086c79e705c
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/section_framer.cpp
@@ -0,0 +1,37 @@
+#include "snii/encoding/section_framer.h"
+
+#include "snii/encoding/crc32c.h"
+
+namespace snii {
+
+void SectionFramer::write(ByteSink& sink, uint8_t section_type, Slice payload) {
+    // Assemble type+len+payload in a temporary sink, compute crc over the whole thing, then write it all out.
+    ByteSink framed;
+    framed.put_u8(section_type);
+    framed.put_varint64(payload.size());
+    framed.put_bytes(payload);
+    uint32_t crc = crc32c(framed.view());
+    sink.put_bytes(framed.view());
+    sink.put_fixed32(crc);
+}
+
+Status SectionFramer::read(ByteSource& src, FramedSection* out) {
+    size_t start = src.position();
+    uint8_t type;
+    SNII_RETURN_IF_ERROR(src.get_u8(&type));
+    uint64_t len;
+    SNII_RETURN_IF_ERROR(src.get_varint64(&len));
+    Slice payload;
+    SNII_RETURN_IF_ERROR(src.get_bytes(static_cast<size_t>(len), &payload));
+    size_t framed_len = src.position() - start;
+    uint32_t stored;
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&stored));
+    if (crc32c(src.slice_from(start, framed_len)) != stored) {
+        return Status::Corruption("section crc mismatch");
+    }
+    out->type = type;
+    out->payload = payload;
+    return Status::OK();
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/varint.cpp b/be/src/storage/index/snii/core/src/encoding/varint.cpp
new file mode 100644
index 00000000000000..12877f972cb089
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/varint.cpp
@@ -0,0 +1,53 @@
+#include "snii/encoding/varint.h"
+
+namespace snii {
+
+size_t varint_len(uint64_t v) {
+    size_t n = 1;
+    while (v >= 0x80) {
+        v >>= 7;
+        ++n;
+    }
+    return n;
+}
+
+size_t encode_varint64(uint64_t v, uint8_t* out) {
+    size_t i = 0;
+    while (v >= 0x80) {
+        out[i++] = static_cast<uint8_t>(v) | 0x80;
+        v >>= 7;
+    }
+    out[i++] = static_cast<uint8_t>(v);
+    return i;
+}
+
+size_t encode_varint32(uint32_t v, uint8_t* out) {
+    return encode_varint64(v, out);
+}
+
+Status decode_varint64(const uint8_t* p, const uint8_t* end, uint64_t* v, const uint8_t** next) {
+    uint64_t result = 0;
+    int shift = 0;
+    while (p < end) {
+        uint8_t b = *p++;
+        result |= static_cast<uint64_t>(b & 0x7F) << shift;
+        if ((b & 0x80) == 0) {
+            *v = result;
+            *next = p;
+            return Status::OK();
+        }
+        shift += 7;
+        if (shift >= 64) return Status::Corruption("varint64 overflow");
+    }
+    return Status::Corruption("varint truncated");
+}
+
+Status decode_varint32(const uint8_t* p, const uint8_t* end, uint32_t* v, const uint8_t** next) {
+    uint64_t tmp;
+    SNII_RETURN_IF_ERROR(decode_varint64(p, end, &tmp, next));
+    if (tmp > 0xFFFFFFFFu) return Status::Corruption("varint32 overflow");
+    *v = static_cast<uint32_t>(tmp);
+    return Status::OK();
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp b/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp
new file mode 100644
index 00000000000000..abb01981d63450
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp
@@ -0,0 +1,32 @@
+#include "snii/encoding/zstd_codec.h"
+
+#include <zstd.h>
+
+#include <string>
+
+namespace snii {
+
+Status zstd_compress(Slice input, int level, std::vector<uint8_t>* out) {
+    size_t bound = ZSTD_compressBound(input.size());
+    out->resize(bound);
+    size_t n = ZSTD_compress(out->data(), bound, input.data(), input.size(), level);
+    if (ZSTD_isError(n)) {
+        return Status::Internal(std::string("zstd compress: ") + ZSTD_getErrorName(n));
+    }
+    out->resize(n);
+    return Status::OK();
+}
+
+Status zstd_decompress(Slice input, size_t expected_uncomp_len, std::vector<uint8_t>* out) {
+    out->resize(expected_uncomp_len);
+    size_t n = ZSTD_decompress(out->data(), expected_uncomp_len, input.data(), input.size());
+    if (ZSTD_isError(n)) {
+        return Status::Corruption(std::string("zstd decompress: ") + ZSTD_getErrorName(n));
+    }
+    if (n != expected_uncomp_len) {
+        return Status::Corruption("zstd decompressed length mismatch");
+    }
+    return Status::OK();
+}
+
+} // namespace snii
diff --git a/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp b/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp
new file mode 100644
index 00000000000000..e65c4817d1c6dc
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp
@@ -0,0 +1,91 @@
+#include "snii/format/bootstrap_header.h"
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+
+namespace snii::format {
+
+namespace {
+
+// Number of bytes covered by header_checksum: everything except the trailing
+// crc32c.
+constexpr size_t kChecksumCoverage = kBootstrapHeaderSize - 4;
+
+// Writes all fixed fields except the trailing checksum. Field order is the
+// on-disk contract; reuse ByteSink fixed-width primitives, never hand-assemble
+// bytes.
+void encode_fields(const BootstrapHeader& header, ByteSink* sink) {
+    sink->put_fixed32(header.magic);
+    sink->put_fixed32((static_cast<uint32_t>(header.min_reader_version) << 16) |
+                      header.format_version);
+    sink->put_fixed32(header.flags);
+    sink->put_fixed32(kBootstrapHeaderSize); // header_length is always derived
+    sink->put_u8(header.tail_pointer_size);
+}
+
+} // namespace
+
+Status encode_bootstrap_header(const BootstrapHeader& header, ByteSink* sink) {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("bootstrap_header: null sink");
+    }
+    ByteSink fields;
+    encode_fields(header, &fields);
+    const uint32_t checksum = crc32c(fields.view());
+    sink->put_bytes(fields.view());
+    sink->put_fixed32(checksum);
+    return Status::OK();
+}
+
+Status decode_bootstrap_header(Slice data, BootstrapHeader* out) {
+    if (out == nullptr) {
+        return Status::InvalidArgument("bootstrap_header: null out");
+    }
+    // Reject any size other than the exact fixed header: short input is
+    // truncation, longer input means stray trailing bytes the parser would
+    // otherwise ignore.
+    if (data.size() != kBootstrapHeaderSize) {
+        return Status::Corruption("bootstrap_header: wrong header size");
+    }
+
+    ByteSource src(data);
+    uint32_t magic = 0;
+    uint32_t version_pair = 0;
+    uint32_t flags = 0;
+    uint32_t header_length = 0;
+    uint8_t tail_pointer_size = 0;
+    uint32_t stored_checksum = 0;
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&magic));
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&version_pair));
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&flags));
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&header_length));
+    SNII_RETURN_IF_ERROR(src.get_u8(&tail_pointer_size));
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&stored_checksum));
+
+    if (magic != kContainerMagic) {
+        return Status::Corruption("bootstrap_header: bad container magic");
+    }
+    const uint32_t computed = crc32c(data.subslice(0, kChecksumCoverage));
+    if (computed != stored_checksum) {
+        return Status::Corruption("bootstrap_header: checksum mismatch");
+    }
+
+    const auto min_reader_version = static_cast<uint16_t>((version_pair >> 16) & 0xFFFFu);
+    const auto format_version = static_cast<uint16_t>(version_pair & 0xFFFFu);
+    if (format_version != kFormatVersion) {
+        return Status::Unsupported("bootstrap_header: unsupported container format_version");
+    }
+    if (min_reader_version > kFormatVersion) {
+        return Status::Unsupported("bootstrap_header: container requires a newer reader version");
+    }
+
+    out->magic = magic;
+    out->format_version = format_version;
+    out->min_reader_version = min_reader_version;
+    out->flags = flags;
+    out->header_length = header_length;
+    out->tail_pointer_size = tail_pointer_size;
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/bsbf.cpp b/be/src/storage/index/snii/core/src/format/bsbf.cpp
new file mode 100644
index 00000000000000..adfe5e445c2dce
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/bsbf.cpp
@@ -0,0 +1,218 @@
+#include "snii/format/bsbf.h"
+
+#include <cmath>
+
+#include "snii/encoding/crc32c.h"
+
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+#define SNII_BSBF_X86 1
+#endif
+
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+namespace snii::format {
+
+const uint32_t kBsbfSalt[kBsbfBitsSetPerBlock] = {0x47b6137bU, 0x44974d91U, 0x8824ad5bU,
+                                                  0xa2b7289dU, 0x705495c7U, 0x2df1424bU,
+                                                  0x9efc4947U, 0x5c6bfb31U};
+
+namespace {
+
+void store_le32(uint8_t* p, uint32_t v) {
+    p[0] = static_cast<uint8_t>(v);
+    p[1] = static_cast<uint8_t>(v >> 8);
+    p[2] = static_cast<uint8_t>(v >> 16);
+    p[3] = static_cast<uint8_t>(v >> 24);
+}
+uint32_t load_le32(const uint8_t* p) {
+    return static_cast<uint32_t>(p[0]) | (static_cast<uint32_t>(p[1]) << 8) |
+           (static_cast<uint32_t>(p[2]) << 16) | (static_cast<uint32_t>(p[3]) << 24);
+}
+
+bool cpu_has_avx2() {
+#if defined(SNII_BSBF_X86)
+    static const bool v = __builtin_cpu_supports("avx2");
+    return v;
+#else
+    return false;
+#endif
+}
+
+// --- scalar kernels ---
+inline void masks_scalar(uint32_t key, uint32_t m[8]) {
+    for (int i = 0; i < 8; ++i) m[i] = 1u << ((key * kBsbfSalt[i]) >> 27);
+}
+bool block_contains_scalar(uint64_t hash, const uint8_t* block) {
+    const uint32_t* w = reinterpret_cast<const uint32_t*>(block); // LE
+    uint32_t m[8];
+    masks_scalar(static_cast<uint32_t>(hash), m);
+    for (int i = 0; i < 8; ++i)
+        if ((load_le32(reinterpret_cast<const uint8_t*>(w + i)) & m[i]) != m[i]) return false;
+    return true;
+}
+void insert_scalar(uint32_t* words, uint32_t block, uint32_t key) {
+    uint32_t m[8];
+    masks_scalar(key, m);
+    for (int i = 0; i < 8; ++i) words[block * 8 + i] |= m[i];
+}
+bool find_scalar(const uint32_t* words, uint32_t block, uint32_t key) {
+    uint32_t m[8];
+    masks_scalar(key, m);
+    for (int i = 0; i < 8; ++i)
+        if ((words[block * 8 + i] & m[i]) != m[i]) return false;
+    return true;
+}
+
+#if defined(SNII_BSBF_X86)
+// --- AVX2 kernels: a 256-bit block is one YMM register ---
+__attribute__((target("avx2"))) __m256i mask_avx2(uint32_t key) {
+    const __m256i salt =
+            _mm256_setr_epi32(static_cast<int>(kBsbfSalt[0]), static_cast<int>(kBsbfSalt[1]),
+                              static_cast<int>(kBsbfSalt[2]), static_cast<int>(kBsbfSalt[3]),
+                              static_cast<int>(kBsbfSalt[4]), static_cast<int>(kBsbfSalt[5]),
+                              static_cast<int>(kBsbfSalt[6]), static_cast<int>(kBsbfSalt[7]));
+    const __m256i prod = _mm256_mullo_epi32(_mm256_set1_epi32(static_cast<int>(key)), salt);
+    const __m256i shifts = _mm256_srli_epi32(prod, 27); // top 5 bits -> 0..31
+    return _mm256_sllv_epi32(_mm256_set1_epi32(1), shifts);
+}
+__attribute__((target("avx2"))) bool block_contains_avx2(uint64_t hash, const uint8_t* block) {
+    const __m256i m = mask_avx2(static_cast<uint32_t>(hash));
+    const __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(block));
+    return _mm256_testc_si256(b, m) != 0; // (~b & m) == 0 -> b contains m
+}
+__attribute__((target("avx2"))) void insert_avx2(uint32_t* words, uint32_t block, uint32_t key) {
+    __m256i* p = reinterpret_cast<__m256i*>(words + block * 8);
+    _mm256_storeu_si256(p, _mm256_or_si256(_mm256_loadu_si256(p), mask_avx2(key)));
+}
+__attribute__((target("avx2"))) bool find_avx2(const uint32_t* words, uint32_t block,
+                                               uint32_t key) {
+    const __m256i m = mask_avx2(key);
+    const __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(words + block * 8));
+    return _mm256_testc_si256(b, m) != 0;
+}
+#endif
+
+} // namespace
+
+uint64_t bsbf_hash(std::string_view term) {
+    return XXH64(term.data(), term.size(), /*seed=*/0);
+}
+
+uint32_t bsbf_optimal_num_bytes(uint32_t ndv, double fpp) {
+    // Parquet OptimalNumOfBits, then >>3 for bytes.
+    const double m = -8.0 * ndv / std::log(1 - std::pow(fpp, 1.0 / 8));
+    uint32_t num_bits;
+    if (m < 0 || m > static_cast<double>(kBsbfMaxBytes) * 8) {
+        num_bits = kBsbfMaxBytes << 3;
+    } else {
+        num_bits = static_cast<uint32_t>(m);
+    }
+    if (num_bits < (kBsbfMinBytes << 3)) num_bits = kBsbfMinBytes << 3;
+    if (num_bits & (num_bits - 1)) { // next power of 2
+        uint32_t p = 1;
+        while (p < num_bits) p <<= 1;
+        num_bits = p;
+    }
+    if (num_bits > (kBsbfMaxBytes << 3)) num_bits = kBsbfMaxBytes << 3;
+    return num_bits >> 3;
+}
+
+bool bsbf_block_contains(uint64_t hash, const uint8_t block[kBsbfBytesPerBlock]) {
+#if defined(SNII_BSBF_X86)
+    if (cpu_has_avx2()) return block_contains_avx2(hash, block);
+#endif
+    return block_contains_scalar(hash, block);
+}
+
+Status BsbfBuilder::create(uint32_t ndv, double fpp, BsbfBuilder* out) {
+    if (out == nullptr) return Status::InvalidArgument("bsbf: null out");
+    if (!(fpp > 0.0 && fpp < 1.0)) return Status::InvalidArgument("bsbf: fpp out of (0,1)");
+    if (ndv == 0) ndv = 1;
+    out->num_bytes_ = bsbf_optimal_num_bytes(ndv, fpp);
+    out->num_blocks_ = out->num_bytes_ / kBsbfBytesPerBlock;
+    out->ndv_ = ndv;
+    out->words_.assign(out->num_bytes_ / 4, 0u);
+    return Status::OK();
+}
+
+void BsbfBuilder::insert(uint64_t hash) {
+    const uint32_t block = bsbf_block_index(hash, num_blocks_);
+    const uint32_t key = static_cast<uint32_t>(hash);
+#if defined(SNII_BSBF_X86)
+    if (cpu_has_avx2()) {
+        insert_avx2(words_.data(), block, key);
+        return;
+    }
+#endif
+    insert_scalar(words_.data(), block, key);
+}
+
+bool BsbfBuilder::maybe_contains(uint64_t hash) const {
+    const uint32_t block = bsbf_block_index(hash, num_blocks_);
+    const uint32_t key = static_cast<uint32_t>(hash);
+#if defined(SNII_BSBF_X86)
+    if (cpu_has_avx2()) return find_avx2(words_.data(), block, key);
+#endif
+    return find_scalar(words_.data(), block, key);
+}
+
+Status BsbfBuilder::serialize(ByteSink* sink) const {
+    if (sink == nullptr) return Status::InvalidArgument("bsbf: null sink");
+    if (num_bytes_ == 0) return Status::InvalidArgument("bsbf: not built");
+    uint8_t hdr[kBsbfHeaderSize] = {0};
+    hdr[0] = 'B';
+    hdr[1] = 'S';
+    hdr[2] = 'B';
+    hdr[3] = 'F';
+    hdr[4] = 1; // version
+    hdr[5] = 0; // hash strategy: XXH64 seed 0
+    hdr[6] = 0; // index strategy: fastrange
+    hdr[7] = 0; // pad
+    store_le32(hdr + 8, num_bytes_);
+    store_le32(hdr + 12, num_blocks_);
+    store_le32(hdr + 16, ndv_);
+    store_le32(hdr + 20, crc32c(Slice(hdr, 20))); // header crc over [0,20)
+    const uint8_t* bits = reinterpret_cast<const uint8_t*>(words_.data());
+    store_le32(hdr + 24, crc32c(Slice(bits, num_bytes_))); // bitset crc
+    sink->put_bytes(Slice(hdr, kBsbfHeaderSize));
+    sink->put_bytes(Slice(bits, num_bytes_)); // contiguous, uncompressed, LE
+    return Status::OK();
+}
+
+Status BsbfHeader::parse(Slice h, uint64_t section_base, BsbfHeader* out) {
+    if (out == nullptr) return Status::InvalidArgument("bsbf: null out");
+    if (h.size() < kBsbfHeaderSize) return Status::Corruption("bsbf: short header");
+    const uint8_t* p = h.data();
+    if (p[0] != 'B' || p[1] != 'S' || p[2] != 'B' || p[3] != 'F')
+        return Status::Corruption("bsbf: bad magic");
+    if (p[4] != 1) return Status::Corruption("bsbf: bad version");
+    if (p[5] != 0) return Status::Corruption("bsbf: unsupported hash strategy");
+    if (p[6] != 0) return Status::Corruption("bsbf: unsupported index strategy");
+    if (crc32c(Slice(p, 20)) != load_le32(p + 20))
+        return Status::Corruption("bsbf: header crc mismatch");
+    const uint32_t nb = load_le32(p + 8);
+    const uint32_t nblk = load_le32(p + 12);
+    if (nb < kBsbfMinBytes || nb > kBsbfMaxBytes || (nb & (nb - 1)) != 0)
+        return Status::Corruption("bsbf: num_bytes out of range or not power of 2");
+    if (nblk != nb / kBsbfBytesPerBlock) return Status::Corruption("bsbf: num_blocks mismatch");
+    out->num_bytes = nb;
+    out->num_blocks = nblk;
+    out->bitset_crc = load_le32(p + 24);
+    out->bitset_base = section_base + kBsbfHeaderSize;
+    return Status::OK();
+}
+
+Status bsbf_probe(snii::io::FileReader* reader, const BsbfHeader& header, uint64_t hash,
+                  bool* maybe_present) {
+    if (reader == nullptr || maybe_present == nullptr)
+        return Status::InvalidArgument("bsbf: null arg");
+    std::vector<uint8_t> blk;
+    SNII_RETURN_IF_ERROR(reader->read_at(header.block_offset(hash), kBsbfBytesPerBlock, &blk));
+    if (blk.size() < kBsbfBytesPerBlock) return Status::Corruption("bsbf: short block read");
+    *maybe_present = bsbf_block_contains(hash, blk.data());
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/dict_block.cpp b/be/src/storage/index/snii/core/src/format/dict_block.cpp
new file mode 100644
index 00000000000000..375414df96f264
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/dict_block.cpp
@@ -0,0 +1,293 @@
+#include "snii/format/dict_block.h"
+
+#include <algorithm>
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/encoding/varint.h"
+
+namespace snii::format {
+
+namespace {
+
+constexpr size_t kFooterBytes = sizeof(uint32_t);    // trailing crc32c
+constexpr size_t kNAnchorsBytes = sizeof(uint32_t);  // n_anchors u32
+constexpr size_t kAnchorOffBytes = sizeof(uint32_t); // per-anchor offset u32
+
+// Estimate the encoded upper-bound byte size of one entry (no actual encoding; used by estimated_bytes).
+// Take the maximum varint width of each variable-length field plus payload bytes to guarantee an upper bound.
+size_t estimate_entry_bytes(const DictEntry& e) {
+    size_t body = 0;
+    body += varint_len(static_cast<uint32_t>(e.term.size())); // prefix_len upper bound
+    body += varint_len(static_cast<uint32_t>(e.term.size())); // suffix_len upper bound
+    body += e.term.size();                                    // suffix bytes upper bound
+    body += 1;                                                // flags
+    body += 10;                                               // df + ttf + max_freq upper bound
+    body += 10;                                               // ttf_delta
+    body += 10;                                               // max_freq
+    if (e.kind == DictEntryKind::kInline) {
+        body += 10 + e.frq_bytes.size();
+        body += 10 + e.prx_bytes.size();
+    } else {
+        body += 10 * 5; // frq_off/frq_len/prelude/prx_off/prx_len upper bound
+    }
+    return varint_len(static_cast<uint64_t>(body)) + body; // entry_len + body
+}
+
+} // namespace
+
+// ---- DictBlockBuilder ----
+
+DictBlockBuilder::DictBlockBuilder(IndexTier tier, bool has_positions, uint64_t frq_base,
+                                   uint64_t prx_base, uint32_t anchor_interval)
+        : tier_(tier),
+          has_positions_(has_positions),
+          frq_base_(frq_base),
+          prx_base_(prx_base),
+          anchor_interval_(anchor_interval == 0 ? 1 : anchor_interval) {}
+
+void DictBlockBuilder::add_entry(const DictEntry& entry) {
+    if (is_anchor(n_entries_)) ++n_anchors_;
+    entries_est_ += estimate_entry_bytes(entry);
+    entries_.push_back(entry);
+    prev_term_ = entry.term;
+    ++n_entries_;
+}
+
+size_t DictBlockBuilder::estimated_bytes() const {
+    size_t header = varint_len(static_cast<uint64_t>(n_entries_)) + 2; // +ver +flags
+    header += varint_len(frq_base_);
+    if (has_positions_) header += varint_len(prx_base_);
+    const size_t anchors = n_anchors_ * kAnchorOffBytes + kNAnchorsBytes;
+    return header + entries_est_ + anchors + kFooterBytes;
+}
+
+void DictBlockBuilder::finish(ByteSink* sink) const {
+    ByteSink body; // header + entries + anchor_offsets + n_anchors (crc covered region)
+
+    // header.
+    body.put_varint64(static_cast<uint64_t>(n_entries_));
+    body.put_u8(kDictBlockFormatVer);
+    body.put_u8(has_positions_ ? dict_block_flags::kHasPositions : 0u);
+    body.put_varint64(frq_base_);
+    if (has_positions_) body.put_varint64(prx_base_);
+
+    // entries: anchor entries use prev_term="" and record their byte offset within the block.
+    std::vector<uint32_t> anchor_offsets;
+    anchor_offsets.reserve(n_anchors_);
+    std::string prev;
+    for (uint32_t i = 0; i < n_entries_; ++i) {
+        const bool anchor = is_anchor(i);
+        if (anchor) {
+            anchor_offsets.push_back(static_cast<uint32_t>(body.size()));
+        }
+        const std::string_view prev_term = anchor ? std::string_view {} : std::string_view(prev);
+        encode_dict_entry(entries_[i], prev_term, tier_, &body);
+        prev = entries_[i].term;
+    }
+
+    // anchor_offsets[] + n_anchors.
+    for (uint32_t off : anchor_offsets) body.put_fixed32(off);
+    body.put_fixed32(static_cast<uint32_t>(anchor_offsets.size()));
+
+    // Write the entire block (including crc footer) to sink.
+    sink->put_bytes(body.view());
+    sink->put_fixed32(crc32c(body.view()));
+}
+
+// ---- DictBlockReader ----
+
+namespace {
+
+// Verify the block length is sufficient and validate the trailing crc; return a Slice of the covered region (excluding crc footer).
+Status verify_crc(Slice block, Slice* covered) {
+    if (block.size() < kFooterBytes + kNAnchorsBytes) {
+        return Status::Corruption("dict_block: block too short to contain footer");
+    }
+    const size_t covered_len = block.size() - kFooterBytes;
+    *covered = block.subslice(0, covered_len);
+
+    ByteSource crc_src(block.subslice(covered_len, kFooterBytes));
+    uint32_t stored = 0;
+    SNII_RETURN_IF_ERROR(crc_src.get_fixed32(&stored));
+    if (crc32c(*covered) != stored) {
+        return Status::Corruption("dict_block: crc32c checksum mismatch");
+    }
+    return Status::OK();
+}
+
+// Read and verify that block_flags is consistent with has_positions.
+Status check_flags(uint8_t flags, bool has_positions) {
+    const bool flag_pos = (flags & dict_block_flags::kHasPositions) != 0;
+    if (flag_pos != has_positions) {
+        return Status::InvalidArgument("dict_block: has_positions inconsistent with block_flags");
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status DictBlockReader::open(Slice block, IndexTier tier, bool has_positions,
+                             DictBlockReader* out) {
+    if (out == nullptr) return Status::InvalidArgument("dict_block: out is null");
+    *out = DictBlockReader {};
+
+    Slice covered;
+    SNII_RETURN_IF_ERROR(verify_crc(block, &covered));
+    out->block_ = covered;
+    out->tier_ = tier;
+    out->has_positions_ = has_positions;
+
+    // header.
+    ByteSource src(covered);
+    uint64_t n_entries = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint64(&n_entries));
+    uint8_t ver = 0;
+    uint8_t flags = 0;
+    SNII_RETURN_IF_ERROR(src.get_u8(&ver));
+    SNII_RETURN_IF_ERROR(src.get_u8(&flags));
+    if (ver != kDictBlockFormatVer) {
+        return Status::Unsupported("dict_block: unsupported entry_format_ver");
+    }
+    SNII_RETURN_IF_ERROR(check_flags(flags, has_positions));
+    SNII_RETURN_IF_ERROR(src.get_varint64(&out->frq_base_));
+    if (has_positions) SNII_RETURN_IF_ERROR(src.get_varint64(&out->prx_base_));
+
+    out->n_entries_ = static_cast<uint32_t>(n_entries);
+    out->entries_begin_ = src.position();
+
+    // The anchor table is at the tail of covered: [... anchor_offsets[n] n_anchors(u32)].
+    if (covered.size() < kNAnchorsBytes) {
+        return Status::Corruption("dict_block: missing n_anchors");
+    }
+    ByteSource na_src(covered.subslice(covered.size() - kNAnchorsBytes, kNAnchorsBytes));
+    uint32_t n_anchors = 0;
+    SNII_RETURN_IF_ERROR(na_src.get_fixed32(&n_anchors));
+
+    const size_t anchor_table_bytes = static_cast<size_t>(n_anchors) * kAnchorOffBytes;
+    if (covered.size() < kNAnchorsBytes + anchor_table_bytes ||
+        out->entries_begin_ + anchor_table_bytes + kNAnchorsBytes > covered.size()) {
+        return Status::Corruption("dict_block: anchor table out of range");
+    }
+    const size_t anchor_table_begin = covered.size() - kNAnchorsBytes - anchor_table_bytes;
+
+    ByteSource at_src(covered.subslice(anchor_table_begin, anchor_table_bytes));
+    out->anchor_offsets_.resize(n_anchors);
+    out->anchor_terms_.resize(n_anchors);
+    for (uint32_t i = 0; i < n_anchors; ++i) {
+        uint32_t off = 0;
+        SNII_RETURN_IF_ERROR(at_src.get_fixed32(&off));
+        if (off >= anchor_table_begin) {
+            return Status::Corruption("dict_block: anchor offset out of range");
+        }
+        // Anchor offsets must be strictly monotonically increasing, and the first anchor must be exactly the start of the entries region (entry 0 is always an anchor).
+        // Otherwise scan_from_anchor's segment-length computation seg_end-seg_begin would underflow as size_t and cause an out-of-range read,
+        // guarding against non-monotonic offset tables with a re-stamped crc (remote on-demand read / cache misalignment scenarios).
+        if (i == 0) {
+            if (off != out->entries_begin_) {
+                return Status::Corruption(
+                        "dict_block: first anchor offset is not the start of entries");
+            }
+        } else if (off <= out->anchor_offsets_[i - 1]) {
+            return Status::Corruption("dict_block: anchor offsets are not strictly increasing");
+        }
+        out->anchor_offsets_[i] = off;
+        // Anchor entries are encoded with prev_term="" and can be decoded independently to retrieve their term.
+        ByteSource e_src(covered.subslice(off, anchor_table_begin - off));
+        DictEntry probe;
+        SNII_RETURN_IF_ERROR(decode_dict_entry(&e_src, std::string_view {}, tier, &probe));
+        out->anchor_terms_[i] = std::move(probe.term);
+    }
+    return Status::OK();
+}
+
+bool DictBlockReader::locate_anchor(std::string_view target, size_t* anchor_idx) const {
+    if (anchor_terms_.empty()) return false;
+    if (target < std::string_view(anchor_terms_.front())) return false;
+    // The last anchor_term <= target.
+    size_t lo = 0;
+    size_t hi = anchor_terms_.size(); // open interval
+    while (lo + 1 < hi) {
+        const size_t mid = lo + (hi - lo) / 2;
+        if (std::string_view(anchor_terms_[mid]) <= target) {
+            lo = mid;
+        } else {
+            hi = mid;
+        }
+    }
+    *anchor_idx = lo;
+    return true;
+}
+
+Status DictBlockReader::decode_all(std::vector<DictEntry>* out) const {
+    if (out == nullptr) return Status::InvalidArgument("dict_block: out is null");
+    out->clear();
+    out->reserve(n_entries_);
+    for (size_t a = 0; a < anchor_offsets_.size(); ++a) {
+        const size_t seg_begin = anchor_offsets_[a];
+        const bool is_last = a + 1 == anchor_offsets_.size();
+        const size_t seg_end = is_last ? (block_.size() - kNAnchorsBytes -
+                                          anchor_offsets_.size() * kAnchorOffBytes)
+                                       : anchor_offsets_[a + 1];
+        if (seg_end < seg_begin || seg_end > block_.size()) {
+            return Status::Corruption("dict_block: anchor segment range invalid");
+        }
+        ByteSource src(block_.subslice(seg_begin, seg_end - seg_begin));
+        std::string prev; // first entry of a segment is an anchor (prev_term="")
+        while (!src.eof()) {
+            DictEntry e;
+            SNII_RETURN_IF_ERROR(decode_dict_entry(&src, std::string_view(prev), tier_, &e));
+            prev = e.term;
+            out->push_back(std::move(e));
+        }
+    }
+    if (out->size() != n_entries_) {
+        return Status::Corruption("dict_block: decoded entry count mismatch");
+    }
+    return Status::OK();
+}
+
+Status DictBlockReader::scan_from_anchor(size_t anchor_idx, std::string_view target, bool* found,
+                                         DictEntry* out) const {
+    // Byte range of this anchor segment: [anchor_offset, next anchor offset or anchor table start).
+    const size_t seg_begin = anchor_offsets_[anchor_idx];
+    const bool is_last = anchor_idx + 1 == anchor_offsets_.size();
+    const size_t seg_end =
+            is_last ? (block_.size() - kNAnchorsBytes - anchor_offsets_.size() * kAnchorOffBytes)
+                    : anchor_offsets_[anchor_idx + 1];
+
+    // Fallback: open() has already verified anchor monotonicity; this additionally guards against seg_end<seg_begin underflow/out-of-range read.
+    if (seg_end < seg_begin || seg_end > block_.size()) {
+        return Status::Corruption("dict_block: anchor segment range invalid");
+    }
+    ByteSource src(block_.subslice(seg_begin, seg_end - seg_begin));
+    std::string prev; // the first entry in the segment is an anchor, prev_term=""
+    while (!src.eof()) {
+        DictEntry e;
+        SNII_RETURN_IF_ERROR(decode_dict_entry(&src, std::string_view(prev), tier_, &e));
+        if (e.term == target) {
+            *found = true;
+            *out = std::move(e);
+            return Status::OK();
+        }
+        if (std::string_view(e.term) > target) {
+            *found = false; // already past target; entries are sorted so it does not exist
+            return Status::OK();
+        }
+        prev = std::move(e.term);
+    }
+    *found = false;
+    return Status::OK();
+}
+
+Status DictBlockReader::find_term(std::string_view target, bool* found, DictEntry* out) const {
+    if (found == nullptr || out == nullptr) {
+        return Status::InvalidArgument("dict_block: found / out is null");
+    }
+    *found = false;
+    size_t anchor_idx = 0;
+    if (!locate_anchor(target, &anchor_idx)) return Status::OK();
+    return scan_from_anchor(anchor_idx, target, found, out);
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp b/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp
new file mode 100644
index 00000000000000..05f73814c32d2d
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp
@@ -0,0 +1,89 @@
+#include "snii/format/dict_block_directory.h"
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/section_framer.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+
+namespace {
+
+// Each block_ref has a fixed field order; reuse ByteSink varint/fixed primitives — do not hand-craft bytes manually.
+// uncomp_len trails only when the kZstd flag is set, so uncompressed-block
+// directories keep their compact (v1-identical) per-ref byte layout.
+void encode_ref(const BlockRef& ref, ByteSink* payload) {
+    payload->put_varint64(ref.offset);
+    payload->put_varint64(ref.length);
+    payload->put_varint32(ref.n_entries);
+    payload->put_u8(ref.flags);
+    payload->put_fixed32(ref.checksum);
+    if (ref.flags & block_ref_flags::kZstd) payload->put_varint64(ref.uncomp_len);
+}
+
+Status decode_ref(ByteSource* ps, BlockRef* ref) {
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->offset));
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->length));
+    SNII_RETURN_IF_ERROR(ps->get_varint32(&ref->n_entries));
+    SNII_RETURN_IF_ERROR(ps->get_u8(&ref->flags));
+    SNII_RETURN_IF_ERROR(ps->get_fixed32(&ref->checksum));
+    if (ref->flags & block_ref_flags::kZstd) {
+        SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->uncomp_len));
+    }
+    return Status::OK();
+}
+
+Status decode_payload(Slice payload, std::vector<BlockRef>* refs) {
+    ByteSource ps(payload);
+    uint32_t n_blocks = 0;
+    SNII_RETURN_IF_ERROR(ps.get_varint32(&n_blocks));
+    // Guard against a corrupted, inflated count from untrusted bytes: each BlockRef
+    // needs >= 8 bytes (flags u8 + checksum u32 + >= 1 byte for each of 3 varints),
+    // so cap before reserve to avoid a huge allocation.
+    constexpr size_t kMinRefBytes = 8;
+    if (n_blocks > ps.remaining() / kMinRefBytes) {
+        return Status::Corruption("dict_block_directory: n_blocks exceeds payload capacity");
+    }
+    refs->clear();
+    refs->reserve(n_blocks);
+    for (uint32_t i = 0; i < n_blocks; ++i) {
+        BlockRef ref {};
+        SNII_RETURN_IF_ERROR(decode_ref(&ps, &ref));
+        refs->push_back(ref);
+    }
+    if (!ps.eof()) {
+        return Status::Corruption("dict_block_directory: trailing bytes in payload");
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+void DictBlockDirectoryBuilder::finish(ByteSink* sink) const {
+    ByteSink payload;
+    payload.put_varint32(static_cast<uint32_t>(refs_.size()));
+    for (const auto& ref : refs_) {
+        encode_ref(ref, &payload);
+    }
+    SectionFramer::write(*sink, static_cast<uint8_t>(SectionType::kDictBlockDirectory),
+                         payload.view());
+}
+
+Status DictBlockDirectoryReader::open(Slice section, DictBlockDirectoryReader* out) {
+    ByteSource src(section);
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec));
+    if (sec.type != static_cast<uint8_t>(SectionType::kDictBlockDirectory)) {
+        return Status::InvalidArgument("dict_block_directory: unexpected section type");
+    }
+    return decode_payload(sec.payload, &out->refs_);
+}
+
+Status DictBlockDirectoryReader::get(uint32_t ordinal, BlockRef* out) const {
+    if (ordinal >= refs_.size()) {
+        return Status::NotFound("dict_block_directory: ordinal out of range");
+    }
+    *out = refs_[ordinal];
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/dict_entry.cpp b/be/src/storage/index/snii/core/src/format/dict_entry.cpp
new file mode 100644
index 00000000000000..3b7a189e2c276b
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/dict_entry.cpp
@@ -0,0 +1,293 @@
+#include "snii/format/dict_entry.h"
+
+#include <algorithm>
+
+#include "snii/common/slice.h"
+
+namespace snii::format {
+
+namespace {
+
+// Pure-function assembly / parsing of flags bits; avoids a long inline if-else
+// chain.
+uint8_t pack_flags(const DictEntry& e) {
+    uint8_t f = 0;
+    if (e.kind == DictEntryKind::kInline) f |= dict_flags::kKind;
+    if (e.enc == DictEntryEnc::kWindowed) f |= dict_flags::kEnc;
+    if (e.has_sb) f |= dict_flags::kHasSb;
+    // bit3 has_champion / bit4 offsets_ref are always 0 in v1.
+    return f;
+}
+
+void apply_flags(uint8_t f, DictEntry* e) {
+    e->kind = (f & dict_flags::kKind) ? DictEntryKind::kInline : DictEntryKind::kPodRef;
+    e->enc = (f & dict_flags::kEnc) ? DictEntryEnc::kWindowed : DictEntryEnc::kSlim;
+    e->has_sb = (f & dict_flags::kHasSb) != 0;
+}
+
+// Length of the longest common prefix between term and prev_term.
+uint32_t common_prefix_len(std::string_view term, std::string_view prev) {
+    uint32_t n = 0;
+    const uint32_t lim = static_cast<uint32_t>(std::min(term.size(), prev.size()));
+    while (n < lim && term[n] == prev[n]) ++n;
+    return n;
+}
+
+bool tier_has_stats(IndexTier tier) {
+    return tier >= IndexTier::kT2;
+}
+
+// ---- Encode entry body (excluding entry_len and trailing crc) ----
+
+void write_term_key(const DictEntry& e, std::string_view prev, ByteSink* sink) {
+    const uint32_t prefix = common_prefix_len(e.term, prev);
+    const std::string_view suffix = std::string_view(e.term).substr(prefix);
+    sink->put_varint32(prefix);
+    sink->put_varint32(static_cast<uint32_t>(suffix.size()));
+    sink->put_bytes(Slice(suffix));
+}
+
+void write_stats(const DictEntry& e, IndexTier tier, ByteSink* sink) {
+    sink->put_varint32(e.df);
+    if (!tier_has_stats(tier)) return;
+    sink->put_varint64(e.ttf_delta);
+    sink->put_varint64(e.max_freq);
+}
+
+// Per-window codec mode byte shared by slim/inline single-window regions.
+uint8_t pack_win_mode(const DictEntry& e) {
+    uint8_t mode = 0;
+    if (e.dd_meta.zstd) mode |= 1u << 0;   // dd_zstd
+    if (e.freq_meta.zstd) mode |= 1u << 1; // freq_zstd
+    return mode;
+}
+
+// Writes the slim/inline region codec metadata (dd always; freq when tier>=T2).
+// store_crc=false (INLINE entries, format v2) omits the redundant per-region
+// crc32c: the inline bytes already sit inside the dict block, whose own
+// block-level crc32c covers them. POD-ref entries pass store_crc=true (their
+// regions live in the separately-fetched .frq POD, uncovered by the block crc).
+void write_region_meta(const DictEntry& e, IndexTier tier, bool store_crc, ByteSink* sink) {
+    sink->put_u8(pack_win_mode(e));
+    sink->put_varint64(e.dd_meta.uncomp_len);
+    if (store_crc) sink->put_fixed32(e.dd_meta.crc);
+    if (!tier_has_stats(tier)) return;
+    sink->put_varint64(e.freq_meta.uncomp_len);
+    if (store_crc) sink->put_fixed32(e.freq_meta.crc);
+}
+
+void write_pod_ref(const DictEntry& e, IndexTier tier, ByteSink* sink) {
+    sink->put_varint64(e.frq_off_delta);
+    sink->put_varint64(e.frq_len);
+    if (e.enc == DictEntryEnc::kWindowed) {
+        sink->put_varint64(e.prelude_len);
+        sink->put_varint64(e.frq_docs_len);
+    } else {
+        sink->put_varint64(e.frq_docs_len); // slim pod_ref: dd region on-disk length
+        // POD-ref regions live in the .frq POD (not covered by the block crc): keep
+        // crc.
+        write_region_meta(e, tier, /*store_crc=*/true, sink);
+    }
+    if (!tier_has_stats(tier)) return;
+    sink->put_varint64(e.prx_off_delta);
+    sink->put_varint64(e.prx_len);
+}
+
+void write_inline(const DictEntry& e, IndexTier tier, ByteSink* sink) {
+    sink->put_varint64(static_cast<uint64_t>(e.frq_bytes.size()));
+    sink->put_bytes(Slice(e.frq_bytes));
+    sink->put_varint64(e.inline_dd_disk_len);
+    // INLINE bytes are covered by the dict block crc32c: omit the redundant
+    // per-region crc.
+    write_region_meta(e, tier, /*store_crc=*/false, sink);
+    if (!tier_has_stats(tier)) return;
+    sink->put_varint64(static_cast<uint64_t>(e.prx_bytes.size()));
+    sink->put_bytes(Slice(e.prx_bytes));
+}
+
+void write_body(const DictEntry& e, std::string_view prev, IndexTier tier, ByteSink* sink) {
+    write_term_key(e, prev, sink);
+    sink->put_u8(pack_flags(e));
+    write_stats(e, tier, sink);
+    if (e.kind == DictEntryKind::kInline) {
+        write_inline(e, tier, sink);
+    } else {
+        write_pod_ref(e, tier, sink);
+    }
+}
+
+// ---- Decode entry body ----
+
+Status read_term_key(ByteSource* src, std::string_view prev, DictEntry* out) {
+    uint32_t prefix = 0;
+    uint32_t suffix_len = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint32(&prefix));
+    SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len));
+    if (prefix > prev.size()) {
+        return Status::Corruption("dict_entry: prefix_len exceeds prev_term length");
+    }
+    Slice suffix;
+    SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix));
+    out->term.assign(prev.substr(0, prefix));
+    out->term.append(reinterpret_cast<const char*>(suffix.data()), suffix.size());
+    return Status::OK();
+}
+
+Status read_stats(ByteSource* src, IndexTier tier, DictEntry* out) {
+    SNII_RETURN_IF_ERROR(src->get_varint32(&out->df));
+    if (!tier_has_stats(tier)) return Status::OK();
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->ttf_delta));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->max_freq));
+    return Status::OK();
+}
+
+// Reads the slim/inline region codec metadata (mode/uncomp/[crc]) and fills the
+// dd/freq region disk_len from the supplied total/split lengths. has_crc=false
+// (INLINE entries, format v2) means no per-region crc was stored: the on-disk
+// crc field is absent and region decode must skip crc verification (verify_crc=
+// false) since the dict block's own crc32c already covers the inline bytes.
+Status read_region_meta(ByteSource* src, IndexTier tier, bool has_crc, uint64_t dd_disk_len,
+                        uint64_t freq_disk_len, DictEntry* out) {
+    uint8_t mode = 0;
+    SNII_RETURN_IF_ERROR(src->get_u8(&mode));
+    if ((mode & ~0x3u) != 0) {
+        return Status::Corruption("dict_entry: unknown win_mode bits");
+    }
+    out->dd_meta.zstd = (mode & (1u << 0)) != 0;
+    out->dd_meta.disk_len = dd_disk_len;
+    out->dd_meta.verify_crc = has_crc;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->dd_meta.uncomp_len));
+    if (has_crc) SNII_RETURN_IF_ERROR(src->get_fixed32(&out->dd_meta.crc));
+    if (!tier_has_stats(tier)) {
+        if (mode & (1u << 1)) {
+            return Status::Corruption("dict_entry: freq mode set without freq tier");
+        }
+        return Status::OK();
+    }
+    out->freq_meta.zstd = (mode & (1u << 1)) != 0;
+    out->freq_meta.disk_len = freq_disk_len;
+    out->freq_meta.verify_crc = has_crc;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->freq_meta.uncomp_len));
+    if (has_crc) SNII_RETURN_IF_ERROR(src->get_fixed32(&out->freq_meta.crc));
+    return Status::OK();
+}
+
+Status read_pod_ref(ByteSource* src, IndexTier tier, DictEntry* out) {
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_off_delta));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_len));
+    if (out->enc == DictEntryEnc::kWindowed) {
+        SNII_RETURN_IF_ERROR(src->get_varint64(&out->prelude_len));
+        SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_docs_len));
+        if (out->prelude_len == 0 || out->prelude_len > out->frq_docs_len ||
+            out->frq_docs_len > out->frq_len) {
+            return Status::Corruption("dict_entry: invalid windowed docs prefix");
+        }
+    } else {
+        SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_docs_len));
+        if (out->frq_docs_len > out->frq_len) {
+            return Status::Corruption("dict_entry: frq_docs_len exceeds frq_len");
+        }
+        SNII_RETURN_IF_ERROR(read_region_meta(src, tier, /*has_crc=*/true, out->frq_docs_len,
+                                              out->frq_len - out->frq_docs_len, out));
+    }
+    if (!tier_has_stats(tier)) return Status::OK();
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->prx_off_delta));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->prx_len));
+    return Status::OK();
+}
+
+Status read_byte_blob(ByteSource* src, std::vector<uint8_t>* out) {
+    uint64_t len = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&len));
+    Slice bytes;
+    SNII_RETURN_IF_ERROR(src->get_bytes(static_cast<size_t>(len), &bytes));
+    out->assign(bytes.data(), bytes.data() + bytes.size());
+    return Status::OK();
+}
+
+Status read_inline(ByteSource* src, IndexTier tier, DictEntry* out) {
+    SNII_RETURN_IF_ERROR(read_byte_blob(src, &out->frq_bytes));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&out->inline_dd_disk_len));
+    if (out->inline_dd_disk_len > out->frq_bytes.size()) {
+        return Status::Corruption("dict_entry: inline_dd_disk_len exceeds frq_bytes");
+    }
+    const uint64_t freq_disk_len =
+            static_cast<uint64_t>(out->frq_bytes.size()) - out->inline_dd_disk_len;
+    // INLINE entries store no per-region crc (covered by the block crc):
+    // has_crc=false.
+    SNII_RETURN_IF_ERROR(read_region_meta(src, tier, /*has_crc=*/false, out->inline_dd_disk_len,
+                                          freq_disk_len, out));
+    if (!tier_has_stats(tier)) return Status::OK();
+    SNII_RETURN_IF_ERROR(read_byte_blob(src, &out->prx_bytes));
+    return Status::OK();
+}
+
+Status read_locator(ByteSource* src, IndexTier tier, DictEntry* out) {
+    if (out->kind == DictEntryKind::kInline) return read_inline(src, tier, out);
+    return read_pod_ref(src, tier, out);
+}
+
+// Read entry_len (= body length) and verify that src has enough remaining
+// bytes.
+Status read_entry_len(ByteSource* src, uint64_t* total) {
+    SNII_RETURN_IF_ERROR(src->get_varint64(total));
+    if (*total > src->remaining()) {
+        return Status::Corruption("dict_entry: entry_len out of range");
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status encode_dict_entry(const DictEntry& entry, std::string_view prev_term, IndexTier tier,
+                         ByteSink* sink) {
+    if (sink == nullptr) return Status::InvalidArgument("dict_entry: sink is null");
+
+    // Serialize the body into a temporary buffer first to obtain the exact
+    // length, then write entry_len + body. CRC verification is done uniformly at
+    // the DICT block level (covering block header + all entries + anchor table);
+    // CRC is not repeated at the entry level, to keep slim/inline low-frequency
+    // terms maximally compact (spec §DICT block/§dict entry).
+    ByteSink body;
+    write_body(entry, prev_term, tier, &body);
+    sink->put_varint64(static_cast<uint64_t>(body.size()));
+    sink->put_bytes(body.view());
+    return Status::OK();
+}
+
+Status decode_dict_entry(ByteSource* src, std::string_view prev_term, IndexTier tier,
+                         DictEntry* out) {
+    if (src == nullptr || out == nullptr) {
+        return Status::InvalidArgument("dict_entry: src / out is null");
+    }
+    *out = DictEntry {};
+
+    uint64_t total = 0;
+    SNII_RETURN_IF_ERROR(read_entry_len(src, &total));
+    const size_t body_start = src->position();
+
+    SNII_RETURN_IF_ERROR(read_term_key(src, prev_term, out));
+    uint8_t flags = 0;
+    SNII_RETURN_IF_ERROR(src->get_u8(&flags));
+    apply_flags(flags, out);
+    SNII_RETURN_IF_ERROR(read_stats(src, tier, out));
+    SNII_RETURN_IF_ERROR(read_locator(src, tier, out));
+
+    // The body must consume exactly entry_len bytes; otherwise the structure is
+    // inconsistent with the tier.
+    const size_t consumed = src->position() - body_start;
+    if (consumed != static_cast<size_t>(total)) {
+        return Status::Corruption("dict_entry: body length does not match entry_len");
+    }
+    return Status::OK();
+}
+
+Status skip_dict_entry(ByteSource* src) {
+    if (src == nullptr) return Status::InvalidArgument("dict_entry: src is null");
+    uint64_t total = 0;
+    SNII_RETURN_IF_ERROR(read_entry_len(src, &total));
+    Slice unused;
+    return src->get_bytes(static_cast<size_t>(total), &unused);
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/frq_pod.cpp b/be/src/storage/index/snii/core/src/format/frq_pod.cpp
new file mode 100644
index 00000000000000..1dc28fb9eea696
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/frq_pod.cpp
@@ -0,0 +1,196 @@
+#include "snii/format/frq_pod.h"
+
+#include <cstddef>
+#include <span>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/encoding/pfor.h"
+#include "snii/encoding/zstd_codec.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+namespace {
+
+// Auto-compression threshold: use raw when a region is smaller than this byte
+// count (zstd gain is negligible and metadata overhead is relatively large).
+inline constexpr size_t kAutoZstdMinBytes = 512;
+// Default zstd level for auto mode.
+inline constexpr int kDefaultZstdLevel = 3;
+// Maximum decompressed byte size for a single region. Guards against a
+// corrupted uncomp_len read from S3 that inflated to a huge value: sanity-check
+// before allocating/decompressing to avoid GB-scale allocations. Windows are
+// 256-doc aligned and normally far smaller than this.
+inline constexpr uint32_t kMaxRegionUncompBytes = 256u * 1024 * 1024;
+// Maximum doc count per .frq window (guards against a corrupted n). Window
+// baseline is 256, practical combined cap is 2048, so this is a loose but
+// astronomically-large-number-blocking upper bound.
+inline constexpr uint32_t kMaxWindowDocs = 1u << 24;
+
+// Encode a uint32 array into multiple PFOR runs, each of 256 (kFrqBaseUnit)
+// elements. n / run count is not written: the number of runs is derived from
+// total length n and kFrqBaseUnit, and the decoder computes it the same way.
+void encode_pfor_runs(std::span<const uint32_t> values, ByteSink* out) {
+    size_t n = values.size();
+    for (size_t off = 0; off < n; off += kFrqBaseUnit) {
+        size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit;
+        pfor_encode(values.data() + off, run, out);
+    }
+}
+
+// Decode n uint32 values from source (multiple PFOR runs of 256 each).
+Status decode_pfor_runs(ByteSource* src, size_t n, std::vector<uint32_t>* out) {
+    out->assign(n, 0);
+    for (size_t off = 0; off < n; off += kFrqBaseUnit) {
+        size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit;
+        SNII_RETURN_IF_ERROR(pfor_decode(src, run, out->data() + off));
+    }
+    return Status::OK();
+}
+
+// Verifies docids are ascending and the first entry is not below win_base.
+Status validate_docs(std::span<const uint32_t> docs, uint64_t win_base) {
+    if (docs.empty()) return Status::OK();
+    if (static_cast<uint64_t>(docs.front()) < win_base) {
+        return Status::InvalidArgument("frq: first docid below win_base");
+    }
+    for (size_t i = 1; i < docs.size(); ++i) {
+        if (docs[i] < docs[i - 1]) {
+            return Status::InvalidArgument("frq: docids must be ascending");
+        }
+    }
+    return Status::OK();
+}
+
+// Decision: given level and plaintext length, determine whether to compress.
+bool should_compress(int level, size_t plain_len) {
+    if (level == 0) return false;          // force raw
+    if (level > 0) return true;            // force zstd
+    return plain_len >= kAutoZstdMinBytes; // auto
+}
+
+// Encodes one region's plaintext into raw or zstd, appends the on-disk bytes to
+// out, and fills meta (mode/uncomp_len/disk_len/crc). The region carries no
+// header.
+Status emit_region(Slice plain, int level, ByteSink* out, FrqRegionMeta* meta) {
+    if (out == nullptr || meta == nullptr) {
+        return Status::InvalidArgument("frq: null region out");
+    }
+    meta->uncomp_len = plain.size();
+    std::vector<uint8_t> disk;
+    if (should_compress(level, plain.size())) {
+        meta->zstd = true;
+        SNII_RETURN_IF_ERROR(zstd_compress(plain, level > 0 ? level : kDefaultZstdLevel, &disk));
+    } else {
+        meta->zstd = false;
+        disk.assign(plain.data(), plain.data() + plain.size());
+    }
+    meta->disk_len = static_cast<uint64_t>(disk.size());
+    meta->crc = crc32c(Slice(disk));
+    out->put_bytes(Slice(disk));
+    return Status::OK();
+}
+
+// Materializes a region's plaintext (raw borrows the view; zstd decompresses)
+// and verifies its crc + slice length against meta.
+Status open_region(Slice disk, const FrqRegionMeta& meta, std::vector<uint8_t>* holder,
+                   Slice* plain) {
+    if (disk.size() != static_cast<size_t>(meta.disk_len)) {
+        return Status::Corruption("frq: region slice length mismatch");
+    }
+    if (meta.uncomp_len > kMaxRegionUncompBytes) {
+        return Status::Corruption("frq: region uncomp_len exceeds sane cap");
+    }
+    // Inline entries (verify_crc=false) carry no per-region crc: their on-disk
+    // bytes are covered by the enclosing dict block's block-level crc32c, so the
+    // region crc would be redundant. POD-ref regions keep their own crc check.
+    if (meta.verify_crc && crc32c(disk) != meta.crc) {
+        return Status::Corruption("frq: region crc mismatch");
+    }
+    if (!meta.zstd) {
+        if (meta.uncomp_len != meta.disk_len) {
+            return Status::Corruption("frq: raw region length inconsistent");
+        }
+        *plain = disk;
+        return Status::OK();
+    }
+    SNII_RETURN_IF_ERROR(zstd_decompress(disk, static_cast<size_t>(meta.uncomp_len), holder));
+    *plain = Slice(*holder);
+    return Status::OK();
+}
+
+} // namespace
+
+Status build_dd_region(std::span<const uint32_t> docids_ascending, uint64_t win_base,
+                       int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta) {
+    if (out == nullptr || meta == nullptr) {
+        return Status::InvalidArgument("frq: null dd region out");
+    }
+    SNII_RETURN_IF_ERROR(validate_docs(docids_ascending, win_base));
+    ByteSink plain; // VInt n ++ PFOR_runs(doc_delta)
+    std::vector<uint32_t> dd(docids_ascending.size());
+    uint64_t prev = win_base;
+    for (size_t i = 0; i < docids_ascending.size(); ++i) {
+        dd[i] = static_cast<uint32_t>(static_cast<uint64_t>(docids_ascending[i]) - prev);
+        prev = docids_ascending[i];
+    }
+    plain.put_varint32(static_cast<uint32_t>(docids_ascending.size()));
+    encode_pfor_runs(dd, &plain);
+    return emit_region(plain.view(), zstd_level_or_neg_for_auto, out, meta);
+}
+
+Status build_freq_region(std::span<const uint32_t> freqs, int zstd_level_or_neg_for_auto,
+                         ByteSink* out, FrqRegionMeta* meta) {
+    if (out == nullptr || meta == nullptr) {
+        return Status::InvalidArgument("frq: null freq region out");
+    }
+    ByteSink plain;
+    encode_pfor_runs(freqs, &plain);
+    return emit_region(plain.view(), zstd_level_or_neg_for_auto, out, meta);
+}
+
+Status decode_dd_region(Slice dd_disk, const FrqRegionMeta& meta, uint64_t win_base,
+                        std::vector<uint32_t>* docids) {
+    if (docids == nullptr) return Status::InvalidArgument("frq: null docids out");
+    std::vector<uint8_t> holder;
+    Slice plain;
+    SNII_RETURN_IF_ERROR(open_region(dd_disk, meta, &holder, &plain));
+    ByteSource src(plain);
+    uint32_t n = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&n));
+    if (n > kMaxWindowDocs) return Status::Corruption("frq: doc count exceeds sane cap");
+    SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, n, docids));
+    if (!src.eof()) {
+        return Status::Corruption("frq: trailing bytes after dd region payload");
+    }
+    uint64_t cur = win_base;
+    for (uint32_t i = 0; i < n; ++i) {
+        cur += (*docids)[i];
+        (*docids)[i] = static_cast<uint32_t>(cur);
+    }
+    return Status::OK();
+}
+
+Status decode_freq_region(Slice freq_disk, const FrqRegionMeta& meta, size_t doc_count,
+                          std::vector<uint32_t>* freqs) {
+    if (freqs == nullptr) return Status::InvalidArgument("frq: null freqs out");
+    std::vector<uint8_t> holder;
+    Slice plain;
+    SNII_RETURN_IF_ERROR(open_region(freq_disk, meta, &holder, &plain));
+    if (doc_count == 0) {
+        if (meta.uncomp_len != 0) {
+            return Status::Corruption("frq: empty freq region expected");
+        }
+        freqs->clear();
+        return Status::OK();
+    }
+    ByteSource src(plain);
+    SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, freqs));
+    if (!src.eof()) {
+        return Status::Corruption("frq: trailing bytes after freq region payload");
+    }
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/frq_prelude.cpp b/be/src/storage/index/snii/core/src/format/frq_prelude.cpp
new file mode 100644
index 00000000000000..568fda00f2f854
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/frq_prelude.cpp
@@ -0,0 +1,470 @@
+#include "snii/format/frq_prelude.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+
+namespace snii::format {
+
+namespace {
+
+// Anti-DoS: a segment holds at most ~15M docs (>=1 doc/window), so 1<<24
+// windows is a generous ceiling that still prevents multi-GB allocations from a
+// crafted N. (crc32c is not a MAC and cannot defend a re-stamped inflated count.)
+constexpr uint64_t kMaxWindows = 1ull << 24;
+
+uint64_t ceil_div(uint64_t a, uint64_t b) {
+    return (a + b - 1) / b;
+}
+
+uint8_t make_flags(const FrqPreludeColumns& cols) {
+    uint8_t flags = 0;
+    if (cols.has_freq) flags |= frq_prelude_flags::kHasFreq;
+    if (cols.has_prx) flags |= frq_prelude_flags::kHasPrx;
+    return flags;
+}
+
+uint8_t make_win_mode(const WindowMeta& m, bool has_freq) {
+    uint8_t mode = 0;
+    if (m.dd_zstd) mode |= frq_win_mode::kDdZstd;
+    if (has_freq && m.freq_zstd) mode |= frq_win_mode::kFreqZstd;
+    return mode;
+}
+
+Status checked_add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) {
+    if (rhs > std::numeric_limits<uint64_t>::max() - lhs) {
+        return Status::Corruption(message);
+    }
+    *out = lhs + rhs;
+    return Status::OK();
+}
+
+Status checked_u32(uint64_t value, const char* message, uint32_t* out) {
+    if (value > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption(message);
+    }
+    *out = static_cast<uint32_t>(value);
+    return Status::OK();
+}
+
+Status validate_window_doc_count(bool first_window, uint64_t win_base, uint64_t last_docid,
+                                 uint64_t doc_count) {
+    uint64_t first_docid = 0;
+    if (!first_window) {
+        SNII_RETURN_IF_ERROR(checked_add_u64(
+                win_base, 1, "frq_prelude: window base exceeds docid range", &first_docid));
+    }
+    if (last_docid < first_docid) {
+        return Status::Corruption("frq_prelude: invalid window docid range");
+    }
+    const uint64_t width = last_docid - first_docid + 1;
+    if (doc_count > width) {
+        return Status::Corruption("frq_prelude: doc_count exceeds window width");
+    }
+    return Status::OK();
+}
+
+// Validates builder input: non-null sink, group_size>=1, sane count, and
+// non-decreasing absolute last_docid across windows.
+Status validate_input(const FrqPreludeColumns& cols, ByteSink* out) {
+    if (out == nullptr) return Status::InvalidArgument("frq_prelude: null sink");
+    if (cols.group_size == 0) {
+        return Status::InvalidArgument("frq_prelude: group_size must be >= 1");
+    }
+    if (cols.windows.size() > kMaxWindows) {
+        return Status::InvalidArgument("frq_prelude: window count exceeds cap");
+    }
+    for (size_t w = 1; w < cols.windows.size(); ++w) {
+        if (cols.windows[w].last_docid < cols.windows[w - 1].last_docid) {
+            return Status::InvalidArgument("frq_prelude: last_docid not monotonic");
+        }
+    }
+    return Status::OK();
+}
+
+// Encodes one window row into a per-block sink. last_docid_delta is the row's
+// absolute last_docid minus prev_last (the previous window's absolute last).
+void encode_window_row(const WindowMeta& m, bool has_freq, bool has_prx, uint64_t prev_last,
+                       ByteSink* block) {
+    block->put_varint64(static_cast<uint64_t>(m.last_docid) - prev_last);
+    block->put_varint64(m.doc_count);
+    block->put_u8(make_win_mode(m, has_freq));
+    block->put_varint64(m.dd_off);
+    block->put_varint64(m.dd_disk_len);
+    block->put_varint64(m.dd_uncomp_len);
+    block->put_fixed32(m.crc_dd);
+    if (has_freq) {
+        block->put_varint64(m.freq_off);
+        block->put_varint64(m.freq_disk_len);
+        block->put_varint64(m.freq_uncomp_len);
+        block->put_fixed32(m.crc_freq);
+    }
+    if (has_prx) {
+        block->put_varint64(m.prx_off);
+        block->put_varint64(m.prx_len);
+    }
+    block->put_varint64(m.max_freq);
+    block->put_u8(m.max_norm);
+}
+
+// One super-block's serialized window block plus its directory fields.
+struct SuperBlock {
+    ByteSink block;
+    uint64_t last_docid = 0; // absolute last docid of this super-block's last window
+};
+
+// Builds every super-block's window block (row-encoded) and records the running
+// absolute last docid at each super-block boundary.
+std::vector<SuperBlock> encode_super_blocks(const FrqPreludeColumns& cols) {
+    const uint32_t g = cols.group_size;
+    const size_t n = cols.windows.size();
+    std::vector<SuperBlock> blocks;
+    blocks.reserve(static_cast<size_t>(ceil_div(n, g)));
+    uint64_t prev_last = 0; // previous window's absolute last docid (chains across blocks)
+    for (size_t start = 0; start < n; start += g) {
+        const size_t end = std::min(n, start + g);
+        SuperBlock sb;
+        for (size_t w = start; w < end; ++w) {
+            encode_window_row(cols.windows[w], cols.has_freq, cols.has_prx, prev_last, &sb.block);
+            prev_last = cols.windows[w].last_docid;
+        }
+        sb.last_docid = prev_last;
+        blocks.push_back(std::move(sb));
+    }
+    return blocks;
+}
+
+// Serializes the super_block_dir (one row per super-block) into dir_sink, using
+// each block's byte length to compute its offset within the window_dir region.
+void encode_super_block_dir(const std::vector<SuperBlock>& blocks, ByteSink* dir_sink) {
+    uint64_t prev_last = 0;
+    uint64_t block_off = 0;
+    for (const SuperBlock& sb : blocks) {
+        dir_sink->put_varint64(sb.last_docid - prev_last);
+        dir_sink->put_varint64(block_off);
+        dir_sink->put_varint64(sb.block.size());
+        prev_last = sb.last_docid;
+        block_off += sb.block.size();
+    }
+}
+
+} // namespace
+
+Status build_frq_prelude(const FrqPreludeColumns& cols, ByteSink* out) {
+    SNII_RETURN_IF_ERROR(validate_input(cols, out));
+
+    const std::vector<SuperBlock> blocks = encode_super_blocks(cols);
+    ByteSink dir_sink;
+    encode_super_block_dir(blocks, &dir_sink);
+
+    // covered = header + super_block_dir (the crc covers exactly this region).
+    ByteSink covered;
+    covered.put_u8(make_flags(cols));
+    covered.put_varint64(cols.windows.size());
+    covered.put_varint64(cols.group_size);
+    covered.put_varint64(blocks.size());
+    covered.put_varint64(dir_sink.size());
+    covered.put_bytes(dir_sink.view());
+
+    out->put_bytes(covered.view());
+    out->put_fixed32(crc32c(covered.view()));
+    for (const SuperBlock& sb : blocks) out->put_bytes(sb.block.view());
+    return Status::OK();
+}
+
+namespace {
+
+// Decoded header fields shared between parse phases.
+struct Header {
+    bool has_freq = false;
+    bool has_prx = false;
+    uint64_t n = 0;
+    uint64_t group_size = 0;
+    uint64_t n_super = 0;
+    uint64_t sbdir_len = 0;
+};
+
+// Verifies the trailing crc covers [start of buffer .. end of super_block_dir].
+// covered_len = header bytes (up to and including sbdir_len) + sbdir_len.
+Status verify_covered_crc(Slice prelude, size_t header_end, uint64_t sbdir_len) {
+    const size_t covered = header_end + static_cast<size_t>(sbdir_len);
+    if (covered + sizeof(uint32_t) > prelude.size()) {
+        return Status::Corruption("frq_prelude: buffer too short for crc region");
+    }
+    uint32_t stored = 0;
+    ByteSource crc_src(prelude.subslice(covered, sizeof(uint32_t)));
+    SNII_RETURN_IF_ERROR(crc_src.get_fixed32(&stored));
+    if (crc32c(prelude.subslice(0, covered)) != stored) {
+        return Status::Corruption("frq_prelude: crc32c mismatch");
+    }
+    return Status::OK();
+}
+
+// Parses + validates the header (counts capped before any later reserve).
+Status parse_header(ByteSource* src, Header* h) {
+    uint8_t flags = 0;
+    SNII_RETURN_IF_ERROR(src->get_u8(&flags));
+    h->has_freq = (flags & frq_prelude_flags::kHasFreq) != 0;
+    h->has_prx = (flags & frq_prelude_flags::kHasPrx) != 0;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&h->n));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&h->group_size));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&h->n_super));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&h->sbdir_len));
+    if (h->n > kMaxWindows || h->n_super > kMaxWindows) {
+        return Status::Corruption("frq_prelude: window count exceeds sane cap");
+    }
+    if (h->group_size == 0) {
+        return Status::Corruption("frq_prelude: group_size is zero");
+    }
+    if (h->n_super != ceil_div(h->n, h->group_size)) {
+        return Status::Corruption("frq_prelude: n_super inconsistent with N/G");
+    }
+    return Status::OK();
+}
+
+// One super-block directory row.
+struct SbDirRow {
+    uint64_t last_docid = 0;
+    uint64_t block_off = 0;
+    uint64_t block_len = 0;
+};
+
+// Decodes the super_block_dir region into absolute-last-docid rows, validating
+// monotonic last docids and contiguous, in-bounds block offsets.
+Status decode_super_block_dir(Slice dir, const Header& h, std::vector<SbDirRow>* rows,
+                              uint64_t* window_region_len) {
+    ByteSource src(dir);
+    rows->clear();
+    rows->reserve(static_cast<size_t>(h.n_super));
+    uint64_t prev_last = 0;
+    uint64_t expect_off = 0;
+    for (uint64_t s = 0; s < h.n_super; ++s) {
+        SbDirRow r;
+        uint64_t ldd = 0;
+        SNII_RETURN_IF_ERROR(src.get_varint64(&ldd));
+        SNII_RETURN_IF_ERROR(src.get_varint64(&r.block_off));
+        SNII_RETURN_IF_ERROR(src.get_varint64(&r.block_len));
+        SNII_RETURN_IF_ERROR(checked_add_u64(
+                prev_last, ldd, "frq_prelude: super-block last_docid overflow", &r.last_docid));
+        uint32_t checked_last = 0;
+        SNII_RETURN_IF_ERROR(checked_u32(
+                r.last_docid, "frq_prelude: super-block last_docid exceeds u32", &checked_last));
+        if (r.last_docid < prev_last || r.block_off != expect_off) {
+            return Status::Corruption("frq_prelude: super-block dir inconsistent");
+        }
+        expect_off += r.block_len;
+        prev_last = r.last_docid;
+        rows->push_back(r);
+    }
+    if (!src.eof()) {
+        return Status::Corruption("frq_prelude: super-block dir has trailing bytes");
+    }
+    *window_region_len = expect_off;
+    return Status::OK();
+}
+
+// Validates a per-window codec mode byte against the known bits.
+Status check_win_mode(uint8_t mode, bool has_freq) {
+    if ((mode & ~frq_win_mode::kKnownBits) != 0) {
+        return Status::Corruption("frq_prelude: unknown win_mode bits");
+    }
+    if (!has_freq && (mode & frq_win_mode::kFreqZstd) != 0) {
+        return Status::Corruption("frq_prelude: freq mode set without has_freq");
+    }
+    return Status::OK();
+}
+
+// Decodes one window row, advancing prev_last to this window's absolute last.
+Status decode_window_row(ByteSource* src, bool has_freq, bool has_prx, bool first_window,
+                         uint64_t* prev_last, WindowMeta* m) {
+    uint64_t ldd = 0, doc_count = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&ldd));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&doc_count));
+    uint8_t mode = 0;
+    SNII_RETURN_IF_ERROR(src->get_u8(&mode));
+    SNII_RETURN_IF_ERROR(check_win_mode(mode, has_freq));
+    m->dd_zstd = (mode & frq_win_mode::kDdZstd) != 0;
+    m->freq_zstd = has_freq && (mode & frq_win_mode::kFreqZstd) != 0;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_off));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_disk_len));
+    SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_uncomp_len));
+    SNII_RETURN_IF_ERROR(src->get_fixed32(&m->crc_dd));
+    if (has_freq) {
+        SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_off));
+        SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_disk_len));
+        SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_uncomp_len));
+        SNII_RETURN_IF_ERROR(src->get_fixed32(&m->crc_freq));
+    }
+    if (has_prx) {
+        SNII_RETURN_IF_ERROR(src->get_varint64(&m->prx_off));
+        SNII_RETURN_IF_ERROR(src->get_varint64(&m->prx_len));
+    }
+    uint64_t max_freq = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint64(&max_freq));
+    SNII_RETURN_IF_ERROR(src->get_u8(&m->max_norm));
+    uint64_t last_docid = 0;
+    SNII_RETURN_IF_ERROR(checked_add_u64(*prev_last, ldd, "frq_prelude: window last_docid overflow",
+                                         &last_docid));
+    SNII_RETURN_IF_ERROR(
+            validate_window_doc_count(first_window, *prev_last, last_docid, doc_count));
+    m->win_base = *prev_last;
+    SNII_RETURN_IF_ERROR(
+            checked_u32(last_docid, "frq_prelude: window last_docid exceeds u32", &m->last_docid));
+    SNII_RETURN_IF_ERROR(
+            checked_u32(doc_count, "frq_prelude: window doc_count exceeds u32", &m->doc_count));
+    SNII_RETURN_IF_ERROR(
+            checked_u32(max_freq, "frq_prelude: window max_freq exceeds u32", &m->max_freq));
+    *prev_last = last_docid;
+    return Status::OK();
+}
+
+// Decodes one super-block's window block (<=G rows) into the global window list,
+// seeding win_base from prev_last and re-checking the recorded sb last docid.
+Status decode_one_block(Slice block, const Header& h, uint64_t sb_last_docid, size_t row_count,
+                        uint64_t* prev_last, std::vector<WindowMeta>* windows) {
+    ByteSource src(block);
+    for (size_t i = 0; i < row_count; ++i) {
+        WindowMeta m;
+        SNII_RETURN_IF_ERROR(
+                decode_window_row(&src, h.has_freq, h.has_prx, windows->empty(), prev_last, &m));
+        windows->push_back(m);
+    }
+    if (!src.eof()) {
+        return Status::Corruption("frq_prelude: window block has trailing bytes");
+    }
+    if (*prev_last != sb_last_docid) {
+        return Status::Corruption("frq_prelude: window block last docid mismatch");
+    }
+    return Status::OK();
+}
+
+// Decodes all window blocks pointed to by the super_block_dir.
+Status decode_all_blocks(Slice window_region, const Header& h, const std::vector<SbDirRow>& dir,
+                         std::vector<WindowMeta>* windows) {
+    windows->clear();
+    windows->reserve(static_cast<size_t>(h.n));
+    uint64_t prev_last = 0;
+    for (size_t s = 0; s < dir.size(); ++s) {
+        const SbDirRow& r = dir[s];
+        if (r.block_off + r.block_len > window_region.size() ||
+            r.block_off + r.block_len < r.block_off) {
+            return Status::Corruption("frq_prelude: window block out of region");
+        }
+        const uint64_t already = static_cast<uint64_t>(windows->size());
+        const uint64_t rows = std::min<uint64_t>(h.group_size, h.n - already);
+        Slice block = window_region.subslice(static_cast<size_t>(r.block_off),
+                                             static_cast<size_t>(r.block_len));
+        SNII_RETURN_IF_ERROR(decode_one_block(block, h, r.last_docid, static_cast<size_t>(rows),
+                                              &prev_last, windows));
+    }
+    if (windows->size() != h.n) {
+        return Status::Corruption("frq_prelude: decoded window count mismatch");
+    }
+    return Status::OK();
+}
+
+// Validates the dd/freq region locators tile the dd-block / freq-block contiguously
+// (each region starts where the previous one ended) and returns the block lengths.
+// Contiguity makes the docs-only prefix one solid run and bounds the read range.
+Status validate_region_layout(const Header& h, const std::vector<WindowMeta>& windows,
+                              uint64_t* dd_block_len, uint64_t* freq_block_len) {
+    uint64_t dd_expect = 0;
+    uint64_t freq_expect = 0;
+    for (const WindowMeta& m : windows) {
+        if (m.dd_off != dd_expect) {
+            return Status::Corruption("frq_prelude: dd region not contiguous");
+        }
+        if (m.dd_disk_len > m.dd_uncomp_len && !m.dd_zstd) {
+            return Status::Corruption("frq_prelude: raw dd region length inconsistent");
+        }
+        if (dd_expect + m.dd_disk_len < dd_expect) {
+            return Status::Corruption("frq_prelude: dd block length overflow");
+        }
+        dd_expect += m.dd_disk_len;
+        if (h.has_freq) {
+            if (m.freq_off != freq_expect) {
+                return Status::Corruption("frq_prelude: freq region not contiguous");
+            }
+            if (freq_expect + m.freq_disk_len < freq_expect) {
+                return Status::Corruption("frq_prelude: freq block length overflow");
+            }
+            freq_expect += m.freq_disk_len;
+        }
+    }
+    *dd_block_len = dd_expect;
+    *freq_block_len = freq_expect;
+    return Status::OK();
+}
+
+} // namespace
+
+Status FrqPreludeReader::open(Slice prelude, FrqPreludeReader* out) {
+    ByteSource src(prelude);
+    Header h;
+    SNII_RETURN_IF_ERROR(parse_header(&src, &h));
+    const size_t header_end = src.position();
+    SNII_RETURN_IF_ERROR(verify_covered_crc(prelude, header_end, h.sbdir_len));
+
+    if (header_end + static_cast<size_t>(h.sbdir_len) > prelude.size()) {
+        return Status::Corruption("frq_prelude: sbdir_len past buffer");
+    }
+    Slice dir = prelude.subslice(header_end, static_cast<size_t>(h.sbdir_len));
+    std::vector<SbDirRow> rows;
+    uint64_t window_region_len = 0;
+    SNII_RETURN_IF_ERROR(decode_super_block_dir(dir, h, &rows, &window_region_len));
+
+    const size_t region_start = header_end + static_cast<size_t>(h.sbdir_len) + sizeof(uint32_t);
+    if (region_start + static_cast<size_t>(window_region_len) > prelude.size()) {
+        return Status::Corruption("frq_prelude: window region past buffer");
+    }
+    Slice window_region = prelude.subslice(region_start, static_cast<size_t>(window_region_len));
+
+    out->has_freq_ = h.has_freq;
+    out->has_prx_ = h.has_prx;
+    out->group_size_ = static_cast<uint32_t>(h.group_size);
+    out->n_super_ = static_cast<uint32_t>(h.n_super);
+    out->sb_last_docid_.clear();
+    out->sb_last_docid_.reserve(rows.size());
+    for (const SbDirRow& r : rows) out->sb_last_docid_.push_back(r.last_docid);
+    SNII_RETURN_IF_ERROR(decode_all_blocks(window_region, h, rows, &out->windows_));
+    return validate_region_layout(h, out->windows_, &out->dd_block_len_, &out->freq_block_len_);
+}
+
+Status FrqPreludeReader::window(uint32_t w, WindowMeta* out) const {
+    if (out == nullptr) return Status::InvalidArgument("frq_prelude: null window out");
+    if (w >= windows_.size()) {
+        return Status::InvalidArgument("frq_prelude: window index out of range");
+    }
+    *out = windows_[w];
+    return Status::OK();
+}
+
+Status FrqPreludeReader::locate_window(uint32_t docid, bool* found, uint32_t* w) const {
+    if (found == nullptr || w == nullptr) {
+        return Status::InvalidArgument("frq_prelude: null locate out");
+    }
+    *found = false;
+    if (windows_.empty()) return Status::OK();
+    if (docid > windows_.back().last_docid) return Status::OK();
+
+    // Level 1: first super-block whose absolute last docid >= docid.
+    const auto sb_it = std::lower_bound(sb_last_docid_.begin(), sb_last_docid_.end(),
+                                        static_cast<uint64_t>(docid));
+    const size_t sb = static_cast<size_t>(sb_it - sb_last_docid_.begin());
+    // Level 2: window binary search within [sb*G, min((sb+1)*G, N)).
+    const size_t lo = sb * group_size_;
+    const size_t hi = std::min<size_t>(lo + group_size_, windows_.size());
+    for (size_t i = lo; i < hi; ++i) {
+        if (docid <= windows_[i].last_docid) {
+            *found = true;
+            *w = static_cast<uint32_t>(i);
+            return Status::OK();
+        }
+    }
+    return Status::OK(); // unreachable when invariants hold; defensive miss.
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp b/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp
new file mode 100644
index 00000000000000..27ca75b8f6b9ec
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp
@@ -0,0 +1,116 @@
+#include "snii/format/logical_index_directory.h"
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/section_framer.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+
+namespace {
+
+// Minimum payload bytes any entry can occupy: index_id (>=1) + suffix_len (>=1, value 0) +
+// meta_off (>=1) + meta_len (>=1). Used as an anti-DoS lower bound before reserving.
+constexpr size_t kMinEntryBytes = 4;
+
+// Encode one directory entry. Fixed field order; reuse ByteSink varint/bytes primitives.
+void encode_entry(const LogicalIndexRef& ref, ByteSink* payload) {
+    payload->put_varint64(ref.index_id);
+    payload->put_varint32(static_cast<uint32_t>(ref.index_suffix.size()));
+    payload->put_bytes(Slice(std::string_view(ref.index_suffix)));
+    payload->put_varint64(ref.meta_off);
+    payload->put_varint64(ref.meta_len);
+}
+
+// Decode one directory entry, validating suffix_len against the remaining payload before copying.
+Status decode_entry(ByteSource* ps, LogicalIndexRef* ref) {
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->index_id));
+    uint32_t suffix_len = 0;
+    SNII_RETURN_IF_ERROR(ps->get_varint32(&suffix_len));
+    // Anti-DoS: reject a suffix_len that cannot fit in the remaining bytes before allocating.
+    if (suffix_len > ps->remaining()) {
+        return Status::Corruption("logical_index_directory: suffix_len exceeds payload");
+    }
+    Slice suffix;
+    SNII_RETURN_IF_ERROR(ps->get_bytes(suffix_len, &suffix));
+    ref->index_suffix.assign(reinterpret_cast<const char*>(suffix.data()), suffix.size());
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->meta_off));
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->meta_len));
+    return Status::OK();
+}
+
+Status decode_payload(Slice payload, std::vector<LogicalIndexRef>* refs) {
+    ByteSource ps(payload);
+    uint32_t n_entries = 0;
+    SNII_RETURN_IF_ERROR(ps.get_varint32(&n_entries));
+    // Anti-DoS: cap n_entries against the remaining payload before reserving, so a corrupted
+    // inflated count cannot trigger a huge allocation.
+    if (n_entries > ps.remaining() / kMinEntryBytes) {
+        return Status::Corruption("logical_index_directory: n_entries exceeds payload capacity");
+    }
+    refs->clear();
+    refs->reserve(n_entries);
+    for (uint32_t i = 0; i < n_entries; ++i) {
+        LogicalIndexRef ref {};
+        SNII_RETURN_IF_ERROR(decode_entry(&ps, &ref));
+        refs->push_back(std::move(ref));
+    }
+    if (!ps.eof()) {
+        return Status::Corruption("logical_index_directory: trailing bytes in payload");
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+void LogicalIndexDirectoryBuilder::finish(ByteSink* sink) const {
+    ByteSink payload;
+    payload.put_varint32(static_cast<uint32_t>(refs_.size()));
+    for (const auto& ref : refs_) {
+        encode_entry(ref, &payload);
+    }
+    SectionFramer::write(*sink, static_cast<uint8_t>(SectionType::kLogicalIndexDirectory),
+                         payload.view());
+}
+
+Status LogicalIndexDirectoryReader::open(Slice framed, LogicalIndexDirectoryReader* out) {
+    if (out == nullptr) {
+        return Status::InvalidArgument("logical_index_directory: out is null");
+    }
+    ByteSource src(framed);
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec));
+    if (sec.type != static_cast<uint8_t>(SectionType::kLogicalIndexDirectory)) {
+        return Status::InvalidArgument("logical_index_directory: unexpected section type");
+    }
+    return decode_payload(sec.payload, &out->refs_);
+}
+
+Status LogicalIndexDirectoryReader::get(uint32_t i, LogicalIndexRef* out) const {
+    if (out == nullptr) {
+        return Status::InvalidArgument("logical_index_directory: out is null");
+    }
+    if (i >= refs_.size()) {
+        return Status::NotFound("logical_index_directory: index out of range");
+    }
+    *out = refs_[i];
+    return Status::OK();
+}
+
+Status LogicalIndexDirectoryReader::find(uint64_t index_id, std::string_view suffix, bool* found,
+                                         LogicalIndexRef* out) const {
+    if (found == nullptr || out == nullptr) {
+        return Status::InvalidArgument("logical_index_directory: output pointer is null");
+    }
+    *found = false;
+    for (const auto& ref : refs_) {
+        if (ref.index_id != index_id || std::string_view(ref.index_suffix) != suffix) {
+            continue;
+        }
+        *out = ref;
+        *found = true;
+        return Status::OK();
+    }
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/norms_pod.cpp b/be/src/storage/index/snii/core/src/format/norms_pod.cpp
new file mode 100644
index 00000000000000..a6f80c03b1ebcd
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/norms_pod.cpp
@@ -0,0 +1,46 @@
+#include "snii/format/norms_pod.h"
+
+#include <limits>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/section_framer.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+
+void NormsPodWriter::finish(ByteSink* sink) const {
+    // Build inner payload: [varint64 doc_count][raw norm bytes].
+    ByteSink payload;
+    payload.put_varint64(norms_.size());
+    payload.put_bytes(Slice(norms_));
+    // Delegate outer framing to SectionFramer to append type+len+crc32c, avoiding manual checksum assembly.
+    SectionFramer::write(*sink, static_cast<uint8_t>(SectionType::kStatsBlock), payload.view());
+}
+
+Status NormsPodReader::open(Slice framed, NormsPodReader* out) {
+    // framer handles CRC verify, truncation detection, and payload slicing.
+    ByteSource src(framed);
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec));
+
+    // Parse inner payload: [varint64 doc_count][bytes].
+    ByteSource payload(sec.payload);
+    uint64_t doc_count = 0;
+    SNII_RETURN_IF_ERROR(payload.get_varint64(&doc_count));
+    if (doc_count > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("norms POD doc_count overflows uint32");
+    }
+    // doc_count must exactly equal the remaining byte count (1 byte per doc).
+    if (payload.remaining() != doc_count) {
+        return Status::Corruption("norms POD length mismatch");
+    }
+
+    Slice bytes;
+    SNII_RETURN_IF_ERROR(payload.get_bytes(static_cast<size_t>(doc_count), &bytes));
+    out->doc_count_ = static_cast<uint32_t>(doc_count);
+    out->norms_ = bytes.data();
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/null_bitmap.cpp b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp
new file mode 100644
index 00000000000000..d805cd2e945563
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp
@@ -0,0 +1,107 @@
+#include "snii/format/null_bitmap.h"
+
+#include <limits>
+#include <vector>
+
+#include "roaring/roaring.h"
+#include "roaring/roaring.hh"
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/section_framer.h"
+
+namespace snii::format {
+
+NullBitmapWriter::
+        NullBitmapWriter() // NOLINT(modernize-use-equals-default): roaring type is incomplete in the header.
+        : bitmap_(std::make_unique<roaring::Roaring>()) {}
+
+NullBitmapWriter::~NullBitmapWriter() = default;
+
+void NullBitmapWriter::add_null(uint32_t docid) {
+    bitmap_->add(docid);
+}
+
+uint32_t NullBitmapWriter::null_count() const {
+    return static_cast<uint32_t>(bitmap_->cardinality());
+}
+
+void NullBitmapWriter::finish(uint32_t doc_count, ByteSink* sink) const {
+    // Serialize the Roaring bitmap to its portable on-disk form.
+    const size_t roaring_size = bitmap_->getSizeInBytes();
+    std::vector<char> roaring_buf(roaring_size);
+    bitmap_->write(roaring_buf.data());
+
+    // Build inner payload: [varint64 doc_count][varint64 roaring_size][bytes].
+    ByteSink payload;
+    payload.put_varint64(doc_count);
+    payload.put_varint64(roaring_size);
+    payload.put_bytes(Slice(reinterpret_cast<const uint8_t*>(roaring_buf.data()), roaring_size));
+
+    // Delegate the type + len + crc32c envelope to SectionFramer.
+    SectionFramer::write(*sink, kNullBitmapSectionType, payload.view());
+}
+
+NullBitmapReader::
+        NullBitmapReader() // NOLINT(modernize-use-equals-default): roaring type is incomplete in the header.
+        : bitmap_(std::make_unique<roaring::Roaring>()) {}
+
+NullBitmapReader::~NullBitmapReader() = default;
+
+NullBitmapReader::NullBitmapReader(NullBitmapReader&&) noexcept = default;
+NullBitmapReader& NullBitmapReader::operator=(NullBitmapReader&&) noexcept = default;
+
+Status NullBitmapReader::open(Slice framed, NullBitmapReader* out) {
+    // SectionFramer handles CRC verification, truncation detection, and payload
+    // slicing.
+    ByteSource src(framed);
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec));
+
+    // Parse inner payload: [varint64 doc_count][varint64 roaring_size][bytes].
+    ByteSource payload(sec.payload);
+    uint64_t doc_count = 0;
+    SNII_RETURN_IF_ERROR(payload.get_varint64(&doc_count));
+    if (doc_count > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("null bitmap doc_count overflows uint32");
+    }
+
+    uint64_t roaring_size = 0;
+    SNII_RETURN_IF_ERROR(payload.get_varint64(&roaring_size));
+    // Anti-DoS: the declared roaring_size must not exceed the bytes actually
+    // present, otherwise readSafe could be told to walk past the payload.
+    if (roaring_size > payload.remaining()) {
+        return Status::Corruption("null bitmap roaring_size exceeds payload");
+    }
+
+    Slice roaring_bytes;
+    SNII_RETURN_IF_ERROR(payload.get_bytes(static_cast<size_t>(roaring_size), &roaring_bytes));
+
+    // Validate the Roaring container BEFORE deserializing. A CRC-valid frame can
+    // still carry malformed roaring bytes; Roaring::readSafe / read would then hit
+    // CRoaring's terminate-or-throw path (NULL -> ROARING_TERMINATE). The safe,
+    // non-throwing C probe returns the exact byte count a valid container would
+    // consume, or 0 on malformed/insufficient input.
+    const char* rb = reinterpret_cast<const char*>(roaring_bytes.data());
+    const size_t probed =
+            roaring_bitmap_portable_deserialize_size(rb, static_cast<size_t>(roaring_size));
+    if (probed == 0 || probed != static_cast<size_t>(roaring_size)) {
+        return Status::Corruption("null bitmap: malformed roaring container");
+    }
+    *out->bitmap_ = roaring::Roaring::readSafe(rb, static_cast<size_t>(roaring_size));
+    out->doc_count_ = static_cast<uint32_t>(doc_count);
+    return Status::OK();
+}
+
+bool NullBitmapReader::is_null(uint32_t docid) const {
+    return bitmap_->contains(docid);
+}
+
+uint32_t NullBitmapReader::null_count() const {
+    return static_cast<uint32_t>(bitmap_->cardinality());
+}
+
+void NullBitmapReader::copy_to(roaring::Roaring* out) const {
+    *out = *bitmap_;
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/per_index_meta.cpp b/be/src/storage/index/snii/core/src/format/per_index_meta.cpp
new file mode 100644
index 00000000000000..31bb6e42445404
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/per_index_meta.cpp
@@ -0,0 +1,191 @@
+#include "snii/format/per_index_meta.h"
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/encoding/section_framer.h"
+
+namespace snii::format {
+
+namespace {
+
+// Upper bound on index_suffix length read from untrusted bytes, capped before
+// allocation to avoid a DoS-inflated reserve. A logical index suffix is a short
+// column/field name; 64 KiB is far beyond any legitimate value.
+constexpr uint32_t kMaxSuffixLen = 64u * 1024u;
+
+void encode_region(const RegionRef& r, ByteSink* payload) {
+    payload->put_varint64(r.offset);
+    payload->put_varint64(r.length);
+}
+
+Status decode_region(ByteSource* ps, RegionRef* r) {
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&r->offset));
+    SNII_RETURN_IF_ERROR(ps->get_varint64(&r->length));
+    return Status::OK();
+}
+
+// SectionRefs payload: five RegionRefs in fixed order, each as varint64 pair.
+// Order: dict_region, posting_region, norms, null_bitmap, bsbf.
+void encode_section_refs(const SectionRefs& refs, ByteSink* sink) {
+    ByteSink payload;
+    encode_region(refs.dict_region, &payload);
+    encode_region(refs.posting_region, &payload);
+    encode_region(refs.norms, &payload);
+    encode_region(refs.null_bitmap, &payload);
+    encode_region(refs.bsbf, &payload);
+    SectionFramer::write(*sink, static_cast<uint8_t>(SectionType::kSectionRefs), payload.view());
+}
+
+Status decode_section_refs(Slice payload, SectionRefs* out) {
+    ByteSource ps(payload);
+    SNII_RETURN_IF_ERROR(decode_region(&ps, &out->dict_region));
+    SNII_RETURN_IF_ERROR(decode_region(&ps, &out->posting_region));
+    SNII_RETURN_IF_ERROR(decode_region(&ps, &out->norms));
+    SNII_RETURN_IF_ERROR(decode_region(&ps, &out->null_bitmap));
+    SNII_RETURN_IF_ERROR(decode_region(&ps, &out->bsbf));
+    if (!ps.eof()) {
+        return Status::Corruption("per_index_meta: trailing bytes in section_refs");
+    }
+    return Status::OK();
+}
+
+// Writes the self-checksummed header prefix. Layout matches the class comment.
+void encode_header(uint64_t index_id, const std::string& suffix, uint32_t flags, ByteSink* sink) {
+    ByteSink head;
+    head.put_fixed16(kMetaFormatVersion);
+    head.put_varint64(index_id);
+    head.put_varint32(static_cast<uint32_t>(suffix.size()));
+    head.put_bytes(Slice(suffix));
+    head.put_fixed32(flags);
+    uint32_t crc = crc32c(head.view());
+    sink->put_bytes(head.view());
+    sink->put_fixed32(crc);
+}
+
+// Parses and crc-verifies the header prefix, advancing src past the crc field.
+Status decode_header(Slice block, ByteSource* src, uint64_t* index_id, std::string* suffix,
+                     uint32_t* flags) {
+    size_t start = src->position();
+    uint16_t version = 0;
+    SNII_RETURN_IF_ERROR(src->get_fixed16(&version));
+    if (version != kMetaFormatVersion) {
+        return Status::Corruption("per_index_meta: unsupported meta_format_version");
+    }
+    SNII_RETURN_IF_ERROR(src->get_varint64(index_id));
+    uint32_t suffix_len = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len));
+    if (suffix_len > kMaxSuffixLen || suffix_len > src->remaining()) {
+        return Status::Corruption("per_index_meta: suffix_len exceeds bounds");
+    }
+    Slice suffix_view;
+    SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix_view));
+    SNII_RETURN_IF_ERROR(src->get_fixed32(flags));
+    size_t covered = src->position() - start;
+    uint32_t stored = 0;
+    SNII_RETURN_IF_ERROR(src->get_fixed32(&stored));
+    if (crc32c(block.subslice(start, covered)) != stored) {
+        return Status::Corruption("per_index_meta: header crc mismatch");
+    }
+    suffix->assign(reinterpret_cast<const char*>(suffix_view.data()), suffix_view.size());
+    return Status::OK();
+}
+
+// Reads one framed section, returning both its type and the FULL frame Slice
+// (type+len+payload+crc) so it can be re-opened by a sub-module reader. The
+// framer itself crc-verifies the frame.
+Status read_frame(Slice block, ByteSource* src, uint8_t* type, Slice* frame) {
+    size_t start = src->position();
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(*src, &sec));
+    *type = sec.type;
+    *frame = block.subslice(start, src->position() - start);
+    return Status::OK();
+}
+
+// Captures one frame into the matching reader field by section type. Returns
+// false (via *handled) for unrecognized types so the caller skips them.
+// Routes an optional sub-section frame to its slot. Unknown section types are
+// intentionally ignored (forward compatibility: skip unknown optional sections).
+void dispatch_frame(uint8_t type, Slice frame, Slice* sampled, Slice* dict) {
+    if (type == static_cast<uint8_t>(SectionType::kSampledTermIndex)) {
+        *sampled = frame;
+    } else if (type == static_cast<uint8_t>(SectionType::kDictBlockDirectory)) {
+        *dict = frame;
+    }
+}
+
+} // namespace
+
+PerIndexMetaBuilder::PerIndexMetaBuilder(uint64_t index_id, std::string index_suffix,
+                                         uint32_t flags)
+        : index_id_(index_id), index_suffix_(std::move(index_suffix)), flags_(flags) {}
+
+void PerIndexMetaBuilder::set_stats(const StatsBlock& stats) {
+    stats_ = stats;
+}
+
+void PerIndexMetaBuilder::set_sampled_term_index(Slice framed_bytes) {
+    sampled_term_index_.assign(framed_bytes.data(), framed_bytes.data() + framed_bytes.size());
+}
+
+void PerIndexMetaBuilder::set_dict_block_directory(Slice framed_bytes) {
+    dict_block_directory_.assign(framed_bytes.data(), framed_bytes.data() + framed_bytes.size());
+}
+
+void PerIndexMetaBuilder::set_section_refs(const SectionRefs& refs) {
+    section_refs_ = refs;
+}
+
+void PerIndexMetaBuilder::add_raw_section(Slice framed_bytes) {
+    extra_sections_.emplace_back(framed_bytes.data(), framed_bytes.data() + framed_bytes.size());
+}
+
+Status PerIndexMetaBuilder::finish(ByteSink* sink) const {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("per_index_meta: null sink");
+    }
+    encode_header(index_id_, index_suffix_, flags_, sink);
+    encode_stats_block(stats_, sink);
+    sink->put_bytes(Slice(sampled_term_index_));
+    sink->put_bytes(Slice(dict_block_directory_));
+    encode_section_refs(section_refs_, sink);
+    for (const auto& extra : extra_sections_) {
+        sink->put_bytes(Slice(extra));
+    }
+    return Status::OK();
+}
+
+Status PerIndexMetaReader::open(Slice block, PerIndexMetaReader* out) {
+    if (out == nullptr) {
+        return Status::InvalidArgument("per_index_meta: null reader");
+    }
+    ByteSource src(block);
+    SNII_RETURN_IF_ERROR(
+            decode_header(block, &src, &out->index_id_, &out->index_suffix_, &out->flags_));
+    bool have_stats = false;
+    bool have_refs = false;
+    while (!src.eof()) {
+        uint8_t type = 0;
+        Slice frame;
+        SNII_RETURN_IF_ERROR(read_frame(block, &src, &type, &frame));
+        if (type == static_cast<uint8_t>(SectionType::kStatsBlock)) {
+            ByteSource fs(frame);
+            SNII_RETURN_IF_ERROR(decode_stats_block(&fs, &out->stats_));
+            have_stats = true;
+        } else if (type == static_cast<uint8_t>(SectionType::kSectionRefs)) {
+            FramedSection sec;
+            ByteSource fs(frame);
+            SNII_RETURN_IF_ERROR(SectionFramer::read(fs, &sec));
+            SNII_RETURN_IF_ERROR(decode_section_refs(sec.payload, &out->section_refs_));
+            have_refs = true;
+        } else {
+            dispatch_frame(type, frame, &out->sampled_term_index_, &out->dict_block_directory_);
+        }
+    }
+    if (!have_stats || !have_refs) {
+        return Status::Corruption("per_index_meta: missing required sub-section");
+    }
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/prx_pod.cpp b/be/src/storage/index/snii/core/src/format/prx_pod.cpp
new file mode 100644
index 00000000000000..7d90cb3ead5df6
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/prx_pod.cpp
@@ -0,0 +1,738 @@
+#include "snii/format/prx_pod.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <span>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/encoding/pfor.h"
+#include "snii/encoding/zstd_codec.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+namespace {
+
+// Auto-compression threshold: use raw when payload is smaller than this (zstd
+// gain is negligible and metadata overhead is relatively large).
+inline constexpr size_t kAutoZstdMinBytes = 512;
+// Default zstd level in auto mode.
+inline constexpr int kDefaultZstdLevel = 3;
+// Maximum decompressed byte size for a single .prx window. Guards against a
+// corrupted uncomp_len read from S3 inflated to a huge value: sanity-check
+// before allocating/decompressing to avoid GB-scale allocations. Windows are
+// 256-doc aligned and normally far below this limit.
+inline constexpr uint32_t kMaxWindowUncompBytes = 256u * 1024 * 1024;
+// Anti-DoS cap on position count decoded from a single window before
+// allocation.
+inline constexpr uint32_t kMaxWindowPositions = 1u << 26; // 64M positions/window
+// Anti-DoS cap on doc count decoded from a single window before allocation. A
+// corrupt doc_count is otherwise fed straight to assign()/reserve() ->
+// bad_alloc.
+inline constexpr uint32_t kMaxWindowDocs = 1u << 24; // 16M docs/window
+
+// Writer-side precondition for the FLAT builders: the per-doc partition `freqs`
+// must address exactly the positions present in `flat`. If sum(freqs) overruns
+// flat.size() a (positions_flat, freqs) mismatch would index flat[off+i] past
+// the span end -- an out-of-bounds read on caller-supplied data. Reject it as
+// InvalidArgument BEFORE any indexing so the bug surfaces as a clean Status,
+// never UB. (sum < size leaves trailing positions unused, which is also a
+// writer bug, so we require exact equality.) Uint64 accumulation cannot
+// overflow for uint32 freqs.
+Status check_flat_partition(std::span<const uint32_t> flat, std::span<const uint32_t> freqs) {
+    uint64_t sum = 0;
+    for (uint32_t fc : freqs) sum += fc;
+    if (sum != flat.size()) {
+        return Status::InvalidArgument("prx: sum(freqs) does not match positions_flat size");
+    }
+    return Status::OK();
+}
+
+// Encode per-doc position lists into a self-describing plain payload (doc_count
+// + per-doc delta stream).
+Status encode_payload(std::span<const std::vector<uint32_t>> per_doc, ByteSink* out) {
+    out->put_varint32(static_cast<uint32_t>(per_doc.size()));
+    for (const auto& doc : per_doc) {
+        out->put_varint32(static_cast<uint32_t>(doc.size()));
+        uint32_t prev = 0;
+        for (size_t i = 0; i < doc.size(); ++i) {
+            uint32_t pos = doc[i];
+            if (i > 0 && pos < prev) {
+                return Status::InvalidArgument("prx: positions within a doc must be ascending");
+            }
+            out->put_varint32(i == 0 ? pos : pos - prev);
+            prev = pos;
+        }
+    }
+    return Status::OK();
+}
+
+// FLAT-positions encoder: identical wire output to encode_payload above, but
+// reads positions from a single flat span partitioned per-doc by `freqs` (doc d
+// owns the next freqs[d] entries). This avoids materializing a
+// vector-of-vectors for the window; freqs.size() is the doc count and
+// sum(freqs) == flat.size().
+Status encode_payload_flat(std::span<const uint32_t> flat, std::span<const uint32_t> freqs,
+                           ByteSink* out) {
+    SNII_RETURN_IF_ERROR(check_flat_partition(flat, freqs));
+    out->put_varint32(static_cast<uint32_t>(freqs.size()));
+    size_t off = 0;
+    for (uint32_t fc : freqs) {
+        out->put_varint32(fc);
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < fc; ++i) {
+            const uint32_t pos = flat[off + i];
+            if (i > 0 && pos < prev) {
+                return Status::InvalidArgument("prx: positions within a doc must be ascending");
+            }
+            out->put_varint32(i == 0 ? pos : pos - prev);
+            prev = pos;
+        }
+        off += fc;
+    }
+    return Status::OK();
+}
+
+// Encode a uint32 array into PFOR runs of kFrqBaseUnit (256) elements each. The
+// run count is derived by the decoder from the total length, so it is not
+// stored.
+void encode_pfor_runs(std::span<const uint32_t> values, ByteSink* out) {
+    const size_t n = values.size();
+    for (size_t off = 0; off < n; off += kFrqBaseUnit) {
+        const size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit;
+        pfor_encode(values.data() + off, run, out);
+    }
+}
+
+// Decode n uint32 values (multiple PFOR runs of kFrqBaseUnit each) into out.
+Status decode_pfor_runs(ByteSource* src, size_t n, std::vector<uint32_t>* out) {
+    out->assign(n, 0);
+    for (size_t off = 0; off < n; off += kFrqBaseUnit) {
+        const size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit;
+        SNII_RETURN_IF_ERROR(pfor_decode(src, run, out->data() + off));
+    }
+    return Status::OK();
+}
+
+// PFOR window payload (self-describing; no entropy coding):
+//   VInt doc_count
+//   VInt total_pos             # sum of all pos_counts
+//   PFOR_runs(pos_counts)      # doc_count values (bit-packed; mostly 1 -> ~1
+//   bit) PFOR_runs(position_deltas) # total_pos deltas, flat across docs (first
+//   per
+//                              #   doc absolute, rest delta-within-doc)
+// Bit-packing the per-doc pos_counts (vs one varint each) is the size win: in a
+// uniform corpus most docs have freq 1, so the count column packs to ~1
+// bit/doc. Builds the payload from a flat positions span partitioned per-doc by
+// `freqs`.
+Status encode_pfor_payload_flat(std::span<const uint32_t> flat, std::span<const uint32_t> freqs,
+                                ByteSink* out) {
+    SNII_RETURN_IF_ERROR(check_flat_partition(flat, freqs));
+    out->put_varint32(static_cast<uint32_t>(freqs.size()));
+    out->put_varint32(static_cast<uint32_t>(flat.size()));
+    encode_pfor_runs(freqs, out);
+    std::vector<uint32_t> deltas;
+    deltas.reserve(flat.size());
+    size_t off = 0;
+    for (uint32_t fc : freqs) {
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < fc; ++i) {
+            const uint32_t pos = flat[off + i];
+            if (i > 0 && pos < prev) {
+                return Status::InvalidArgument("prx: positions within a doc must be ascending");
+            }
+            deltas.push_back(i == 0 ? pos : pos - prev);
+            prev = pos;
+        }
+        off += fc;
+    }
+    encode_pfor_runs(deltas, out);
+    return Status::OK();
+}
+
+// Builds the PFOR payload from per-doc lists (delegates through a flat view).
+Status encode_pfor_payload(std::span<const std::vector<uint32_t>> per_doc, ByteSink* out) {
+    std::vector<uint32_t> flat, freqs;
+    freqs.reserve(per_doc.size());
+    for (const auto& doc : per_doc) {
+        freqs.push_back(static_cast<uint32_t>(doc.size()));
+        flat.insert(flat.end(), doc.begin(), doc.end());
+    }
+    return encode_pfor_payload_flat(flat, freqs, out);
+}
+
+// Decode per-doc position lists from a PFOR payload.
+Status decode_pfor_payload(Slice plain, std::vector<std::vector<uint32_t>>* out) {
+    ByteSource src(plain);
+    uint32_t doc_count = 0, total_pos = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count));
+    SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos));
+    if (total_pos > kMaxWindowPositions) {
+        return Status::Corruption("prx: position count exceeds sane cap");
+    }
+    if (doc_count > kMaxWindowDocs) {
+        return Status::Corruption("prx: doc count exceeds sane cap");
+    }
+    std::vector<uint32_t> pos_counts;
+    SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts));
+    uint64_t sum = 0;
+    for (uint32_t d = 0; d < doc_count; ++d) sum += pos_counts[d];
+    if (sum != total_pos) {
+        return Status::Corruption("prx: pos_count sum mismatch");
+    }
+    std::vector<uint32_t> deltas;
+    SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, &deltas));
+    out->clear();
+    out->reserve(doc_count);
+    size_t off = 0;
+    for (uint32_t d = 0; d < doc_count; ++d) {
+        std::vector<uint32_t> doc;
+        doc.reserve(pos_counts[d]);
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < pos_counts[d]; ++i) {
+            prev = (i == 0) ? deltas[off + i] : prev + deltas[off + i];
+            doc.push_back(prev);
+        }
+        off += pos_counts[d];
+        out->push_back(std::move(doc));
+    }
+    if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload");
+    return Status::OK();
+}
+
+// Writes a PFOR window: codec=pfor, payload, crc(header+payload).
+void write_pfor(Slice payload, ByteSink* sink) {
+    ByteSink framed;
+    framed.put_u8(static_cast<uint8_t>(PrxCodec::kPfor));
+    framed.put_varint32(static_cast<uint32_t>(payload.size()));
+    framed.put_bytes(payload);
+    sink->put_bytes(framed.view());
+    sink->put_fixed32(crc32c(framed.view()));
+}
+
+// Decode per-doc position lists from a plain payload.
+Status decode_payload(Slice plain, std::vector<std::vector<uint32_t>>* out) {
+    ByteSource src(plain);
+    uint32_t doc_count = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count));
+    if (doc_count > kMaxWindowDocs) {
+        return Status::Corruption("prx: doc count exceeds sane cap");
+    }
+    out->clear();
+    out->reserve(doc_count);
+    for (uint32_t d = 0; d < doc_count; ++d) {
+        uint32_t pos_count = 0;
+        SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count));
+        std::vector<uint32_t> doc;
+        doc.reserve(pos_count);
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < pos_count; ++i) {
+            uint32_t delta = 0;
+            SNII_RETURN_IF_ERROR(src.get_varint32(&delta));
+            prev = (i == 0) ? delta : prev + delta;
+            doc.push_back(prev);
+        }
+        out->push_back(std::move(doc));
+    }
+    if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload");
+    return Status::OK();
+}
+
+// CSR decode of a PFOR payload: all docs' positions into one flat buffer +
+// per-doc offsets, with NO per-doc std::vector allocation. `pos_off` has
+// doc_count+1 entries (pos_off[0]==0); doc d's positions are
+// pos_flat[pos_off[d] .. pos_off[d+1]).
+Status decode_pfor_payload_csr(Slice plain, std::vector<uint32_t>* pos_flat,
+                               std::vector<uint32_t>* pos_off) {
+    ByteSource src(plain);
+    uint32_t doc_count = 0, total_pos = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count));
+    SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos));
+    if (total_pos > kMaxWindowPositions) {
+        return Status::Corruption("prx: position count exceeds sane cap");
+    }
+    if (doc_count > kMaxWindowDocs) {
+        return Status::Corruption("prx: doc count exceeds sane cap");
+    }
+    pos_off->clear();
+    pos_off->reserve(static_cast<size_t>(doc_count) + 1);
+    SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, pos_off));
+    uint64_t sum = 0;
+    for (uint32_t d = 0; d < doc_count; ++d) sum += (*pos_off)[d];
+    if (sum != total_pos) return Status::Corruption("prx: pos_count sum mismatch");
+    pos_flat->reserve(total_pos);
+    SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat));
+    size_t off = 0;
+    uint32_t next_off = 0;
+    for (uint32_t d = 0; d < doc_count; ++d) {
+        const uint32_t pos_count = (*pos_off)[d];
+        (*pos_off)[d] = next_off;
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < pos_count; ++i) {
+            uint32_t& value = (*pos_flat)[off + i];
+            prev = (i == 0) ? value : prev + value;
+            value = prev;
+        }
+        off += pos_count;
+        next_off += pos_count;
+    }
+    pos_off->push_back(next_off);
+    if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload");
+    return Status::OK();
+}
+
+Status validate_doc_ordinals(std::span<const uint32_t> doc_ordinals, uint32_t doc_count) {
+    uint32_t prev = 0;
+    for (size_t i = 0; i < doc_ordinals.size(); ++i) {
+        const uint32_t doc = doc_ordinals[i];
+        if (doc >= doc_count) {
+            return Status::Corruption("prx: selected doc ordinal out of range");
+        }
+        if (i != 0 && doc <= prev) {
+            return Status::InvalidArgument("prx: selected doc ordinals must be strictly ascending");
+        }
+        prev = doc;
+    }
+    return Status::OK();
+}
+
+struct SelectedRange {
+    SelectedRange(uint32_t begin_, uint32_t end_, uint32_t out_begin_)
+            : begin(begin_), end(end_), out_begin(out_begin_) {}
+
+    uint32_t begin;
+    uint32_t end;
+    uint32_t out_begin;
+};
+
+uint32_t count_covered_pfor_runs(std::span<const SelectedRange> selected, uint32_t total_pos) {
+    if (selected.empty() || total_pos == 0) {
+        return 0;
+    }
+    uint32_t runs = 0;
+    uint32_t next_run = 0;
+    for (const SelectedRange& range : selected) {
+        if (range.begin == range.end) {
+            continue;
+        }
+        const uint32_t first_run = range.begin / kFrqBaseUnit;
+        const uint32_t last_run = (range.end - 1) / kFrqBaseUnit;
+        const uint32_t counted_first = std::max(first_run, next_run);
+        if (counted_first <= last_run) {
+            runs += last_run - counted_first + 1;
+            next_run = last_run + 1;
+        }
+    }
+    return runs;
+}
+
+bool should_decode_full_prx_positions(std::span<const SelectedRange> selected,
+                                      uint32_t selected_pos_count, uint32_t total_pos) {
+    if (selected.empty() || total_pos == 0) {
+        return false;
+    }
+    if (selected_pos_count * 2 >= total_pos) {
+        return true;
+    }
+    const uint32_t total_runs = (total_pos + kFrqBaseUnit - 1) / kFrqBaseUnit;
+    const uint32_t covered_runs = count_covered_pfor_runs(selected, total_pos);
+    return covered_runs * 4 >= total_runs * 3;
+}
+
+void compact_selected_pfor_positions(std::span<const SelectedRange> selected,
+                                     std::vector<uint32_t>& pos_flat,
+                                     std::vector<uint32_t>& pos_off) {
+    size_t write_off = 0;
+    pos_off.clear();
+    pos_off.reserve(selected.size() + 1);
+    pos_off.push_back(0);
+    for (const SelectedRange& range : selected) {
+        const uint32_t count = range.end - range.begin;
+        if (count == 1) {
+            pos_flat[write_off++] = pos_flat[range.begin];
+            pos_off.push_back(static_cast<uint32_t>(write_off));
+            continue;
+        }
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < count; ++i) {
+            const uint32_t delta = pos_flat[range.begin + i];
+            prev = (i == 0) ? delta : prev + delta;
+            pos_flat[write_off++] = prev;
+        }
+        pos_off.push_back(static_cast<uint32_t>(write_off));
+    }
+    pos_flat.resize(write_off);
+}
+
+Status decode_selected_pfor_count_ranges(ByteSource* src, uint32_t doc_count,
+                                         std::span<const uint32_t> doc_ordinals,
+                                         std::vector<SelectedRange>& selected,
+                                         std::vector<uint32_t>& pos_off, uint64_t* total_pos_count,
+                                         uint32_t* selected_pos_count) {
+    selected.clear();
+    selected.reserve(doc_ordinals.size());
+    pos_off.clear();
+    pos_off.reserve(doc_ordinals.size() + 1);
+    pos_off.push_back(0);
+
+    *selected_pos_count = 0;
+    uint32_t delta_begin = 0;
+    size_t next_doc = 0;
+    *total_pos_count = 0;
+    std::array<uint32_t, kFrqBaseUnit> run_buf {};
+    for (uint32_t run_begin = 0; run_begin < doc_count; run_begin += kFrqBaseUnit) {
+        const uint32_t run_len = std::min<uint32_t>(kFrqBaseUnit, doc_count - run_begin);
+        SNII_RETURN_IF_ERROR(pfor_decode(src, run_len, run_buf.data()));
+        for (uint32_t i = 0; i < run_len; ++i) {
+            const uint32_t d = run_begin + i;
+            const uint32_t count = run_buf[i];
+            *total_pos_count += count;
+            if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) {
+                selected.emplace_back(delta_begin, delta_begin + count, *selected_pos_count);
+                *selected_pos_count += count;
+                pos_off.push_back(*selected_pos_count);
+                ++next_doc;
+            }
+            delta_begin += count;
+        }
+    }
+    if (next_doc != doc_ordinals.size()) {
+        return Status::Corruption("prx: selected doc ordinal was not decoded");
+    }
+    return Status::OK();
+}
+
+Status decode_sparse_selected_pfor_positions(ByteSource* src, uint32_t total_pos,
+                                             std::span<const SelectedRange> selected,
+                                             std::span<uint32_t> pos_flat) {
+    std::array<uint32_t, kFrqBaseUnit> run_buf {};
+    size_t range_idx = 0;
+    for (uint32_t run_begin = 0; run_begin < total_pos; run_begin += kFrqBaseUnit) {
+        const uint32_t run_len = std::min<uint32_t>(kFrqBaseUnit, total_pos - run_begin);
+        const uint32_t run_end = run_begin + run_len;
+        while (range_idx < selected.size() && selected[range_idx].end <= run_begin) {
+            ++range_idx;
+        }
+        if (range_idx == selected.size() || selected[range_idx].begin >= run_end) {
+            SNII_RETURN_IF_ERROR(pfor_skip(src, run_len));
+            continue;
+        }
+
+        SNII_RETURN_IF_ERROR(pfor_decode(src, run_len, run_buf.data()));
+        for (size_t ri = range_idx; ri < selected.size() && selected[ri].begin < run_end; ++ri) {
+            const SelectedRange& range = selected[ri];
+            const uint32_t copy_begin = std::max(range.begin, run_begin);
+            const uint32_t copy_end = std::min(range.end, run_end);
+            const uint32_t dst_begin = range.out_begin + copy_begin - range.begin;
+            std::copy_n(run_buf.data() + copy_begin - run_begin, copy_end - copy_begin,
+                        pos_flat.data() + dst_begin);
+        }
+    }
+    return Status::OK();
+}
+
+void restore_selected_position_deltas(const std::vector<uint32_t>& pos_off,
+                                      std::span<uint32_t> pos_flat) {
+    for (size_t i = 0; i + 1 < pos_off.size(); ++i) {
+        uint32_t prev = 0;
+        for (uint32_t off = pos_off[i]; off < pos_off[i + 1]; ++off) {
+            uint32_t& value = pos_flat[off];
+            prev = (off == pos_off[i]) ? value : prev + value;
+            value = prev;
+        }
+    }
+}
+
+Status decode_pfor_payload_csr_selective(Slice plain, std::span<const uint32_t> doc_ordinals,
+                                         std::vector<uint32_t>* pos_flat,
+                                         std::vector<uint32_t>* pos_off) {
+    ByteSource src(plain);
+    uint32_t doc_count = 0, total_pos = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count));
+    SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos));
+    if (total_pos > kMaxWindowPositions) {
+        return Status::Corruption("prx: position count exceeds sane cap");
+    }
+    if (doc_count > kMaxWindowDocs) {
+        return Status::Corruption("prx: doc count exceeds sane cap");
+    }
+    SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count));
+
+    pos_flat->clear();
+
+    std::vector<SelectedRange> selected;
+    uint64_t sum = 0;
+    uint32_t selected_pos_count = 0;
+    SNII_RETURN_IF_ERROR(decode_selected_pfor_count_ranges(&src, doc_count, doc_ordinals, selected,
+                                                           *pos_off, &sum, &selected_pos_count));
+    if (sum != total_pos) {
+        return Status::Corruption("prx: pos_count sum mismatch");
+    }
+
+    if (should_decode_full_prx_positions(selected, selected_pos_count, total_pos)) {
+        SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat));
+        compact_selected_pfor_positions(selected, *pos_flat, *pos_off);
+        if (!src.eof()) {
+            return Status::Corruption("prx: trailing bytes after pfor payload");
+        }
+        return Status::OK();
+    }
+
+    pos_flat->resize(selected_pos_count);
+    SNII_RETURN_IF_ERROR(decode_sparse_selected_pfor_positions(
+            &src, total_pos, selected, std::span<uint32_t>(pos_flat->data(), pos_flat->size())));
+
+    restore_selected_position_deltas(*pos_off,
+                                     std::span<uint32_t>(pos_flat->data(), pos_flat->size()));
+    if (!src.eof()) {
+        return Status::Corruption("prx: trailing bytes after pfor payload");
+    }
+    return Status::OK();
+}
+
+// CSR decode of a plain (raw) payload. See decode_pfor_payload_csr.
+Status decode_payload_csr(Slice plain, std::vector<uint32_t>* pos_flat,
+                          std::vector<uint32_t>* pos_off) {
+    ByteSource src(plain);
+    uint32_t doc_count = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count));
+    if (doc_count > kMaxWindowDocs) {
+        return Status::Corruption("prx: doc count exceeds sane cap");
+    }
+    pos_flat->clear();
+    pos_off->clear();
+    pos_off->reserve(static_cast<size_t>(doc_count) + 1);
+    pos_off->push_back(0);
+    uint64_t total_pos = 0;
+    for (uint32_t d = 0; d < doc_count; ++d) {
+        uint32_t pos_count = 0;
+        SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count));
+        total_pos += pos_count;
+        if (total_pos > kMaxWindowPositions) {
+            return Status::Corruption("prx: position count exceeds sane cap");
+        }
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < pos_count; ++i) {
+            uint32_t delta = 0;
+            SNII_RETURN_IF_ERROR(src.get_varint32(&delta));
+            prev = (i == 0) ? delta : prev + delta;
+            pos_flat->push_back(prev);
+        }
+        pos_off->push_back(static_cast<uint32_t>(pos_flat->size()));
+    }
+    if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload");
+    return Status::OK();
+}
+
+Status decode_payload_csr_selective(Slice plain, std::span<const uint32_t> doc_ordinals,
+                                    std::vector<uint32_t>* pos_flat,
+                                    std::vector<uint32_t>* pos_off) {
+    ByteSource src(plain);
+    uint32_t doc_count = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count));
+    if (doc_count > kMaxWindowDocs) {
+        return Status::Corruption("prx: doc count exceeds sane cap");
+    }
+    SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count));
+    pos_flat->clear();
+    pos_off->clear();
+    pos_off->reserve(doc_ordinals.size() + 1);
+    pos_off->push_back(0);
+    size_t next_doc = 0;
+    uint64_t total_pos = 0;
+    for (uint32_t d = 0; d < doc_count; ++d) {
+        uint32_t pos_count = 0;
+        SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count));
+        total_pos += pos_count;
+        if (total_pos > kMaxWindowPositions) {
+            return Status::Corruption("prx: position count exceeds sane cap");
+        }
+        const bool selected = next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d;
+        uint32_t prev = 0;
+        for (uint32_t i = 0; i < pos_count; ++i) {
+            uint32_t delta = 0;
+            SNII_RETURN_IF_ERROR(src.get_varint32(&delta));
+            if (!selected) continue;
+            prev = (i == 0) ? delta : prev + delta;
+            pos_flat->push_back(prev);
+        }
+        if (selected) {
+            pos_off->push_back(static_cast<uint32_t>(pos_flat->size()));
+            ++next_doc;
+        }
+    }
+    if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload");
+    return Status::OK();
+}
+
+// Decision: given level and plain length, determine whether to compress.
+bool should_compress(int level, size_t plain_len) {
+    if (level == 0) return false;          // force raw
+    if (level > 0) return true;            // force zstd
+    return plain_len >= kAutoZstdMinBytes; // auto
+}
+
+// Write a raw window: codec=raw, uncomp_len, crc(header+payload), payload.
+void write_raw(Slice plain, ByteSink* sink) {
+    ByteSink framed;
+    framed.put_u8(static_cast<uint8_t>(PrxCodec::kRaw));
+    framed.put_varint32(static_cast<uint32_t>(plain.size()));
+    framed.put_bytes(plain);
+    sink->put_bytes(framed.view());
+    sink->put_fixed32(crc32c(framed.view()));
+}
+
+// Write a zstd window: codec=zstd, uncomp_len, comp_len, crc(header+payload),
+// payload.
+Status write_zstd(Slice plain, int level, ByteSink* sink) {
+    std::vector<uint8_t> comp;
+    SNII_RETURN_IF_ERROR(zstd_compress(plain, level > 0 ? level : kDefaultZstdLevel, &comp));
+    ByteSink framed;
+    framed.put_u8(static_cast<uint8_t>(PrxCodec::kZstd));
+    framed.put_varint32(static_cast<uint32_t>(plain.size()));
+    framed.put_varint32(static_cast<uint32_t>(comp.size()));
+    framed.put_bytes(Slice(comp));
+    sink->put_bytes(framed.view());
+    sink->put_fixed32(crc32c(framed.view()));
+    return Status::OK();
+}
+
+// Read header + payload, verify crc in retrospect, and return the payload view
+// and uncomp_len to the caller.
+Status read_framed(ByteSource* src, uint8_t* codec, uint32_t* uncomp_len, Slice* payload) {
+    size_t start = src->position();
+    SNII_RETURN_IF_ERROR(src->get_u8(codec));
+    if (*codec != static_cast<uint8_t>(PrxCodec::kRaw) &&
+        *codec != static_cast<uint8_t>(PrxCodec::kZstd) &&
+        *codec != static_cast<uint8_t>(PrxCodec::kPfor)) {
+        return Status::Corruption("prx: unknown codec");
+    }
+    SNII_RETURN_IF_ERROR(src->get_varint32(uncomp_len));
+    if (*uncomp_len > kMaxWindowUncompBytes) {
+        return Status::Corruption("prx: uncomp_len exceeds sane window cap");
+    }
+    size_t payload_len = *uncomp_len;
+    if (*codec == static_cast<uint8_t>(PrxCodec::kZstd)) {
+        uint32_t comp_len = 0;
+        SNII_RETURN_IF_ERROR(src->get_varint32(&comp_len));
+        payload_len = comp_len;
+    }
+    SNII_RETURN_IF_ERROR(src->get_bytes(payload_len, payload));
+    size_t framed_len = src->position() - start;
+    uint32_t stored = 0;
+    SNII_RETURN_IF_ERROR(src->get_fixed32(&stored));
+    if (crc32c(src->slice_from(start, framed_len)) != stored) {
+        return Status::Corruption("prx: window crc mismatch");
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status build_prx_window(std::span<const std::vector<uint32_t>> per_doc_positions,
+                        int zstd_level_or_negative_for_auto, ByteSink* sink) {
+    if (sink == nullptr) return Status::InvalidArgument("prx: null sink");
+    // Forced legacy codecs (level 0 = raw varint, level > 0 = zstd) are kept so
+    // the test/legacy paths still exercise them; the auto path (< 0) now emits
+    // PFOR bit-packed deltas -- no entropy coding, far cheaper build CPU than
+    // zstd-3.
+    if (zstd_level_or_negative_for_auto >= 0) {
+        ByteSink plain;
+        SNII_RETURN_IF_ERROR(encode_payload(per_doc_positions, &plain));
+        Slice plain_view = plain.view();
+        if (!should_compress(zstd_level_or_negative_for_auto, plain_view.size())) {
+            write_raw(plain_view, sink);
+            return Status::OK();
+        }
+        return write_zstd(plain_view, zstd_level_or_negative_for_auto, sink);
+    }
+    ByteSink payload;
+    SNII_RETURN_IF_ERROR(encode_pfor_payload(per_doc_positions, &payload));
+    write_pfor(payload.view(), sink);
+    return Status::OK();
+}
+
+Status build_prx_window_flat(std::span<const uint32_t> positions_flat,
+                             std::span<const uint32_t> freqs, int zstd_level_or_negative_for_auto,
+                             ByteSink* sink) {
+    if (sink == nullptr) return Status::InvalidArgument("prx: null sink");
+    if (zstd_level_or_negative_for_auto >= 0) {
+        ByteSink plain;
+        SNII_RETURN_IF_ERROR(encode_payload_flat(positions_flat, freqs, &plain));
+        Slice plain_view = plain.view();
+        if (!should_compress(zstd_level_or_negative_for_auto, plain_view.size())) {
+            write_raw(plain_view, sink);
+            return Status::OK();
+        }
+        return write_zstd(plain_view, zstd_level_or_negative_for_auto, sink);
+    }
+    ByteSink payload;
+    SNII_RETURN_IF_ERROR(encode_pfor_payload_flat(positions_flat, freqs, &payload));
+    write_pfor(payload.view(), sink);
+    return Status::OK();
+}
+
+Status read_prx_window(ByteSource* source, std::vector<std::vector<uint32_t>>* per_doc_positions) {
+    if (source == nullptr || per_doc_positions == nullptr) {
+        return Status::InvalidArgument("prx: null arg");
+    }
+    uint8_t codec = 0;
+    uint32_t uncomp_len = 0;
+    Slice payload;
+    SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload));
+    if (codec == static_cast<uint8_t>(PrxCodec::kPfor)) {
+        return decode_pfor_payload(payload, per_doc_positions);
+    }
+    if (codec == static_cast<uint8_t>(PrxCodec::kRaw)) {
+        return decode_payload(payload, per_doc_positions);
+    }
+    std::vector<uint8_t> plain;
+    SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain));
+    return decode_payload(Slice(plain), per_doc_positions);
+}
+
+Status read_prx_window_csr(ByteSource* source, std::vector<uint32_t>* pos_flat,
+                           std::vector<uint32_t>* pos_off) {
+    if (source == nullptr || pos_flat == nullptr || pos_off == nullptr) {
+        return Status::InvalidArgument("prx: null arg");
+    }
+    uint8_t codec = 0;
+    uint32_t uncomp_len = 0;
+    Slice payload;
+    SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload));
+    if (codec == static_cast<uint8_t>(PrxCodec::kPfor)) {
+        return decode_pfor_payload_csr(payload, pos_flat, pos_off);
+    }
+    if (codec == static_cast<uint8_t>(PrxCodec::kRaw)) {
+        return decode_payload_csr(payload, pos_flat, pos_off);
+    }
+    std::vector<uint8_t> plain;
+    SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain));
+    return decode_payload_csr(Slice(plain), pos_flat, pos_off);
+}
+
+Status read_prx_window_csr_selective(ByteSource* source, std::span<const uint32_t> doc_ordinals,
+                                     std::vector<uint32_t>* pos_flat,
+                                     std::vector<uint32_t>* pos_off) {
+    if (source == nullptr || pos_flat == nullptr || pos_off == nullptr) {
+        return Status::InvalidArgument("prx: null arg");
+    }
+    uint8_t codec = 0;
+    uint32_t uncomp_len = 0;
+    Slice payload;
+    SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload));
+    if (codec == static_cast<uint8_t>(PrxCodec::kPfor)) {
+        return decode_pfor_payload_csr_selective(payload, doc_ordinals, pos_flat, pos_off);
+    }
+    if (codec == static_cast<uint8_t>(PrxCodec::kRaw)) {
+        return decode_payload_csr_selective(payload, doc_ordinals, pos_flat, pos_off);
+    }
+    std::vector<uint8_t> plain;
+    SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain));
+    return decode_payload_csr_selective(Slice(plain), doc_ordinals, pos_flat, pos_off);
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp b/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp
new file mode 100644
index 00000000000000..1f7790e3aac84e
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp
@@ -0,0 +1,154 @@
+#include "snii/format/sampled_term_index.h"
+
+#include <algorithm>
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/section_framer.h"
+
+namespace snii::format {
+
+namespace {
+
+// Longest common prefix length of term and prev (front coding primitive, consistent with dict_entry).
+uint32_t common_prefix_len(std::string_view term, std::string_view prev) {
+    uint32_t n = 0;
+    const uint32_t lim = static_cast<uint32_t>(std::min(term.size(), prev.size()));
+    while (n < lim && term[n] == prev[n]) ++n;
+    return n;
+}
+
+// Write a front-coded term key (prefix_len + suffix_len + suffix).
+void write_term_key(std::string_view term, std::string_view prev, ByteSink* sink) {
+    const uint32_t prefix = common_prefix_len(term, prev);
+    const std::string_view suffix = term.substr(prefix);
+    sink->put_varint32(prefix);
+    sink->put_varint32(static_cast<uint32_t>(suffix.size()));
+    sink->put_bytes(Slice(suffix));
+}
+
+// Read a front-coded term key and reconstruct it into out from prev + suffix.
+Status read_term_key(ByteSource* src, std::string_view prev, std::string* out) {
+    uint32_t prefix = 0;
+    uint32_t suffix_len = 0;
+    SNII_RETURN_IF_ERROR(src->get_varint32(&prefix));
+    SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len));
+    if (prefix > prev.size()) {
+        return Status::Corruption("sampled_term_index: prefix_len exceeds prev_term length");
+    }
+    Slice suffix;
+    SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix));
+    out->assign(prev.substr(0, prefix));
+    out->append(reinterpret_cast<const char*>(suffix.data()), suffix.size());
+    return Status::OK();
+}
+
+} // namespace
+
+void SampledTermIndexBuilder::add_block_first_term(std::string_view first_term) {
+    first_terms_.emplace_back(first_term);
+}
+
+void SampledTermIndexBuilder::finish(ByteSink* sink) {
+    ByteSink payload;
+    payload.put_varint32(static_cast<uint32_t>(first_terms_.size()));
+    // min_term / max_term are written only when non-empty (== first/last sample_term).
+    if (!first_terms_.empty()) {
+        write_term_key(first_terms_.front(), std::string_view {}, &payload);
+        write_term_key(first_terms_.back(), std::string_view {}, &payload);
+        std::string_view prev {};
+        for (const auto& t : first_terms_) {
+            write_term_key(t, prev, &payload);
+            prev = t;
+        }
+    }
+    SectionFramer::write(*sink, static_cast<uint8_t>(SectionType::kSampledTermIndex),
+                         payload.view());
+}
+
+namespace {
+
+// Parse n_blocks, min/max (not used directly; consumed for checksum alignment), and all sample_terms from payload.
+Status parse_payload(Slice payload, std::vector<std::string>* terms) {
+    ByteSource src(payload);
+    uint32_t n_blocks = 0;
+    SNII_RETURN_IF_ERROR(src.get_varint32(&n_blocks));
+    if (n_blocks == 0) {
+        if (!src.eof()) {
+            return Status::Corruption("sampled_term_index: empty index contains trailing bytes");
+        }
+        terms->clear();
+        return Status::OK();
+    }
+
+    // min_term / max_term (do not drive binary search directly; must be consumed to verify structural alignment).
+    std::string min_term;
+    std::string max_term;
+    SNII_RETURN_IF_ERROR(read_term_key(&src, std::string_view {}, &min_term));
+    SNII_RETURN_IF_ERROR(read_term_key(&src, std::string_view {}, &max_term));
+
+    std::vector<std::string> out;
+    out.reserve(n_blocks);
+    std::string prev;
+    for (uint32_t i = 0; i < n_blocks; ++i) {
+        std::string term;
+        SNII_RETURN_IF_ERROR(read_term_key(&src, prev, &term));
+        prev = term;
+        out.push_back(std::move(term));
+    }
+    if (!src.eof()) {
+        return Status::Corruption("sampled_term_index: payload contains trailing bytes");
+    }
+    if (out.front() != min_term || out.back() != max_term) {
+        return Status::Corruption("sampled_term_index: min/max inconsistent with sample_terms");
+    }
+    *terms = std::move(out);
+    return Status::OK();
+}
+
+} // namespace
+
+Status SampledTermIndexReader::open(Slice section, SampledTermIndexReader* out) {
+    if (out == nullptr) {
+        return Status::InvalidArgument("sampled_term_index: out is null");
+    }
+    ByteSource src(section);
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec));
+    if (sec.type != static_cast<uint8_t>(SectionType::kSampledTermIndex)) {
+        return Status::InvalidArgument("sampled_term_index: not a kSampledTermIndex section");
+    }
+    *out = SampledTermIndexReader {};
+    return parse_payload(sec.payload, &out->sample_terms_);
+}
+
+Status SampledTermIndexReader::locate(std::string_view target, bool* maybe_present,
+                                      uint32_t* block_ordinal) const {
+    if (maybe_present == nullptr || block_ordinal == nullptr) {
+        return Status::InvalidArgument("sampled_term_index: output pointer is null");
+    }
+    *maybe_present = false;
+    *block_ordinal = 0;
+    if (sample_terms_.empty()) {
+        return Status::OK(); // empty index: always out of range.
+    }
+    // target < min_term (first block's first term) -> before the first block, so it
+    // cannot exist in any block. NOTE: a target GREATER than the last sample term is
+    // NOT out of range -- sample_terms_ holds each block's FIRST term, so the LAST
+    // block can contain terms greater than its first term. Such a target routes to
+    // the last block (upper_bound -> end()), where find_term confirms presence.
+    if (target < std::string_view(sample_terms_.front())) {
+        return Status::OK();
+    }
+    // Last sample_term <= target: step back one position after upper_bound. For a
+    // target past every sample term, upper_bound returns end() and idx = n-1 (the
+    // last block), which is correct.
+    auto it = std::upper_bound(
+            sample_terms_.begin(), sample_terms_.end(), target,
+            [](std::string_view t, const std::string& s) { return t < std::string_view(s); });
+    const auto idx = (it - sample_terms_.begin()) - 1; // it > begin (< min excluded).
+    *maybe_present = true;
+    *block_ordinal = static_cast<uint32_t>(idx);
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/stats_block.cpp b/be/src/storage/index/snii/core/src/format/stats_block.cpp
new file mode 100644
index 00000000000000..527f4f98d43d79
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/stats_block.cpp
@@ -0,0 +1,46 @@
+#include "snii/format/stats_block.h"
+
+namespace snii::format {
+
+namespace {
+
+// Field order within payload is fixed; reuse ByteSink varint primitives — do not hand-assemble bytes.
+void encode_payload(const StatsBlock& sb, ByteSink* payload) {
+    payload->put_varint64(sb.doc_count);
+    payload->put_varint64(sb.indexed_doc_count);
+    payload->put_varint64(sb.term_count);
+    payload->put_varint64(sb.sum_total_term_freq);
+    payload->put_varint64(sb.null_count);
+}
+
+Status decode_payload(Slice payload, StatsBlock* out) {
+    ByteSource ps(payload);
+    SNII_RETURN_IF_ERROR(ps.get_varint64(&out->doc_count));
+    SNII_RETURN_IF_ERROR(ps.get_varint64(&out->indexed_doc_count));
+    SNII_RETURN_IF_ERROR(ps.get_varint64(&out->term_count));
+    SNII_RETURN_IF_ERROR(ps.get_varint64(&out->sum_total_term_freq));
+    SNII_RETURN_IF_ERROR(ps.get_varint64(&out->null_count));
+    if (!ps.eof()) {
+        return Status::Corruption("stats_block: trailing bytes in payload");
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+void encode_stats_block(const StatsBlock& sb, ByteSink* sink) {
+    ByteSink payload;
+    encode_payload(sb, &payload);
+    SectionFramer::write(*sink, static_cast<uint8_t>(SectionType::kStatsBlock), payload.view());
+}
+
+Status decode_stats_block(ByteSource* src, StatsBlock* out) {
+    FramedSection sec;
+    SNII_RETURN_IF_ERROR(SectionFramer::read(*src, &sec));
+    if (sec.type != static_cast<uint8_t>(SectionType::kStatsBlock)) {
+        return Status::InvalidArgument("stats_block: unexpected section type");
+    }
+    return decode_payload(sec.payload, out);
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp b/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp
new file mode 100644
index 00000000000000..ed781c4d82e667
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp
@@ -0,0 +1,129 @@
+#include "snii/format/tail_meta_region.h"
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+namespace {
+
+// Header field bytes (before header_crc): u32 ver + u32 flags + u64 meta_region_len
+// + u32 n + u64 directory_offset + u64 directory_length.
+constexpr size_t kHeaderFields = 4 + 4 + 8 + 4 + 8 + 8; // 36
+constexpr size_t kHeaderSize = kHeaderFields + 4;       // + header_crc32c
+constexpr size_t kRegionChecksumSize = 4;
+
+} // namespace
+
+void TailMetaRegionBuilder::add_index(uint64_t index_id, std::string index_suffix,
+                                      Slice per_index_meta_bytes) {
+    Entry e;
+    e.index_id = index_id;
+    e.suffix = std::move(index_suffix);
+    e.bytes.assign(per_index_meta_bytes.data(),
+                   per_index_meta_bytes.data() + per_index_meta_bytes.size());
+    entries_.push_back(std::move(e));
+}
+
+void TailMetaRegionBuilder::finish(ByteSink* sink) const {
+    // Lay out per-index meta blocks right after the header; build the directory
+    // with each block's in-region offset/length.
+    LogicalIndexDirectoryBuilder dir;
+    uint64_t offset = kHeaderSize;
+    for (const Entry& e : entries_) {
+        LogicalIndexRef ref;
+        ref.index_id = e.index_id;
+        ref.index_suffix = e.suffix;
+        ref.meta_off = offset;
+        ref.meta_len = e.bytes.size();
+        dir.add(ref);
+        offset += e.bytes.size();
+    }
+    const uint64_t directory_offset = offset;
+    ByteSink dir_bytes;
+    dir.finish(&dir_bytes);
+    const uint64_t directory_length = dir_bytes.size();
+    const uint64_t meta_region_len = directory_offset + directory_length + kRegionChecksumSize;
+
+    ByteSink fields;
+    fields.put_fixed32(kMetaFormatVersion);
+    fields.put_fixed32(0); // flags
+    fields.put_fixed64(meta_region_len);
+    fields.put_fixed32(static_cast<uint32_t>(entries_.size()));
+    fields.put_fixed64(directory_offset);
+    fields.put_fixed64(directory_length);
+
+    ByteSink region;
+    region.put_bytes(fields.view());
+    region.put_fixed32(crc32c(fields.view())); // header_crc32c
+    for (const Entry& e : entries_) region.put_bytes(Slice(e.bytes));
+    region.put_bytes(dir_bytes.view());
+    region.put_fixed32(crc32c(region.view())); // meta_region_checksum
+
+    sink->put_bytes(region.view());
+}
+
+Status TailMetaRegionReader::open(Slice region, TailMetaRegionReader* out) {
+    if (out == nullptr) return Status::InvalidArgument("tail_meta_region: null out");
+    if (region.size() < kHeaderSize + kRegionChecksumSize) {
+        return Status::Corruption("tail_meta_region: region too short");
+    }
+
+    // Verify the trailing region checksum.
+    const size_t covered = region.size() - kRegionChecksumSize;
+    ByteSource cs(region.subslice(covered, kRegionChecksumSize));
+    uint32_t region_crc = 0;
+    SNII_RETURN_IF_ERROR(cs.get_fixed32(&region_crc));
+    if (crc32c(region.subslice(0, covered)) != region_crc) {
+        return Status::Corruption("tail_meta_region: meta_region_checksum mismatch");
+    }
+
+    // Parse + verify the header.
+    ByteSource hs(region.subslice(0, kHeaderFields));
+    uint32_t ver = 0, flags = 0, n = 0;
+    uint64_t meta_region_len = 0, directory_offset = 0, directory_length = 0;
+    SNII_RETURN_IF_ERROR(hs.get_fixed32(&ver));
+    SNII_RETURN_IF_ERROR(hs.get_fixed32(&flags));
+    SNII_RETURN_IF_ERROR(hs.get_fixed64(&meta_region_len));
+    SNII_RETURN_IF_ERROR(hs.get_fixed32(&n));
+    SNII_RETURN_IF_ERROR(hs.get_fixed64(&directory_offset));
+    SNII_RETURN_IF_ERROR(hs.get_fixed64(&directory_length));
+    ByteSource hc(region.subslice(kHeaderFields, 4));
+    uint32_t header_crc = 0;
+    SNII_RETURN_IF_ERROR(hc.get_fixed32(&header_crc));
+    if (crc32c(region.subslice(0, kHeaderFields)) != header_crc) {
+        return Status::Corruption("tail_meta_region: header crc mismatch");
+    }
+    if (ver != kMetaFormatVersion) {
+        return Status::Unsupported("tail_meta_region: unsupported meta_format_version");
+    }
+    if (meta_region_len != region.size()) {
+        return Status::Corruption("tail_meta_region: declared length mismatch");
+    }
+    if (directory_offset + directory_length > region.size() || directory_offset < kHeaderSize) {
+        return Status::Corruption("tail_meta_region: directory out of range");
+    }
+
+    SNII_RETURN_IF_ERROR(LogicalIndexDirectoryReader::open(
+            region.subslice(directory_offset, directory_length), &out->dir_));
+    if (out->dir_.size() != n) {
+        return Status::Corruption("tail_meta_region: directory size mismatch");
+    }
+    out->region_ = region;
+    out->n_ = n;
+    return Status::OK();
+}
+
+Status TailMetaRegionReader::find(uint64_t index_id, std::string_view suffix, bool* found,
+                                  Slice* per_index_meta_bytes) const {
+    LogicalIndexRef ref;
+    SNII_RETURN_IF_ERROR(dir_.find(index_id, suffix, found, &ref));
+    if (!*found) return Status::OK();
+    if (ref.meta_off + ref.meta_len > region_.size()) {
+        return Status::Corruption("tail_meta_region: meta block out of range");
+    }
+    *per_index_meta_bytes = region_.subslice(ref.meta_off, ref.meta_len);
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/format/tail_pointer.cpp b/be/src/storage/index/snii/core/src/format/tail_pointer.cpp
new file mode 100644
index 00000000000000..bc17f5652d4f82
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/format/tail_pointer.cpp
@@ -0,0 +1,95 @@
+#include "snii/format/tail_pointer.h"
+
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::format {
+
+namespace {
+
+// Byte widths of every fixed field, used to derive the constant on-disk size:
+// u32 magic + u16 version + 3*u64 + 2*u32 + u8 size + u32 tail_checksum.
+constexpr size_t kMagicBytes = 4;
+constexpr size_t kVersionBytes = 2;
+constexpr size_t kU64Bytes = 8;
+constexpr size_t kU32Bytes = 4;
+constexpr size_t kSizeByteBytes = 1;
+
+constexpr size_t kFixedSize =
+        kMagicBytes + kVersionBytes + 3 * kU64Bytes + 2 * kU32Bytes + kSizeByteBytes + kU32Bytes;
+// tail_checksum is the trailing u32 and covers every byte before it.
+constexpr size_t kChecksumCoverage = kFixedSize - kU32Bytes;
+
+// Serializes the checksum-covered region in fixed field order into covered.
+void serialize_covered(const TailPointer& tp, ByteSink* covered) {
+    covered->put_fixed32(kTailMagic);
+    covered->put_fixed16(kFormatVersion);
+    covered->put_fixed64(tp.meta_region_offset);
+    covered->put_fixed64(tp.meta_region_length);
+    covered->put_fixed64(tp.hot_off);
+    covered->put_fixed32(tp.meta_region_checksum);
+    covered->put_fixed32(tp.bootstrap_header_checksum);
+    covered->put_u8(static_cast<uint8_t>(kFixedSize));
+}
+
+} // namespace
+
+size_t tail_pointer_size() {
+    return kFixedSize;
+}
+
+Status encode_tail_pointer(const TailPointer& tp, ByteSink* sink) {
+    ByteSink covered;
+    serialize_covered(tp, &covered);
+    if (covered.size() != kChecksumCoverage) {
+        return Status::Internal("tail_pointer: covered size mismatch");
+    }
+    const uint32_t tail_checksum = crc32c(covered.view());
+    sink->put_bytes(covered.view());
+    sink->put_fixed32(tail_checksum);
+    return Status::OK();
+}
+
+Status decode_tail_pointer(Slice last_bytes, TailPointer* out) {
+    // Anti-DoS / framing: the tail pointer is a fixed-size footer, so reject any
+    // input that is not exactly the fixed size before touching its contents.
+    if (last_bytes.size() != kFixedSize) {
+        return Status::Corruption("tail_pointer: input is not the fixed size");
+    }
+    // Verify the trailing tail_checksum over the covered region first; a mismatch
+    // means any parsed field would be untrustworthy.
+    const Slice covered = last_bytes.subslice(0, kChecksumCoverage);
+    ByteSource src(last_bytes);
+
+    uint32_t magic = 0;
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&magic));
+    if (magic != kTailMagic) {
+        return Status::Corruption("tail_pointer: bad magic");
+    }
+
+    uint16_t format_version = 0;
+    SNII_RETURN_IF_ERROR(src.get_fixed16(&format_version));
+    (void)format_version; // Read to advance the cursor; version policy lives in
+                          // the bootstrap header, not here.
+    SNII_RETURN_IF_ERROR(src.get_fixed64(&out->meta_region_offset));
+    SNII_RETURN_IF_ERROR(src.get_fixed64(&out->meta_region_length));
+    SNII_RETURN_IF_ERROR(src.get_fixed64(&out->hot_off));
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&out->meta_region_checksum));
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&out->bootstrap_header_checksum));
+
+    uint8_t on_disk_size = 0;
+    SNII_RETURN_IF_ERROR(src.get_u8(&on_disk_size));
+    if (on_disk_size != kFixedSize) {
+        return Status::Corruption("tail_pointer: embedded size mismatch");
+    }
+
+    uint32_t tail_checksum = 0;
+    SNII_RETURN_IF_ERROR(src.get_fixed32(&tail_checksum));
+    if (tail_checksum != crc32c(covered)) {
+        return Status::Corruption("tail_pointer: tail_checksum mismatch");
+    }
+    return Status::OK();
+}
+
+} // namespace snii::format
diff --git a/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp b/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp
new file mode 100644
index 00000000000000..1292f8d4f09c2e
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp
@@ -0,0 +1,81 @@
+#include "snii/io/batch_range_fetcher.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace snii::io {
+namespace {
+
+Status checked_end(uint64_t offset, uint64_t len, uint64_t* out) {
+    if (len > std::numeric_limits<uint64_t>::max() - offset) {
+        return Status::Corruption("batch_range_fetcher: range end overflow");
+    }
+    *out = offset + len;
+    return Status::OK();
+}
+
+Status checked_size(uint64_t len, size_t* out) {
+    if (len > static_cast<uint64_t>(std::numeric_limits<size_t>::max())) {
+        return Status::Corruption("batch_range_fetcher: physical range too large");
+    }
+    *out = static_cast<size_t>(len);
+    return Status::OK();
+}
+
+} // namespace
+
+BatchRangeFetcher::BatchRangeFetcher(FileReader* reader, uint64_t coalesce_gap)
+        : reader_(reader), coalesce_gap_(coalesce_gap) {}
+
+size_t BatchRangeFetcher::add(uint64_t offset, uint64_t len) {
+    reqs_.push_back(Req {offset, len});
+    return reqs_.size() - 1;
+}
+
+void BatchRangeFetcher::clear() {
+    reqs_.clear();
+    phys_.clear();
+}
+
+Status BatchRangeFetcher::fetch() {
+    if (reader_ == nullptr) return Status::InvalidArgument("batch_range_fetcher: null reader");
+    phys_.clear();
+    if (reqs_.empty()) return Status::OK();
+
+    std::vector<size_t> order(reqs_.size());
+    for (size_t i = 0; i < order.size(); ++i) order[i] = i;
+    std::sort(order.begin(), order.end(),
+              [&](size_t a, size_t b) { return reqs_[a].offset < reqs_[b].offset; });
+
+    // Sweep in offset order, merging requests into physical segments.
+    std::vector<Range> segs;
+    uint64_t cur_start = 0;
+    uint64_t cur_end = 0;
+    for (size_t k = 0; k < order.size(); ++k) {
+        Req& r = reqs_[order[k]];
+        uint64_t r_end = 0;
+        SNII_RETURN_IF_ERROR(checked_end(r.offset, r.len, &r_end));
+        SNII_RETURN_IF_ERROR(checked_size(r.len, &r.len_size));
+        const bool disjoint = r.offset > cur_end && r.offset - cur_end > coalesce_gap_;
+        if (segs.empty() || disjoint) {
+            segs.push_back(Range {r.offset, 0}); // length finalized below
+            cur_start = r.offset;
+            cur_end = r_end;
+        } else {
+            cur_end = std::max(cur_end, r_end);
+        }
+        r.phys_idx = segs.size() - 1;
+        SNII_RETURN_IF_ERROR(checked_size(r.offset - cur_start, &r.sub_offset));
+        SNII_RETURN_IF_ERROR(checked_size(cur_end - cur_start, &segs.back().len));
+    }
+
+    return reader_->read_batch(segs, &phys_);
+}
+
+Slice BatchRangeFetcher::get(size_t h) const {
+    const Req& r = reqs_[h];
+    const std::vector<uint8_t>& buf = phys_[r.phys_idx];
+    return Slice(buf.data() + r.sub_offset, r.len_size);
+}
+
+} // namespace snii::io
diff --git a/be/src/storage/index/snii/core/src/io/local_file.cpp b/be/src/storage/index/snii/core/src/io/local_file.cpp
new file mode 100644
index 00000000000000..af64664fe6ad30
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/io/local_file.cpp
@@ -0,0 +1,113 @@
+#include "snii/io/local_file.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstring>
+
+namespace snii::io {
+namespace {
+
+std::string errno_msg(const char* what) {
+    return std::string(what) + ": " + std::strerror(errno);
+}
+
+} // namespace
+
+LocalFileReader::~LocalFileReader() {
+    if (fd_ >= 0) ::close(fd_);
+}
+
+Status LocalFileReader::open(const std::string& path) {
+    fd_ = ::open(path.c_str(), O_RDONLY);
+    if (fd_ < 0) return Status::IoError(errno_msg("open"));
+    struct stat st;
+    if (::fstat(fd_, &st) != 0) return Status::IoError(errno_msg("fstat"));
+    size_ = static_cast<uint64_t>(st.st_size);
+    return Status::OK();
+}
+
+Status LocalFileReader::read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) {
+    if (fd_ < 0) return Status::IoError("read_at on unopened file");
+    // Non-wrapping bounds check (offset+len could overflow uint64 on a corrupt arg).
+    if (offset > size_ || len > size_ - offset) {
+        return Status::Corruption("read_at past end of file");
+    }
+    out->resize(len);
+    size_t done = 0;
+    while (done < len) {
+        ssize_t n = ::pread(fd_, out->data() + done, len - done, static_cast<off_t>(offset + done));
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            return Status::IoError(errno_msg("pread"));
+        }
+        if (n == 0) return Status::Corruption("pread returned 0 before len");
+        done += static_cast<size_t>(n);
+    }
+    return Status::OK();
+}
+
+LocalFileWriter::~LocalFileWriter() {
+    if (fd_ >= 0) ::close(fd_); // best-effort: dtor cannot surface a flush error
+}
+
+Status LocalFileWriter::open(const std::string& path) {
+    fd_ = ::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+    if (fd_ < 0) return Status::IoError(errno_msg("open"));
+    buf_.reserve(kBufCapacity);
+    return Status::OK();
+}
+
+Status LocalFileWriter::write_all(const uint8_t* data, size_t len) {
+    size_t done = 0;
+    while (done < len) {
+        ssize_t n = ::write(fd_, data + done, len - done);
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            return Status::IoError(errno_msg("write"));
+        }
+        done += static_cast<size_t>(n);
+    }
+    return Status::OK();
+}
+
+Status LocalFileWriter::flush_buffer() {
+    if (buf_.empty()) return Status::OK();
+    SNII_RETURN_IF_ERROR(write_all(buf_.data(), buf_.size()));
+    buf_.clear();
+    return Status::OK();
+}
+
+Status LocalFileWriter::append(Slice data) {
+    if (fd_ < 0) return Status::IoError("append on unopened file");
+    const size_t len = data.size();
+    if (len == 0) return Status::OK();
+    // Spans larger than the buffer go straight to the fd (after flushing pending
+    // bytes) to avoid a pointless copy and an oversized buffer.
+    if (len >= kBufCapacity) {
+        SNII_RETURN_IF_ERROR(flush_buffer());
+        SNII_RETURN_IF_ERROR(write_all(data.data(), len));
+        bytes_written_ += len;
+        return Status::OK();
+    }
+    if (buf_.size() + len > kBufCapacity) SNII_RETURN_IF_ERROR(flush_buffer());
+    buf_.insert(buf_.end(), data.data(), data.data() + len);
+    bytes_written_ += len;
+    return Status::OK();
+}
+
+Status LocalFileWriter::finalize() {
+    if (fd_ < 0) return Status::IoError("finalize on unopened file");
+    SNII_RETURN_IF_ERROR(flush_buffer());
+    if (::fsync(fd_) != 0) return Status::IoError(errno_msg("fsync"));
+    if (::close(fd_) != 0) {
+        fd_ = -1;
+        return Status::IoError(errno_msg("close"));
+    }
+    fd_ = -1;
+    return Status::OK();
+}
+
+} // namespace snii::io
diff --git a/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp b/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp
new file mode 100644
index 00000000000000..a643d8eca5aa3f
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp
@@ -0,0 +1,117 @@
+#include "snii/io/metered_file_reader.h"
+
+#include <algorithm>
+
+namespace snii::io {
+namespace {
+
+// Inclusive [first, last] block ids touched by a validated [offset, offset+len).
+// Empty len touches no block (callers guard len==0 before calling this).
+void block_range(uint64_t offset, size_t len, size_t block_size, uint64_t* first, uint64_t* last) {
+    *first = offset / block_size;
+    *last = (offset + len - 1) / block_size;
+}
+
+} // namespace
+
+MeteredFileReader::MeteredFileReader(FileReader* inner, size_t block_size)
+        : inner_(inner), block_size_(block_size) {}
+
+void MeteredFileReader::reset_metrics() {
+    resident_.clear();
+    metrics_ = IoMetrics {};
+}
+
+Status MeteredFileReader::validate_range(uint64_t offset, size_t len) const {
+    if (inner_ == nullptr) return Status::InvalidArgument("metered: null inner reader");
+    if (block_size_ == 0) return Status::InvalidArgument("metered: zero block size");
+    const uint64_t total = inner_->size();
+    if (offset > total || len > total - offset) {
+        return Status::Corruption("metered: read range past end");
+    }
+    return Status::OK();
+}
+
+// Accounts the FileCache effect of touching [offset, offset+len): newly missed
+// blocks become coalesced remote GETs and remote bytes. Returns true iff any
+// block missed. (Single contiguous span -> at most one coalesced run.)
+bool MeteredFileReader::account_blocks(uint64_t offset, size_t len) {
+    if (len == 0) return false;
+    uint64_t first = 0, last = 0;
+    block_range(offset, len, block_size_, &first, &last);
+
+    bool any_miss = false;
+    bool in_run = false; // currently inside a contiguous run of missing blocks
+    const uint64_t total = inner_->size();
+    for (uint64_t b = first; b <= last; ++b) {
+        if (resident_.count(b)) {
+            in_run = false;
+            continue;
+        }
+        resident_.insert(b);
+        any_miss = true;
+        const uint64_t block_start = b * block_size_;
+        metrics_.remote_bytes += std::min<uint64_t>(block_size_, total - block_start);
+        if (!in_run) {
+            ++metrics_.range_gets; // start of a new coalesced GET
+            in_run = true;
+        }
+    }
+    return any_miss;
+}
+
+Status MeteredFileReader::read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) {
+    if (out == nullptr) return Status::InvalidArgument("metered: null out");
+    SNII_RETURN_IF_ERROR(validate_range(offset, len));
+    ++metrics_.read_at_calls;
+    metrics_.total_request_bytes += len;
+    // A single blocking read: any miss forces one serial round (the next offset is
+    // not known until these bytes return).
+    if (account_blocks(offset, len)) ++metrics_.serial_rounds;
+    return inner_->read_at(offset, len, out);
+}
+
+Status MeteredFileReader::read_batch(const std::vector<Range>& ranges,
+                                     std::vector<std::vector<uint8_t>>* outs) {
+    if (outs == nullptr) return Status::InvalidArgument("metered: null batch out");
+    for (const Range& r : ranges) {
+        SNII_RETURN_IF_ERROR(validate_range(r.offset, r.len));
+    }
+
+    // Gather the union of touched blocks so coalescing spans the whole batch, and
+    // the entire batch counts as at most one serial round.
+    std::vector<uint64_t> blocks;
+    for (const Range& r : ranges) {
+        metrics_.total_request_bytes += r.len;
+        if (r.len == 0) continue;
+        uint64_t first = 0, last = 0;
+        block_range(r.offset, r.len, block_size_, &first, &last);
+        for (uint64_t b = first; b <= last; ++b) blocks.push_back(b);
+    }
+    metrics_.read_at_calls += ranges.size();
+
+    std::sort(blocks.begin(), blocks.end());
+    blocks.erase(std::unique(blocks.begin(), blocks.end()), blocks.end());
+
+    bool any_miss = false;
+    const uint64_t total = inner_->size();
+    uint64_t prev_miss = 0;
+    bool have_prev = false;
+    for (uint64_t b : blocks) {
+        if (resident_.count(b)) continue;
+        resident_.insert(b);
+        any_miss = true;
+        metrics_.remote_bytes += std::min<uint64_t>(block_size_, total - b * block_size_);
+        if (!have_prev || b != prev_miss + 1) ++metrics_.range_gets; // new run
+        prev_miss = b;
+        have_prev = true;
+    }
+    if (any_miss) ++metrics_.serial_rounds;
+
+    // Delegate the actual byte fetch to the inner reader's batch path, so a backend
+    // that fetches a batch concurrently (e.g. S3FileReader) realizes the planned
+    // round as parallel GETs (matching the single serial round accounted above).
+    return inner_->read_batch(ranges, outs);
+}
+
+} // namespace snii::io
diff --git a/be/src/storage/index/snii/core/src/io/s3_object_store.cpp b/be/src/storage/index/snii/core/src/io/s3_object_store.cpp
new file mode 100644
index 00000000000000..6be72027ebe263
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/io/s3_object_store.cpp
@@ -0,0 +1,217 @@
+#include "snii/io/s3_object_store.h"
+
+// The whole implementation is compiled only when the S3 backend is enabled.
+// Without SNII_WITH_S3 this file is an empty translation unit and pulls in no
+// aws-sdk headers, keeping core aws-free by default.
+#ifdef SNII_WITH_S3
+
+#include <aws/core/Aws.h>
+#include <aws/core/auth/AWSAuthSigner.h>
+#include <aws/core/auth/AWSCredentials.h>
+#include <aws/core/client/ClientConfiguration.h>
+#include <aws/core/utils/memory/stl/AWSStringStream.h>
+#include <aws/s3/S3Client.h>
+#include <aws/s3/model/GetObjectRequest.h>
+#include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/PutObjectRequest.h>
+
+#include <algorithm>
+#include <atomic>
+#include <future>
+#include <mutex>
+#include <sstream>
+#include <utility>
+
+namespace snii::io {
+namespace {
+
+// Refcounted process-wide InitAPI/ShutdownAPI control, shared by AwsApiGuard.
+std::mutex g_api_mu;
+int g_api_refcount = 0;
+Aws::SDKOptions g_api_options;
+
+void api_acquire() {
+    std::lock_guard<std::mutex> lock(g_api_mu);
+    if (g_api_refcount == 0) {
+        Aws::InitAPI(g_api_options);
+    }
+    ++g_api_refcount;
+}
+
+void api_release() {
+    std::lock_guard<std::mutex> lock(g_api_mu);
+    if (g_api_refcount > 0) {
+        --g_api_refcount;
+        if (g_api_refcount == 0) {
+            Aws::ShutdownAPI(g_api_options);
+        }
+    }
+}
+
+// Builds a virtual-hosted-addressing S3 client for an OSS-compatible endpoint.
+// OSS rejects path-style addressing (SecondLevelDomainForbidden), so virtual
+// addressing is mandatory; payload signing is disabled (Never).
+std::shared_ptr<Aws::S3::S3Client> make_client(const S3Config& cfg) {
+    Aws::Auth::AWSCredentials creds(Aws::String(cfg.ak.c_str()), Aws::String(cfg.sk.c_str()));
+    Aws::Client::ClientConfigurationInitValues init;
+    init.shouldDisableIMDS = true;
+    Aws::Client::ClientConfiguration client_cfg(init);
+    client_cfg.endpointOverride = Aws::String(cfg.endpoint.c_str());
+    client_cfg.region = Aws::String(cfg.region.c_str());
+    client_cfg.connectTimeoutMs = cfg.connect_timeout_ms;
+    client_cfg.requestTimeoutMs = cfg.request_timeout_ms;
+    client_cfg.httpRequestTimeoutMs = cfg.http_request_timeout_ms;
+    return std::make_shared<Aws::S3::S3Client>(
+            creds, client_cfg, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
+            /*useVirtualAddressing=*/true);
+}
+
+std::string join_key(const std::string& prefix, const std::string& key) {
+    if (prefix.empty()) return key;
+    return prefix + "/" + key;
+}
+
+} // namespace
+
+AwsApiGuard::AwsApiGuard() {
+    api_acquire();
+}
+AwsApiGuard::~AwsApiGuard() {
+    api_release();
+}
+
+// ---------------------------------------------------------------------------
+// S3FileReader
+// ---------------------------------------------------------------------------
+
+S3FileReader::~S3FileReader() = default;
+
+S3FileReader::S3FileReader(S3FileReader&&) noexcept = default;
+S3FileReader& S3FileReader::operator=(S3FileReader&&) noexcept = default;
+
+Status S3FileReader::open(const S3Config& cfg, const std::string& key, S3FileReader* out) {
+    if (out == nullptr) return Status::InvalidArgument("S3FileReader::open: null out");
+    out->client_ = make_client(cfg);
+    out->bucket_ = cfg.bucket;
+    out->object_key_ = join_key(cfg.prefix, key);
+
+    Aws::S3::Model::HeadObjectRequest req;
+    req.SetBucket(Aws::String(out->bucket_.c_str()));
+    req.SetKey(Aws::String(out->object_key_.c_str()));
+    auto outcome = out->client_->HeadObject(req);
+    if (!outcome.IsSuccess()) {
+        return Status::IoError("HeadObject(" + out->object_key_ +
+                               "): " + outcome.GetError().GetMessage().c_str());
+    }
+    out->size_ = static_cast<uint64_t>(outcome.GetResult().GetContentLength());
+    return Status::OK();
+}
+
+Status S3FileReader::read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) {
+    if (client_ == nullptr) return Status::IoError("read_at on unopened S3 object");
+    if (out == nullptr) return Status::InvalidArgument("read_at: null out");
+    // Non-wrapping bounds check (offset+len could overflow uint64 on a corrupt arg).
+    if (offset > size_ || len > size_ - offset) {
+        return Status::Corruption("read_at past end of object");
+    }
+    out->resize(len);
+    if (len == 0) return Status::OK();
+
+    Aws::S3::Model::GetObjectRequest req;
+    req.SetBucket(Aws::String(bucket_.c_str()));
+    req.SetKey(Aws::String(object_key_.c_str()));
+    std::ostringstream range;
+    range << "bytes=" << offset << "-" << (offset + len - 1);
+    req.SetRange(Aws::String(range.str().c_str()));
+
+    auto outcome = client_->GetObject(req);
+    if (!outcome.IsSuccess()) {
+        return Status::IoError("GetObject(" + object_key_ +
+                               "): " + outcome.GetError().GetMessage().c_str());
+    }
+    auto& body = outcome.GetResult().GetBody();
+    body.read(reinterpret_cast<char*>(out->data()), static_cast<std::streamsize>(len));
+    const std::streamsize got = body.gcount();
+    if (static_cast<size_t>(got) != len) {
+        return Status::Corruption("GetObject returned fewer bytes than requested");
+    }
+    return Status::OK();
+}
+
+Status S3FileReader::read_batch(const std::vector<Range>& ranges,
+                                std::vector<std::vector<uint8_t>>* outs) {
+    if (outs == nullptr) return Status::InvalidArgument("read_batch: null outs");
+    outs->resize(ranges.size());
+    if (ranges.empty()) return Status::OK();
+    // Issue GETs concurrently in bounded waves; aws S3Client is safe for parallel
+    // requests and each range writes a distinct output buffer.
+    constexpr size_t kMaxConcurrent = 16;
+    Status first_err;
+    for (size_t base = 0; base < ranges.size(); base += kMaxConcurrent) {
+        const size_t end = std::min(base + kMaxConcurrent, ranges.size());
+        std::vector<std::future<Status>> futs;
+        for (size_t i = base; i < end; ++i) {
+            futs.push_back(std::async(std::launch::async, [this, &ranges, outs, i]() {
+                return read_at(ranges[i].offset, ranges[i].len, &(*outs)[i]);
+            }));
+        }
+        for (auto& f : futs) {
+            const Status s = f.get();
+            if (!s.ok() && first_err.ok()) first_err = s;
+        }
+    }
+    return first_err;
+}
+
+// ---------------------------------------------------------------------------
+// S3FileWriter
+// ---------------------------------------------------------------------------
+
+S3FileWriter::~S3FileWriter() = default;
+
+S3FileWriter::S3FileWriter(S3FileWriter&&) noexcept = default;
+S3FileWriter& S3FileWriter::operator=(S3FileWriter&&) noexcept = default;
+
+Status S3FileWriter::open(const S3Config& cfg, const std::string& key) {
+    client_ = make_client(cfg);
+    bucket_ = cfg.bucket;
+    object_key_ = join_key(cfg.prefix, key);
+    buffer_.clear();
+    bytes_written_ = 0;
+    finalized_ = false;
+    return Status::OK();
+}
+
+Status S3FileWriter::append(Slice data) {
+    if (client_ == nullptr) return Status::IoError("append on unopened S3 writer");
+    if (finalized_) return Status::IoError("append after finalize");
+    buffer_.insert(buffer_.end(), data.data(), data.data() + data.size());
+    bytes_written_ += data.size();
+    return Status::OK();
+}
+
+Status S3FileWriter::finalize() {
+    if (client_ == nullptr) return Status::IoError("finalize on unopened S3 writer");
+    if (finalized_) return Status::IoError("finalize called twice");
+
+    Aws::S3::Model::PutObjectRequest req;
+    req.SetBucket(Aws::String(bucket_.c_str()));
+    req.SetKey(Aws::String(object_key_.c_str()));
+    auto stream = Aws::MakeShared<Aws::StringStream>("S3FileWriter");
+    stream->write(reinterpret_cast<const char*>(buffer_.data()),
+                  static_cast<std::streamsize>(buffer_.size()));
+    req.SetBody(stream);
+    req.SetContentLength(static_cast<long long>(buffer_.size()));
+
+    auto outcome = client_->PutObject(req);
+    if (!outcome.IsSuccess()) {
+        return Status::IoError("PutObject(" + object_key_ +
+                               "): " + outcome.GetError().GetMessage().c_str());
+    }
+    finalized_ = true;
+    return Status::OK();
+}
+
+} // namespace snii::io
+
+#endif // SNII_WITH_S3
diff --git a/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp b/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp
new file mode 100644
index 00000000000000..4987d788e6ed7d
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp
@@ -0,0 +1,42 @@
+#include "snii/query/bm25_scorer.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace snii::query {
+
+double decode_norm(uint8_t encoded) {
+    return encoded == 0 ? 1.0 : static_cast<double>(encoded);
+}
+
+uint8_t encode_norm(uint64_t doc_length) {
+    const uint64_t clamped = std::clamp<uint64_t>(doc_length, 1, 255);
+    return static_cast<uint8_t>(clamped);
+}
+
+ScorerContext ScorerContext::make(uint64_t n, uint64_t df) {
+    ScorerContext ctx;
+    ctx.df_ = df;
+    const double nn = static_cast<double>(n);
+    const double dff = static_cast<double>(df);
+    // idf = log(1 + (N - df + 0.5) / (df + 0.5)); always positive for df <= N.
+    ctx.idf_ = std::log(1.0 + (nn - dff + 0.5) / (dff + 0.5));
+    return ctx;
+}
+
+double ScorerContext::score(uint32_t tf, uint8_t encoded_norm, double avgdl,
+                            const Bm25Params& params) const {
+    const double dl = decode_norm(encoded_norm);
+    const double tff = static_cast<double>(tf);
+    const double denom = tff + params.k1 * (1.0 - params.b + params.b * dl / avgdl);
+    return idf_ * (tff * (params.k1 + 1.0)) / denom;
+}
+
+double ScorerContext::max_score(uint32_t max_freq, uint8_t min_norm, double avgdl,
+                                const Bm25Params& params) const {
+    // The score grows monotonically with tf and shrinks with dl, so the per-window
+    // upper bound uses the window's largest tf and smallest dl (min encoded norm).
+    return score(max_freq, min_norm, avgdl, params);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/boolean_query.cpp b/be/src/storage/index/snii/core/src/query/boolean_query.cpp
new file mode 100644
index 00000000000000..e4befe6e316b4a
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/boolean_query.cpp
@@ -0,0 +1,99 @@
+#include "snii/query/boolean_query.h"
+
+#include <algorithm>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "snii/format/dict_entry.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/internal/docid_conjunction.h"
+#include "snii/query/internal/docid_posting_reader.h"
+#include "snii/query/internal/docid_union.h"
+
+namespace snii::query {
+
+namespace {
+
+std::vector<std::string_view> unique_terms(const std::vector<std::string>& terms) {
+    std::vector<std::string_view> out;
+    out.reserve(terms.size());
+    for (const std::string& term : terms) out.emplace_back(term);
+    std::sort(out.begin(), out.end());
+    out.erase(std::unique(out.begin(), out.end()), out.end());
+    return out;
+}
+
+Status resolve_or_postings(const snii::reader::LogicalIndexReader& idx,
+                           const std::vector<std::string>& terms,
+                           std::vector<internal::ResolvedDocidPosting>* postings) {
+    postings->clear();
+    for (std::string_view term : unique_terms(terms)) {
+        bool found = false;
+        snii::format::DictEntry entry;
+        uint64_t frq_base = 0;
+        uint64_t prx_base = 0;
+        SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base));
+        if (!found) continue;
+
+        postings->push_back({std::move(entry), frq_base, prx_base});
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status boolean_or(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, std::vector<uint32_t>* docids) {
+    if (docids == nullptr) return Status::InvalidArgument("boolean_or: null out");
+    docids->clear();
+    if (terms.empty()) return Status::OK();
+
+    std::vector<internal::ResolvedDocidPosting> postings;
+    SNII_RETURN_IF_ERROR(resolve_or_postings(idx, terms, &postings));
+    return internal::build_docid_union(idx, postings, docids);
+}
+
+Status boolean_or(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, std::vector<uint32_t>* docids,
+                  QueryProfile* profile) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return boolean_or(idx, terms, docids);
+}
+
+Status boolean_or(const snii::reader::LogicalIndexReader& idx,
+                  const std::vector<std::string>& terms, DocIdSink* sink) {
+    if (sink == nullptr) return Status::InvalidArgument("boolean_or: null sink");
+    if (terms.empty()) return Status::OK();
+
+    std::vector<internal::ResolvedDocidPosting> postings;
+    SNII_RETURN_IF_ERROR(resolve_or_postings(idx, terms, &postings));
+    return internal::emit_docid_union(idx, postings, sink);
+}
+
+Status boolean_and(const snii::reader::LogicalIndexReader& idx,
+                   const std::vector<std::string>& terms, std::vector<uint32_t>* docids) {
+    if (docids == nullptr) return Status::InvalidArgument("boolean_and: null out");
+    docids->clear();
+    if (terms.empty()) return Status::OK();
+
+    snii::io::BatchRangeFetcher round1(idx.reader());
+    std::vector<internal::TermPlan> plans;
+    bool all_present = false;
+    SNII_RETURN_IF_ERROR(internal::plan_terms(idx, terms, &round1, &plans, &all_present,
+                                              /*need_positions=*/false));
+    if (!all_present) return Status::OK();
+    if (round1.pending() > 0) SNII_RETURN_IF_ERROR(round1.fetch());
+    SNII_RETURN_IF_ERROR(internal::open_preludes(round1, &plans,
+                                                 /*need_positions=*/false));
+    return internal::build_docid_only_conjunction(idx, round1, plans, docids);
+}
+
+Status boolean_and(const snii::reader::LogicalIndexReader& idx,
+                   const std::vector<std::string>& terms, std::vector<uint32_t>* docids,
+                   QueryProfile* profile) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return boolean_and(idx, terms, docids);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp
new file mode 100644
index 00000000000000..cfbafd3ca7c1bb
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp
@@ -0,0 +1,823 @@
+#include "snii/query/internal/docid_conjunction.h"
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+#include <limits>
+
+#include "snii/format/frq_pod.h"
+#include "snii/query/internal/docid_set_ops.h"
+#include "snii/reader/windowed_posting.h"
+
+namespace snii::query::internal {
+
+using snii::format::DictEntry;
+using snii::format::DictEntryEnc;
+using snii::format::DictEntryKind;
+using snii::format::FrqPreludeReader;
+using snii::format::WindowMeta;
+using snii::reader::LogicalIndexReader;
+
+namespace {
+
+using CandidateIt = std::vector<uint32_t>::const_iterator;
+
+constexpr uint32_t kBoundedSpanBitsetDocs = 16 * 1024;
+constexpr size_t kBoundedSpanBitsetWords = kBoundedSpanBitsetDocs / 64;
+constexpr size_t kBoundedSpanBitsetMinInput = 32;
+
+struct CandidateRange {
+    size_t begin = 0;
+    size_t end = 0;
+};
+
+Status slim_frq_docs_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) {
+    if (entry.frq_docs_len > win_len) {
+        return Status::Corruption("docid_conjunction: slim frq_docs_len exceeds frq window");
+    }
+    *out = entry.frq_docs_len > 0 ? entry.frq_docs_len : win_len;
+    return Status::OK();
+}
+
+Status add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) {
+    if (rhs > std::numeric_limits<uint64_t>::max() - lhs) {
+        return Status::Corruption(message);
+    }
+    *out = lhs + rhs;
+    return Status::OK();
+}
+
+Status posting_abs_offset(const LogicalIndexReader& idx, uint64_t base, uint64_t delta,
+                          const char* message, uint64_t* out) {
+    uint64_t with_base = 0;
+    SNII_RETURN_IF_ERROR(
+            add_u64(idx.section_refs().posting_region.offset, base, message, &with_base));
+    return add_u64(with_base, delta, message, out);
+}
+
+Status configure_term_plan(const LogicalIndexReader& idx, bool need_positions,
+                           snii::io::BatchRangeFetcher* fetcher, TermPlan* p) {
+    p->df = p->entry.df;
+    p->pod_ref = (p->entry.kind == DictEntryKind::kPodRef);
+    p->windowed = p->pod_ref && p->entry.enc == DictEntryEnc::kWindowed;
+    if (p->windowed) {
+        uint64_t prelude_abs = 0;
+        SNII_RETURN_IF_ERROR(posting_abs_offset(idx, p->frq_base, p->entry.frq_off_delta,
+                                                "docid_conjunction: prelude offset overflow",
+                                                &prelude_abs));
+        p->prelude_handle = fetcher->add(prelude_abs, p->entry.prelude_len);
+    } else if (p->pod_ref) {
+        uint64_t foff = 0;
+        uint64_t flen = 0;
+        uint64_t poff = 0;
+        uint64_t plen = 0;
+        SNII_RETURN_IF_ERROR(idx.resolve_frq_window(p->entry, p->frq_base, &foff, &flen));
+        uint64_t frq_fetch = flen;
+        SNII_RETURN_IF_ERROR(slim_frq_docs_len(p->entry, flen, &frq_fetch));
+        p->frq_handle = fetcher->add(foff, frq_fetch);
+        if (need_positions) {
+            SNII_RETURN_IF_ERROR(idx.resolve_prx_window(p->entry, p->prx_base, &poff, &plen));
+            p->prx_handle = fetcher->add(poff, plen);
+        }
+    }
+    return Status::OK();
+}
+
+std::vector<uint32_t> all_windows(const FrqPreludeReader& prelude) {
+    std::vector<uint32_t> ws(prelude.n_windows());
+    for (uint32_t i = 0; i < prelude.n_windows(); ++i) ws[i] = i;
+    return ws;
+}
+
+std::vector<size_t> ascending_df_order(const std::vector<TermPlan>& plans) {
+    std::vector<size_t> order(plans.size());
+    for (size_t i = 0; i < plans.size(); ++i) order[i] = i;
+    std::sort(order.begin(), order.end(),
+              [&](size_t a, size_t b) { return plans[a].df < plans[b].df; });
+    return order;
+}
+
+Status first_docid_in_window(const WindowMeta& meta, uint32_t window_ordinal, uint32_t* first) {
+    if (window_ordinal == 0) {
+        *first = 0;
+        return Status::OK();
+    }
+    if (meta.win_base >= std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("docid_conjunction: window base exceeds docid range");
+    }
+    *first = static_cast<uint32_t>(meta.win_base + 1);
+    if (*first > meta.last_docid) {
+        return Status::Corruption("docid_conjunction: invalid window docid range");
+    }
+    return Status::OK();
+}
+
+Status is_dense_full_window(const WindowMeta& meta, uint32_t window_ordinal, bool* full) {
+    uint32_t first = 0;
+    SNII_RETURN_IF_ERROR(first_docid_in_window(meta, window_ordinal, &first));
+    const uint64_t width = static_cast<uint64_t>(meta.last_docid) - first + 1;
+    *full = meta.doc_count == width;
+    return Status::OK();
+}
+
+Status append_docid_range(uint32_t first, uint32_t last, std::vector<uint32_t>* out) {
+    if (last < first) {
+        return Status::Corruption("docid_conjunction: invalid dense docid range");
+    }
+    const uint64_t count64 = static_cast<uint64_t>(last) - first + 1;
+    if (count64 > static_cast<uint64_t>(std::numeric_limits<size_t>::max() - out->size())) {
+        return Status::Corruption("docid_conjunction: dense docid range too large");
+    }
+    out->reserve(out->size() + static_cast<size_t>(count64));
+    uint32_t docid = first;
+    while (true) {
+        out->push_back(docid);
+        if (docid == last) break;
+        ++docid;
+    }
+    return Status::OK();
+}
+
+CandidateRange find_candidate_range(const std::vector<uint32_t>& candidates, size_t* search_begin,
+                                    uint32_t first, uint32_t last) {
+    const auto from = candidates.begin() + *search_begin;
+    const auto begin = std::lower_bound(from, candidates.end(), first);
+    const auto end = std::upper_bound(begin, candidates.end(), last);
+    *search_begin = static_cast<size_t>(end - candidates.begin());
+    return {.begin = static_cast<size_t>(begin - candidates.begin()),
+            .end = static_cast<size_t>(end - candidates.begin())};
+}
+
+void append_candidate_range(CandidateIt begin, CandidateIt end, std::vector<uint32_t>* out) {
+    out->insert(out->end(), begin, end);
+}
+
+void clear_ordinals_if_all_term_docs_selected(const std::vector<uint32_t>& term_docids,
+                                              DocidChunk* chunk) {
+    if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() &&
+        chunk->docids.front() == term_docids.front() &&
+        chunk->docids.back() == term_docids.back()) {
+        chunk->prx_doc_ordinals.clear();
+    }
+}
+
+bool append_term_docs_if_candidates_cover_span(CandidateIt begin, CandidateIt end,
+                                               const std::vector<uint32_t>& term_docids,
+                                               std::vector<uint32_t>* out, DocidChunk* chunk) {
+    const uint32_t first = term_docids.front();
+    const uint32_t last = term_docids.back();
+    const uint64_t width = static_cast<uint64_t>(last) - first + 1;
+    const size_t candidate_count = static_cast<size_t>(end - begin);
+    if (width > candidate_count) {
+        return false;
+    }
+
+    const auto span_begin = *begin == first ? begin : std::lower_bound(begin, end, first);
+    if (span_begin == end || *span_begin != first) {
+        return false;
+    }
+    if (static_cast<uint64_t>(end - span_begin) < width) {
+        return false;
+    }
+
+    const auto span_last = span_begin + static_cast<size_t>(width) - 1;
+    if (*span_last != last) {
+        return false;
+    }
+
+    out->insert(out->end(), term_docids.begin(), term_docids.end());
+    chunk->docids.insert(chunk->docids.end(), term_docids.begin(), term_docids.end());
+    return true;
+}
+
+Status append_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, uint32_t first,
+                                            uint32_t last, std::vector<uint32_t>* out,
+                                            DocidChunk* chunk) {
+    const size_t candidate_count = static_cast<size_t>(end - begin);
+    chunk->docids.reserve(candidate_count);
+    const uint64_t width = static_cast<uint64_t>(last) - first + 1;
+    if (width > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("docid_conjunction: dense window exceeds doc count range");
+    }
+    chunk->prx_doc_count = static_cast<uint32_t>(width);
+    const bool full_dense_range =
+            candidate_count == width && begin != end && *begin == first && *(end - 1) == last;
+    if (full_dense_range) {
+        out->insert(out->end(), begin, end);
+        chunk->docids.insert(chunk->docids.end(), begin, end);
+        return Status::OK();
+    }
+    chunk->prx_doc_ordinals.reserve(candidate_count);
+    for (auto it = begin; it != end; ++it) {
+        out->push_back(*it);
+        chunk->docids.push_back(*it);
+        chunk->prx_doc_ordinals.push_back(*it - first);
+    }
+    return Status::OK();
+}
+
+bool intersect_dense_term_span_with_ordinals(CandidateIt begin, CandidateIt end,
+                                             const std::vector<uint32_t>& term_docids,
+                                             size_t candidate_count, std::vector<uint32_t>* out,
+                                             DocidChunk* chunk) {
+    const uint32_t first = term_docids.front();
+    const uint32_t last = term_docids.back();
+    const uint64_t width = static_cast<uint64_t>(last) - first + 1;
+    if (term_docids.size() > width) {
+        return false;
+    }
+    const uint64_t missing_count = width - term_docids.size();
+    if (missing_count != 0 &&
+        (missing_count * 8 > width || missing_count >= candidate_count ||
+         missing_count > static_cast<uint64_t>(std::numeric_limits<size_t>::max()))) {
+        return false;
+    }
+
+    if (missing_count == 0) {
+        for (auto it = begin; it != end; ++it) {
+            if (*it < first) {
+                continue;
+            }
+            if (*it > last) {
+                break;
+            }
+            out->push_back(*it);
+            chunk->docids.push_back(*it);
+            chunk->prx_doc_ordinals.push_back(*it - first);
+        }
+        clear_ordinals_if_all_term_docs_selected(term_docids, chunk);
+        return true;
+    }
+
+    std::vector<uint32_t> missing;
+    missing.reserve(static_cast<size_t>(missing_count));
+    uint32_t expect = first;
+    for (uint32_t docid : term_docids) {
+        while (expect < docid) {
+            missing.push_back(expect);
+            ++expect;
+        }
+        if (docid < std::numeric_limits<uint32_t>::max()) {
+            expect = docid + 1;
+        }
+    }
+    while (expect <= last) {
+        missing.push_back(expect);
+        if (expect == std::numeric_limits<uint32_t>::max()) {
+            break;
+        }
+        ++expect;
+    }
+
+    size_t miss = 0;
+    for (auto it = begin; it != end; ++it) {
+        if (*it < first) {
+            continue;
+        }
+        if (*it > last) {
+            break;
+        }
+        while (miss < missing.size() && missing[miss] < *it) {
+            ++miss;
+        }
+        if (miss < missing.size() && missing[miss] == *it) {
+            continue;
+        }
+        out->push_back(*it);
+        chunk->docids.push_back(*it);
+        chunk->prx_doc_ordinals.push_back(static_cast<uint32_t>(*it - first - miss));
+    }
+    clear_ordinals_if_all_term_docs_selected(term_docids, chunk);
+    return true;
+}
+
+bool intersect_bounded_span_with_ordinals(CandidateIt begin, CandidateIt end,
+                                          const std::vector<uint32_t>& term_docids,
+                                          size_t candidate_count, std::vector<uint32_t>* out,
+                                          DocidChunk* chunk) {
+    if (candidate_count < kBoundedSpanBitsetMinInput ||
+        term_docids.size() < kBoundedSpanBitsetMinInput) {
+        return false;
+    }
+
+    const uint32_t first = std::min(*begin, term_docids.front());
+    const uint32_t last = std::max(*(end - 1), term_docids.back());
+    const uint64_t width = static_cast<uint64_t>(last) - first + 1;
+    if (width > kBoundedSpanBitsetDocs || term_docids.size() > width) {
+        return false;
+    }
+
+    std::array<uint64_t, kBoundedSpanBitsetWords> bits {};
+    for (uint32_t docid : term_docids) {
+        const uint32_t off = docid - first;
+        bits[off >> 6] |= 1ULL << (off & 63);
+    }
+
+    const auto word_count = static_cast<size_t>((width + 63) >> 6);
+    std::array<uint32_t, kBoundedSpanBitsetWords> ordinal_base {};
+    uint32_t ordinal = 0;
+    for (size_t word = 0; word < word_count; ++word) {
+        ordinal_base[word] = ordinal;
+        ordinal += static_cast<uint32_t>(__builtin_popcountll(bits[word]));
+    }
+
+    for (auto it = begin; it != end; ++it) {
+        const uint32_t off = *it - first;
+        const size_t word = off >> 6;
+        const uint64_t mask = 1ULL << (off & 63);
+        if ((bits[word] & mask) == 0) {
+            continue;
+        }
+        out->push_back(*it);
+        chunk->docids.push_back(*it);
+        chunk->prx_doc_ordinals.push_back(
+                ordinal_base[word] +
+                static_cast<uint32_t>(__builtin_popcountll(bits[word] & (mask - 1))));
+    }
+    clear_ordinals_if_all_term_docs_selected(term_docids, chunk);
+    return true;
+}
+
+size_t log2_ceil(size_t n) {
+    if (n <= 1) return 1;
+    --n;
+    size_t bits = 0;
+    while (n != 0) {
+        ++bits;
+        n >>= 1;
+    }
+    return bits;
+}
+
+void intersect_window_candidate_range(CandidateIt begin, CandidateIt end,
+                                      const std::vector<uint32_t>& term_docids, uint32_t first,
+                                      uint32_t last, std::vector<uint32_t>* out) {
+    const size_t candidate_count = static_cast<size_t>(end - begin);
+    if (candidate_count == 0 || term_docids.empty()) return;
+
+    const uint64_t width = static_cast<uint64_t>(last) - first + 1;
+    const uint64_t missing_count = term_docids.size() <= width ? width - term_docids.size() : width;
+    if (term_docids.size() <= width && missing_count != 0 && missing_count * 8 <= width &&
+        missing_count < candidate_count) {
+        std::vector<uint32_t> missing;
+        missing.reserve(static_cast<size_t>(missing_count));
+        uint32_t expect = first;
+        for (uint32_t docid : term_docids) {
+            while (expect < docid) {
+                missing.push_back(expect);
+                ++expect;
+            }
+            if (docid < std::numeric_limits<uint32_t>::max()) expect = docid + 1;
+        }
+        while (expect <= last) {
+            missing.push_back(expect);
+            if (expect == std::numeric_limits<uint32_t>::max()) break;
+            ++expect;
+        }
+        size_t miss = 0;
+        for (auto it = begin; it != end; ++it) {
+            while (miss < missing.size() && missing[miss] < *it) ++miss;
+            if (miss == missing.size() || missing[miss] != *it) out->push_back(*it);
+        }
+        return;
+    }
+
+    const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1;
+    if (candidate_count < term_docids.size() / probes_per_candidate) {
+        for (auto it = begin; it != end; ++it) {
+            if (std::binary_search(term_docids.begin(), term_docids.end(), *it)) {
+                out->push_back(*it);
+            }
+        }
+        return;
+    }
+    std::set_intersection(begin, end, term_docids.begin(), term_docids.end(),
+                          std::back_inserter(*out));
+}
+
+Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end,
+                                                      const std::vector<uint32_t>& term_docids,
+                                                      std::vector<uint32_t>* out,
+                                                      DocidChunk* chunk) {
+    if (term_docids.size() > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("docid_conjunction: prx doc count exceeds u32");
+    }
+    chunk->prx_doc_count = static_cast<uint32_t>(term_docids.size());
+    if (begin == end || term_docids.empty()) return Status::OK();
+
+    const size_t candidate_count = static_cast<size_t>(end - begin);
+    const size_t max_matches = std::min(candidate_count, term_docids.size());
+    out->reserve(out->size() + max_matches);
+    chunk->docids.reserve(max_matches);
+    if (candidate_count == term_docids.size() && *begin == term_docids.front() &&
+        *(end - 1) == term_docids.back() && std::equal(begin, end, term_docids.begin())) {
+        out->insert(out->end(), begin, end);
+        chunk->docids.insert(chunk->docids.end(), begin, end);
+        return Status::OK();
+    }
+    if (append_term_docs_if_candidates_cover_span(begin, end, term_docids, out, chunk)) {
+        return Status::OK();
+    }
+
+    chunk->prx_doc_ordinals.reserve(max_matches);
+    if (intersect_dense_term_span_with_ordinals(begin, end, term_docids, candidate_count, out,
+                                                chunk)) {
+        return Status::OK();
+    }
+    if (intersect_bounded_span_with_ordinals(begin, end, term_docids, candidate_count, out,
+                                             chunk)) {
+        return Status::OK();
+    }
+
+    const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1;
+    if (candidate_count < term_docids.size() / probes_per_candidate) {
+        size_t doc_index = 0;
+        for (auto it = begin; it != end; ++it) {
+            const auto found =
+                    std::lower_bound(term_docids.begin() + doc_index, term_docids.end(), *it);
+            if (found == term_docids.end()) break;
+            doc_index = static_cast<size_t>(found - term_docids.begin());
+            if (*found != *it) continue;
+            out->push_back(*it);
+            chunk->docids.push_back(*it);
+            chunk->prx_doc_ordinals.push_back(static_cast<uint32_t>(doc_index));
+            ++doc_index;
+        }
+        clear_ordinals_if_all_term_docs_selected(term_docids, chunk);
+        return Status::OK();
+    }
+
+    const size_t probes_per_term_doc = log2_ceil(candidate_count) + 1;
+    if (term_docids.size() < candidate_count / probes_per_term_doc) {
+        auto candidate_it = begin;
+        for (size_t doc_index = 0; doc_index < term_docids.size(); ++doc_index) {
+            const uint32_t docid = term_docids[doc_index];
+            candidate_it = std::lower_bound(candidate_it, end, docid);
+            if (candidate_it == end) break;
+            if (*candidate_it != docid) continue;
+            out->push_back(docid);
+            chunk->docids.push_back(docid);
+            chunk->prx_doc_ordinals.push_back(static_cast<uint32_t>(doc_index));
+            ++candidate_it;
+        }
+        clear_ordinals_if_all_term_docs_selected(term_docids, chunk);
+        return Status::OK();
+    }
+
+    size_t doc_index = 0;
+    for (auto it = begin; it != end; ++it) {
+        while (doc_index < term_docids.size() && term_docids[doc_index] < *it) {
+            ++doc_index;
+        }
+        if (doc_index == term_docids.size()) break;
+        if (term_docids[doc_index] != *it) continue;
+        out->push_back(*it);
+        chunk->docids.push_back(*it);
+        chunk->prx_doc_ordinals.push_back(static_cast<uint32_t>(doc_index));
+        ++doc_index;
+    }
+    clear_ordinals_if_all_term_docs_selected(term_docids, chunk);
+    return Status::OK();
+}
+
+Status select_covering_windows(const FrqPreludeReader& prelude,
+                               const std::vector<uint32_t>& candidates,
+                               std::vector<uint32_t>* windows) {
+    std::vector<uint32_t> sel;
+    uint32_t last = UINT32_MAX;
+    for (uint32_t d : candidates) {
+        bool found = false;
+        uint32_t w = 0;
+        SNII_RETURN_IF_ERROR(prelude.locate_window(d, &found, &w));
+        if (!found) continue;
+        if (w != last) {
+            sel.push_back(w);
+            last = w;
+        }
+    }
+    *windows = std::move(sel);
+    return Status::OK();
+}
+
+bool should_scan_all_windows(const LogicalIndexReader& idx, const TermPlan& p,
+                             size_t candidate_count) {
+    const size_t window_count = p.prelude.n_windows();
+    if (candidate_count > window_count * 64) return true;
+
+    const uint64_t doc_count = idx.stats().doc_count;
+    const bool near_full = doc_count != 0 && static_cast<uint64_t>(p.df) * 10 >= doc_count * 9;
+    return near_full && candidate_count > window_count * 4;
+}
+
+Status decode_flat_docids_only(const snii::io::BatchRangeFetcher& round1, const TermPlan& p,
+                               std::vector<uint32_t>* docids) {
+    Slice dd;
+    if (p.pod_ref) {
+        dd = round1.get(p.frq_handle);
+    } else {
+        SNII_RETURN_IF_ERROR(inline_dd_region(p.entry, &dd));
+    }
+    return snii::format::decode_dd_region(dd, p.entry.dd_meta, /*win_base=*/0, docids);
+}
+
+struct WindowWork {
+    uint32_t ordinal = 0;
+    WindowMeta meta;
+    CandidateRange candidates;
+    size_t handle = 0;
+    bool dense_full = false;
+};
+
+Status emit_dense_full_window_docids(const WindowWork& f, const std::vector<uint32_t>* candidates,
+                                     std::vector<uint32_t>& out, DocidSource* source) {
+    uint32_t first = 0;
+    SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first));
+    if (source != nullptr) {
+        DocidChunk chunk;
+        chunk.windowed = true;
+        chunk.window = f.ordinal;
+        chunk.prx_doc_count = f.meta.doc_count;
+        if (candidates == nullptr) {
+            SNII_RETURN_IF_ERROR(append_docid_range(first, f.meta.last_docid, &chunk.docids));
+        } else {
+            const auto begin = candidates->begin() + f.candidates.begin;
+            const auto end = candidates->begin() + f.candidates.end;
+            SNII_RETURN_IF_ERROR(append_candidate_range_with_ordinals(
+                    begin, end, first, f.meta.last_docid, &out, &chunk));
+        }
+        source->chunks.push_back(std::move(chunk));
+    }
+    if (candidates == nullptr) {
+        SNII_RETURN_IF_ERROR(append_docid_range(first, f.meta.last_docid, &out));
+    } else if (source == nullptr) {
+        append_candidate_range(candidates->begin() + f.candidates.begin,
+                               candidates->begin() + f.candidates.end, &out);
+    }
+    return Status::OK();
+}
+
+Status emit_decoded_window_docids(const WindowWork& f, const snii::io::BatchRangeFetcher& fetcher,
+                                  const std::vector<uint32_t>* candidates,
+                                  std::vector<uint32_t>& out, DocidSource* source,
+                                  std::vector<uint32_t>& docs, std::vector<uint32_t>& freqs,
+                                  std::vector<std::vector<uint32_t>>& positions) {
+    docs.clear();
+    freqs.clear();
+    positions.clear();
+    SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices(
+            f.meta, fetcher.get(f.handle), Slice(), Slice(),
+            /*want_positions=*/false, /*want_freq=*/false, &docs, &freqs, &positions));
+    if (source != nullptr) {
+        DocidChunk chunk;
+        chunk.windowed = true;
+        chunk.window = f.ordinal;
+        if (candidates == nullptr) {
+            chunk.docids = docs;
+            if (docs.size() > std::numeric_limits<uint32_t>::max()) {
+                return Status::Corruption("docid_conjunction: prx doc count exceeds u32");
+            }
+            chunk.prx_doc_count = static_cast<uint32_t>(docs.size());
+            source->chunks.push_back(std::move(chunk));
+        } else {
+            const auto begin = candidates->begin() + f.candidates.begin;
+            const auto end = candidates->begin() + f.candidates.end;
+            SNII_RETURN_IF_ERROR(
+                    intersect_window_candidate_range_with_ordinals(begin, end, docs, &out, &chunk));
+            if (!chunk.docids.empty()) {
+                source->chunks.push_back(std::move(chunk));
+            }
+        }
+    }
+    if (candidates == nullptr) {
+        out.insert(out.end(), docs.begin(), docs.end());
+        return Status::OK();
+    }
+    if (source != nullptr) {
+        return Status::OK();
+    }
+    uint32_t first = 0;
+    SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first));
+    intersect_window_candidate_range(candidates->begin() + f.candidates.begin,
+                                     candidates->begin() + f.candidates.end, docs, first,
+                                     f.meta.last_docid, &out);
+    return Status::OK();
+}
+
+Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPlan& p,
+                                    const std::vector<uint32_t>& windows,
+                                    const std::vector<uint32_t>* candidates,
+                                    std::vector<uint32_t>* out, DocidSource* source) {
+    snii::io::BatchRangeFetcher fetcher(idx.reader(), snii::reader::kSameTermCoalesceGap);
+    std::vector<WindowWork> work;
+    work.reserve(windows.size());
+    out->reserve(candidates == nullptr ? p.entry.df : candidates->size());
+    size_t candidate_search_begin = 0;
+    for (uint32_t w : windows) {
+        WindowMeta meta;
+        SNII_RETURN_IF_ERROR(p.prelude.window(w, &meta));
+        uint32_t first = 0;
+        SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first));
+        CandidateRange candidate_range;
+        if (candidates != nullptr) {
+            candidate_range = find_candidate_range(*candidates, &candidate_search_begin, first,
+                                                   meta.last_docid);
+            if (candidate_range.begin == candidate_range.end) {
+                continue;
+            }
+        }
+        bool dense_full = false;
+        SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full));
+        if (dense_full) {
+            work.push_back(WindowWork {
+                    .ordinal = w, .meta = meta, .candidates = candidate_range, .dense_full = true});
+            continue;
+        }
+
+        snii::reader::WindowAbsRange range;
+        SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range(
+                idx, p.entry, p.frq_base, p.prx_base, p.prelude, w,
+                /*want_positions=*/false, /*want_freq=*/false, &range));
+        WindowWork f;
+        f.ordinal = w;
+        f.meta = meta;
+        f.candidates = candidate_range;
+        f.handle = fetcher.add(range.dd_off, range.dd_len);
+        work.push_back(f);
+    }
+    if (fetcher.pending() > 0) {
+        SNII_RETURN_IF_ERROR(fetcher.fetch());
+    }
+
+    std::vector<uint32_t> docs;
+    std::vector<uint32_t> freqs;
+    std::vector<std::vector<uint32_t>> positions;
+    for (const WindowWork& f : work) {
+        if (f.dense_full) {
+            SNII_RETURN_IF_ERROR(emit_dense_full_window_docids(f, candidates, *out, source));
+            continue;
+        }
+        SNII_RETURN_IF_ERROR(emit_decoded_window_docids(f, fetcher, candidates, *out, source, docs,
+                                                        freqs, positions));
+    }
+    return Status::OK();
+}
+
+Status collect_docids_only(const LogicalIndexReader& idx, const snii::io::BatchRangeFetcher& round1,
+                           const TermPlan& p, const std::vector<uint32_t>* candidates,
+                           std::vector<uint32_t>* out, DocidSource* source) {
+    if (p.windowed) {
+        std::vector<uint32_t> windows;
+        if (candidates == nullptr) {
+            windows = all_windows(p.prelude);
+        } else if (should_scan_all_windows(idx, p, candidates->size())) {
+            // Dense candidate sets cover most windows; for near-full terms this also
+            // avoids thousands-to-millions of locate_window probes with no byte win.
+            windows = all_windows(p.prelude);
+        } else {
+            SNII_RETURN_IF_ERROR(select_covering_windows(p.prelude, *candidates, &windows));
+        }
+        return collect_windowed_docids_only(idx, p, windows, candidates, out, source);
+    }
+
+    std::vector<uint32_t> term_docids;
+    SNII_RETURN_IF_ERROR(decode_flat_docids_only(round1, p, &term_docids));
+    if (source != nullptr) {
+        DocidChunk chunk;
+        if (term_docids.size() > std::numeric_limits<uint32_t>::max()) {
+            return Status::Corruption("docid_conjunction: prx doc count exceeds u32");
+        }
+        chunk.prx_doc_count = static_cast<uint32_t>(term_docids.size());
+        if (candidates == nullptr) {
+            chunk.docids = term_docids;
+        } else if (!term_docids.empty()) {
+            const auto begin = std::ranges::lower_bound(*candidates, term_docids.front());
+            const auto end = std::upper_bound(begin, candidates->end(), term_docids.back());
+            SNII_RETURN_IF_ERROR(intersect_window_candidate_range_with_ordinals(
+                    begin, end, term_docids, out, &chunk));
+        }
+        if (candidates == nullptr || !chunk.docids.empty()) {
+            source->chunks.push_back(std::move(chunk));
+        }
+    }
+    if (candidates == nullptr) {
+        *out = std::move(term_docids);
+        return Status::OK();
+    }
+    if (source != nullptr) {
+        return Status::OK();
+    }
+    *out = intersect_sorted(*candidates, term_docids);
+    return Status::OK();
+}
+
+Status build_docid_only_conjunction_impl(const LogicalIndexReader& idx,
+                                         const snii::io::BatchRangeFetcher& round1,
+                                         const std::vector<TermPlan>& plans,
+                                         std::vector<uint32_t>* candidates,
+                                         std::vector<DocidSource>* sources) {
+    if (sources != nullptr) {
+        sources->assign(plans.size(), DocidSource {});
+    }
+    const std::vector<size_t> order = ascending_df_order(plans);
+    for (size_t k = 0; k < order.size(); ++k) {
+        const size_t ti = order[k];
+        std::vector<uint32_t> next;
+        DocidSource* source = sources == nullptr ? nullptr : &(*sources)[ti];
+        SNII_RETURN_IF_ERROR(collect_docids_only(idx, round1, plans[ti],
+                                                 k == 0 ? nullptr : candidates, &next, source));
+        if (source != nullptr && k + 1 == order.size()) {
+            source->docids_are_final_candidates = true;
+        }
+        *candidates = std::move(next);
+        if (candidates->empty()) {
+            return Status::OK();
+        }
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status resolve_query_term(const LogicalIndexReader& idx, const std::string& term,
+                          ResolvedQueryTerm* resolved, bool* found) {
+    *found = false;
+    SNII_RETURN_IF_ERROR(
+            idx.lookup(term, found, &resolved->entry, &resolved->frq_base, &resolved->prx_base));
+    return Status::OK();
+}
+
+Status plan_terms(const LogicalIndexReader& idx, const std::vector<std::string>& terms,
+                  snii::io::BatchRangeFetcher* fetcher, std::vector<TermPlan>* plans,
+                  bool* all_present, bool need_positions) {
+    *all_present = true;
+    plans->resize(terms.size());
+    for (size_t i = 0; i < terms.size(); ++i) {
+        ResolvedQueryTerm resolved;
+        bool found = false;
+        SNII_RETURN_IF_ERROR(resolve_query_term(idx, terms[i], &resolved, &found));
+        if (!found) {
+            *all_present = false;
+            return Status::OK();
+        }
+        TermPlan& p = (*plans)[i];
+        p.order = i;
+        p.entry = std::move(resolved.entry);
+        p.frq_base = resolved.frq_base;
+        p.prx_base = resolved.prx_base;
+        SNII_RETURN_IF_ERROR(configure_term_plan(idx, need_positions, fetcher, &p));
+    }
+    return Status::OK();
+}
+
+Status plan_resolved_terms(const LogicalIndexReader& idx,
+                           const std::vector<ResolvedQueryTerm>& terms,
+                           snii::io::BatchRangeFetcher* fetcher, std::vector<TermPlan>* plans,
+                           bool need_positions) {
+    plans->resize(terms.size());
+    for (size_t i = 0; i < terms.size(); ++i) {
+        TermPlan& p = (*plans)[i];
+        p.order = i;
+        p.entry = terms[i].entry;
+        p.frq_base = terms[i].frq_base;
+        p.prx_base = terms[i].prx_base;
+        SNII_RETURN_IF_ERROR(configure_term_plan(idx, need_positions, fetcher, &p));
+    }
+    return Status::OK();
+}
+
+Status open_preludes(const snii::io::BatchRangeFetcher& fetcher, std::vector<TermPlan>* plans,
+                     bool need_positions) {
+    for (TermPlan& p : *plans) {
+        if (!p.windowed) continue;
+        SNII_RETURN_IF_ERROR(FrqPreludeReader::open(fetcher.get(p.prelude_handle), &p.prelude));
+        if (need_positions && !p.prelude.has_prx()) {
+            return Status::Corruption("docid_conjunction: windowed prelude has no positions");
+        }
+    }
+    return Status::OK();
+}
+
+Status inline_dd_region(const DictEntry& entry, Slice* out) {
+    if (entry.dd_meta.disk_len > entry.frq_bytes.size()) {
+        return Status::Corruption("docid_conjunction: inline dd region exceeds frq bytes");
+    }
+    *out = Slice(entry.frq_bytes.data(), static_cast<size_t>(entry.dd_meta.disk_len));
+    return Status::OK();
+}
+
+Status build_docid_only_conjunction(const LogicalIndexReader& idx,
+                                    const snii::io::BatchRangeFetcher& round1,
+                                    const std::vector<TermPlan>& plans,
+                                    std::vector<uint32_t>* candidates) {
+    return build_docid_only_conjunction_impl(idx, round1, plans, candidates, nullptr);
+}
+
+Status build_docid_only_conjunction(const LogicalIndexReader& idx,
+                                    const snii::io::BatchRangeFetcher& round1,
+                                    const std::vector<TermPlan>& plans,
+                                    std::vector<uint32_t>* candidates,
+                                    std::vector<DocidSource>* sources) {
+    return build_docid_only_conjunction_impl(idx, round1, plans, candidates, sources);
+}
+
+} // namespace snii::query::internal
diff --git a/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp
new file mode 100644
index 00000000000000..206221ffc5dbbc
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp
@@ -0,0 +1,296 @@
+#include "snii/query/internal/docid_posting_reader.h"
+
+#include <limits>
+#include <utility>
+
+#include "snii/common/slice.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/frq_pod.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/io/batch_range_fetcher.h"
+#include "snii/reader/windowed_posting.h"
+
+namespace snii::query::internal {
+
+using snii::format::DictEntry;
+using snii::format::DictEntryEnc;
+using snii::format::DictEntryKind;
+using snii::format::FrqPreludeReader;
+using snii::format::WindowMeta;
+using snii::reader::LogicalIndexReader;
+
+namespace {
+
+Status decode_flat_docs(const DictEntry& entry, Slice dd_region, std::vector<uint32_t>* docids) {
+    return snii::format::decode_dd_region(dd_region, entry.dd_meta,
+                                          /*win_base=*/0, docids);
+}
+
+Status decode_inline_docs(const DictEntry& entry, std::vector<uint32_t>* docids) {
+    if (entry.dd_meta.disk_len > entry.frq_bytes.size()) {
+        return Status::Corruption("docid_posting_reader: inline dd region exceeds frq bytes");
+    }
+    return decode_flat_docs(
+            entry, Slice(entry.frq_bytes.data(), static_cast<size_t>(entry.dd_meta.disk_len)),
+            docids);
+}
+
+Status slim_docs_fetch_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) {
+    if (entry.frq_docs_len > win_len) {
+        return Status::Corruption("docid_posting_reader: slim frq_docs_len exceeds frq window");
+    }
+    *out = entry.frq_docs_len > 0 ? entry.frq_docs_len : win_len;
+    return Status::OK();
+}
+
+Status add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) {
+    if (rhs > std::numeric_limits<uint64_t>::max() - lhs) {
+        return Status::Corruption(message);
+    }
+    *out = lhs + rhs;
+    return Status::OK();
+}
+
+Status prelude_abs(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base,
+                   uint64_t* out) {
+    uint64_t with_base = 0;
+    SNII_RETURN_IF_ERROR(add_u64(idx.section_refs().posting_region.offset, frq_base,
+                                 "docid_posting_reader: prelude offset overflow", &with_base));
+    return add_u64(with_base, entry.frq_off_delta, "docid_posting_reader: prelude offset overflow",
+                   out);
+}
+
+Status validate_windowed_docs_prefix(const DictEntry& entry) {
+    if (entry.prelude_len == 0) {
+        return Status::Corruption("docid_posting_reader: windowed entry has no prelude");
+    }
+    if (entry.prelude_len > entry.frq_docs_len) {
+        return Status::Corruption("docid_posting_reader: prelude_len exceeds docs prefix");
+    }
+    if (entry.frq_docs_len > entry.frq_len) {
+        return Status::Corruption("docid_posting_reader: docs prefix exceeds frq_len");
+    }
+    return Status::OK();
+}
+
+struct FlatPlan {
+    size_t out_index = 0;
+    const DictEntry* entry = nullptr;
+    size_t handle = 0;
+};
+
+struct WindowPlan {
+    size_t out_index = 0;
+    const ResolvedDocidPosting* posting = nullptr;
+    size_t prefix_handle = 0;
+};
+
+Status plan_flat_docs(const LogicalIndexReader& idx, const ResolvedDocidPosting& posting,
+                      snii::io::BatchRangeFetcher* fetcher, FlatPlan* plan) {
+    uint64_t win_abs = 0;
+    uint64_t win_len = 0;
+    SNII_RETURN_IF_ERROR(
+            idx.resolve_frq_window(posting.entry, posting.frq_base, &win_abs, &win_len));
+    uint64_t docs_len = 0;
+    SNII_RETURN_IF_ERROR(slim_docs_fetch_len(posting.entry, win_len, &docs_len));
+    plan->handle = fetcher->add(win_abs, docs_len);
+    return Status::OK();
+}
+
+Status plan_window_prefix(const LogicalIndexReader& idx, WindowPlan* plan,
+                          snii::io::BatchRangeFetcher* fetcher) {
+    const ResolvedDocidPosting& posting = *plan->posting;
+    SNII_RETURN_IF_ERROR(validate_windowed_docs_prefix(posting.entry));
+    uint64_t abs = 0;
+    SNII_RETURN_IF_ERROR(prelude_abs(idx, posting.entry, posting.frq_base, &abs));
+    plan->prefix_handle = fetcher->add(abs, posting.entry.frq_docs_len);
+    return Status::OK();
+}
+
+Status window_dd_slice(Slice dd_block, const WindowMeta& meta, Slice* out) {
+    if (meta.dd_off > dd_block.size() || meta.dd_disk_len > dd_block.size() - meta.dd_off) {
+        return Status::Corruption("docid_posting_reader: window dd range out of prefix");
+    }
+    *out = dd_block.subslice(static_cast<size_t>(meta.dd_off),
+                             static_cast<size_t>(meta.dd_disk_len));
+    return Status::OK();
+}
+
+Status first_docid_in_window(const WindowMeta& meta, uint32_t window_ordinal, uint32_t* first) {
+    if (window_ordinal == 0) {
+        *first = 0;
+        return Status::OK();
+    }
+    if (meta.win_base >= std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("docid_posting_reader: window base exceeds docid range");
+    }
+    *first = static_cast<uint32_t>(meta.win_base + 1);
+    if (*first > meta.last_docid) {
+        return Status::Corruption("docid_posting_reader: invalid window docid range");
+    }
+    return Status::OK();
+}
+
+Status is_dense_full_window(const WindowMeta& meta, uint32_t window_ordinal, bool* full) {
+    uint32_t first = 0;
+    SNII_RETURN_IF_ERROR(first_docid_in_window(meta, window_ordinal, &first));
+    const uint64_t width = static_cast<uint64_t>(meta.last_docid) - first + 1;
+    *full = meta.doc_count == width;
+    return Status::OK();
+}
+
+Status decode_flat_plan(const snii::io::BatchRangeFetcher& fetcher, const FlatPlan& plan,
+                        std::vector<uint32_t>* out) {
+    return decode_flat_docs(*plan.entry, fetcher.get(plan.handle), out);
+}
+
+Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan,
+                                 DocIdSink* sink);
+
+Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan,
+                                 std::vector<uint32_t>* out) {
+    VectorDocIdSink sink(*out);
+    return decode_window_prefix_plan(fetcher, plan, &sink);
+}
+
+Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan,
+                                 DocIdSink* sink) {
+    const DictEntry& entry = plan.posting->entry;
+    const Slice prefix = fetcher.get(plan.prefix_handle);
+    if (entry.prelude_len > prefix.size()) {
+        return Status::Corruption("docid_posting_reader: short docs prefix");
+    }
+    const size_t prelude_len = static_cast<size_t>(entry.prelude_len);
+    FrqPreludeReader prelude;
+    SNII_RETURN_IF_ERROR(FrqPreludeReader::open(prefix.subslice(0, prelude_len), &prelude));
+    const uint64_t dd_block_len = prelude.dd_block_len();
+    if (dd_block_len > static_cast<uint64_t>(std::numeric_limits<size_t>::max()) - prelude_len) {
+        return Status::Corruption("docid_posting_reader: docs prefix length overflow");
+    }
+    const size_t expected_prefix_len = prelude_len + static_cast<size_t>(dd_block_len);
+    if (prefix.size() != expected_prefix_len) {
+        return Status::Corruption("docid_posting_reader: docs prefix length mismatch");
+    }
+    const Slice dd_block = prefix.subslice(prelude_len, prefix.size() - prelude_len);
+    std::vector<uint32_t> docs;
+    std::vector<uint32_t> freqs;
+    std::vector<std::vector<uint32_t>> positions;
+    for (uint32_t w = 0; w < prelude.n_windows(); ++w) {
+        WindowMeta meta;
+        Slice dd_region;
+        SNII_RETURN_IF_ERROR(prelude.window(w, &meta));
+        SNII_RETURN_IF_ERROR(window_dd_slice(dd_block, meta, &dd_region));
+        bool dense_full = false;
+        SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full));
+        if (dense_full) {
+            uint32_t first = 0;
+            SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first));
+            SNII_RETURN_IF_ERROR(
+                    sink->append_range(first, static_cast<uint64_t>(meta.last_docid) + 1));
+            continue;
+        }
+        docs.clear();
+        freqs.clear();
+        positions.clear();
+        SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices(
+                meta, dd_region, Slice(), Slice(), /*want_positions=*/false,
+                /*want_freq=*/false, &docs, &freqs, &positions));
+        SNII_RETURN_IF_ERROR(sink->append_sorted(docs));
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base,
+                          uint64_t prx_base, std::vector<uint32_t>* docids) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("docid_posting_reader: null out");
+    }
+    docids->clear();
+    VectorDocIdSink sink(*docids);
+    return read_docid_posting(idx, entry, frq_base, prx_base, &sink);
+}
+
+Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base,
+                          uint64_t prx_base, DocIdSink* sink) {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("docid_posting_reader: null sink");
+    }
+    ResolvedDocidPosting posting {entry, frq_base, prx_base};
+    if (posting.entry.kind == DictEntryKind::kInline) {
+        std::vector<uint32_t> docs;
+        SNII_RETURN_IF_ERROR(decode_inline_docs(posting.entry, &docs));
+        return sink->append_sorted(docs);
+    }
+
+    snii::io::BatchRangeFetcher docs_fetcher(idx.reader());
+    if (posting.entry.enc == DictEntryEnc::kWindowed) {
+        WindowPlan plan;
+        plan.out_index = 0;
+        plan.posting = &posting;
+        SNII_RETURN_IF_ERROR(plan_window_prefix(idx, &plan, &docs_fetcher));
+        if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch());
+        return decode_window_prefix_plan(docs_fetcher, plan, sink);
+    }
+
+    FlatPlan plan;
+    plan.out_index = 0;
+    plan.entry = &posting.entry;
+    SNII_RETURN_IF_ERROR(plan_flat_docs(idx, posting, &docs_fetcher, &plan));
+    if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch());
+    std::vector<uint32_t> docs;
+    SNII_RETURN_IF_ERROR(decode_flat_plan(docs_fetcher, plan, &docs));
+    return sink->append_sorted(docs);
+}
+
+Status read_docid_postings_batched(const LogicalIndexReader& idx,
+                                   const std::vector<ResolvedDocidPosting>& postings,
+                                   std::vector<std::vector<uint32_t>>* docids) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("docid_posting_reader: null batched out");
+    }
+    docids->clear();
+    docids->resize(postings.size());
+
+    std::vector<FlatPlan> flat_plans;
+    std::vector<WindowPlan> window_plans;
+    snii::io::BatchRangeFetcher docs_fetcher(idx.reader());
+
+    for (size_t i = 0; i < postings.size(); ++i) {
+        const ResolvedDocidPosting& posting = postings[i];
+        if (posting.entry.kind == DictEntryKind::kInline) {
+            SNII_RETURN_IF_ERROR(decode_inline_docs(posting.entry, &(*docids)[i]));
+            continue;
+        }
+        if (posting.entry.enc == DictEntryEnc::kWindowed) {
+            WindowPlan plan;
+            plan.out_index = i;
+            plan.posting = &posting;
+            SNII_RETURN_IF_ERROR(plan_window_prefix(idx, &plan, &docs_fetcher));
+            window_plans.push_back(std::move(plan));
+            continue;
+        }
+        FlatPlan plan;
+        plan.out_index = i;
+        plan.entry = &posting.entry;
+        flat_plans.push_back(plan);
+    }
+
+    for (FlatPlan& plan : flat_plans) {
+        const ResolvedDocidPosting& posting = postings[plan.out_index];
+        SNII_RETURN_IF_ERROR(plan_flat_docs(idx, posting, &docs_fetcher, &plan));
+    }
+    if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch());
+
+    for (const FlatPlan& plan : flat_plans) {
+        SNII_RETURN_IF_ERROR(decode_flat_plan(docs_fetcher, plan, &(*docids)[plan.out_index]));
+    }
+    for (const WindowPlan& plan : window_plans) {
+        SNII_RETURN_IF_ERROR(
+                decode_window_prefix_plan(docs_fetcher, plan, &(*docids)[plan.out_index]));
+    }
+    return Status::OK();
+}
+
+} // namespace snii::query::internal
diff --git a/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp b/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp
new file mode 100644
index 00000000000000..88b748e49e80b1
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp
@@ -0,0 +1,105 @@
+#include "snii/query/internal/docid_set_ops.h"
+
+#include <algorithm>
+#include <iterator>
+#include <queue>
+#include <utility>
+
+namespace snii::query::internal {
+
+std::vector<uint32_t> intersect_sorted(const std::vector<uint32_t>& a,
+                                       const std::vector<uint32_t>& b) {
+    std::vector<uint32_t> out;
+    out.reserve(std::min(a.size(), b.size()));
+    std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), std::back_inserter(out));
+    return out;
+}
+
+void union_sorted_into(std::vector<uint32_t>* acc, const std::vector<uint32_t>& next) {
+    std::vector<uint32_t> merged;
+    merged.reserve(acc->size() + next.size());
+    std::set_union(acc->begin(), acc->end(), next.begin(), next.end(), std::back_inserter(merged));
+    *acc = std::move(merged);
+}
+
+std::vector<uint32_t> union_sorted_many(const std::vector<std::vector<uint32_t>>& lists) {
+    constexpr size_t kLinearFanInMax = 8;
+    struct Cursor {
+        uint32_t docid = 0;
+        size_t list = 0;
+        size_t offset = 0;
+    };
+    struct GreaterDocId {
+        bool operator()(const Cursor& a, const Cursor& b) const { return a.docid > b.docid; }
+    };
+
+    size_t non_empty = 0;
+    size_t largest = 0;
+    std::priority_queue<Cursor, std::vector<Cursor>, GreaterDocId> heap;
+    for (size_t i = 0; i < lists.size(); ++i) {
+        if (lists[i].empty()) continue;
+        ++non_empty;
+        largest = std::max(largest, lists[i].size());
+        heap.push(Cursor {lists[i][0], i, 0});
+    }
+    if (non_empty == 0) return {};
+    if (non_empty == 1) {
+        for (const std::vector<uint32_t>& docs : lists) {
+            if (!docs.empty()) return docs;
+        }
+    }
+
+    if (non_empty <= kLinearFanInMax) {
+        std::vector<size_t> offsets(lists.size(), 0);
+        std::vector<uint32_t> out;
+        out.reserve(largest);
+        bool has_last = false;
+        uint32_t last = 0;
+        for (;;) {
+            bool found = false;
+            uint32_t next = 0;
+            for (size_t i = 0; i < lists.size(); ++i) {
+                if (offsets[i] >= lists[i].size()) continue;
+                const uint32_t docid = lists[i][offsets[i]];
+                if (!found || docid < next) {
+                    found = true;
+                    next = docid;
+                }
+            }
+            if (!found) break;
+            if (!has_last || next != last) {
+                out.push_back(next);
+                last = next;
+                has_last = true;
+            }
+            for (size_t i = 0; i < lists.size(); ++i) {
+                while (offsets[i] < lists[i].size() && lists[i][offsets[i]] == next) {
+                    ++offsets[i];
+                }
+            }
+        }
+        return out;
+    }
+
+    std::vector<uint32_t> out;
+    out.reserve(largest);
+    bool has_last = false;
+    uint32_t last = 0;
+    while (!heap.empty()) {
+        const Cursor cur = heap.top();
+        heap.pop();
+        if (!has_last || cur.docid != last) {
+            out.push_back(cur.docid);
+            last = cur.docid;
+            has_last = true;
+        }
+        const size_t next_offset = cur.offset + 1;
+        const std::vector<uint32_t>& docs = lists[cur.list];
+        if (next_offset < docs.size()) {
+            heap.push(Cursor {docs[next_offset], cur.list, next_offset});
+        }
+    }
+    return out;
+}
+
+} // namespace snii::query::internal
diff --git a/be/src/storage/index/snii/core/src/query/docid_union.cpp b/be/src/storage/index/snii/core/src/query/docid_union.cpp
new file mode 100644
index 00000000000000..da4665a63d1280
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/docid_union.cpp
@@ -0,0 +1,31 @@
+#include "snii/query/internal/docid_union.h"
+
+#include <vector>
+
+#include "snii/query/internal/docid_set_ops.h"
+
+namespace snii::query::internal {
+
+Status build_docid_union(const snii::reader::LogicalIndexReader& idx,
+                         const std::vector<ResolvedDocidPosting>& postings,
+                         std::vector<uint32_t>* out) {
+    if (out == nullptr) return Status::InvalidArgument("docid_union: null out");
+    out->clear();
+    if (postings.empty()) return Status::OK();
+
+    std::vector<std::vector<uint32_t>> docs_by_posting;
+    SNII_RETURN_IF_ERROR(read_docid_postings_batched(idx, postings, &docs_by_posting));
+    *out = union_sorted_many(docs_by_posting);
+    return Status::OK();
+}
+
+Status emit_docid_union(const snii::reader::LogicalIndexReader& idx,
+                        const std::vector<ResolvedDocidPosting>& postings, DocIdSink* sink) {
+    if (sink == nullptr) return Status::InvalidArgument("docid_union: null sink");
+    std::vector<uint32_t> acc;
+    SNII_RETURN_IF_ERROR(build_docid_union(idx, postings, &acc));
+    if (acc.empty()) return Status::OK();
+    return sink->append_sorted(acc);
+}
+
+} // namespace snii::query::internal
diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp
new file mode 100644
index 00000000000000..72db2d628513e0
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp
@@ -0,0 +1,1194 @@
+#include "snii/query/phrase_query.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/frq_pod.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/format/prx_pod.h"
+#include "snii/io/batch_range_fetcher.h"
+#include "snii/query/internal/docid_conjunction.h"
+#include "snii/query/internal/docid_set_ops.h"
+#include "snii/query/internal/position_math.h"
+#include "snii/query/prefix_query.h"
+#include "snii/query/term_query.h"
+#include "snii/reader/windowed_posting.h"
+
+// phrase_query implements MATCH_PHRASE with WINDOW (sub-block) SKIPPING for
+// high-df windowed terms (design spec section 6.2):
+//   1. Resolve every term; reject if any is absent.
+//   2. Batch-read each windowed term's prelude + each slim/inline term's full
+//      docid posting in one round; open the two-level prelude readers.
+//   3. Pick the DRIVER = smallest-df term; materialize it fully -> the initial
+//      candidate docid set.
+//   4. For every other term in ascending-df order, narrow the candidate set:
+//        - slim/inline: intersect with its (already decoded) full posting.
+//        - windowed:    locate_window() the CURRENT candidates -> the SET of
+//                       windows covering them; batch-fetch ONLY those windows'
+//                       .frq docid regions; keep candidates present in some
+//                       covering window. A high-df term thus reads
+//                       O(candidates) windows instead of its whole O(df)
+//                       posting.
+//   5. Fetch PRX only for retained chunks and run the positional phrase check
+//      (term[0]@p, term[1]@p+1, ...) on the survivors.
+// The result is identical to a full-read intersection; only the bytes read for
+// high-df windowed terms shrink.
+namespace snii::query {
+
+using snii::query::internal::DocidChunk;
+using snii::query::internal::DocidSource;
+using snii::query::internal::ResolvedQueryTerm;
+using snii::query::internal::TermPlan;
+using snii::reader::LogicalIndexReader;
+
+namespace {
+
+struct ExpectedTailPositions {
+    uint32_t docid = 0;
+    size_t positions_begin = 0;
+    size_t positions_end = 0;
+};
+
+struct ExpectedTailPositionSet {
+    std::vector<ExpectedTailPositions> docs;
+    std::vector<uint32_t> positions;
+
+    void clear() {
+        docs.clear();
+        positions.clear();
+    }
+
+    void reserve_docs(size_t count) {
+        docs.reserve(count);
+        positions.reserve(count);
+    }
+};
+
+// One decoded chunk of a term's posting: a windowed term's covering window, or
+// a slim/inline term's single posting. `docids` is decoded in the conjunction
+// phase (and reused by the streaming cursor -- the dd region is decoded exactly
+// once); `prx` is the on-disk positions bytes, decoded lazily by the cursor
+// (once per chunk) during phrase verification.
+struct PosChunk {
+    std::vector<uint32_t> docids; // ascending, absolute
+    // Empty means the chunk keeps every PRX doc in on-disk order. Non-empty means
+    // `docids[i]` corresponds to on-disk local document ordinal
+    // `prx_doc_ordinals[i]`, allowing PRX decode to skip positions for docs that
+    // were removed by the docid-only conjunction.
+    std::vector<uint32_t> prx_doc_ordinals;
+    uint32_t prx_doc_count = 0;
+    Slice prx; // .prx window bytes (reference fetcher/round1/entry)
+    bool windowed = false;
+    uint32_t window = 0;
+};
+
+// A term's retained posting as an ordered list of chunks (windowed: covering
+// windows in docid order; slim/inline: one). The referenced prx bytes live in
+// `round1` / the per-term fetchers kept alive in phrase_query::owners for the
+// whole query, so the cursor can decode positions during verification.
+struct PosSource {
+    std::vector<PosChunk> chunks;
+};
+
+struct PhraseExecutionState {
+    std::vector<PosSource> srcs;
+    std::vector<std::unique_ptr<snii::io::BatchRangeFetcher>> owners;
+    std::vector<uint32_t> candidates;
+};
+
+struct PhraseTermMapping {
+    std::vector<std::string> unique_terms;
+    std::vector<size_t> phrase_plan_index;
+};
+
+PhraseTermMapping BuildPhraseTermMapping(const std::vector<std::string>& terms) {
+    PhraseTermMapping mapping;
+    mapping.phrase_plan_index.reserve(terms.size());
+    for (const std::string& term : terms) {
+        auto it = std::find(mapping.unique_terms.begin(), mapping.unique_terms.end(), term);
+        if (it == mapping.unique_terms.end()) {
+            mapping.phrase_plan_index.push_back(mapping.unique_terms.size());
+            mapping.unique_terms.push_back(term);
+            continue;
+        }
+        mapping.phrase_plan_index.push_back(static_cast<size_t>(it - mapping.unique_terms.begin()));
+    }
+    return mapping;
+}
+
+Status append_prx_doc_ordinal(size_t ordinal, std::vector<uint32_t>* out) {
+    if (ordinal > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("phrase_query: prx doc ordinal exceeds u32");
+    }
+    out->push_back(static_cast<uint32_t>(ordinal));
+    return Status::OK();
+}
+
+Status append_selected_ordinal(size_t doc_index, const std::vector<uint32_t>& prx_doc_ordinals,
+                               std::vector<uint32_t>* selected_ordinals) {
+    if (!prx_doc_ordinals.empty()) {
+        selected_ordinals->push_back(prx_doc_ordinals[doc_index]);
+        return Status::OK();
+    }
+    return append_prx_doc_ordinal(doc_index, selected_ordinals);
+}
+
+Status append_selected_doc(size_t doc_index, uint32_t docid,
+                           const std::vector<uint32_t>& prx_doc_ordinals,
+                           std::vector<uint32_t>* selected_docids,
+                           std::vector<uint32_t>* selected_ordinals) {
+    selected_docids->push_back(docid);
+    return append_selected_ordinal(doc_index, prx_doc_ordinals, selected_ordinals);
+}
+
+Status materialize_selected_prefix(size_t count, size_t capacity,
+                                   const std::vector<uint32_t>& docids,
+                                   const std::vector<uint32_t>& prx_doc_ordinals,
+                                   std::vector<uint32_t>* selected_docids,
+                                   std::vector<uint32_t>* selected_ordinals) {
+    selected_docids->reserve(capacity);
+    selected_ordinals->reserve(capacity);
+    selected_docids->insert(selected_docids->end(), docids.begin(), docids.begin() + count);
+    for (size_t i = 0; i < count; ++i) {
+        SNII_RETURN_IF_ERROR(append_selected_ordinal(i, prx_doc_ordinals, selected_ordinals));
+    }
+    return Status::OK();
+}
+
+Status materialize_selected_prefix_if_needed(bool* selected_all, size_t count, size_t capacity,
+                                             const std::vector<uint32_t>& docids,
+                                             const std::vector<uint32_t>& prx_doc_ordinals,
+                                             std::vector<uint32_t>* selected_docids,
+                                             std::vector<uint32_t>* selected_ordinals) {
+    if (!*selected_all) {
+        return Status::OK();
+    }
+    *selected_all = false;
+    return materialize_selected_prefix(count, capacity, docids, prx_doc_ordinals, selected_docids,
+                                       selected_ordinals);
+}
+
+Status SelectCandidateDocsForPrx(std::vector<uint32_t>* docids,
+                                 std::vector<uint32_t>* prx_doc_ordinals, uint32_t prx_doc_count,
+                                 const std::vector<uint32_t>& candidates, PosChunk* chunk) {
+    chunk->docids.clear();
+    chunk->prx_doc_ordinals.clear();
+    if (prx_doc_count == 0 && docids->size() > std::numeric_limits<uint32_t>::max()) {
+        return Status::Corruption("phrase_query: prx doc count exceeds u32");
+    }
+    chunk->prx_doc_count =
+            prx_doc_count == 0 ? static_cast<uint32_t>(docids->size()) : prx_doc_count;
+    if (docids->empty() || candidates.empty()) {
+        return Status::OK();
+    }
+    if (!prx_doc_ordinals->empty() && prx_doc_ordinals->size() != docids->size()) {
+        return Status::Corruption("phrase_query: prx ordinal/docid count mismatch");
+    }
+
+    std::vector<uint32_t> selected_docids;
+    std::vector<uint32_t> selected_ordinals;
+    bool selected_all = true;
+    const size_t selected_capacity = std::min(docids->size(), candidates.size());
+
+    auto candidate_it = std::ranges::lower_bound(candidates, docids->front());
+    size_t candidate_index = static_cast<size_t>(candidate_it - candidates.begin());
+    for (size_t doc_index = 0; doc_index < docids->size(); ++doc_index) {
+        const uint32_t docid = (*docids)[doc_index];
+        while (candidate_index < candidates.size() && candidates[candidate_index] < docid) {
+            ++candidate_index;
+        }
+        if (candidate_index == candidates.size()) {
+            SNII_RETURN_IF_ERROR(materialize_selected_prefix_if_needed(
+                    &selected_all, doc_index, selected_capacity, *docids, *prx_doc_ordinals,
+                    &selected_docids, &selected_ordinals));
+            break;
+        }
+        if (candidates[candidate_index] != docid) {
+            SNII_RETURN_IF_ERROR(materialize_selected_prefix_if_needed(
+                    &selected_all, doc_index, selected_capacity, *docids, *prx_doc_ordinals,
+                    &selected_docids, &selected_ordinals));
+            continue;
+        }
+
+        if (!selected_all) {
+            SNII_RETURN_IF_ERROR(append_selected_doc(doc_index, docid, *prx_doc_ordinals,
+                                                     &selected_docids, &selected_ordinals));
+        }
+        ++candidate_index;
+    }
+
+    if (selected_all) {
+        chunk->docids = std::move(*docids);
+        chunk->prx_doc_ordinals = std::move(*prx_doc_ordinals);
+        docids->clear();
+        prx_doc_ordinals->clear();
+        return Status::OK();
+    }
+    if (selected_docids.empty()) {
+        return Status::OK();
+    }
+    chunk->docids = std::move(selected_docids);
+    chunk->prx_doc_ordinals = std::move(selected_ordinals);
+    return Status::OK();
+}
+
+Status BuildFlatPositionSource(const LogicalIndexReader& idx,
+                               const snii::io::BatchRangeFetcher& round1, DocidSource* doc_source,
+                               const TermPlan& p, const std::vector<uint32_t>& candidates,
+                               std::vector<std::unique_ptr<snii::io::BatchRangeFetcher>>* owners,
+                               PosSource* src) {
+    PosChunk chunk;
+    std::vector<uint32_t> docids;
+    std::vector<uint32_t> prx_doc_ordinals;
+    const bool docids_are_final_candidates =
+            doc_source->docids_are_final_candidates && !doc_source->chunks.empty();
+    if (!doc_source->chunks.empty()) {
+        DocidChunk& doc_chunk = doc_source->chunks.front();
+        docids = std::move(doc_chunk.docids);
+        prx_doc_ordinals = std::move(doc_chunk.prx_doc_ordinals);
+        chunk.prx_doc_count = doc_chunk.prx_doc_count;
+    }
+    if (p.pod_ref) {
+        uint64_t poff = 0;
+        uint64_t plen = 0;
+        SNII_RETURN_IF_ERROR(idx.resolve_prx_window(p.entry, p.prx_base, &poff, &plen));
+        auto fetcher = std::make_unique<snii::io::BatchRangeFetcher>(idx.reader());
+        const size_t prx_handle = fetcher->add(poff, plen);
+        SNII_RETURN_IF_ERROR(fetcher->fetch());
+        chunk.prx = fetcher->get(prx_handle);
+        owners->push_back(std::move(fetcher));
+    } else {
+        chunk.prx = Slice(p.entry.prx_bytes);
+    }
+    if (docids.empty()) {
+        Slice dd;
+        if (p.pod_ref) {
+            dd = round1.get(p.frq_handle);
+        } else {
+            SNII_RETURN_IF_ERROR(internal::inline_dd_region(p.entry, &dd));
+        }
+        SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd, p.entry.dd_meta,
+                                                            /*win_base=*/0, &docids));
+        if (docids.size() > std::numeric_limits<uint32_t>::max()) {
+            return Status::Corruption("phrase_query: prx doc count exceeds u32");
+        }
+        chunk.prx_doc_count = static_cast<uint32_t>(docids.size());
+    }
+    if (docids_are_final_candidates) {
+        chunk.docids = std::move(docids);
+        chunk.prx_doc_ordinals = std::move(prx_doc_ordinals);
+        if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk));
+        return Status::OK();
+    }
+    SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx(&docids, &prx_doc_ordinals, chunk.prx_doc_count,
+                                                   candidates, &chunk));
+    if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk));
+    return Status::OK();
+}
+
+bool ChunkMayContainCandidate(const DocidChunk& chunk, const std::vector<uint32_t>& candidates) {
+    if (chunk.docids.empty() || candidates.empty()) return false;
+    const auto it = std::lower_bound(candidates.begin(), candidates.end(), chunk.docids.front());
+    return it != candidates.end() && *it <= chunk.docids.back();
+}
+
+Status DecodeWindowedPositionSource(
+        const LogicalIndexReader& idx, const TermPlan& p, DocidSource* doc_source,
+        const std::vector<uint32_t>& candidates,
+        std::vector<std::unique_ptr<snii::io::BatchRangeFetcher>>* owners, PosSource* src) {
+    struct WindowFetch {
+        size_t chunk_index = 0;
+        size_t prx_handle = 0;
+    };
+
+    auto prx_fetcher = std::make_unique<snii::io::BatchRangeFetcher>(
+            idx.reader(), snii::reader::kSameTermCoalesceGap);
+    std::vector<WindowFetch> fetched;
+    fetched.reserve(doc_source->chunks.size());
+    for (size_t i = 0; i < doc_source->chunks.size(); ++i) {
+        DocidChunk& doc_chunk = doc_source->chunks[i];
+        if (!doc_source->docids_are_final_candidates &&
+            !ChunkMayContainCandidate(doc_chunk, candidates)) {
+            continue;
+        }
+        if (!doc_chunk.windowed) {
+            return Status::Corruption("phrase_query: expected windowed doc chunk");
+        }
+        PosChunk chunk;
+        if (doc_source->docids_are_final_candidates) {
+            chunk.docids = std::move(doc_chunk.docids);
+            chunk.prx_doc_ordinals = std::move(doc_chunk.prx_doc_ordinals);
+            chunk.prx_doc_count = doc_chunk.prx_doc_count;
+        } else {
+            SNII_RETURN_IF_ERROR(
+                    SelectCandidateDocsForPrx(&doc_chunk.docids, &doc_chunk.prx_doc_ordinals,
+                                              doc_chunk.prx_doc_count, candidates, &chunk));
+        }
+        if (chunk.docids.empty()) continue;
+
+        snii::reader::WindowAbsRange range;
+        SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range(
+                idx, p.entry, p.frq_base, p.prx_base, p.prelude, doc_chunk.window,
+                /*want_positions=*/true, /*want_freq=*/false, &range));
+        chunk.windowed = true;
+        chunk.window = doc_chunk.window;
+        WindowFetch f;
+        f.chunk_index = src->chunks.size();
+        f.prx_handle = prx_fetcher->add(range.prx_off, range.prx_len);
+        fetched.push_back(f);
+        src->chunks.push_back(std::move(chunk));
+    }
+    if (prx_fetcher->pending() > 0) SNII_RETURN_IF_ERROR(prx_fetcher->fetch());
+
+    for (const WindowFetch& f : fetched) {
+        src->chunks[f.chunk_index].prx = prx_fetcher->get(f.prx_handle);
+    }
+    if (!fetched.empty()) owners->push_back(std::move(prx_fetcher));
+    return Status::OK();
+}
+
+Status BuildPositionSourcesForCandidates(
+        const LogicalIndexReader& idx, const snii::io::BatchRangeFetcher& round1,
+        const std::vector<TermPlan>& plans, std::vector<DocidSource>* doc_sources,
+        const std::vector<uint32_t>& candidates,
+        std::vector<std::unique_ptr<snii::io::BatchRangeFetcher>>* owners,
+        std::vector<PosSource>* srcs) {
+    srcs->assign(plans.size(), PosSource {});
+    for (size_t i = 0; i < plans.size(); ++i) {
+        const TermPlan& p = plans[i];
+        if (p.windowed) {
+            SNII_RETURN_IF_ERROR(DecodeWindowedPositionSource(idx, p, &(*doc_sources)[i],
+                                                              candidates, owners, &(*srcs)[i]));
+            continue;
+        }
+        SNII_RETURN_IF_ERROR(BuildFlatPositionSource(idx, round1, &(*doc_sources)[i], p, candidates,
+                                                     owners, &(*srcs)[i]));
+    }
+    return Status::OK();
+}
+
+class PosChunkDecoder {
+public:
+    void reset() {
+        chunk_ = nullptr;
+        offsets_by_prx_ordinal_ = false;
+    }
+
+    Status decode(const PosChunk& chunk) {
+        chunk_ = &chunk;
+        ByteSource ps(chunk.prx);
+        offsets_by_prx_ordinal_ = false;
+        if (chunk.prx_doc_ordinals.empty()) {
+            SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_));
+        } else if (should_decode_full_prx_window(chunk)) {
+            SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_));
+            offsets_by_prx_ordinal_ = true;
+        } else {
+            SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr_selective(
+                    &ps, chunk.prx_doc_ordinals, &pflat_, &poff_));
+        }
+        if (offsets_by_prx_ordinal_) {
+            if (poff_.size() != static_cast<size_t>(chunk.prx_doc_count) + 1) {
+                return Status::Corruption("phrase_query: full prx doc-count mismatch");
+            }
+        } else if (poff_.size() != chunk.docids.size() + 1) {
+            return Status::Corruption("phrase_query: selected prx/doc-count mismatch");
+        }
+        if (poff_.back() > pflat_.size()) {
+            return Status::Corruption("phrase_query: prx final offset out of range");
+        }
+        return Status::OK();
+    }
+
+    Status positions(size_t doc_index, std::pair<const uint32_t*, const uint32_t*>* out) const {
+        if (chunk_ == nullptr || doc_index >= chunk_->docids.size()) {
+            return Status::Corruption("phrase_query: decoded chunk doc index out of range");
+        }
+        const size_t pos_index =
+                offsets_by_prx_ordinal_ ? chunk_->prx_doc_ordinals[doc_index] : doc_index;
+        if (pos_index + 1 >= poff_.size()) {
+            return Status::Corruption("phrase_query: prx ordinal offset out of range");
+        }
+        const uint32_t begin = poff_[pos_index];
+        const uint32_t end = poff_[pos_index + 1];
+        if (begin == end) {
+            *out = {nullptr, nullptr};
+            return Status::OK();
+        }
+        if (end > pflat_.size()) {
+            return Status::Corruption("phrase_query: prx offset out of range");
+        }
+        *out = {pflat_.data() + begin, pflat_.data() + end};
+        return Status::OK();
+    }
+
+    inline __attribute__((always_inline)) std::pair<const uint32_t*, const uint32_t*>
+    positions_unchecked(size_t doc_index) const {
+        const size_t pos_index =
+                offsets_by_prx_ordinal_ ? chunk_->prx_doc_ordinals[doc_index] : doc_index;
+        const uint32_t begin = poff_[pos_index];
+        const uint32_t end = poff_[pos_index + 1];
+        if (begin == end) {
+            return {nullptr, nullptr};
+        }
+        return {pflat_.data() + begin, pflat_.data() + end};
+    }
+
+private:
+    static bool should_decode_full_prx_window(const PosChunk& chunk) {
+        return chunk.prx_doc_count != 0 &&
+               static_cast<uint64_t>(chunk.prx_doc_ordinals.size()) * 2 >= chunk.prx_doc_count;
+    }
+
+    const PosChunk* chunk_ = nullptr;
+    bool offsets_by_prx_ordinal_ = false;
+    std::vector<uint32_t> pflat_;
+    std::vector<uint32_t> poff_;
+};
+
+// Streaming position cursor over one term's retained chunks. It advances ONLY
+// forward (callers seek ascending candidate docids), decodes each chunk's
+// docids once (reused from the conjunction phase) and each chunk's positions at
+// most once (lazily, into a flat CSR whose capacity is retained across chunks).
+// No per-doc allocation, no per-candidate docid binary search: positions are
+// addressed by the doc's local index within its chunk. This is the read-side
+// dual of the windowed posting layout -- the S3-native batch fetch already
+// pulled every needed chunk into memory; the cursor is pure in-memory column
+// iteration.
+class PostingCursor {
+public:
+    void init(const PosSource* src) {
+        src_ = src;
+        ci_ = 0;
+        li_ = 0;
+        decoded_pos_chunk_ = kNoChunk;
+        decoder_.reset();
+    }
+
+    // Positions the cursor at `target` (guaranteed present: candidates are the
+    // intersection of exactly these chunks' docids). Monotonic forward advance.
+    Status seek(uint32_t target) {
+        while (ci_ < src_->chunks.size() &&
+               (src_->chunks[ci_].docids.empty() || src_->chunks[ci_].docids.back() < target)) {
+            ++ci_;
+            li_ = 0;
+        }
+        if (ci_ >= src_->chunks.size()) {
+            return Status::Corruption("phrase_query: cursor exhausted before target docid");
+        }
+        const std::vector<uint32_t>& d = src_->chunks[ci_].docids;
+        while (li_ < d.size() && d[li_] < target) ++li_;
+        if (li_ >= d.size() || d[li_] != target) {
+            return Status::Corruption("phrase_query: candidate missing from posting chunk");
+        }
+        return Status::OK();
+    }
+
+    // [begin,end) of the current doc's positions, decoding the current chunk's
+    // .prx exactly once (cached). Must follow a seek that landed on a real doc.
+    Status positions(std::pair<const uint32_t*, const uint32_t*>* out) {
+        if (ci_ >= src_->chunks.size() || li_ >= src_->chunks[ci_].docids.size()) {
+            return Status::Corruption("phrase_query: cursor positions out of range");
+        }
+        if (decoded_pos_chunk_ != ci_) {
+            SNII_RETURN_IF_ERROR(decoder_.decode(src_->chunks[ci_]));
+            decoded_pos_chunk_ = ci_;
+        }
+        return decoder_.positions(li_, out);
+    }
+
+    Status next(uint32_t* docid, std::pair<const uint32_t*, const uint32_t*>* out) {
+        while (ci_ < src_->chunks.size() &&
+               (src_->chunks[ci_].docids.empty() || li_ >= src_->chunks[ci_].docids.size())) {
+            ++ci_;
+            li_ = 0;
+        }
+        if (ci_ >= src_->chunks.size()) {
+            return Status::Corruption("phrase_query: cursor exhausted before next docid");
+        }
+        *docid = src_->chunks[ci_].docids[li_];
+        SNII_RETURN_IF_ERROR(positions(out));
+        ++li_;
+        return Status::OK();
+    }
+
+private:
+    static constexpr size_t kNoChunk = static_cast<size_t>(-1);
+
+    const PosSource* src_ = nullptr;
+    size_t ci_ = 0;                       // current chunk
+    size_t li_ = 0;                       // current local doc index within the chunk
+    size_t decoded_pos_chunk_ = kNoChunk; // which chunk decoder_ currently holds
+    PosChunkDecoder decoder_;
+};
+
+class PhrasePositionLoader {
+public:
+    PhrasePositionLoader(size_t plan_count, std::vector<PosSource>& srcs)
+            : cursors_(plan_count), plan_spans_(plan_count), loaded_epoch_(plan_count, 0) {
+        for (size_t i = 0; i < plan_count; ++i) {
+            cursors_[i].init(&srcs[i]);
+        }
+    }
+
+    void begin_doc(uint32_t docid) {
+        docid_ = docid;
+        ++epoch_;
+        if (epoch_ == 0) {
+            std::ranges::fill(loaded_epoch_, 0);
+            epoch_ = 1;
+        }
+    }
+
+    Status positions_for_phrase_pos(const std::vector<size_t>& phrase_plan_index, size_t phrase_pos,
+                                    std::pair<const uint32_t*, const uint32_t*>* out) {
+        const size_t plan_index = phrase_plan_index[phrase_pos];
+        if (loaded_epoch_[plan_index] != epoch_) {
+            SNII_RETURN_IF_ERROR(cursors_[plan_index].seek(docid_));
+            SNII_RETURN_IF_ERROR(cursors_[plan_index].positions(&plan_spans_[plan_index]));
+            loaded_epoch_[plan_index] = epoch_;
+        }
+        *out = plan_spans_[plan_index];
+        return Status::OK();
+    }
+
+private:
+    std::vector<PostingCursor> cursors_;
+    std::vector<std::pair<const uint32_t*, const uint32_t*>> plan_spans_;
+    std::vector<uint32_t> loaded_epoch_;
+    uint32_t docid_ = 0;
+    uint32_t epoch_ = 0;
+};
+
+bool ContainsTwoTermPhrase(std::pair<const uint32_t*, const uint32_t*> left_span,
+                           std::pair<const uint32_t*, const uint32_t*> right_span,
+                           uint32_t right_delta) {
+    const uint32_t* left = left_span.first;
+    const uint32_t* right = right_span.first;
+    const uint32_t max_start = std::numeric_limits<uint32_t>::max() - right_delta;
+    while (left != left_span.second && right != right_span.second) {
+        if (*left > max_start) {
+            return false;
+        }
+        const uint32_t want = *left + right_delta;
+        while (right != right_span.second && *right < want) {
+            ++right;
+        }
+        if (right == right_span.second) {
+            return false;
+        }
+        if (*right == want) {
+            return true;
+        }
+        ++left;
+    }
+    return false;
+}
+
+size_t SelectPhraseVerificationPair(const std::vector<TermPlan>& plans,
+                                    const std::vector<size_t>& phrase_plan_index) {
+    size_t best_left = 0;
+    uint64_t best_score = std::numeric_limits<uint64_t>::max();
+    for (size_t left = 0; left + 1 < phrase_plan_index.size(); ++left) {
+        const uint64_t score = static_cast<uint64_t>(plans[phrase_plan_index[left]].df) +
+                               plans[phrase_plan_index[left + 1]].df;
+        if (score < best_score) {
+            best_score = score;
+            best_left = left;
+        }
+    }
+    return best_left;
+}
+
+void CollectTwoTermPhraseStarts(std::pair<const uint32_t*, const uint32_t*> left_span,
+                                std::pair<const uint32_t*, const uint32_t*> right_span,
+                                uint32_t right_delta, uint32_t left_offset,
+                                std::vector<uint32_t>& starts) {
+    starts.clear();
+    const uint32_t* left = left_span.first;
+    const uint32_t* right = right_span.first;
+    const uint32_t max_left = std::numeric_limits<uint32_t>::max() - right_delta;
+    while (left != left_span.second && right != right_span.second) {
+        if (*left > max_left) {
+            return;
+        }
+        const uint32_t want = *left + right_delta;
+        while (right != right_span.second && *right < want) {
+            ++right;
+        }
+        if (right == right_span.second) {
+            return;
+        }
+        if (*right == want && *left >= left_offset) {
+            starts.push_back(*left - left_offset);
+        }
+        ++left;
+    }
+}
+
+Status EmitTwoTermPhraseStreaming(const std::vector<size_t>& phrase_plan_index,
+                                  const std::vector<uint32_t>& position_offsets,
+                                  std::vector<PosSource>& srcs,
+                                  const std::vector<uint32_t>& candidates,
+                                  std::vector<uint32_t>* docids) {
+    const size_t left_plan = phrase_plan_index[0];
+    const size_t right_plan = phrase_plan_index[1];
+    const uint32_t right_delta = position_offsets[1] - position_offsets[0];
+
+    if (left_plan == right_plan) {
+        PostingCursor cursor;
+        cursor.init(&srcs[left_plan]);
+        for (uint32_t expected_docid : candidates) {
+            uint32_t docid = 0;
+            std::pair<const uint32_t*, const uint32_t*> span;
+            SNII_RETURN_IF_ERROR(cursor.next(&docid, &span));
+            if (docid != expected_docid) {
+                return Status::Corruption("phrase_query: repeated-term cursor/docid mismatch");
+            }
+            if (ContainsTwoTermPhrase(span, span, right_delta)) {
+                docids->push_back(docid);
+            }
+        }
+        return Status::OK();
+    }
+
+    PostingCursor left_cursor;
+    PostingCursor right_cursor;
+    left_cursor.init(&srcs[left_plan]);
+    right_cursor.init(&srcs[right_plan]);
+    for (uint32_t expected_docid : candidates) {
+        uint32_t left_docid = 0;
+        uint32_t right_docid = 0;
+        std::pair<const uint32_t*, const uint32_t*> left_span;
+        std::pair<const uint32_t*, const uint32_t*> right_span;
+        SNII_RETURN_IF_ERROR(left_cursor.next(&left_docid, &left_span));
+        SNII_RETURN_IF_ERROR(right_cursor.next(&right_docid, &right_span));
+        if (left_docid != expected_docid || right_docid != expected_docid) {
+            return Status::Corruption("phrase_query: two-term cursor/docid mismatch");
+        }
+        if (ContainsTwoTermPhrase(left_span, right_span, right_delta)) {
+            docids->push_back(expected_docid);
+        }
+    }
+    return Status::OK();
+}
+
+void EmitTwoTermPhraseChunkPair(const PosChunk& left, const PosChunk& right,
+                                const PosChunkDecoder& left_decoder,
+                                const PosChunkDecoder& right_decoder, uint32_t right_delta,
+                                std::vector<uint32_t>& docids) {
+    size_t li = static_cast<size_t>(
+            std::lower_bound(left.docids.begin(), left.docids.end(), right.docids.front()) -
+            left.docids.begin());
+    size_t ri = static_cast<size_t>(
+            std::lower_bound(right.docids.begin(), right.docids.end(), left.docids.front()) -
+            right.docids.begin());
+    while (li < left.docids.size() && ri < right.docids.size()) {
+        const uint32_t left_docid = left.docids[li];
+        const uint32_t right_docid = right.docids[ri];
+        if (left_docid < right_docid) {
+            ++li;
+            continue;
+        }
+        if (right_docid < left_docid) {
+            ++ri;
+            continue;
+        }
+
+        const std::pair<const uint32_t*, const uint32_t*> left_span =
+                left_decoder.positions_unchecked(li);
+        const std::pair<const uint32_t*, const uint32_t*> right_span =
+                right_decoder.positions_unchecked(ri);
+        if (ContainsTwoTermPhrase(left_span, right_span, right_delta)) {
+            docids.push_back(left_docid);
+        }
+        ++li;
+        ++ri;
+    }
+}
+
+Status EmitTwoTermPhraseChunkMerge(const std::vector<size_t>& phrase_plan_index,
+                                   const std::vector<uint32_t>& position_offsets,
+                                   std::vector<PosSource>& srcs,
+                                   std::vector<uint32_t>* const docids) {
+    const size_t left_plan = phrase_plan_index[0];
+    const size_t right_plan = phrase_plan_index[1];
+    const uint32_t right_delta = position_offsets[1] - position_offsets[0];
+    const PosSource& left_src = srcs[left_plan];
+    const PosSource& right_src = srcs[right_plan];
+
+    PosChunkDecoder left_decoder;
+    PosChunkDecoder right_decoder;
+    size_t decoded_left_chunk = static_cast<size_t>(-1);
+    size_t decoded_right_chunk = static_cast<size_t>(-1);
+    size_t left_chunk = 0;
+    size_t right_chunk = 0;
+    while (left_chunk < left_src.chunks.size() && right_chunk < right_src.chunks.size()) {
+        const PosChunk& left = left_src.chunks[left_chunk];
+        const PosChunk& right = right_src.chunks[right_chunk];
+        if (left.docids.empty()) {
+            ++left_chunk;
+            continue;
+        }
+        if (right.docids.empty()) {
+            ++right_chunk;
+            continue;
+        }
+        if (left.docids.back() < right.docids.front()) {
+            ++left_chunk;
+            continue;
+        }
+        if (right.docids.back() < left.docids.front()) {
+            ++right_chunk;
+            continue;
+        }
+
+        if (decoded_left_chunk != left_chunk) {
+            SNII_RETURN_IF_ERROR(left_decoder.decode(left));
+            decoded_left_chunk = left_chunk;
+        }
+        if (decoded_right_chunk != right_chunk) {
+            SNII_RETURN_IF_ERROR(right_decoder.decode(right));
+            decoded_right_chunk = right_chunk;
+        }
+
+        EmitTwoTermPhraseChunkPair(left, right, left_decoder, right_decoder, right_delta, *docids);
+
+        const uint32_t left_last = left.docids.back();
+        const uint32_t right_last = right.docids.back();
+        if (left_last <= right_last) {
+            ++left_chunk;
+        }
+        if (right_last <= left_last) {
+            ++right_chunk;
+        }
+    }
+    return Status::OK();
+}
+
+bool PhraseStartMatchesAllTerms(
+        uint32_t start, size_t phrase_len, size_t pair_left, size_t pair_right,
+        const std::vector<uint32_t>& position_offsets,
+        const std::vector<std::pair<const uint32_t*, const uint32_t*>>& span) {
+    for (size_t t = 0; t < phrase_len; ++t) {
+        if (t == pair_left || t == pair_right) {
+            continue;
+        }
+        uint32_t want = 0;
+        if (!internal::add_position_offset(start, position_offsets[t], &want)) {
+            return false;
+        }
+        if (!std::binary_search(span[t].first, span[t].second, want)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+Status EmitSingleTermPhraseStreaming(const std::vector<size_t>& phrase_plan_index,
+                                     std::vector<PosSource>& srcs,
+                                     const std::vector<uint32_t>& candidates,
+                                     std::vector<uint32_t>* docids) {
+    PhrasePositionLoader loader(srcs.size(), srcs);
+    for (uint32_t d : candidates) {
+        loader.begin_doc(d);
+        std::pair<const uint32_t*, const uint32_t*> single_span;
+        SNII_RETURN_IF_ERROR(loader.positions_for_phrase_pos(phrase_plan_index, 0, &single_span));
+        if (single_span.first != single_span.second) {
+            docids->push_back(d);
+        }
+    }
+    return Status::OK();
+}
+
+Status EmitMultiTermPhraseStreaming(const std::vector<TermPlan>& plans,
+                                    const std::vector<size_t>& phrase_plan_index,
+                                    const std::vector<uint32_t>& position_offsets,
+                                    std::vector<PosSource>& srcs,
+                                    const std::vector<uint32_t>& candidates,
+                                    std::vector<uint32_t>* docids) {
+    const size_t phrase_len = phrase_plan_index.size();
+    PhrasePositionLoader loader(plans.size(), srcs);
+    std::vector<std::pair<const uint32_t*, const uint32_t*>> span(phrase_len);
+    std::vector<uint32_t> starts;
+    const size_t pair_left = SelectPhraseVerificationPair(plans, phrase_plan_index);
+    const size_t pair_right = pair_left + 1;
+    for (uint32_t d : candidates) {
+        loader.begin_doc(d);
+        std::pair<const uint32_t*, const uint32_t*> left_span;
+        std::pair<const uint32_t*, const uint32_t*> right_span;
+        SNII_RETURN_IF_ERROR(
+                loader.positions_for_phrase_pos(phrase_plan_index, pair_left, &left_span));
+        SNII_RETURN_IF_ERROR(
+                loader.positions_for_phrase_pos(phrase_plan_index, pair_right, &right_span));
+
+        CollectTwoTermPhraseStarts(left_span, right_span,
+                                   position_offsets[pair_right] - position_offsets[pair_left],
+                                   position_offsets[pair_left], starts);
+        if (starts.empty()) {
+            continue;
+        }
+
+        span[pair_left] = left_span;
+        span[pair_right] = right_span;
+        for (size_t pp = 0; pp < phrase_len; ++pp) {
+            if (pp == pair_left || pp == pair_right) {
+                continue;
+            }
+            SNII_RETURN_IF_ERROR(loader.positions_for_phrase_pos(phrase_plan_index, pp, &span[pp]));
+        }
+
+        for (uint32_t start : starts) {
+            if (PhraseStartMatchesAllTerms(start, phrase_len, pair_left, pair_right,
+                                           position_offsets, span)) {
+                docids->push_back(d);
+                break;
+            }
+        }
+    }
+    return Status::OK();
+}
+
+// Single streaming pass over the candidates: for each (ascending) candidate,
+// gather positions lazily, and test the consecutive-phrase predicate
+// (term[0]@p, term[1]@p+1, ...). Multi-term phrases first test the cheapest
+// adjacent pair by df before decoding the remaining terms for that document.
+// Cursors decode each retained chunk at most once and address positions by
+// local index -- no per-candidate docid binary search, no full-candidate
+// position materialization. Candidates are ascending so the emitted docids are
+// already sorted.
+Status EmitPhraseStreaming(const std::vector<TermPlan>& plans,
+                           const std::vector<size_t>& phrase_plan_index,
+                           const std::vector<uint32_t>& position_offsets,
+                           std::vector<PosSource>& srcs, const std::vector<uint32_t>& candidates,
+                           std::vector<uint32_t>* docids) {
+    const size_t phrase_len = phrase_plan_index.size();
+    if (phrase_len == 1) {
+        return EmitSingleTermPhraseStreaming(phrase_plan_index, srcs, candidates, docids);
+    }
+    if (phrase_len == 2) {
+        if (phrase_plan_index[0] != phrase_plan_index[1]) {
+            return EmitTwoTermPhraseChunkMerge(phrase_plan_index, position_offsets, srcs, docids);
+        }
+        return EmitTwoTermPhraseStreaming(phrase_plan_index, position_offsets, srcs, candidates,
+                                          docids);
+    }
+    return EmitMultiTermPhraseStreaming(plans, phrase_plan_index, position_offsets, srcs,
+                                        candidates, docids);
+}
+
+Status BuildPhraseExecutionState(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1,
+                                 std::vector<TermPlan>* plans, PhraseExecutionState* state) {
+    if (round1->pending() > 0) SNII_RETURN_IF_ERROR(round1->fetch());
+    SNII_RETURN_IF_ERROR(internal::open_preludes(*round1, plans,
+                                                 /*need_positions=*/true));
+
+    state->owners.clear();
+    state->candidates.clear();
+    std::vector<DocidSource> doc_sources;
+    SNII_RETURN_IF_ERROR(internal::build_docid_only_conjunction(idx, *round1, *plans,
+                                                                &state->candidates, &doc_sources));
+    if (state->candidates.empty()) return Status::OK();
+    SNII_RETURN_IF_ERROR(BuildPositionSourcesForCandidates(
+            idx, *round1, *plans, &doc_sources, state->candidates, &state->owners, &state->srcs));
+    return Status::OK();
+}
+
+Status ExecutePhrasePlans(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1,
+                          std::vector<TermPlan>* plans,
+                          const std::vector<size_t>& phrase_plan_index,
+                          std::vector<uint32_t>* docids) {
+    PhraseExecutionState state;
+    SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, round1, plans, &state));
+    if (state.candidates.empty()) return Status::OK();
+
+    std::vector<uint32_t> position_offsets;
+    if (!internal::build_position_offsets(phrase_plan_index.size(), &position_offsets)) {
+        return Status::InvalidArgument("phrase_query: phrase length exceeds doc position range");
+    }
+    return EmitPhraseStreaming(*plans, phrase_plan_index, position_offsets, state.srcs,
+                               state.candidates, docids);
+}
+
+Status ExecuteResolvedPhraseTerms(const LogicalIndexReader& idx,
+                                  const std::vector<ResolvedQueryTerm>& terms,
+                                  std::vector<uint32_t>* docids) {
+    snii::io::BatchRangeFetcher round1(idx.reader());
+    std::vector<TermPlan> plans;
+    SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, terms, &round1, &plans,
+                                                       /*need_positions=*/false));
+    std::vector<size_t> phrase_plan_index(terms.size());
+    std::iota(phrase_plan_index.begin(), phrase_plan_index.end(), 0);
+    return ExecutePhrasePlans(idx, &round1, &plans, phrase_plan_index, docids);
+}
+
+Status CollectExpectedTailPositions(const std::vector<TermPlan>& plans,
+                                    const std::vector<uint32_t>& position_offsets,
+                                    std::vector<PosSource>& srcs,
+                                    const std::vector<uint32_t>& candidates,
+                                    ExpectedTailPositionSet* out) {
+    const size_t n = plans.size();
+    std::vector<PostingCursor> cur(n);
+    for (size_t i = 0; i < n; ++i) cur[i].init(&srcs[i]);
+
+    std::vector<PostingCursor*> ordered(n);
+    for (size_t i = 0; i < n; ++i) ordered[plans[i].order] = &cur[i];
+
+    std::vector<std::pair<const uint32_t*, const uint32_t*>> span(n);
+    for (uint32_t d : candidates) {
+        for (size_t i = 0; i < n; ++i) SNII_RETURN_IF_ERROR(cur[i].seek(d));
+        for (size_t pp = 0; pp < n; ++pp) {
+            SNII_RETURN_IF_ERROR(ordered[pp]->positions(&span[pp]));
+        }
+
+        const size_t expected_begin = out->positions.size();
+        for (const uint32_t* p = span[0].first; p != span[0].second; ++p) {
+            const uint32_t start = *p;
+            bool ok = true;
+            for (size_t t = 1; t < n; ++t) {
+                uint32_t want = 0;
+                if (!internal::add_position_offset(start, position_offsets[t], &want)) {
+                    ok = false;
+                    break;
+                }
+                if (!std::binary_search(span[t].first, span[t].second, want)) {
+                    ok = false;
+                    break;
+                }
+            }
+            uint32_t tail_pos = 0;
+            if (ok && internal::add_position_offset(start, position_offsets[n], &tail_pos)) {
+                out->positions.push_back(tail_pos);
+            }
+        }
+        const size_t expected_end = out->positions.size();
+        if (expected_end != expected_begin) {
+            out->docs.push_back({d, expected_begin, expected_end});
+        }
+    }
+    return Status::OK();
+}
+
+Status CollectSingleTermExpectedTailPositions(std::vector<PosSource>& srcs,
+                                              const std::vector<uint32_t>& candidates,
+                                              uint32_t tail_offset, ExpectedTailPositionSet* out) {
+    PostingCursor cursor;
+    cursor.init(srcs.data());
+    out->reserve_docs(out->docs.size() + candidates.size());
+
+    for (uint32_t d : candidates) {
+        SNII_RETURN_IF_ERROR(cursor.seek(d));
+        std::pair<const uint32_t*, const uint32_t*> span;
+        SNII_RETURN_IF_ERROR(cursor.positions(&span));
+
+        const size_t expected_begin = out->positions.size();
+        for (const uint32_t* p = span.first; p != span.second; ++p) {
+            uint32_t tail_pos = 0;
+            if (internal::add_position_offset(*p, tail_offset, &tail_pos)) {
+                out->positions.push_back(tail_pos);
+            }
+        }
+        const size_t expected_end = out->positions.size();
+        if (expected_end != expected_begin) {
+            out->docs.push_back({d, expected_begin, expected_end});
+        }
+    }
+    return Status::OK();
+}
+
+Status CollectExpectedTailPositions(const LogicalIndexReader& idx,
+                                    const std::vector<ResolvedQueryTerm>& exact_terms,
+                                    ExpectedTailPositionSet* out) {
+    out->clear();
+    snii::io::BatchRangeFetcher round1(idx.reader());
+    std::vector<TermPlan> plans;
+    SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, exact_terms, &round1, &plans,
+                                                       /*need_positions=*/false));
+
+    PhraseExecutionState state;
+    SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state));
+    if (state.candidates.empty()) return Status::OK();
+    out->reserve_docs(state.candidates.size());
+    std::vector<uint32_t> position_offsets;
+    if (!internal::build_position_offsets(plans.size() + 1, &position_offsets)) {
+        return Status::InvalidArgument(
+                "phrase_prefix_query: phrase length exceeds doc position range");
+    }
+    if (plans.size() == 1) {
+        return CollectSingleTermExpectedTailPositions(state.srcs, state.candidates,
+                                                      position_offsets[1], out);
+    }
+    return CollectExpectedTailPositions(plans, position_offsets, state.srcs, state.candidates, out);
+}
+
+bool contains_any_position(const ExpectedTailPositionSet& expected,
+                           const ExpectedTailPositions& wanted,
+                           std::pair<const uint32_t*, const uint32_t*> actual) {
+    for (size_t i = wanted.positions_begin; i < wanted.positions_end; ++i) {
+        if (std::binary_search(actual.first, actual.second, expected.positions[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
+Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx,
+                                             const ResolvedQueryTerm& tail,
+                                             const ExpectedTailPositionSet& expected,
+                                             std::vector<uint32_t>* out) {
+    if (expected.docs.empty()) {
+        return Status::OK();
+    }
+
+    snii::io::BatchRangeFetcher round1(idx.reader());
+    std::vector<TermPlan> plans;
+    SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, {tail}, &round1, &plans,
+                                                       /*need_positions=*/false));
+
+    PhraseExecutionState state;
+    SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state));
+    if (state.candidates.empty()) return Status::OK();
+
+    PostingCursor cursor;
+    cursor.init(&state.srcs[0]);
+    size_t ei = 0;
+    size_t ti = 0;
+    while (ei < expected.docs.size() && ti < state.candidates.size()) {
+        const uint32_t want_doc = expected.docs[ei].docid;
+        const uint32_t tail_doc = state.candidates[ti];
+        if (want_doc < tail_doc) {
+            ++ei;
+            continue;
+        }
+        if (tail_doc < want_doc) {
+            ++ti;
+            continue;
+        }
+
+        SNII_RETURN_IF_ERROR(cursor.seek(want_doc));
+        std::pair<const uint32_t*, const uint32_t*> actual;
+        SNII_RETURN_IF_ERROR(cursor.positions(&actual));
+        if (contains_any_position(expected, expected.docs[ei], actual)) {
+            out->push_back(want_doc);
+        }
+        ++ei;
+        ++ti;
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status phrase_query(const LogicalIndexReader& idx, const std::vector<std::string>& terms,
+                    std::vector<uint32_t>* const docids) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("phrase_query: null out");
+    }
+    docids->clear();
+    if (terms.empty()) {
+        return Status::OK();
+    }
+    if (terms.size() == 1) {
+        return term_query(idx, terms.front(), docids);
+    }
+    if (!idx.has_positions()) {
+        return Status::Unsupported("phrase_query: index has no positions");
+    }
+
+    // Round 1: preludes (windowed) + docid postings (slim/inline) batched
+    // together. Positions are fetched after the docid-only conjunction has
+    // produced final candidates, so phrase verification does not read PRX for
+    // windows later removed by the docid intersection.
+    snii::io::BatchRangeFetcher round1(idx.reader());
+    const PhraseTermMapping mapping = BuildPhraseTermMapping(terms);
+    std::vector<TermPlan> plans;
+    bool all_present = false;
+    SNII_RETURN_IF_ERROR(internal::plan_terms(idx, mapping.unique_terms, &round1, &plans,
+                                              &all_present,
+                                              /*need_positions=*/false));
+    if (!all_present) return Status::OK();
+    return ExecutePhrasePlans(idx, &round1, &plans, mapping.phrase_plan_index, docids);
+}
+
+Status phrase_query(const LogicalIndexReader& idx, const std::vector<std::string>& terms,
+                    std::vector<uint32_t>* const docids, QueryProfile* profile) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return phrase_query(idx, terms, docids);
+}
+
+Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector<std::string>& terms,
+                           std::vector<uint32_t>* const docids, int32_t max_expansions) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("phrase_prefix_query: null out");
+    }
+    docids->clear();
+    if (terms.empty()) {
+        return Status::OK();
+    }
+    if (terms.size() == 1) {
+        return prefix_query(idx, terms.front(), docids, max_expansions);
+    }
+    if (!idx.has_positions()) {
+        return Status::Unsupported("phrase_prefix_query: index has no positions");
+    }
+
+    std::vector<ResolvedQueryTerm> exact_terms;
+    exact_terms.reserve(terms.size() - 1);
+    for (size_t i = 0; i + 1 < terms.size(); ++i) {
+        ResolvedQueryTerm resolved;
+        bool found = false;
+        SNII_RETURN_IF_ERROR(internal::resolve_query_term(idx, terms[i], &resolved, &found));
+        if (!found) {
+            return Status::OK();
+        }
+        exact_terms.push_back(std::move(resolved));
+    }
+
+    std::vector<LogicalIndexReader::PrefixHit> tail_hits;
+    SNII_RETURN_IF_ERROR(idx.prefix_terms(terms.back(), &tail_hits, max_expansions));
+    if (tail_hits.empty()) {
+        return Status::OK();
+    }
+    if (tail_hits.size() == 1) {
+        std::vector<ResolvedQueryTerm> resolved_terms = exact_terms;
+        resolved_terms.push_back(ResolvedQueryTerm {std::move(tail_hits.front().entry),
+                                                    tail_hits.front().frq_base,
+                                                    tail_hits.front().prx_base});
+        return ExecuteResolvedPhraseTerms(idx, resolved_terms, docids);
+    }
+
+    ExpectedTailPositionSet expected;
+    SNII_RETURN_IF_ERROR(CollectExpectedTailPositions(idx, exact_terms, &expected));
+    if (expected.docs.empty()) {
+        return Status::OK();
+    }
+
+    std::vector<uint32_t> acc;
+    for (LogicalIndexReader::PrefixHit& hit : tail_hits) {
+        ResolvedQueryTerm tail {std::move(hit.entry), hit.frq_base, hit.prx_base};
+        std::vector<uint32_t> tail_docs;
+        SNII_RETURN_IF_ERROR(
+                CollectTailMatchesAtExpectedPositions(idx, tail, expected, &tail_docs));
+        internal::union_sorted_into(&acc, tail_docs);
+    }
+    *docids = std::move(acc);
+    return Status::OK();
+}
+
+Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector<std::string>& terms,
+                           std::vector<uint32_t>* const docids, QueryProfile* profile,
+                           int32_t max_expansions) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return phrase_prefix_query(idx, terms, docids, max_expansions);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/prefix_query.cpp b/be/src/storage/index/snii/core/src/query/prefix_query.cpp
new file mode 100644
index 00000000000000..4ad9b6629bdf77
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/prefix_query.cpp
@@ -0,0 +1,47 @@
+#include "snii/query/prefix_query.h"
+
+#include <utility>
+#include <vector>
+
+#include "snii/query/internal/docid_posting_reader.h"
+#include "snii/query/internal/docid_union.h"
+
+namespace snii::query {
+
+using snii::reader::LogicalIndexReader;
+
+Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix,
+                    std::vector<uint32_t>* const docids, int32_t max_expansions) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("prefix_query: null out");
+    }
+    docids->clear();
+    VectorDocIdSink sink(*docids);
+    return prefix_query(idx, prefix, &sink, max_expansions);
+}
+
+Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix,
+                    std::vector<uint32_t>* const docids, QueryProfile* profile,
+                    int32_t max_expansions) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return prefix_query(idx, prefix, docids, max_expansions);
+}
+
+Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, DocIdSink* const sink,
+                    int32_t max_expansions) {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("prefix_query: null sink");
+    }
+
+    std::vector<LogicalIndexReader::PrefixHit> hits;
+    SNII_RETURN_IF_ERROR(idx.prefix_terms(prefix, &hits, max_expansions));
+
+    std::vector<internal::ResolvedDocidPosting> postings;
+    postings.reserve(hits.size());
+    for (LogicalIndexReader::PrefixHit& hit : hits) {
+        postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base});
+    }
+    return internal::emit_docid_union(idx, postings, sink);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/query_profile.cpp b/be/src/storage/index/snii/core/src/query/query_profile.cpp
new file mode 100644
index 00000000000000..9ecd333cb231ed
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/query_profile.cpp
@@ -0,0 +1,46 @@
+#include "snii/query/query_profile.h"
+
+#include <algorithm>
+#include <chrono>
+
+#include "snii/io/file_reader.h"
+
+namespace snii::query {
+
+QueryProfileScope::QueryProfileScope(snii::io::FileReader* reader, QueryProfile* profile)
+        : reader_(reader), profile_(profile), start_(std::chrono::steady_clock::now()) {
+    if (profile_ == nullptr) return;
+
+    *profile_ = QueryProfile {};
+    if (reader_ == nullptr) return;
+
+    const snii::io::IoMetrics* metrics = reader_->io_metrics();
+    if (metrics == nullptr) return;
+
+    profile_->has_io_metrics = true;
+    profile_->io_before = *metrics;
+}
+
+QueryProfileScope::~QueryProfileScope() {
+    finish();
+}
+
+void QueryProfileScope::finish() {
+    if (profile_ == nullptr || finished_) return;
+    finished_ = true;
+
+    const auto end = std::chrono::steady_clock::now();
+    const auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start_).count();
+    profile_->elapsed_ns = std::max<uint64_t>(1, static_cast<uint64_t>(elapsed));
+
+    if (!profile_->has_io_metrics || reader_ == nullptr) return;
+    const snii::io::IoMetrics* metrics = reader_->io_metrics();
+    if (metrics == nullptr) {
+        profile_->has_io_metrics = false;
+        return;
+    }
+    profile_->io_after = *metrics;
+    profile_->io_delta = snii::io::delta(profile_->io_after, profile_->io_before);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/regexp_query.cpp b/be/src/storage/index/snii/core/src/query/regexp_query.cpp
new file mode 100644
index 00000000000000..13377732b17201
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/regexp_query.cpp
@@ -0,0 +1,91 @@
+#include "snii/query/regexp_query.h"
+
+#include <regex>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/query/internal/term_expansion.h"
+
+namespace snii::query {
+
+namespace {
+
+bool is_regex_metachar(char c) {
+    switch (c) {
+    case '.':
+    case '^':
+    case '$':
+    case '|':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '*':
+    case '+':
+    case '?':
+    case '{':
+    case '}':
+    case '\\':
+        return true;
+    default:
+        return false;
+    }
+}
+
+std::string literal_prefix_for_regex(std::string_view pattern) {
+    std::string out;
+    size_t i = 0;
+    if (!pattern.empty() && pattern.front() == '^') {
+        i = 1;
+    }
+    for (; i < pattern.size(); ++i) {
+        const char c = pattern[i];
+        if (is_regex_metachar(c)) {
+            break;
+        }
+        out.push_back(c);
+    }
+    return out;
+}
+
+} // namespace
+
+Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                    std::vector<uint32_t>* const docids, int32_t max_expansions) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("regexp_query: null out");
+    }
+    docids->clear();
+    VectorDocIdSink sink(*docids);
+    return regexp_query(idx, pattern, &sink, max_expansions);
+}
+
+Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                    std::vector<uint32_t>* const docids, QueryProfile* profile,
+                    int32_t max_expansions) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return regexp_query(idx, pattern, docids, max_expansions);
+}
+
+Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                    DocIdSink* const sink, int32_t max_expansions) {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("regexp_query: null sink");
+    }
+
+    std::regex re;
+    try {
+        re = std::regex(std::string(pattern));
+    } catch (const std::regex_error& e) {
+        return Status::InvalidArgument(std::string("regexp_query: invalid regex: ") + e.what());
+    }
+
+    const std::string enum_prefix = literal_prefix_for_regex(pattern);
+    return internal::emit_expanded_docid_union(
+            idx, enum_prefix,
+            [&re](std::string_view term) { return std::regex_match(term.begin(), term.end(), re); },
+            sink, max_expansions);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/scoring_query.cpp b/be/src/storage/index/snii/core/src/query/scoring_query.cpp
new file mode 100644
index 00000000000000..4813b3560ca7d7
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/scoring_query.cpp
@@ -0,0 +1,684 @@
+#include "snii/query/scoring_query.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/frq_pod.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/io/batch_range_fetcher.h"
+#include "snii/reader/windowed_posting.h"
+
+namespace snii::query {
+
+using snii::format::DictEntry;
+using snii::format::DictEntryEnc;
+using snii::format::DictEntryKind;
+using snii::format::FrqPreludeReader;
+using snii::format::WindowMeta;
+using snii::reader::LogicalIndexReader;
+
+namespace {
+
+// One scored posting for one term in one doc.
+struct TermPosting {
+    uint32_t docid = 0;
+    double score = 0.0;
+};
+
+// One window's block-max upper bound and the docid range it covers. block_max is
+// true when max_score came from the frq_prelude columns (vs the exact-score
+// fallback); both are valid upper bounds, so it is informational only.
+struct WindowBound {
+    uint32_t first_docid = 0; // inclusive
+    uint32_t last_docid = 0;  // inclusive
+    double max_score = 0.0;   // block-max upper bound for any doc in this window
+    bool block_max = false;
+};
+
+// All scored postings of one query term plus its block-max metadata.
+struct TermCursor {
+    std::vector<TermPosting> postings; // ascending docid, exact per-doc scores
+    std::vector<WindowBound> windows;  // ascending, covering all postings
+    size_t pos = 0;                    // DAAT cursor into postings
+};
+
+uint32_t CurrentDoc(const TermCursor& c) {
+    return c.pos < c.postings.size() ? c.postings[c.pos].docid
+                                     : std::numeric_limits<uint32_t>::max();
+}
+
+// Reads one slim .frq window's bytes for a slim pod_ref/inline entry (prelude
+// stripped). Windowed entries are handled separately via the prelude decode.
+Status FetchSlimWindowBytes(const LogicalIndexReader& idx, const DictEntry& entry,
+                            uint64_t frq_base, std::vector<uint8_t>* window_owned, Slice* window) {
+    if (entry.kind == DictEntryKind::kInline) {
+        *window = Slice(entry.frq_bytes);
+        return Status::OK();
+    }
+    uint64_t win_abs = 0;
+    uint64_t win_len = 0;
+    SNII_RETURN_IF_ERROR(idx.resolve_frq_window(entry, frq_base, &win_abs, &win_len));
+    snii::io::BatchRangeFetcher fetcher(idx.reader());
+    const size_t h = fetcher.add(win_abs, win_len);
+    SNII_RETURN_IF_ERROR(fetcher.fetch());
+    Slice got = fetcher.get(h);
+    window_owned->assign(got.data(), got.data() + got.size());
+    *window = Slice(*window_owned);
+    return Status::OK();
+}
+
+// Reads a windowed entry's frq_prelude (block-max columns live here).
+Status FetchPrelude(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base,
+                    FrqPreludeReader* out) {
+    const auto& region = idx.section_refs().posting_region;
+    const uint64_t prelude_abs = region.offset + frq_base + entry.frq_off_delta;
+    snii::io::BatchRangeFetcher fetcher(idx.reader());
+    const size_t h = fetcher.add(prelude_abs, entry.prelude_len);
+    SNII_RETURN_IF_ERROR(fetcher.fetch());
+    return FrqPreludeReader::open(fetcher.get(h), out);
+}
+
+// Builds per-window block-max bounds from a windowed entry's prelude. Each
+// WindowMeta carries the window's max_freq / max_norm and its covered docid
+// range (win_base+1 .. last_docid), so bounds come straight from the directory.
+Status BuildWindowBounds(const FrqPreludeReader& prelude, const ScorerContext& ctx, double avgdl,
+                         const Bm25Params& params, std::vector<WindowBound>* windows) {
+    const uint32_t n = prelude.n_windows();
+    for (uint32_t w = 0; w < n; ++w) {
+        WindowMeta m;
+        SNII_RETURN_IF_ERROR(prelude.window(w, &m));
+        if (m.doc_count == 0) continue;
+        WindowBound wb;
+        wb.first_docid = static_cast<uint32_t>(m.win_base) + (w == 0 ? 0u : 1u);
+        wb.last_docid = m.last_docid;
+        wb.max_score = ctx.max_score(m.max_freq, m.max_norm, avgdl, params);
+        wb.block_max = true;
+        windows->push_back(wb);
+    }
+    return Status::OK();
+}
+
+// Fallback single window covering all postings, bounded by the exact max score
+// (always a valid upper bound, so pruning stays correct).
+void SingleWindowFallback(const std::vector<TermPosting>& postings,
+                          std::vector<WindowBound>* windows) {
+    if (postings.empty()) return;
+    WindowBound wb;
+    wb.first_docid = postings.front().docid;
+    wb.last_docid = postings.back().docid;
+    wb.block_max = false;
+    for (const auto& p : postings) wb.max_score = std::max(wb.max_score, p.score);
+    windows->push_back(wb);
+}
+
+// Computes exact per-doc BM25 scores from decoded (docid, freq) vectors.
+Status ScoreDecoded(const snii::stats::SniiStatsProvider& stats, const ScorerContext& ctx,
+                    const Bm25Params& params, const std::vector<uint32_t>& docids,
+                    const std::vector<uint32_t>& freqs, std::vector<TermPosting>* out) {
+    const double avgdl = stats.avgdl();
+    out->reserve(docids.size());
+    for (size_t i = 0; i < docids.size(); ++i) {
+        uint8_t norm = 0;
+        SNII_RETURN_IF_ERROR(stats.encoded_norm(docids[i], &norm));
+        const uint32_t tf = i < freqs.size() ? freqs[i] : 1;
+        out->push_back({docids[i], ctx.score(tf, norm, avgdl, params)});
+    }
+    return Status::OK();
+}
+
+// Decodes a slim/inline term's single .frq window ([dd_region][freq_region]) into
+// docids/freqs using the entry's region metadata.
+Status DecodeSlim(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base,
+                  std::vector<uint32_t>* docids, std::vector<uint32_t>* freqs) {
+    std::vector<uint8_t> owned;
+    Slice window;
+    SNII_RETURN_IF_ERROR(FetchSlimWindowBytes(idx, entry, frq_base, &owned, &window));
+    const uint64_t dd_len = entry.dd_meta.disk_len;
+    if (dd_len > window.size()) {
+        return Status::Corruption("scoring_query: slim dd region exceeds window");
+    }
+    Slice dd_region = window.subslice(0, static_cast<size_t>(dd_len));
+    SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd_region, entry.dd_meta,
+                                                        /*win_base=*/0, docids));
+    Slice freq_region = window.subslice(static_cast<size_t>(dd_len),
+                                        window.size() - static_cast<size_t>(dd_len));
+    return snii::format::decode_freq_region(freq_region, entry.freq_meta, docids->size(), freqs);
+}
+
+// Builds the cursor for a windowed term: tiles all windows for exact scores and
+// reads the prelude once for true per-window block-max bounds.
+Status BuildWindowedCursor(const LogicalIndexReader& idx,
+                           const snii::stats::SniiStatsProvider& stats, const ScorerContext& ctx,
+                           const DictEntry& entry, uint64_t frq_base, uint64_t prx_base,
+                           const Bm25Params& params, TermCursor* cursor) {
+    snii::reader::DecodedPosting posting;
+    // Scoring needs freqs for BM25: fetch the FULL windows (want_freq=true).
+    SNII_RETURN_IF_ERROR(snii::reader::read_windowed_posting(idx, entry, frq_base, prx_base,
+                                                             /*want_positions=*/false,
+                                                             /*want_freq=*/true, &posting));
+    SNII_RETURN_IF_ERROR(
+            ScoreDecoded(stats, ctx, params, posting.docids, posting.freqs, &cursor->postings));
+    FrqPreludeReader prelude;
+    if (FetchPrelude(idx, entry, frq_base, &prelude).ok()) {
+        SNII_RETURN_IF_ERROR(
+                BuildWindowBounds(prelude, ctx, stats.avgdl(), params, &cursor->windows));
+    }
+    return Status::OK();
+}
+
+// Builds the cursor for one term: postings with exact scores + window bounds.
+Status BuildCursor(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats,
+                   const std::string& term, const Bm25Params& params, bool* found,
+                   TermCursor* cursor) {
+    DictEntry entry;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+    SNII_RETURN_IF_ERROR(idx.lookup(term, found, &entry, &frq_base, &prx_base));
+    if (!*found) return Status::OK();
+
+    const ScorerContext ctx = ScorerContext::make(stats.indexed_doc_count(), entry.df);
+
+    const bool windowed =
+            entry.kind == DictEntryKind::kPodRef && entry.enc == DictEntryEnc::kWindowed;
+    if (windowed) {
+        SNII_RETURN_IF_ERROR(
+                BuildWindowedCursor(idx, stats, ctx, entry, frq_base, prx_base, params, cursor));
+    } else {
+        std::vector<uint32_t> docids;
+        std::vector<uint32_t> freqs;
+        SNII_RETURN_IF_ERROR(DecodeSlim(idx, entry, frq_base, &docids, &freqs));
+        SNII_RETURN_IF_ERROR(ScoreDecoded(stats, ctx, params, docids, freqs, &cursor->postings));
+    }
+    if (cursor->windows.empty()) {
+        SingleWindowFallback(cursor->postings, &cursor->windows);
+    }
+    return Status::OK();
+}
+
+// Block-max upper bound for a term at a given docid: the max_score of the window
+// covering docid (windows are ascending and contiguous). Beyond the last window
+// the bound is 0 (the term cannot contribute).
+double TermBoundAt(const TermCursor& c, uint32_t docid) {
+    // Windows are ascending and contiguous; the first window whose last_docid is
+    // >= docid covers it. Its block-max is a valid upper bound for any contained
+    // doc, so it also bounds gaps between windows.
+    for (const auto& w : c.windows) {
+        if (docid <= w.last_docid) return w.max_score;
+    }
+    return 0.0;
+}
+
+// Min-heap keyed on score (smallest at top) maintaining the top-K.
+struct TopK {
+    explicit TopK(uint32_t k) : k_(k) {}
+    void offer(uint32_t docid, double score) {
+        if (heap_.size() < k_) {
+            heap_.push({score, docid});
+            return;
+        }
+        if (heap_.empty()) return;
+        const Entry& worst = heap_.top(); // lowest score; ties: largest docid
+        const bool better = score > worst.first || (score == worst.first && docid < worst.second);
+        if (better) {
+            heap_.pop();
+            heap_.push({score, docid});
+        }
+    }
+    double threshold() const { return heap_.size() < k_ ? -1.0 : heap_.top().first; }
+
+    using Entry = std::pair<double, uint32_t>;
+    struct Cmp {
+        bool operator()(const Entry& a, const Entry& b) const {
+            if (a.first != b.first) return a.first > b.first; // min-score at top
+            return a.second < b.second; // for ties, largest docid at top (evictable)
+        }
+    };
+    uint32_t k_;
+    std::priority_queue<Entry, std::vector<Entry>, Cmp> heap_;
+};
+
+void DrainSorted(TopK* topk, std::vector<ScoredDoc>* out) {
+    std::vector<ScoredDoc> all;
+    while (!topk->heap_.empty()) {
+        all.push_back({topk->heap_.top().second, topk->heap_.top().first});
+        topk->heap_.pop();
+    }
+    std::sort(all.begin(), all.end(), [](const ScoredDoc& a, const ScoredDoc& b) {
+        if (a.score != b.score) return a.score > b.score;
+        return a.docid < b.docid;
+    });
+    *out = std::move(all);
+}
+
+Status BuildCursors(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats,
+                    const std::vector<std::string>& terms, const Bm25Params& params,
+                    std::vector<TermCursor>* cursors) {
+    for (const auto& term : terms) {
+        bool found = false;
+        TermCursor c;
+        SNII_RETURN_IF_ERROR(BuildCursor(idx, stats, term, params, &found, &c));
+        if (found && !c.postings.empty()) cursors->push_back(std::move(c));
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status scoring_query_exhaustive(const LogicalIndexReader& idx,
+                                const snii::stats::SniiStatsProvider& stats,
+                                const std::vector<std::string>& terms, uint32_t k,
+                                const Bm25Params& params, std::vector<ScoredDoc>* out) {
+    if (out == nullptr) return Status::InvalidArgument("scoring_query: null out");
+    out->clear();
+    if (k == 0) return Status::OK();
+
+    std::vector<TermCursor> cursors;
+    SNII_RETURN_IF_ERROR(BuildCursors(idx, stats, terms, params, &cursors));
+
+    std::unordered_map<uint32_t, double> scores;
+    for (const auto& c : cursors)
+        for (const auto& p : c.postings) scores[p.docid] += p.score;
+
+    std::vector<ScoredDoc> all;
+    all.reserve(scores.size());
+    for (const auto& [docid, score] : scores) all.push_back({docid, score});
+    std::sort(all.begin(), all.end(), [](const ScoredDoc& a, const ScoredDoc& b) {
+        if (a.score != b.score) return a.score > b.score;
+        return a.docid < b.docid;
+    });
+    if (all.size() > k) all.resize(k);
+    *out = std::move(all);
+    return Status::OK();
+}
+
+namespace {
+
+// --- Phase C: selective-fetch (lazy window) WAND -----------------------------
+//
+// A LazyTermCursor knows its per-window block-max bounds + docid ranges from the
+// frq_prelude WITHOUT fetching any .frq window. Each window's exact (docid,score)
+// postings are decoded on first access and cached, so a window is fetched at most
+// once and ONLY when the WAND control flow touches a posting in it. Combined with
+// window-level SkipTo (advance past whole windows whose last_docid < target via
+// the prelude, never fetching them), the offer sequence is byte-identical to the
+// eager scoring_query_wand path -- only the bytes read differ.
+//
+// Soundness: a window is fetched only when LazyCurrentDoc/LazySkipTo land the
+// cursor inside it, i.e. it covers a candidate the WAND pivot already proved can
+// reach the running theta (bound >= theta). LazySkipTo jumps the cursor to the
+// SAME posting (first docid >= target) the eager per-doc walk would, so pivots,
+// alignments and offers are identical to the eager path; only windows the eager
+// path read-through-but-never-offered-from are skipped. Windows whose block-max
+// bound never reaches theta are never the pivot, so never fetched.
+
+// One query term's lazily-fetched scoring state.
+struct LazyTermCursor {
+    const LogicalIndexReader* idx = nullptr;
+    const snii::stats::SniiStatsProvider* stats = nullptr;
+    ScorerContext ctx = ScorerContext::make(1, 1);
+    Bm25Params params;
+    DictEntry entry;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+    FrqPreludeReader prelude;
+    bool windowed = false; // false => slim/inline single block already materialized
+
+    std::vector<WindowBound> windows;  // ascending; from prelude (or slim fallback)
+    std::vector<TermPosting> postings; // sparse: only fetched windows are filled
+    std::vector<uint32_t> win_start;   // prefix offsets, size = windows.size()+1
+    std::vector<char> fetched;         // size = windows.size()
+    size_t pos = 0;                    // virtual cursor over all windows' postings
+};
+
+// Total posting count across all windows (the virtual stream length).
+uint32_t TotalPostings(const LazyTermCursor& c) {
+    return c.win_start.empty() ? 0 : c.win_start.back();
+}
+
+// Index of the window whose virtual range contains posting index p (p < total).
+uint32_t WindowOf(const LazyTermCursor& c, uint32_t p) {
+    const auto it = std::upper_bound(c.win_start.begin(), c.win_start.end(), p);
+    return static_cast<uint32_t>((it - c.win_start.begin()) - 1);
+}
+
+// Fetches + decodes window w into the cursor's posting cache (idempotent). Only
+// reached when the WAND proves window w can still contribute to the top-K.
+Status MaterializeWindow(LazyTermCursor* c, uint32_t w) {
+    if (c->fetched[w]) return Status::OK();
+    WindowMeta meta;
+    SNII_RETURN_IF_ERROR(c->prelude.window(w, &meta));
+    snii::reader::WindowAbsRange r;
+    SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range(
+            *c->idx, c->entry, c->frq_base, c->prx_base, c->prelude, w,
+            /*want_positions=*/false, /*want_freq=*/true, &r));
+    // Scoring needs docids + freqs: fetch the window's dd sub-range AND freq sub-range.
+    snii::io::BatchRangeFetcher fetcher(c->idx->reader(), snii::reader::kSameTermCoalesceGap);
+    const size_t dh = fetcher.add(r.dd_off, r.dd_len);
+    const size_t fh = fetcher.add(r.freq_off, r.freq_len);
+    SNII_RETURN_IF_ERROR(fetcher.fetch());
+    std::vector<uint32_t> docids;
+    std::vector<uint32_t> freqs;
+    std::vector<std::vector<uint32_t>> pos;
+    SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices(
+            meta, fetcher.get(dh), fetcher.get(fh), Slice(), /*want_positions=*/false,
+            /*want_freq=*/true, &docids, &freqs, &pos));
+    if (docids.size() != c->win_start[w + 1] - c->win_start[w]) {
+        return Status::Corruption("scoring_query: selective window doc-count drift");
+    }
+    std::vector<TermPosting> scored;
+    SNII_RETURN_IF_ERROR(ScoreDecoded(*c->stats, c->ctx, c->params, docids, freqs, &scored));
+    std::copy(scored.begin(), scored.end(), c->postings.begin() + c->win_start[w]);
+    c->fetched[w] = 1;
+    return Status::OK();
+}
+
+// Current docid at the cursor, fetching the covering window if needed. Exhausted
+// cursor -> UINT32_MAX.
+Status LazyCurrentDoc(LazyTermCursor* c, uint32_t* docid) {
+    if (c->pos >= TotalPostings(*c)) {
+        *docid = std::numeric_limits<uint32_t>::max();
+        return Status::OK();
+    }
+    const uint32_t w = WindowOf(*c, static_cast<uint32_t>(c->pos));
+    SNII_RETURN_IF_ERROR(MaterializeWindow(c, w));
+    *docid = c->postings[c->pos].docid;
+    return Status::OK();
+}
+
+// Advances pos to the first posting with docid >= target, skipping ENTIRE windows
+// whose last_docid < target WITHOUT fetching them (prelude-only), then fetching
+// just the landing window. Lands on the same posting the eager per-doc walk would.
+Status LazySkipTo(LazyTermCursor* c, uint32_t target) {
+    const uint32_t total = TotalPostings(*c);
+    while (c->pos < total) {
+        const uint32_t w = WindowOf(*c, static_cast<uint32_t>(c->pos));
+        if (c->windows[w].last_docid >= target) break;
+        c->pos = c->win_start[w + 1]; // skip this window entirely (no fetch)
+    }
+    if (c->pos >= total) return Status::OK();
+    const uint32_t w = WindowOf(*c, static_cast<uint32_t>(c->pos));
+    SNII_RETURN_IF_ERROR(MaterializeWindow(c, w));
+    while (c->pos < total && c->postings[c->pos].docid < target) ++c->pos;
+    return Status::OK();
+}
+
+// Initializes a lazy windowed cursor from the prelude alone: per-window block-max
+// bounds + ranges + cache slots, with NO .frq window fetched.
+Status BuildLazyWindowed(LazyTermCursor* c) {
+    SNII_RETURN_IF_ERROR(
+            snii::reader::fetch_windowed_prelude(*c->idx, c->entry, c->frq_base, &c->prelude));
+    SNII_RETURN_IF_ERROR(
+            BuildWindowBounds(c->prelude, c->ctx, c->stats->avgdl(), c->params, &c->windows));
+    // BuildWindowBounds keeps only non-empty windows, in window order. Build the
+    // matching prefix-sum of doc_counts over those same non-empty windows so the
+    // bound list, win_start and fetched stay 1:1.
+    const uint32_t nb = static_cast<uint32_t>(c->windows.size());
+    c->win_start.assign(nb + 1, 0);
+    c->fetched.assign(nb, 0);
+    uint32_t bi = 0;
+    uint32_t acc = 0;
+    for (uint32_t w = 0; w < c->prelude.n_windows() && bi < nb; ++w) {
+        WindowMeta meta;
+        SNII_RETURN_IF_ERROR(c->prelude.window(w, &meta));
+        if (meta.doc_count == 0) continue;
+        acc += meta.doc_count;
+        c->win_start[++bi] = acc;
+    }
+    c->postings.assign(acc, TermPosting {});
+    return Status::OK();
+}
+
+// Initializes a slim/inline cursor: its single window is small, so fetch + score
+// it eagerly (exactly as the existing path). One bound covers all its postings.
+Status BuildLazySlim(LazyTermCursor* c) {
+    std::vector<uint32_t> docids;
+    std::vector<uint32_t> freqs;
+    SNII_RETURN_IF_ERROR(DecodeSlim(*c->idx, c->entry, c->frq_base, &docids, &freqs));
+    SNII_RETURN_IF_ERROR(ScoreDecoded(*c->stats, c->ctx, c->params, docids, freqs, &c->postings));
+    SingleWindowFallback(c->postings, &c->windows);
+    c->win_start = {0, static_cast<uint32_t>(c->postings.size())};
+    c->fetched.assign(1, 1); // already materialized
+    return Status::OK();
+}
+
+// Builds a LazyTermCursor for one term: prelude-only for windowed terms (no .frq
+// fetched), fully-materialized single window for slim/inline (small).
+Status BuildLazyCursor(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats,
+                       const std::string& term, const Bm25Params& params, bool* found,
+                       LazyTermCursor* c) {
+    uint64_t prx_base = 0;
+    SNII_RETURN_IF_ERROR(idx.lookup(term, found, &c->entry, &c->frq_base, &prx_base));
+    if (!*found) return Status::OK();
+    c->idx = &idx;
+    c->stats = &stats;
+    c->params = params;
+    c->prx_base = prx_base;
+    c->ctx = ScorerContext::make(stats.indexed_doc_count(), c->entry.df);
+    c->windowed =
+            c->entry.kind == DictEntryKind::kPodRef && c->entry.enc == DictEntryEnc::kWindowed;
+    return c->windowed ? BuildLazyWindowed(c) : BuildLazySlim(c);
+}
+
+Status SelectiveBuildCursors(const LogicalIndexReader& idx,
+                             const snii::stats::SniiStatsProvider& stats,
+                             const std::vector<std::string>& terms, const Bm25Params& params,
+                             std::vector<LazyTermCursor>* cursors) {
+    for (const auto& term : terms) {
+        bool found = false;
+        LazyTermCursor c;
+        SNII_RETURN_IF_ERROR(BuildLazyCursor(idx, stats, term, params, &found, &c));
+        if (found && TotalPostings(c) > 0) cursors->push_back(std::move(c));
+    }
+    return Status::OK();
+}
+
+// Block-max upper bound for a lazy cursor at docid: block_max of the window
+// covering docid (ascending, contiguous). Beyond the last window -> 0. Same
+// semantics as TermBoundAt over the eager cursor's window list.
+double LazyTermBoundAt(const LazyTermCursor& c, uint32_t docid) {
+    for (const auto& w : c.windows) {
+        if (docid <= w.last_docid) return w.max_score;
+    }
+    return 0.0;
+}
+
+// Sorts cursors ascending by current docid (materializing each cursor's current
+// covering window), returning the smallest current docid via *front.
+Status SelectiveSortByDoc(std::vector<LazyTermCursor>* cursors, uint32_t* front) {
+    std::vector<uint32_t> cur(cursors->size());
+    for (size_t i = 0; i < cursors->size(); ++i) {
+        SNII_RETURN_IF_ERROR(LazyCurrentDoc(&(*cursors)[i], &cur[i]));
+    }
+    std::vector<size_t> order(cursors->size());
+    for (size_t i = 0; i < order.size(); ++i) order[i] = i;
+    std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { return cur[a] < cur[b]; });
+    std::vector<LazyTermCursor> sorted;
+    sorted.reserve(cursors->size());
+    for (size_t i : order) sorted.push_back(std::move((*cursors)[i]));
+    *cursors = std::move(sorted);
+    *front = order.empty() ? std::numeric_limits<uint32_t>::max() : cur[order.front()];
+    return Status::OK();
+}
+
+// Finds the pivot term: the first cursor (current-docid order) at which the
+// accumulated block-max bound reaches theta. >= keeps boundary ties (matching the
+// exhaustive total order). *found=false when no remaining doc can beat theta.
+Status SelectivePivot(std::vector<LazyTermCursor>* cursors, double theta, size_t* pivot,
+                      uint32_t* pivot_doc, bool* found) {
+    double bound = 0.0;
+    *found = false;
+    for (size_t i = 0; i < cursors->size(); ++i) {
+        uint32_t d = 0;
+        SNII_RETURN_IF_ERROR(LazyCurrentDoc(&(*cursors)[i], &d));
+        if (d == std::numeric_limits<uint32_t>::max()) break;
+        bound += LazyTermBoundAt((*cursors)[i], d);
+        if (bound >= theta) {
+            *pivot = i;
+            *pivot_doc = d;
+            *found = true;
+            return Status::OK();
+        }
+    }
+    return Status::OK();
+}
+
+// Scores the aligned pivot doc exactly (summing all cursors AT pivot_doc) and
+// advances those cursors by one posting.
+Status SelectiveScorePivot(std::vector<LazyTermCursor>* cursors, uint32_t pivot_doc, TopK* topk) {
+    double doc_score = 0.0;
+    for (auto& c : *cursors) {
+        uint32_t d = 0;
+        SNII_RETURN_IF_ERROR(LazyCurrentDoc(&c, &d));
+        if (d == pivot_doc) {
+            doc_score += c.postings[c.pos].score; // window already materialized
+            ++c.pos;
+        }
+    }
+    topk->offer(pivot_doc, doc_score);
+    return Status::OK();
+}
+
+// Advances the first lagging cursor (current doc < pivot_doc) up to pivot_doc.
+Status SelectiveAdvanceLagging(std::vector<LazyTermCursor>* cursors, uint32_t pivot_doc) {
+    for (auto& c : *cursors) {
+        uint32_t d = 0;
+        SNII_RETURN_IF_ERROR(LazyCurrentDoc(&c, &d));
+        if (d < pivot_doc) {
+            SNII_RETURN_IF_ERROR(LazySkipTo(&c, pivot_doc));
+            return Status::OK();
+        }
+    }
+    return Status::OK();
+}
+
+// One WAND iteration body: sort, pick pivot, then either score (aligned) or skip
+// a lagging cursor forward. *done=true ends the loop.
+Status SelectiveStep(std::vector<LazyTermCursor>* cursors, TopK* topk, bool* done) {
+    uint32_t front = 0;
+    SNII_RETURN_IF_ERROR(SelectiveSortByDoc(cursors, &front));
+    if (cursors->empty() || front == std::numeric_limits<uint32_t>::max()) {
+        *done = true;
+        return Status::OK();
+    }
+    size_t pivot = 0;
+    uint32_t pivot_doc = 0;
+    bool found_pivot = false;
+    SNII_RETURN_IF_ERROR(
+            SelectivePivot(cursors, topk->threshold(), &pivot, &pivot_doc, &found_pivot));
+    if (!found_pivot) {
+        *done = true;
+        return Status::OK();
+    }
+    if (front == pivot_doc) {
+        return SelectiveScorePivot(cursors, pivot_doc, topk);
+    }
+    return SelectiveAdvanceLagging(cursors, pivot_doc);
+}
+
+Status SelectiveWandLoop(std::vector<LazyTermCursor>* cursors, TopK* topk) {
+    bool done = false;
+    while (!done) {
+        SNII_RETURN_IF_ERROR(SelectiveStep(cursors, topk, &done));
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status scoring_query_wand_selective(const LogicalIndexReader& idx,
+                                    const snii::stats::SniiStatsProvider& stats,
+                                    const std::vector<std::string>& terms, uint32_t k,
+                                    const Bm25Params& params, std::vector<ScoredDoc>* out) {
+    if (out == nullptr) return Status::InvalidArgument("scoring_query: null out");
+    out->clear();
+    if (k == 0) return Status::OK();
+
+    std::vector<LazyTermCursor> cursors;
+    SNII_RETURN_IF_ERROR(SelectiveBuildCursors(idx, stats, terms, params, &cursors));
+
+    TopK topk(k);
+    SNII_RETURN_IF_ERROR(SelectiveWandLoop(&cursors, &topk));
+    DrainSorted(&topk, out);
+    return Status::OK();
+}
+
+Status scoring_query_wand(const LogicalIndexReader& idx,
+                          const snii::stats::SniiStatsProvider& stats,
+                          const std::vector<std::string>& terms, uint32_t k,
+                          const Bm25Params& params, std::vector<ScoredDoc>* out) {
+    if (out == nullptr) return Status::InvalidArgument("scoring_query: null out");
+    out->clear();
+    if (k == 0) return Status::OK();
+
+    std::vector<TermCursor> cursors;
+    SNII_RETURN_IF_ERROR(BuildCursors(idx, stats, terms, params, &cursors));
+
+    TopK topk(k);
+    // Document-at-a-time WAND with block-max bounds.
+    while (true) {
+        // Sort cursors by current docid (ascending; exhausted cursors sink).
+        std::sort(cursors.begin(), cursors.end(), [](const TermCursor& a, const TermCursor& b) {
+            return CurrentDoc(a) < CurrentDoc(b);
+        });
+        if (cursors.empty() ||
+            CurrentDoc(cursors.front()) == std::numeric_limits<uint32_t>::max()) {
+            break;
+        }
+
+        const double theta = topk.threshold();
+        // Accumulate block-max upper bounds in docid order to find the pivot term.
+        double bound = 0.0;
+        size_t pivot = 0;
+        bool found_pivot = false;
+        for (size_t i = 0; i < cursors.size(); ++i) {
+            const uint32_t d = CurrentDoc(cursors[i]);
+            if (d == std::numeric_limits<uint32_t>::max()) break;
+            bound += TermBoundAt(cursors[i], d);
+            // Use >= (not >) so a doc whose upper bound only TIES the K-th threshold is
+            // still explored and exact-scored: under the (score desc, docid asc) total
+            // order a tie can still evict the current K-th entry (smaller docid wins),
+            // exactly as the exhaustive path would. Strict > would wrongly prune ties.
+            if (bound >= theta) {
+                pivot = i;
+                found_pivot = true;
+                break;
+            }
+        }
+        if (!found_pivot) break; // no doc can beat the threshold anymore.
+
+        const uint32_t pivot_doc = CurrentDoc(cursors[pivot]);
+        if (CurrentDoc(cursors.front()) == pivot_doc) {
+            // All cursors at the pivot doc are aligned: score it exactly.
+            double doc_score = 0.0;
+            for (auto& c : cursors) {
+                if (CurrentDoc(c) == pivot_doc) {
+                    doc_score += c.postings[c.pos].score;
+                    ++c.pos;
+                }
+            }
+            topk.offer(pivot_doc, doc_score);
+        } else {
+            // Advance a lagging cursor toward pivot_doc (skip docs it cannot win on).
+            for (auto& c : cursors) {
+                if (CurrentDoc(c) < pivot_doc) {
+                    while (c.pos < c.postings.size() && c.postings[c.pos].docid < pivot_doc) {
+                        ++c.pos;
+                    }
+                    break;
+                }
+            }
+        }
+    }
+    DrainSorted(&topk, out);
+    return Status::OK();
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/term_expansion.cpp b/be/src/storage/index/snii/core/src/query/term_expansion.cpp
new file mode 100644
index 00000000000000..ce1cffb0f141f1
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/term_expansion.cpp
@@ -0,0 +1,33 @@
+#include "snii/query/internal/term_expansion.h"
+
+#include <utility>
+#include <vector>
+
+#include "snii/query/internal/docid_posting_reader.h"
+#include "snii/query/internal/docid_union.h"
+
+namespace snii::query::internal {
+
+Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx,
+                                 std::string_view enum_prefix, const TermMatcher& matches,
+                                 DocIdSink* const sink, int32_t max_expansions) {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("term_expansion: null sink");
+    }
+
+    std::vector<ResolvedDocidPosting> postings;
+    int32_t count = 0;
+    SNII_RETURN_IF_ERROR(idx.visit_prefix_terms(
+            enum_prefix, [&](snii::reader::LogicalIndexReader::PrefixHit&& hit, bool* stop) {
+                if (!matches(hit.term)) {
+                    return Status::OK();
+                }
+                postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base});
+                ++count;
+                *stop = max_expansions > 0 && count >= max_expansions;
+                return Status::OK();
+            }));
+    return emit_docid_union(idx, postings, sink);
+}
+
+} // namespace snii::query::internal
diff --git a/be/src/storage/index/snii/core/src/query/term_query.cpp b/be/src/storage/index/snii/core/src/query/term_query.cpp
new file mode 100644
index 00000000000000..4cf6e97bc2471b
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/term_query.cpp
@@ -0,0 +1,39 @@
+#include "snii/query/term_query.h"
+
+#include <vector>
+
+#include "snii/format/dict_entry.h"
+#include "snii/query/internal/docid_posting_reader.h"
+
+namespace snii::query {
+
+using snii::format::DictEntry;
+using snii::reader::LogicalIndexReader;
+
+Status term_query(const LogicalIndexReader& idx, std::string_view term,
+                  std::vector<uint32_t>* docids) {
+    if (docids == nullptr) return Status::InvalidArgument("term_query: null out");
+    docids->clear();
+    VectorDocIdSink sink(*docids);
+    return term_query(idx, term, &sink);
+}
+
+Status term_query(const LogicalIndexReader& idx, std::string_view term, DocIdSink* sink) {
+    if (sink == nullptr) return Status::InvalidArgument("term_query: null sink");
+
+    bool found = false;
+    DictEntry entry;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+    SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base));
+    if (!found) return Status::OK();
+    return internal::read_docid_posting(idx, entry, frq_base, prx_base, sink);
+}
+
+Status term_query(const LogicalIndexReader& idx, std::string_view term,
+                  std::vector<uint32_t>* docids, QueryProfile* profile) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return term_query(idx, term, docids);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/query/wildcard_query.cpp b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp
new file mode 100644
index 00000000000000..a3d5fd72bfbb71
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp
@@ -0,0 +1,79 @@
+#include "snii/query/wildcard_query.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "snii/query/internal/term_expansion.h"
+
+namespace snii::query {
+
+namespace {
+
+std::string literal_prefix_for_wildcard(std::string_view pattern) {
+    std::string out;
+    for (char c : pattern) {
+        if (c == '*' || c == '?') {
+            break;
+        }
+        out.push_back(c);
+    }
+    return out;
+}
+
+bool wildcard_match(std::string_view pattern, std::string_view text) {
+    std::vector<uint8_t> prev(text.size() + 1, 0);
+    std::vector<uint8_t> curr(text.size() + 1, 0);
+    prev[0] = 1;
+
+    for (char p : pattern) {
+        std::fill(curr.begin(), curr.end(), 0);
+        if (p == '*') {
+            curr[0] = prev[0];
+            for (size_t i = 1; i <= text.size(); ++i) {
+                curr[i] = prev[i] || curr[i - 1];
+            }
+        } else {
+            for (size_t i = 1; i <= text.size(); ++i) {
+                curr[i] = prev[i - 1] && (p == '?' || p == text[i - 1]);
+            }
+        }
+        prev.swap(curr);
+    }
+    return prev[text.size()] != 0;
+}
+
+} // namespace
+
+Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                      std::vector<uint32_t>* const docids, int32_t max_expansions) {
+    if (docids == nullptr) {
+        return Status::InvalidArgument("wildcard_query: null out");
+    }
+    docids->clear();
+    VectorDocIdSink sink(*docids);
+    return wildcard_query(idx, pattern, &sink, max_expansions);
+}
+
+Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                      std::vector<uint32_t>* const docids, QueryProfile* profile,
+                      int32_t max_expansions) {
+    QueryProfileScope profile_scope(idx.reader(), profile);
+    return wildcard_query(idx, pattern, docids, max_expansions);
+}
+
+Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern,
+                      DocIdSink* const sink, int32_t max_expansions) {
+    if (sink == nullptr) {
+        return Status::InvalidArgument("wildcard_query: null sink");
+    }
+    const std::string enum_prefix = literal_prefix_for_wildcard(pattern);
+    return internal::emit_expanded_docid_union(
+            idx, enum_prefix,
+            [pattern](std::string_view term) { return wildcard_match(pattern, term); }, sink,
+            max_expansions);
+}
+
+} // namespace snii::query
diff --git a/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp
new file mode 100644
index 00000000000000..be6c01b2cb97d6
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp
@@ -0,0 +1,390 @@
+#include "snii/reader/logical_index_reader.h"
+
+#include <cstdlib>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "snii/encoding/crc32c.h"
+#include "snii/encoding/zstd_codec.h"
+#include "snii/format/dict_block.h"
+#include "snii/format/dict_block_directory.h"
+
+namespace snii::reader {
+
+using snii::format::BlockRef;
+using snii::format::bsbf_hash;
+using snii::format::bsbf_probe;
+using snii::format::DictBlockDirectoryReader;
+using snii::format::DictBlockReader;
+using snii::format::DictEntry;
+using snii::format::IndexTier;
+using snii::format::kBsbfBytesPerBlock;
+using snii::format::kBsbfHeaderSize;
+using snii::format::PerIndexMetaReader;
+using snii::format::RegionRef;
+using snii::format::SampledTermIndexReader;
+
+namespace {
+constexpr uint64_t kMaxDictBlockUncompBytes = 256ULL * 1024 * 1024;
+constexpr uint64_t kDefaultDictResidentMaxBytes = 256ULL * 1024;
+
+// L0/L1 tiering threshold (bytes). Defaults to kBsbfResidentMaxBytes; the env
+// SNII_BSBF_RESIDENT_MAX overrides it for tuning and for exercising the
+// on-demand L1 path in tests without a 250K-term corpus. Read fresh each open.
+uint64_t bsbf_resident_max_bytes() {
+    const char* s = std::getenv("SNII_BSBF_RESIDENT_MAX");
+    if (s != nullptr) {
+        char* end = nullptr;
+        const unsigned long long v = std::strtoull(s, &end, 10);
+        if (end != s) {
+            return v;
+        }
+    }
+    return snii::format::kBsbfResidentMaxBytes;
+}
+
+uint64_t dict_resident_max_bytes() {
+    const char* s = std::getenv("SNII_DICT_RESIDENT_MAX");
+    if (s != nullptr) {
+        char* end = nullptr;
+        const unsigned long long v = std::strtoull(s, &end, 10);
+        if (end != s) {
+            return v;
+        }
+    }
+    return kDefaultDictResidentMaxBytes;
+}
+
+Status checked_size(uint64_t value, const char* error, size_t* out) {
+    if (value > std::numeric_limits<size_t>::max()) {
+        return Status::Corruption(error);
+    }
+    *out = static_cast<size_t>(value);
+    return Status::OK();
+}
+
+Status dict_block_memory_bytes(const BlockRef& ref, uint64_t* out) {
+    if ((ref.flags & snii::format::block_ref_flags::kZstd) == 0) {
+        *out = ref.length;
+        return Status::OK();
+    }
+    if (ref.uncomp_len == 0 || ref.uncomp_len > kMaxDictBlockUncompBytes) {
+        return Status::Corruption("dict block: zstd uncomp_len out of range");
+    }
+    *out = ref.uncomp_len;
+    return Status::OK();
+}
+
+Status read_dict_block_bytes(snii::io::FileReader* reader, const BlockRef& ref,
+                             std::vector<uint8_t>* out) {
+    size_t read_len = 0;
+    SNII_RETURN_IF_ERROR(
+            checked_size(ref.length, "dict block: on-disk length out of range", &read_len));
+
+    std::vector<uint8_t> block_bytes;
+    SNII_RETURN_IF_ERROR(reader->read_at(ref.offset, read_len, &block_bytes));
+    if (block_bytes.size() != read_len) {
+        return Status::Corruption("dict block: short read");
+    }
+
+    if ((ref.flags & snii::format::block_ref_flags::kZstd) == 0) {
+        *out = std::move(block_bytes);
+        return Status::OK();
+    }
+
+    uint64_t memory_bytes = 0;
+    SNII_RETURN_IF_ERROR(dict_block_memory_bytes(ref, &memory_bytes));
+    size_t uncomp_len = 0;
+    SNII_RETURN_IF_ERROR(
+            checked_size(memory_bytes, "dict block: zstd length out of range", &uncomp_len));
+    return snii::zstd_decompress(Slice(block_bytes), uncomp_len, out);
+}
+
+Status open_dict_block(snii::io::FileReader* reader, const BlockRef& ref, IndexTier tier,
+                       bool has_positions, std::vector<uint8_t>* bytes, DictBlockReader* out) {
+    SNII_RETURN_IF_ERROR(read_dict_block_bytes(reader, ref, bytes));
+    return DictBlockReader::open(Slice(*bytes), tier, has_positions, out);
+}
+} // namespace
+
+Status LogicalIndexReader::load_resident_dict_blocks() {
+    resident_dict_blocks_.clear();
+
+    const uint64_t max_bytes = dict_resident_max_bytes();
+    if (max_bytes == 0 || dbd_.n_blocks() == 0) {
+        return Status::OK();
+    }
+
+    uint64_t total_bytes = 0;
+    for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) {
+        BlockRef ref {};
+        SNII_RETURN_IF_ERROR(dbd_.get(ord, &ref));
+        uint64_t block_bytes = 0;
+        SNII_RETURN_IF_ERROR(dict_block_memory_bytes(ref, &block_bytes));
+        if (block_bytes > max_bytes - total_bytes) {
+            return Status::OK();
+        }
+        total_bytes += block_bytes;
+    }
+
+    resident_dict_blocks_.reserve(dbd_.n_blocks());
+    for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) {
+        BlockRef ref {};
+        SNII_RETURN_IF_ERROR(dbd_.get(ord, &ref));
+        ResidentDictBlock block;
+        SNII_RETURN_IF_ERROR(
+                open_dict_block(reader_, ref, tier_, has_positions_, &block.bytes, &block.reader));
+        resident_dict_blocks_.push_back(std::move(block));
+    }
+    return Status::OK();
+}
+
+Status LogicalIndexReader::dict_block_reader_for_ordinal(uint32_t ordinal,
+                                                         OnDemandDictBlock* on_demand,
+                                                         const DictBlockReader** out) const {
+    if (!resident_dict_blocks_.empty()) {
+        if (resident_dict_blocks_.size() != dbd_.n_blocks() ||
+            ordinal >= resident_dict_blocks_.size()) {
+            return Status::Corruption("logical_index: incomplete resident dict");
+        }
+        *out = &resident_dict_blocks_[ordinal].reader;
+        return Status::OK();
+    }
+
+    BlockRef ref {};
+    SNII_RETURN_IF_ERROR(dbd_.get(ordinal, &ref));
+    SNII_RETURN_IF_ERROR(open_dict_block(reader_, ref, tier_, has_positions_, &on_demand->bytes,
+                                         &on_demand->reader));
+    *out = &on_demand->reader;
+    return Status::OK();
+}
+
+Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tier,
+                                bool has_positions, Slice meta_block, LogicalIndexReader* out) {
+    if (file_reader == nullptr) {
+        return Status::InvalidArgument("logical_index: null file reader");
+    }
+    if (out == nullptr) {
+        return Status::InvalidArgument("logical_index: null out");
+    }
+    *out = LogicalIndexReader {};
+
+    out->reader_ = file_reader;
+    out->tier_ = tier;
+    out->has_positions_ = has_positions;
+
+    SNII_RETURN_IF_ERROR(PerIndexMetaReader::open(meta_block, &out->meta_));
+    SNII_RETURN_IF_ERROR(
+            SampledTermIndexReader::open(out->meta_.sampled_term_index_bytes(), &out->sti_));
+    SNII_RETURN_IF_ERROR(
+            DictBlockDirectoryReader::open(out->meta_.dict_block_directory_bytes(), &out->dbd_));
+    SNII_RETURN_IF_ERROR(out->load_resident_dict_blocks());
+
+    // Block-split bloom XFilter: derive the resident header from the section ref
+    // (offset+length) -- ZERO open-time I/O, the whole point of the on-demand
+    // design. The bitset starts at the constant offset section.offset + 28; one
+    // 32-byte block is read on demand per probe in lookup().
+    const RegionRef& bsbf = out->meta_.section_refs().bsbf;
+    if (bsbf.length > 0) {
+        if (bsbf.length <= kBsbfHeaderSize) {
+            return Status::Corruption("logical_index: bsbf section too small");
+        }
+        const uint64_t num_bytes = bsbf.length - kBsbfHeaderSize;
+        const bool resident = bsbf.length <= bsbf_resident_max_bytes();
+        // L0: read the WHOLE section (header + bitset) so probes are in-memory AND
+        // the bitset crc can be verified once. L1: read only the 28-byte header so
+        // open stays near-zero I/O; the on-demand single-block probe cannot verify
+        // a whole-bitset crc, so L1 relies on the storage layer's own integrity for
+        // the bitset body. Either way the header (magic/version/strategy/geometry +
+        // header crc) is parsed and verified -- BsbfHeader::parse rejects a corrupt
+        // header.
+        std::vector<uint8_t> head;
+        SNII_RETURN_IF_ERROR(
+                file_reader->read_at(bsbf.offset, resident ? bsbf.length : kBsbfHeaderSize, &head));
+        if (head.size() < kBsbfHeaderSize) {
+            return Status::Corruption("logical_index: short bsbf header read");
+        }
+        SNII_RETURN_IF_ERROR(snii::format::BsbfHeader::parse(Slice(head.data(), kBsbfHeaderSize),
+                                                             bsbf.offset, &out->bsbf_header_));
+        // Cross-check the header geometry against the section ref.
+        if (out->bsbf_header_.num_bytes != num_bytes) {
+            return Status::Corruption("logical_index: bsbf header/section size mismatch");
+        }
+        out->has_bsbf_ = true;
+        if (resident) {
+            if (head.size() < bsbf.length) {
+                return Status::Corruption("logical_index: short bsbf resident read");
+            }
+            const Slice bitset(head.data() + kBsbfHeaderSize, out->bsbf_header_.num_bytes);
+            if (snii::crc32c(bitset) != out->bsbf_header_.bitset_crc) {
+                return Status::Corruption("logical_index: bsbf bitset crc mismatch");
+            }
+            out->bsbf_resident_bitset_.assign(bitset.data(), bitset.data() + bitset.size());
+            out->bsbf_resident_ = true;
+        }
+    }
+    return Status::OK();
+}
+
+Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* entry,
+                                  uint64_t* frq_base, uint64_t* prx_base) const {
+    *found = false;
+    if (reader_ == nullptr) {
+        return Status::InvalidArgument("logical_index: not opened");
+    }
+
+    // 1. XFilter fast rejection. DEFINITELY-ABSENT returns empty without the
+    // DICT read. L0 probes the resident bitset; L1 reads one 32-byte block.
+    if (has_bsbf_) {
+        const uint64_t h = bsbf_hash(term);
+        bool maybe = false;
+        if (bsbf_resident_) {
+            // L0: in-memory probe of the resident bitset (no round).
+            const uint32_t blk = snii::format::bsbf_block_index(h, bsbf_header_.num_blocks);
+            maybe = snii::format::bsbf_block_contains(
+                    h,
+                    bsbf_resident_bitset_.data() + static_cast<size_t>(blk) * kBsbfBytesPerBlock);
+        } else {
+            // L1: on-demand single-block probe.
+            SNII_RETURN_IF_ERROR(bsbf_probe(reader_, bsbf_header_, h, &maybe));
+        }
+        if (!maybe) {
+            return Status::OK();
+        }
+    }
+
+    // 2. SampledTermIndex -> candidate block ordinal.
+    bool maybe = false;
+    uint32_t ordinal = 0;
+    SNII_RETURN_IF_ERROR(sti_.locate(term, &maybe, &ordinal));
+    if (!maybe) {
+        return Status::OK();
+    }
+
+    // 3. Use a resident small-DICT block when present; otherwise read the DICT
+    //    block on demand and parse it with the same validation path used at open.
+    const DictBlockReader* br = nullptr;
+    OnDemandDictBlock on_demand;
+    SNII_RETURN_IF_ERROR(dict_block_reader_for_ordinal(ordinal, &on_demand, &br));
+
+    bool hit = false;
+    SNII_RETURN_IF_ERROR(br->find_term(term, &hit, entry));
+    if (!hit) {
+        return Status::OK();
+    }
+
+    *found = true;
+    *frq_base = br->frq_base();
+    *prx_base = br->prx_base();
+    return Status::OK();
+}
+
+Status LogicalIndexReader::visit_prefix_terms(std::string_view prefix,
+                                              const PrefixHitVisitor& visitor) const {
+    if (!visitor) {
+        return Status::InvalidArgument("logical_index: null prefix visitor");
+    }
+    if (reader_ == nullptr) {
+        return Status::InvalidArgument("logical_index: not opened");
+    }
+
+    // Seek the start block: the SampledTermIndex block whose first term <= prefix
+    // (terms with `prefix` are >= prefix, so they begin in that block or later).
+    // If the prefix sorts before every sample (or is empty), start at block 0.
+    uint32_t start = 0;
+    if (!prefix.empty()) {
+        bool maybe = false;
+        uint32_t ordinal = 0;
+        SNII_RETURN_IF_ERROR(sti_.locate(prefix, &maybe, &ordinal));
+        if (maybe) {
+            start = ordinal;
+        }
+    }
+
+    for (uint32_t ord = start; ord < dbd_.n_blocks(); ++ord) {
+        const DictBlockReader* br = nullptr;
+        OnDemandDictBlock on_demand;
+        SNII_RETURN_IF_ERROR(dict_block_reader_for_ordinal(ord, &on_demand, &br));
+        std::vector<DictEntry> entries;
+        SNII_RETURN_IF_ERROR(br->decode_all(&entries));
+
+        for (DictEntry& e : entries) {
+            const std::string_view t(e.term);
+            if (t < prefix) {
+                continue; // not yet at the prefix range
+            }
+            const bool has_prefix =
+                    t.size() >= prefix.size() && t.compare(0, prefix.size(), prefix) == 0;
+            if (!has_prefix) {
+                return Status::OK(); // past the prefix range; sorted -> done
+            }
+            PrefixHit hit;
+            hit.term = e.term;
+            hit.entry = std::move(e);
+            hit.frq_base = br->frq_base();
+            hit.prx_base = br->prx_base();
+            bool stop = false;
+            SNII_RETURN_IF_ERROR(visitor(std::move(hit), &stop));
+            if (stop) {
+                return Status::OK();
+            }
+        }
+    }
+    return Status::OK();
+}
+
+Status LogicalIndexReader::prefix_terms(std::string_view prefix, std::vector<PrefixHit>* const out,
+                                        int32_t max_terms) const {
+    if (out == nullptr) {
+        return Status::InvalidArgument("logical_index: null out");
+    }
+    out->clear();
+    return visit_prefix_terms(prefix, [&](PrefixHit&& hit, bool* stop) {
+        out->push_back(std::move(hit));
+        *stop = max_terms > 0 && out->size() >= static_cast<size_t>(max_terms);
+        return Status::OK();
+    });
+}
+
+namespace {
+
+// Validates a pod_ref window locator against the posting region and returns the
+// absolute window range (after the prelude). Rejects corrupt locators rather
+// than letting size_t underflow / uint64 overflow reach read_at.
+Status resolve_window(const snii::format::RegionRef& section, uint64_t base, uint64_t off_delta,
+                      uint64_t total_len, uint64_t prelude_len, uint64_t* abs_off, uint64_t* len) {
+    if (prelude_len > total_len) {
+        return Status::Corruption("logical_index: prelude_len exceeds window len");
+    }
+    const uint64_t in_region = base + off_delta;
+    if (in_region < base) {
+        return Status::Corruption("logical_index: locator overflow");
+    }
+    if (in_region > section.length || total_len > section.length - in_region) {
+        return Status::Corruption("logical_index: window past posting region");
+    }
+    *abs_off = section.offset + in_region + prelude_len;
+    *len = total_len - prelude_len;
+    return Status::OK();
+}
+
+} // namespace
+
+Status LogicalIndexReader::resolve_frq_window(const snii::format::DictEntry& entry,
+                                              uint64_t frq_base, uint64_t* abs_off,
+                                              uint64_t* len) const {
+    return resolve_window(section_refs().posting_region, frq_base, entry.frq_off_delta,
+                          entry.frq_len, entry.prelude_len, abs_off, len);
+}
+
+Status LogicalIndexReader::resolve_prx_window(const snii::format::DictEntry& entry,
+                                              uint64_t prx_base, uint64_t* abs_off,
+                                              uint64_t* len) const {
+    // .prx windows carry no prelude (prelude_len = 0); both spans live in the
+    // same posting region (prx span precedes frq span for the same term).
+    return resolve_window(section_refs().posting_region, prx_base, entry.prx_off_delta,
+                          entry.prx_len, 0, abs_off, len);
+}
+
+} // namespace snii::reader
diff --git a/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp b/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp
new file mode 100644
index 00000000000000..41e6ba06800152
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp
@@ -0,0 +1,97 @@
+#include "snii/reader/snii_segment_reader.h"
+
+#include <vector>
+
+#include "snii/encoding/crc32c.h"
+#include "snii/format/bootstrap_header.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/per_index_meta.h"
+#include "snii/format/stats_block.h"
+#include "snii/format/tail_pointer.h"
+
+namespace snii::reader {
+
+using snii::format::BootstrapHeader;
+using snii::format::IndexTier;
+using snii::format::PerIndexMetaReader;
+using snii::format::StatsBlock;
+using snii::format::TailMetaRegionReader;
+using snii::format::TailPointer;
+
+namespace {
+
+// Reads the bootstrap header from the front of the file and validates it.
+Status ReadBootstrap(snii::io::FileReader* reader, BootstrapHeader* bh) {
+    std::vector<uint8_t> buf;
+    SNII_RETURN_IF_ERROR(reader->read_at(0, snii::format::kBootstrapHeaderSize, &buf));
+    return snii::format::decode_bootstrap_header(Slice(buf), bh);
+}
+
+// Reads the fixed tail pointer (last tail_pointer_size() bytes) of the file.
+Status ReadTailPointer(snii::io::FileReader* reader, TailPointer* tp) {
+    const size_t tp_size = snii::format::tail_pointer_size();
+    const uint64_t total = reader->size();
+    if (total < tp_size) {
+        return Status::Corruption("segment: file smaller than tail pointer");
+    }
+    std::vector<uint8_t> buf;
+    SNII_RETURN_IF_ERROR(reader->read_at(total - tp_size, tp_size, &buf));
+    return snii::format::decode_tail_pointer(Slice(buf), tp);
+}
+
+} // namespace
+
+Status SniiSegmentReader::open(snii::io::FileReader* reader, SniiSegmentReader* out) {
+    if (reader == nullptr) return Status::InvalidArgument("segment: null reader");
+    if (out == nullptr) return Status::InvalidArgument("segment: null out");
+
+    BootstrapHeader bh;
+    SNII_RETURN_IF_ERROR(ReadBootstrap(reader, &bh));
+
+    TailPointer tp;
+    SNII_RETURN_IF_ERROR(ReadTailPointer(reader, &tp));
+    if (tp.meta_region_length == 0) {
+        return Status::Corruption("segment: empty tail meta region");
+    }
+
+    out->reader_ = reader;
+    SNII_RETURN_IF_ERROR(
+            reader->read_at(tp.meta_region_offset, tp.meta_region_length, &out->meta_region_));
+    // Verify the whole meta region against the tail pointer's checksum BEFORE parsing
+    // it. (TailMetaRegionReader::open also checks the region's own internal checksum;
+    // this is the read-boundary check that makes tp.meta_region_checksum meaningful and
+    // catches corruption before any framed sub-section is touched.)
+    if (snii::crc32c(Slice(out->meta_region_)) != tp.meta_region_checksum) {
+        return Status::Corruption("segment: meta region checksum mismatch");
+    }
+    return TailMetaRegionReader::open(Slice(out->meta_region_), &out->region_reader_);
+}
+
+Status SniiSegmentReader::open_index(uint64_t index_id, std::string_view suffix,
+                                     LogicalIndexReader* out) const {
+    if (out == nullptr) return Status::InvalidArgument("segment: null index out");
+    if (reader_ == nullptr) return Status::InvalidArgument("segment: not opened");
+
+    bool found = false;
+    Slice meta_bytes;
+    SNII_RETURN_IF_ERROR(region_reader_.find(index_id, suffix, &found, &meta_bytes));
+    if (!found) return Status::NotFound("segment: logical index not found");
+
+    // Determine tier / positions capability from the per-index meta. Positions
+    // capability is read from the PERSISTED header flag (kHasPositions), NOT from
+    // any region length: after the frq/prx merge, posting_region.length is non-zero
+    // for ANY index with a pod_ref term -- docs-only included -- so a region-length
+    // heuristic would mis-classify a docs-only index as positional and make
+    // DictBlockReader::check_flags hard-fail. The "|| has_norms" is kept only as a
+    // defensive belt-and-suspenders (a scoring index always has positions).
+    PerIndexMetaReader meta;
+    SNII_RETURN_IF_ERROR(PerIndexMetaReader::open(meta_bytes, &meta));
+    const bool has_norms = meta.section_refs().norms.length > 0;
+    const bool has_positions = meta.has_positions() || has_norms;
+    const IndexTier tier =
+            has_norms ? IndexTier::kT3 : (has_positions ? IndexTier::kT2 : IndexTier::kT1);
+
+    return LogicalIndexReader::open(reader_, tier, has_positions, meta_bytes, out);
+}
+
+} // namespace snii::reader
diff --git a/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp b/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp
new file mode 100644
index 00000000000000..1299660f0658a8
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp
@@ -0,0 +1,253 @@
+#include "snii/reader/windowed_posting.h"
+
+#include <cstddef>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/format/frq_pod.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/format/prx_pod.h"
+#include "snii/io/batch_range_fetcher.h"
+
+namespace snii::reader {
+
+using snii::format::DictEntry;
+using snii::format::FrqPreludeReader;
+using snii::format::FrqRegionMeta;
+using snii::format::WindowMeta;
+
+namespace {
+
+// Resolves the absolute file offset of the prelude bytes for a windowed entry.
+// The frq span lives in the interleaved posting region (after the term's prx span).
+uint64_t PreludeAbs(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base) {
+    const auto& region = idx.section_refs().posting_region;
+    return region.offset + frq_base + entry.frq_off_delta;
+}
+
+// Validates that [off, off+len) fits within [0, total).
+Status InBounds(uint64_t off, uint64_t len, uint64_t total) {
+    if (off > total || len > total - off) {
+        return Status::Corruption("windowed_posting: range out of section");
+    }
+    return Status::OK();
+}
+
+// Block geometry of a windowed entry's grouped .frq payload (all offsets absolute).
+struct BlockGeometry {
+    uint64_t dd_block_off = 0; // absolute start of the dd-block
+    uint64_t dd_block_len = 0;
+    uint64_t freq_block_off = 0; // absolute start of the freq-block
+    uint64_t freq_block_len = 0;
+    uint64_t frq_region_len = 0; // entry.frq_len - prelude_len (dd-block + freq-block)
+};
+
+// Derives the dd-block / freq-block absolute ranges from the entry + prelude,
+// validating they tile the post-prelude .frq region exactly.
+Status ResolveBlocks(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base,
+                     const FrqPreludeReader& prelude, BlockGeometry* g) {
+    if (entry.prelude_len > entry.frq_len) {
+        return Status::Corruption("windowed_posting: prelude_len exceeds frq_len");
+    }
+    const uint64_t frq_window_start = PreludeAbs(idx, entry, frq_base) + entry.prelude_len;
+    g->frq_region_len = entry.frq_len - entry.prelude_len;
+    g->dd_block_len = prelude.dd_block_len();
+    g->freq_block_len = prelude.freq_block_len();
+    // dd-block + freq-block must fit exactly within the post-prelude region.
+    if (g->dd_block_len > g->frq_region_len ||
+        g->freq_block_len > g->frq_region_len - g->dd_block_len) {
+        return Status::Corruption("windowed_posting: blocks exceed frq region");
+    }
+    g->dd_block_off = frq_window_start;
+    g->freq_block_off = frq_window_start + g->dd_block_len;
+    return Status::OK();
+}
+
+// Per-window decode state for the full-posting path.
+struct WindowSlices {
+    WindowMeta meta;
+    Slice dd_region;
+    Slice freq_region;
+    Slice prx_window;
+};
+
+// Carves window w's dd (and freq when want_freq) sub-slices out of the fetched
+// blocks, validating each locator against its block length.
+Status CarveRegionSlices(const WindowMeta& m, Slice dd_block, Slice freq_block, bool want_freq,
+                         WindowSlices* out) {
+    SNII_RETURN_IF_ERROR(InBounds(m.dd_off, m.dd_disk_len, dd_block.size()));
+    out->dd_region =
+            dd_block.subslice(static_cast<size_t>(m.dd_off), static_cast<size_t>(m.dd_disk_len));
+    if (!want_freq) return Status::OK();
+    SNII_RETURN_IF_ERROR(InBounds(m.freq_off, m.freq_disk_len, freq_block.size()));
+    out->freq_region = freq_block.subslice(static_cast<size_t>(m.freq_off),
+                                           static_cast<size_t>(m.freq_disk_len));
+    return Status::OK();
+}
+
+// Decodes window w from the fetched blocks (+ optional prx slice) and appends to out.
+Status AppendWindow(const WindowSlices& ws, bool want_positions, bool want_freq,
+                    DecodedPosting* out) {
+    std::vector<uint32_t> docids, freqs;
+    std::vector<std::vector<uint32_t>> pos;
+    SNII_RETURN_IF_ERROR(decode_window_slices(ws.meta, ws.dd_region, ws.freq_region, ws.prx_window,
+                                              want_positions, want_freq, &docids, &freqs, &pos));
+    out->docids.insert(out->docids.end(), docids.begin(), docids.end());
+    out->freqs.insert(out->freqs.end(), freqs.begin(), freqs.end());
+    if (want_positions) {
+        for (auto& v : pos) out->positions.push_back(std::move(v));
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status fetch_windowed_prelude(const LogicalIndexReader& idx, const DictEntry& entry,
+                              uint64_t frq_base, FrqPreludeReader* prelude) {
+    if (entry.prelude_len == 0) {
+        return Status::Corruption("windowed_posting: windowed entry has no prelude");
+    }
+    if (entry.prelude_len > entry.frq_len) {
+        return Status::Corruption("windowed_posting: prelude_len exceeds frq_len");
+    }
+    const uint64_t prelude_abs = PreludeAbs(idx, entry, frq_base);
+    snii::io::BatchRangeFetcher fetcher(idx.reader());
+    const size_t h = fetcher.add(prelude_abs, entry.prelude_len);
+    SNII_RETURN_IF_ERROR(fetcher.fetch());
+    return FrqPreludeReader::open(fetcher.get(h), prelude);
+}
+
+Status windowed_window_range(const LogicalIndexReader& idx, const DictEntry& entry,
+                             uint64_t frq_base, uint64_t prx_base, const FrqPreludeReader& prelude,
+                             uint32_t w, bool want_positions, bool want_freq, WindowAbsRange* out) {
+    if (out == nullptr) return Status::InvalidArgument("windowed_posting: null range");
+    *out = WindowAbsRange {};
+    BlockGeometry g;
+    SNII_RETURN_IF_ERROR(ResolveBlocks(idx, entry, frq_base, prelude, &g));
+    WindowMeta meta;
+    SNII_RETURN_IF_ERROR(prelude.window(w, &meta));
+
+    // dd sub-range within the dd-block.
+    SNII_RETURN_IF_ERROR(InBounds(meta.dd_off, meta.dd_disk_len, g.dd_block_len));
+    out->dd_off = g.dd_block_off + meta.dd_off;
+    out->dd_len = meta.dd_disk_len;
+
+    if (want_freq) {
+        SNII_RETURN_IF_ERROR(InBounds(meta.freq_off, meta.freq_disk_len, g.freq_block_len));
+        out->freq_off = g.freq_block_off + meta.freq_off;
+        out->freq_len = meta.freq_disk_len;
+    }
+
+    if (!want_positions) return Status::OK();
+    if (!prelude.has_prx()) {
+        return Status::Corruption("windowed_posting: positions requested but prelude has none");
+    }
+    const uint64_t prx_region_start =
+            idx.section_refs().posting_region.offset + prx_base + entry.prx_off_delta;
+    SNII_RETURN_IF_ERROR(InBounds(meta.prx_off, meta.prx_len, entry.prx_len));
+    out->prx_off = prx_region_start + meta.prx_off;
+    out->prx_len = meta.prx_len;
+    return Status::OK();
+}
+
+Status decode_window_slices(const WindowMeta& meta, Slice dd_region, Slice freq_region,
+                            Slice prx_window, bool want_positions, bool want_freq,
+                            std::vector<uint32_t>* docids, std::vector<uint32_t>* freqs,
+                            std::vector<std::vector<uint32_t>>* positions) {
+    FrqRegionMeta dd_meta;
+    dd_meta.zstd = meta.dd_zstd;
+    dd_meta.uncomp_len = meta.dd_uncomp_len;
+    dd_meta.disk_len = meta.dd_disk_len;
+    dd_meta.crc = meta.crc_dd;
+    dd_meta.verify_crc = meta.verify_crc;
+    SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd_region, dd_meta, meta.win_base, docids));
+    if (docids->size() != meta.doc_count) {
+        return Status::Corruption("windowed_posting: frq doc_count mismatch");
+    }
+    if (want_freq) {
+        FrqRegionMeta freq_meta;
+        freq_meta.zstd = meta.freq_zstd;
+        freq_meta.uncomp_len = meta.freq_uncomp_len;
+        freq_meta.disk_len = meta.freq_disk_len;
+        freq_meta.crc = meta.crc_freq;
+        freq_meta.verify_crc = meta.verify_crc;
+        SNII_RETURN_IF_ERROR(
+                snii::format::decode_freq_region(freq_region, freq_meta, meta.doc_count, freqs));
+    } else {
+        freqs->clear();
+    }
+    if (!want_positions) return Status::OK();
+
+    ByteSource psrc(prx_window);
+    SNII_RETURN_IF_ERROR(snii::format::read_prx_window(&psrc, positions));
+    if (positions->size() != docids->size()) {
+        return Status::Corruption("windowed_posting: prx/frq doc-count mismatch");
+    }
+    return Status::OK();
+}
+
+namespace {
+
+// Fetches the dd-block (always), the freq-block (when want_freq) and the whole .prx
+// region (when want_positions) of a windowed entry in ONE batch and returns the
+// in-memory block slices. The dd-block is a single contiguous range -> the
+// docid-only / phrase path reads it as one Range GET (the byte-saving core).
+Status FetchBlocks(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t prx_base,
+                   const BlockGeometry& g, bool want_positions, bool want_freq,
+                   snii::io::BatchRangeFetcher* fetcher, size_t* dd_h, size_t* freq_h,
+                   size_t* prx_h) {
+    *dd_h = fetcher->add(g.dd_block_off, g.dd_block_len);
+    if (want_freq) {
+        *freq_h = fetcher->add(g.freq_block_off, g.freq_block_len);
+    }
+    if (want_positions) {
+        const uint64_t prx_region_start =
+                idx.section_refs().posting_region.offset + prx_base + entry.prx_off_delta;
+        *prx_h = fetcher->add(prx_region_start, entry.prx_len);
+    }
+    return fetcher->fetch();
+}
+
+} // namespace
+
+Status read_windowed_posting(const LogicalIndexReader& idx, const DictEntry& entry,
+                             uint64_t frq_base, uint64_t prx_base, bool want_positions,
+                             bool want_freq, DecodedPosting* out) {
+    if (out == nullptr) {
+        return Status::InvalidArgument("windowed_posting: null out");
+    }
+    *out = DecodedPosting {};
+
+    FrqPreludeReader prelude;
+    SNII_RETURN_IF_ERROR(fetch_windowed_prelude(idx, entry, frq_base, &prelude));
+    if (want_positions && !prelude.has_prx()) {
+        return Status::Corruption("windowed_posting: positions requested but prelude has none");
+    }
+    BlockGeometry g;
+    SNII_RETURN_IF_ERROR(ResolveBlocks(idx, entry, frq_base, prelude, &g));
+
+    snii::io::BatchRangeFetcher fetcher(idx.reader());
+    size_t dd_h = 0, freq_h = 0, prx_h = 0;
+    SNII_RETURN_IF_ERROR(FetchBlocks(idx, entry, prx_base, g, want_positions, want_freq, &fetcher,
+                                     &dd_h, &freq_h, &prx_h));
+    const Slice dd_block = fetcher.get(dd_h);
+    const Slice freq_block = want_freq ? fetcher.get(freq_h) : Slice();
+    const Slice prx_region = want_positions ? fetcher.get(prx_h) : Slice();
+
+    const uint32_t n = prelude.n_windows();
+    for (uint32_t w = 0; w < n; ++w) {
+        WindowSlices ws;
+        SNII_RETURN_IF_ERROR(prelude.window(w, &ws.meta));
+        SNII_RETURN_IF_ERROR(CarveRegionSlices(ws.meta, dd_block, freq_block, want_freq, &ws));
+        if (want_positions) {
+            SNII_RETURN_IF_ERROR(InBounds(ws.meta.prx_off, ws.meta.prx_len, prx_region.size()));
+            ws.prx_window = prx_region.subslice(static_cast<size_t>(ws.meta.prx_off),
+                                                static_cast<size_t>(ws.meta.prx_len));
+        }
+        SNII_RETURN_IF_ERROR(AppendWindow(ws, want_positions, want_freq, out));
+    }
+    return Status::OK();
+}
+
+} // namespace snii::reader
diff --git a/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp b/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp
new file mode 100644
index 00000000000000..f4457c96273f40
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp
@@ -0,0 +1,93 @@
+#include "snii/stats/snii_stats_provider.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "snii/common/slice.h"
+#include "snii/format/dict_entry.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/stats_block.h"
+#include "snii/io/batch_range_fetcher.h"
+
+namespace snii::stats {
+
+using snii::format::DictEntry;
+using snii::format::NormsPodReader;
+using snii::format::RegionRef;
+
+namespace {
+
+// Resolves a term's DictEntry. *found=false for an absent term (OK status).
+Status LookupEntry(const snii::reader::LogicalIndexReader& idx, std::string_view term, bool* found,
+                   DictEntry* entry) {
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+    return idx.lookup(term, found, entry, &frq_base, &prx_base);
+}
+
+} // namespace
+
+Status SniiStatsProvider::open(const snii::reader::LogicalIndexReader* idx,
+                               SniiStatsProvider* out) {
+    if (idx == nullptr || out == nullptr) {
+        return Status::InvalidArgument("stats_provider: null argument");
+    }
+    out->idx_ = idx;
+    const auto& sb = idx->stats();
+    out->doc_count_ = sb.doc_count;
+    out->indexed_doc_count_ = sb.indexed_doc_count;
+    out->sum_total_term_freq_ = sb.sum_total_term_freq;
+
+    const RegionRef& norms = idx->section_refs().norms;
+    if (norms.length == 0) {
+        out->has_norms_ = false;
+        return Status::OK();
+    }
+
+    snii::io::BatchRangeFetcher fetcher(idx->reader());
+    const size_t h = fetcher.add(norms.offset, norms.length);
+    SNII_RETURN_IF_ERROR(fetcher.fetch());
+    Slice framed = fetcher.get(h);
+    out->norms_bytes_.assign(framed.data(), framed.data() + framed.size());
+    SNII_RETURN_IF_ERROR(NormsPodReader::open(Slice(out->norms_bytes_), &out->norms_reader_));
+    out->has_norms_ = true;
+    return Status::OK();
+}
+
+double SniiStatsProvider::avgdl() const {
+    const uint64_t denom = std::max<uint64_t>(1, indexed_doc_count_);
+    return static_cast<double>(sum_total_term_freq_) / static_cast<double>(denom);
+}
+
+Status SniiStatsProvider::doc_freq(std::string_view term, uint64_t* df) const {
+    if (df == nullptr) return Status::InvalidArgument("stats_provider: null df");
+    *df = 0;
+    bool found = false;
+    DictEntry entry;
+    SNII_RETURN_IF_ERROR(LookupEntry(*idx_, term, &found, &entry));
+    if (found) *df = entry.df;
+    return Status::OK();
+}
+
+Status SniiStatsProvider::total_term_freq(std::string_view term, uint64_t* ttf) const {
+    if (ttf == nullptr) return Status::InvalidArgument("stats_provider: null ttf");
+    *ttf = 0;
+    bool found = false;
+    DictEntry entry;
+    SNII_RETURN_IF_ERROR(LookupEntry(*idx_, term, &found, &entry));
+    if (!found) return Status::OK();
+    // tier>=T2 entries carry the total term frequency directly in ttf_delta (the
+    // LogicalIndexWriter stores ttf there, not a delta from df).
+    *ttf = entry.ttf_delta;
+    return Status::OK();
+}
+
+Status SniiStatsProvider::encoded_norm(uint32_t docid, uint8_t* out) const {
+    if (out == nullptr) return Status::InvalidArgument("stats_provider: null out");
+    if (!has_norms_) {
+        return Status::InvalidArgument("stats_provider: index has no norms");
+    }
+    return norms_reader_.try_encoded_norm(docid, out);
+}
+
+} // namespace snii::stats
diff --git a/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp b/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp
new file mode 100644
index 00000000000000..a6ce29aee03557
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp
@@ -0,0 +1,155 @@
+#include "snii/writer/compact_posting_pool.h"
+
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
+
+namespace snii::writer {
+
+// Gentle (~1.5x) many-level payload-capacity schedule. Starting at 5 bytes with a
+// slow ramp keeps the over-allocated FINAL slice small for the millions of low-df
+// terms (the dominant arena-overhead source) while still reaching multi-KiB slices
+// for high-df chains in a bounded number of hops (so the per-slice 4-byte forward
+// pointer stays a small fraction of a large chain's bytes).
+const uint32_t CompactPostingPool::kSliceSizes[kLevelCount] = {
+        5, 8, 12, 18, 27, 40, 60, 90, 135, 202, 303, 455, 683, 1024, 1536, 2304};
+const uint8_t CompactPostingPool::kNextLevel[kLevelCount] = {1, 2,  3,  4,  5,  6,  7,  8,
+                                                             9, 10, 11, 12, 13, 14, 15, 15};
+
+CompactPostingPool::CompactPostingPool() = default;
+
+uint32_t CompactPostingPool::kSliceSizes_level0() {
+    return kSliceSizes[0];
+}
+
+uint32_t CompactPostingPool::kSliceSize_at(int level) {
+    return kSliceSizes[level];
+}
+
+uint8_t CompactPostingPool::kNextLevel_at(int level) {
+    return kNextLevel[level];
+}
+
+void CompactPostingPool::reset() {
+    std::vector<std::vector<uint8_t>>().swap(blocks_);
+    next_offset_ = 0;
+    payload_bytes_ = 0;
+}
+
+uint32_t CompactPostingPool::alloc_run(uint32_t bytes) {
+    const uint32_t in_block = next_offset_ & kBlockMask;
+    // A fresh block is needed when (a) there is no tail block yet, (b) the run does
+    // not fit in the current tail block's remaining space, or (c) next_offset_ sits
+    // exactly on a block boundary whose block has not been allocated (a previous run
+    // that exactly filled the tail leaves next_offset_ == blocks_.size()*kBlockSize,
+    // so in_block == 0 must NOT be mistaken for an empty fresh block).
+    const bool tail_exists = (next_offset_ >> kBlockShift) < blocks_.size();
+    const bool need_block = !tail_exists || in_block + bytes > kBlockSize;
+    // Hard invariant (see arena_bytes()): the uint32 offset must never wrap. The spimi
+    // accumulator force-spills below 4 GiB, but enforce it here too -- in release as
+    // well as debug -- so any direct user of the pool fails loudly instead of silently
+    // aliasing block 0. We are a library: throw and let the caller decide how to
+    // handle it, rather than aborting the process. The run starts either in the
+    // current tail or at a new block's base; compute that start in 64 bits before the
+    // uint32 arithmetic can wrap.
+    const uint64_t run_start =
+            need_block ? static_cast<uint64_t>(blocks_.size()) * kBlockSize : next_offset_;
+    if (run_start + bytes > UINT32_MAX) {
+        throw std::overflow_error(
+                "snii: CompactPostingPool arena exceeded the 4 GiB uint32 offset limit; "
+                "the caller must spill before this point");
+    }
+    if (need_block) {
+        blocks_.emplace_back(kBlockSize, 0);
+        next_offset_ = static_cast<uint32_t>((blocks_.size() - 1) * kBlockSize);
+    }
+    const uint32_t off = next_offset_;
+    next_offset_ += bytes;
+    return off;
+}
+
+uint32_t CompactPostingPool::alloc_slice(int level, uint32_t* slice_end) {
+    const uint32_t cap = kSliceSizes[level];
+    const uint32_t first = alloc_run(cap + kPtrBytes);
+    *slice_end = first + cap;
+    // Zero the forward pointer so a not-yet-extended tail slice reads next_head == 0.
+    std::memset(at(*slice_end), 0, kPtrBytes);
+    return first;
+}
+
+uint32_t CompactPostingPool::read_ptr(uint32_t slice_end) const {
+    uint32_t v;
+    std::memcpy(&v, at(slice_end), sizeof(v));
+    return v;
+}
+
+void CompactPostingPool::write_ptr(uint32_t slice_end, uint32_t next_head) {
+    std::memcpy(at(slice_end), &next_head, sizeof(next_head));
+}
+
+uint32_t CompactPostingPool::start_chain(SliceWriter* w, uint8_t* level) {
+    *level = 0;
+    const uint32_t head = alloc_slice(0, &w->slice_end);
+    w->cur = head;
+    return head;
+}
+
+void CompactPostingPool::append_byte(SliceWriter* w, uint8_t* level, uint8_t value) {
+    if (w->cur == w->slice_end) {
+        // Current slice payload region is full: grow the chain with a larger slice and
+        // record the link in the old slice's trailing pointer bytes.
+        const uint8_t next_level = kNextLevel[*level];
+        uint32_t new_end = 0;
+        const uint32_t new_head = alloc_slice(next_level, &new_end);
+        write_ptr(w->slice_end, new_head);
+        *level = next_level;
+        w->cur = new_head;
+        w->slice_end = new_end;
+    }
+    *at(w->cur) = value;
+    ++w->cur;
+    ++payload_bytes_;
+}
+
+CompactPostingPool::Cursor::Cursor(const CompactPostingPool* pool, uint32_t head, uint64_t budget)
+        : pool_(pool), cur_(head), level_(0), budget_(budget) {
+    // The first slice is level 0; its payload region ends kSliceSizes[0] bytes in.
+    slice_end_ = head + CompactPostingPool::kSliceSizes[0];
+}
+
+bool CompactPostingPool::Cursor::has_next() const {
+    if (budget_ == 0) return false;
+    // At a slice boundary, the chain continues only if the forward pointer is non-zero;
+    // a zero pointer is the tail marker (offset 0 is never a valid next-slice head). Peek
+    // it so has_next() never reports a phantom byte that next() would have to fabricate.
+    if (cur_ == slice_end_) return pool_->read_ptr(slice_end_) != 0;
+    return true;
+}
+
+uint8_t CompactPostingPool::Cursor::next() {
+    // Budget guard: the caller's stated upper bound is spent -- yield nothing more.
+    if (budget_ == 0) return 0;
+    if (cur_ == slice_end_) {
+        // Reached this slice's payload boundary. Follow the forward pointer to the next
+        // slice -- UNLESS it is zero, which marks the CHAIN TAIL (offset 0 is always the
+        // pool's very first slice, never a valid *next*-slice head, so a zero pointer is
+        // unambiguously "no more slices"). Without this tail check, an over-reading caller
+        // would follow the zero pointer to offset 0 and alias block 0's bytes (or read an
+        // unallocated block) -- UB. Stopping here makes the cursor self-terminating and
+        // safe regardless of how large a budget the caller passed.
+        const uint32_t next_head = pool_->read_ptr(slice_end_);
+        if (next_head == 0) {
+            budget_ = 0; // chain exhausted: no further bytes exist
+            return 0;
+        }
+        level_ = CompactPostingPool::kNextLevel[level_];
+        cur_ = next_head;
+        slice_end_ = next_head + CompactPostingPool::kSliceSizes[level_];
+    }
+    const uint8_t v = *pool_->at(cur_);
+    ++cur_;
+    --budget_;
+    return v;
+}
+
+} // namespace snii::writer
diff --git a/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp b/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp
new file mode 100644
index 00000000000000..8cbf1de2eee0d3
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp
@@ -0,0 +1,686 @@
+#include "snii/writer/logical_index_writer.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <span>
+#include <utility>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/encoding/zstd_codec.h"
+#include "snii/format/bsbf.h"
+#include "snii/format/dict_block.h"
+#include "snii/format/dict_block_directory.h"
+#include "snii/format/frq_pod.h"
+#include "snii/format/frq_prelude.h"
+#include "snii/format/norms_pod.h"
+#include "snii/format/null_bitmap.h"
+#include "snii/format/prx_pod.h"
+
+namespace snii::writer {
+
+using snii::format::BlockRef;
+using snii::format::DictBlockBuilder;
+using snii::format::DictBlockDirectoryBuilder;
+using snii::format::DictEntry;
+using snii::format::DictEntryEnc;
+using snii::format::DictEntryKind;
+using snii::format::FrqPreludeColumns;
+using snii::format::PerIndexMetaBuilder;
+using snii::format::SampledTermIndexBuilder;
+using snii::format::SectionRefs;
+using snii::format::WindowMeta;
+
+namespace {
+
+// Target false-positive probability for the block-split bloom XFilter. Sizes
+// the filter via Parquet OptimalNumOfBytes; L0 keeps the probe in memory and L1
+// keeps the per-query cost at one 32-byte block.
+constexpr double kBsbfFpp = 0.01;
+// Zstd "auto" sentinel for window builders (raw for tiny payloads).
+constexpr int kAutoZstd = -1;
+// Force-raw level for .frq dd/freq regions. Their plaintext is PFOR-bit-packed
+// doc-deltas/freqs -- already high-entropy, so zstd shrinks ~30 MB of input by
+// <0.1 MiB while burning ~0.4s CPU (and an extra crc pass over the compressed
+// bytes) at 5M. We force raw here and keep zstd only on .prx (which compresses
+// ~77%). Output stays self-describing: the region meta records zstd=false.
+constexpr int kRawFrqRegion = 0;
+// Windows per super-block in the two-level prelude directory (design section
+// 5).
+constexpr uint32_t kPreludeGroupSize = 64;
+// zstd level for whole-DICT-block compression. Level 3 (zstd default)
+// compresses the 64KiB front-coded term-key + entry-meta + inline-posting
+// blocks ~40% at ~120 MiB/s encode / ~600 MiB/s decode -- a large size win for
+// a small build-CPU cost, and a per-lookup decode (~0.1ms/64KiB) that is
+// dominated by the S3 round trip it shrinks. Higher levels gain <1% here for
+// materially more CPU.
+constexpr int kDictBlockZstdLevel = 3;
+
+using snii::format::FrqRegionMeta;
+
+// Encodes one window's dd region (and freq region when has_freq) into separate
+// buffers, returning their codec metadata. The dd region is the docs-only data;
+// the freq region is the skippable suffix. Used for both the grouped windowed
+// layout (regions concatenated into posting-level blocks) and the single-window
+// slim/inline layout ([dd_region][freq_region]).
+Status EncodeRegions(std::span<const uint32_t> docids, std::span<const uint32_t> freqs,
+                     uint64_t win_base, bool has_freq, std::vector<uint8_t>* dd_out,
+                     FrqRegionMeta* dd_meta, std::vector<uint8_t>* freq_out,
+                     FrqRegionMeta* freq_meta) {
+    ByteSink dd_sink;
+    SNII_RETURN_IF_ERROR(
+            snii::format::build_dd_region(docids, win_base, kRawFrqRegion, &dd_sink, dd_meta));
+    *dd_out = dd_sink.take();
+    if (!has_freq) {
+        *freq_out = std::vector<uint8_t>();
+        *freq_meta = FrqRegionMeta {};
+        return Status::OK();
+    }
+    ByteSink freq_sink;
+    SNII_RETURN_IF_ERROR(
+            snii::format::build_freq_region(freqs, kRawFrqRegion, &freq_sink, freq_meta));
+    *freq_out = freq_sink.take();
+    return Status::OK();
+}
+
+// Reusable per-window scratch for the windowed builder. Each ByteSink RETAINS
+// its capacity across windows (clear(), not re-construct), so encoding a
+// high-df term split into thousands of windows allocates the scratch ONCE
+// instead of churning thousands of small buffers (which fragment the heap arena
+// and raise peak RSS).
+struct WindowScratch {
+    ByteSink dd_sink;
+    ByteSink freq_sink;
+    ByteSink prx_sink;
+};
+
+// Encodes one window's dd (and freq) region into the scratch sinks and appends
+// the bytes directly to the grouped blocks via LayoutWindowRegions. Reuses the
+// sinks.
+Status EncodeRegionsInto(WindowScratch* sc, std::span<const uint32_t> docids,
+                         std::span<const uint32_t> freqs, uint64_t win_base, bool has_freq,
+                         FrqRegionMeta* dd_meta, FrqRegionMeta* freq_meta) {
+    sc->dd_sink.clear();
+    SNII_RETURN_IF_ERROR(
+            snii::format::build_dd_region(docids, win_base, kRawFrqRegion, &sc->dd_sink, dd_meta));
+    if (has_freq) {
+        sc->freq_sink.clear();
+        SNII_RETURN_IF_ERROR(
+                snii::format::build_freq_region(freqs, kRawFrqRegion, &sc->freq_sink, freq_meta));
+    } else {
+        *freq_meta = FrqRegionMeta {};
+    }
+    return Status::OK();
+}
+
+// Builds a single .prx window directly from a FLAT positions slice + its
+// parallel freqs slice (doc d owns the next freqs[d] entries). Byte-identical
+// to building from per-doc vectors, but with NO vector-of-vectors
+// materialization: the writer indexes straight into the term's flat positions
+// buffer.
+Status MakePrxWindow(std::span<const uint32_t> positions_flat, std::span<const uint32_t> freqs,
+                     std::vector<uint8_t>* out) {
+    ByteSink sink;
+    SNII_RETURN_IF_ERROR(
+            snii::format::build_prx_window_flat(positions_flat, freqs, kAutoZstd, &sink));
+    *out = sink.take();
+    return Status::OK();
+}
+
+uint32_t MaxOf(std::span<const uint32_t> v) {
+    uint32_t m = 0;
+    for (uint32_t x : v) {
+        if (x > m) m = x;
+    }
+    return m;
+}
+
+uint64_t SumOf(const std::vector<uint32_t>& v) {
+    uint64_t s = 0;
+    for (uint32_t x : v) s += x;
+    return s;
+}
+
+// Computes a window's WAND max_norm: the encoded norm yielding the LARGEST BM25
+// length contribution (smallest length penalty), i.e. the SMALLEST encoded norm
+// among the window's docs (smaller dl => higher score). When norms are
+// unavailable (no scoring), returns 0 -- decode_norm(0)=1.0 is the smallest
+// possible dl, giving a correct (loosest) upper bound.
+uint8_t WindowMaxNorm(const std::vector<uint8_t>& norms, std::span<const uint32_t> docs) {
+    if (norms.empty() || docs.empty()) return 0;
+    uint8_t best = 0xFF; // decode_norm uses the byte directly; min byte => max score
+    for (uint32_t docid : docs) {
+        if (docid >= norms.size()) continue; // defensive: out-of-range doc has no norm
+        if (norms[docid] < best) best = norms[docid];
+    }
+    return best == 0xFF ? 0 : best;
+}
+
+// Window doc count by df: high-df windowed terms combine kFrqBaseUnit units
+// into larger (kAdaptiveWindowDocs) windows; both are whole multiples of the
+// base unit so .prx alignment and win_base/last_docid semantics are preserved.
+uint32_t AdaptiveWindowDocs(uint32_t df) {
+    return df >= snii::format::kAdaptiveWindowDfThreshold ? snii::format::kAdaptiveWindowDocs
+                                                          : snii::format::kFrqBaseUnit;
+}
+
+// Builds the two-level .frq prelude for a windowed term and returns its bytes.
+Status BuildPrelude(const std::vector<WindowMeta>& windows, bool has_freq, bool has_prx,
+                    std::vector<uint8_t>* out) {
+    FrqPreludeColumns cols;
+    cols.has_freq = has_freq;
+    cols.has_prx = has_prx;
+    cols.group_size = kPreludeGroupSize;
+    cols.windows = windows;
+    ByteSink sink;
+    SNII_RETURN_IF_ERROR(snii::format::build_frq_prelude(cols, &sink));
+    *out = sink.take();
+    return Status::OK();
+}
+
+void AppendBytes(std::vector<uint8_t>* dst, const std::vector<uint8_t>& src) {
+    dst->insert(dst->end(), src.begin(), src.end());
+}
+
+// One windowed term's grouped .frq layout (design 1.6): all dd regions form the
+// dd-block, all freq regions form the freq-block. The final frq span is
+// [prelude][dd-block][freq-block]. The .prx windows are STREAMED straight to
+// the posting sink (the container output) during pass 1 (not buffered here) --
+// so the widest term's ~tens-of-MiB prx bytes never co-exist with the dd/freq
+// blocks at peak RSS; only prx_total_len (the entry's prx byte span) is
+// tracked. Per-window metadata (region offsets/lens/modes/crcs, prx_off within
+// the entry) is recorded for the prelude.
+struct WindowedPosting {
+    std::vector<uint8_t> dd_block;   // dd_region_0 ++ dd_region_1 ++ ...
+    std::vector<uint8_t> freq_block; // freq_region_0 ++ ... (empty if !has_freq)
+    uint64_t prx_total_len = 0;      // total .prx bytes streamed for this entry
+    std::vector<WindowMeta> windows;
+};
+
+// Fills a window's region locator fields in m from its dd/freq region metas and
+// the running dd-block / freq-block offsets, then appends the region bytes to
+// the blocks. has_freq controls whether the freq region is laid out.
+void LayoutWindowRegions(const FrqRegionMeta& dd_meta, const std::vector<uint8_t>& dd_bytes,
+                         const FrqRegionMeta& freq_meta, const std::vector<uint8_t>& freq_bytes,
+                         bool has_freq, WindowedPosting* out, WindowMeta* m) {
+    m->dd_zstd = dd_meta.zstd;
+    m->dd_off = static_cast<uint64_t>(out->dd_block.size());
+    m->dd_disk_len = dd_meta.disk_len;
+    m->dd_uncomp_len = dd_meta.uncomp_len;
+    m->crc_dd = dd_meta.crc;
+    AppendBytes(&out->dd_block, dd_bytes);
+    if (!has_freq) return;
+    m->freq_zstd = freq_meta.zstd;
+    m->freq_off = static_cast<uint64_t>(out->freq_block.size());
+    m->freq_disk_len = freq_meta.disk_len;
+    m->freq_uncomp_len = freq_meta.uncomp_len;
+    m->crc_freq = freq_meta.crc;
+    AppendBytes(&out->freq_block, freq_bytes);
+}
+
+// Splits a windowed term's postings into base-unit-aligned windows (size chosen
+// by df via AdaptiveWindowDocs). Each window's dd/freq regions are encoded
+// separately and grouped: all dd regions into the dd-block, all freq regions
+// into the freq-block. Records per-window region metadata + WAND max_norm.
+//
+// TWO-PASS, MEMORY-AWARE: the widest term (df in the millions) is the dominant
+// merge-phase peak-RSS source -- its flat positions_flat alone is tens of MiB
+// and would otherwise co-exist with the encoded output blocks at the peak
+// moment.
+//   pass 1 (prx): builds every window's .prx bytes, then FREES positions_flat
+//                 (the single largest source array) before any dd/freq block
+//                 grows.
+//   pass 2 (dd/freq): encodes the dd/freq regions from docids/freqs only.
+// `tp` is taken by mutable reference; positions_flat is freed after pass 1 and
+// docids/freqs are freed by the caller after this returns. Output bytes are
+// byte-identical to the single-pass build (regions/prelude/prx are
+// independent).
+Status BuildWindowedPosting(TermPostings& tp, bool has_freq, bool has_prx,
+                            const std::vector<uint8_t>& norms, snii::io::FileWriter* posting_out,
+                            WindowedPosting* out) {
+    const uint32_t unit = AdaptiveWindowDocs(static_cast<uint32_t>(tp.docids.size()));
+    const size_t n = tp.docids.size();
+    const std::span<const uint32_t> all_docs(tp.docids);
+    const std::span<const uint32_t> all_freqs(tp.freqs);
+
+    // Size the per-term transient blocks up front so a very-high-df term (split
+    // into thousands of windows, dd/freq blocks of MiB) does not grow them by
+    // geometric doubling -- which would briefly hold the old+new buffer
+    // co-resident at the build peak. windows count is exact; dd/freq use a
+    // conservative byte/doc upper estimate (PFOR-packed deltas are typically <= 2
+    // B/doc). Slack is freed when the term ends.
+    out->windows.reserve((n + unit - 1) / unit);
+    out->dd_block.reserve(n * 2);
+    if (has_freq) out->freq_block.reserve(n);
+
+    WindowScratch sc; // reused across all windows (no per-window allocation churn)
+
+    // ---- pass 1: prx (STREAMED to the output) + window skeleton ----
+    // Each window's .prx bytes are appended straight to the posting sink
+    // (container output) as they are built, so the entry's full prx payload (tens
+    // of MiB for the widest term) is never buffered in RAM alongside the dd/freq
+    // blocks that pass 2 grows. m.prx_off is the byte offset WITHIN this entry's
+    // prx span (running prx_total_len), matching the reader's prx_off_delta +
+    // meta.prx_off contract.
+    {
+        // Positions come either from the flat buffer or, for very-high-df terms,
+        // from a sequential pump (so the term's full positions are never
+        // materialized). Both yield the SAME positions in the SAME order, so the
+        // prx bytes are identical.
+        const bool streamed = static_cast<bool>(tp.pos_pump);
+        const std::span<const uint32_t> all_pos(tp.positions_flat);
+        std::vector<uint32_t> win_pos_buf; // reused per window when streaming
+        uint64_t win_base = 0;
+        size_t pos_off = 0;
+        for (size_t start = 0; start < n; start += unit) {
+            const size_t len = std::min<size_t>(unit, n - start);
+            const auto docs = all_docs.subspan(start, len);
+            const auto freqs = all_freqs.subspan(start, len);
+            WindowMeta m;
+            m.last_docid = docs.back();
+            m.win_base = win_base;
+            m.doc_count = static_cast<uint32_t>(docs.size());
+            m.max_freq = MaxOf(freqs);
+            m.max_norm = WindowMaxNorm(norms, docs);
+            size_t win_pos = 0;
+            for (uint32_t f : freqs) win_pos += f;
+            if (has_prx) {
+                std::span<const uint32_t> pos_span;
+                if (streamed) {
+                    win_pos_buf.resize(win_pos);
+                    if (win_pos != 0) tp.pos_pump(win_pos_buf.data(), win_pos);
+                    pos_span = std::span<const uint32_t>(win_pos_buf);
+                } else {
+                    pos_span = all_pos.subspan(pos_off, win_pos);
+                }
+                sc.prx_sink.clear();
+                SNII_RETURN_IF_ERROR(snii::format::build_prx_window_flat(pos_span, freqs, kAutoZstd,
+                                                                         &sc.prx_sink));
+                m.prx_off = out->prx_total_len;
+                m.prx_len = static_cast<uint64_t>(sc.prx_sink.size());
+                SNII_RETURN_IF_ERROR(posting_out->append(sc.prx_sink.view()));
+                out->prx_total_len += m.prx_len;
+            }
+            pos_off += win_pos;
+            out->windows.push_back(m);
+            win_base = m.last_docid;
+        }
+    }
+    // Positions are fully consumed; free the largest source array before pass 2
+    // grows the dd/freq blocks, so the source positions never co-exist with them.
+    std::vector<uint32_t>().swap(tp.positions_flat);
+
+    // ---- pass 2: dd (and freq) regions from docids/freqs only ----
+    uint64_t win_base = 0;
+    size_t wi = 0;
+    for (size_t start = 0; start < n; start += unit, ++wi) {
+        const size_t len = std::min<size_t>(unit, n - start);
+        const auto docs = all_docs.subspan(start, len);
+        const auto freqs = all_freqs.subspan(start, len);
+        FrqRegionMeta dd_meta, freq_meta;
+        SNII_RETURN_IF_ERROR(
+                EncodeRegionsInto(&sc, docs, freqs, win_base, has_freq, &dd_meta, &freq_meta));
+        LayoutWindowRegions(dd_meta, sc.dd_sink.buffer(), freq_meta, sc.freq_sink.buffer(),
+                            has_freq, out, &out->windows[wi]);
+        win_base = out->windows[wi].last_docid;
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+LogicalIndexWriter::LogicalIndexWriter(const SniiIndexInput& in)
+        : index_id_(in.index_id),
+          index_suffix_(in.index_suffix),
+          tier_(snii::format::tier_of(in.config)),
+          has_prx_(snii::format::has_positions(in.config)),
+          has_freq_(snii::format::tier_of(in.config) >= snii::format::IndexTier::kT2),
+          has_norms_(snii::format::has_scoring(in.config)),
+          doc_count_(in.doc_count),
+          null_docids_(in.null_docids),
+          terms_(in.terms),
+          term_source_(in.term_source),
+          encoded_norms_(in.encoded_norms),
+          target_dict_block_bytes_(in.target_dict_block_bytes != 0
+                                           ? in.target_dict_block_bytes
+                                           : snii::format::kDefaultTargetDictBlockBytes),
+          // No independent dict cap: the dict spills via the writer's UNIFIED
+          // gate-2 cap (in.mem_reporter->over_cap()); UINT64_MAX disables the local
+          // per-buffer cap.
+          dict_buf_(UINT64_MAX, "dict", in.mem_reporter) {}
+
+Status LogicalIndexWriter::validate_term(const TermPostings& tp) const {
+    if (tp.freqs.size() != tp.docids.size()) {
+        return Status::InvalidArgument("logical_index: freqs length must equal docids");
+    }
+    if (has_prx_) {
+        uint64_t total_pos = 0;
+        for (uint32_t f : tp.freqs) total_pos += f;
+        // Streamed positions (pos_pump set): validate against the declared
+        // pos_total (positions_flat is intentionally empty). Otherwise validate the
+        // flat buffer.
+        const uint64_t have = tp.pos_pump ? tp.pos_total : tp.positions_flat.size();
+        if (total_pos != have) {
+            return Status::InvalidArgument("logical_index: positions count must equal sum(freqs)");
+        }
+    }
+    for (size_t i = 1; i < tp.docids.size(); ++i) {
+        if (tp.docids[i] <= tp.docids[i - 1]) {
+            return Status::InvalidArgument("logical_index: docids must be strictly ascending");
+        }
+    }
+    return Status::OK();
+}
+
+// Emits a windowed term: splits into base-unit windows, encodes each window's
+// dd/freq regions separately, groups them at posting level, builds a two-level
+// prelude, and lays out [prx span][prelude][dd-block][freq-block] CONTIGUOUSLY
+// in the single posting region (prx span first, then the frq span). Sets
+// enc=windowed + has_sb. frq_docs_len = prelude_len + dd_block_len is the
+// contiguous docs-only prefix, which stays INSIDE the frq span.
+Status LogicalIndexWriter::build_windowed_entry(TermPostings& tp, uint64_t frq_base,
+                                                uint64_t prx_base, DictEntry* e) {
+    // The prx span starts here: pass 1 streams each .prx window straight into
+    // the posting sink, so prx_off_delta is measured against the live
+    // posting-sink size.
+    const uint64_t prx_off = posting_size();
+    WindowedPosting wp;
+    SNII_RETURN_IF_ERROR(
+            BuildWindowedPosting(tp, has_freq_, has_prx_, encoded_norms_, posting_out_, &wp));
+    // wp.prx_total_len bytes were just streamed straight to the posting sink (0
+    // when !has_prx). docids/freqs are now fully encoded into wp; release the
+    // source arrays before the (potentially large) wp blocks are appended to
+    // disk.
+    std::vector<uint32_t>().swap(tp.docids);
+    std::vector<uint32_t>().swap(tp.freqs);
+    std::vector<uint8_t> prelude;
+    SNII_RETURN_IF_ERROR(BuildPrelude(wp.windows, has_freq_, has_prx_, &prelude));
+
+    e->kind = DictEntryKind::kPodRef;
+    e->enc = DictEntryEnc::kWindowed;
+    e->has_sb = true; // prelude is always a two-level skip directory.
+    e->prelude_len = static_cast<uint64_t>(prelude.size());
+    e->frq_docs_len =
+            e->prelude_len + static_cast<uint64_t>(wp.dd_block.size()); // [prelude][dd-block]
+
+    // The frq span starts immediately AFTER the prx span, in the SAME sink. The
+    // writer-side property frq_off == prx_off + wp.prx_total_len holds because
+    // nothing is appended to the posting sink between the prx pass and here --
+    // but the delta is measured from the live size, not assumed.
+    const uint64_t frq_off = posting_size();
+    SNII_RETURN_IF_ERROR(posting_out_->append(Slice(prelude)));
+    SNII_RETURN_IF_ERROR(posting_out_->append(Slice(wp.dd_block)));
+    SNII_RETURN_IF_ERROR(posting_out_->append(Slice(wp.freq_block)));
+    e->frq_off_delta = frq_off - frq_base;
+    e->frq_len = posting_size() - frq_off;
+    if (has_prx_) {
+        e->prx_off_delta = prx_off - prx_base;
+        e->prx_len = wp.prx_total_len; // == frq_off - prx_off
+    }
+    return Status::OK();
+}
+
+// Emits a slim term as a single .frq window (win_base=0) laid out [dd][freq]:
+// inline when the encoded bytes are tiny, otherwise a slim pod_ref (no
+// prelude). The dd region is the docs-only prefix; the freq region (when
+// has_freq) is the skippable suffix. Region codecs are recorded in the
+// DictEntry. For a pod_ref, the term's [prx][frq] spans are appended to the
+// single posting region with the prx span FIRST (consistent with the windowed
+// path); the reader resolves each delta independently so the relative order is
+// not load-bearing.
+Status LogicalIndexWriter::build_slim_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base,
+                                            DictEntry* e) {
+    std::vector<uint8_t> dd_bytes, freq_bytes;
+    FrqRegionMeta dd_meta, freq_meta;
+    SNII_RETURN_IF_ERROR(EncodeRegions(tp.docids, tp.freqs, /*win_base=*/0, has_freq_, &dd_bytes,
+                                       &dd_meta, &freq_bytes, &freq_meta));
+    std::vector<uint8_t> frq_win = dd_bytes; // [dd_region][freq_region]
+    AppendBytes(&frq_win, freq_bytes);
+    std::vector<uint8_t> prx_win;
+    if (has_prx_) {
+        SNII_RETURN_IF_ERROR(MakePrxWindow(tp.positions_flat, tp.freqs, &prx_win));
+    }
+
+    e->enc = DictEntryEnc::kSlim;
+    e->dd_meta = dd_meta;
+    e->freq_meta = freq_meta;
+
+    if (frq_win.size() <= snii::format::kDefaultInlineThreshold) {
+        e->kind = DictEntryKind::kInline;
+        e->inline_dd_disk_len = dd_meta.disk_len;
+        e->frq_bytes = std::move(frq_win);
+        if (has_prx_) e->prx_bytes = std::move(prx_win);
+        return Status::OK();
+    }
+
+    // POD_REF: write [prx][frq] into the single posting sink, prx span first.
+    e->kind = DictEntryKind::kPodRef;
+    e->frq_docs_len = dd_meta.disk_len; // docs-only prefix = the single dd region
+    if (has_prx_) {
+        const uint64_t prx_off = posting_size();
+        SNII_RETURN_IF_ERROR(posting_out_->append(Slice(prx_win)));
+        e->prx_off_delta = prx_off - prx_base;
+        e->prx_len = posting_size() - prx_off;
+    }
+    const uint64_t frq_off = posting_size(); // immediately after the prx span
+    SNII_RETURN_IF_ERROR(posting_out_->append(Slice(frq_win)));
+    e->frq_off_delta = frq_off - frq_base;
+    e->frq_len = posting_size() - frq_off;
+    return Status::OK();
+}
+
+// Builds the DictEntry for one term. Inline entries embed their .frq/.prx
+// bytes; pod_ref entries append [prx][frq] bytes to the single posting region
+// and record off_delta relative to frq_base/prx_base (the posting-region size
+// captured when the block opened; both bases hold that same value).
+Status LogicalIndexWriter::build_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base,
+                                       DictEntry* e) {
+    e->term = tp.term;
+    e->df = static_cast<uint32_t>(tp.docids.size());
+    e->ttf_delta = SumOf(tp.freqs); // simple: ttf stored directly as ttf_delta
+    e->max_freq = MaxOf(tp.freqs);
+
+    if (e->df >= snii::format::kSlimDfThreshold) {
+        return build_windowed_entry(tp, frq_base, prx_base, e);
+    }
+    return build_slim_entry(tp, frq_base, prx_base, e);
+}
+
+// Serializes the current open block, zstd-compresses it (the dict region is the
+// single largest section -- term keys + entry meta + inline postings -- and the
+// 64KiB blocks compress ~40%), streams the compressed bytes into the dict
+// scratch file, and records a directory entry. The block-level crc32c
+// (rec.checksum) covers the UNCOMPRESSED bytes, so DictBlockReader::open
+// verifies integrity after the reader decompresses. A compressed block also
+// shrinks the bytes a term lookup fetches from S3 -- aligning with the
+// read-byte thesis. If zstd does not shrink a (tiny) block, it is stored raw so
+// a lookup never pays a pointless decompress.
+Status LogicalIndexWriter::flush_block(DictBlockBuilder* block, std::string first_term) {
+    ByteSink bsink;
+    block->finish(&bsink);
+    const Slice plain = bsink.view();
+    BlockRecord rec;
+    rec.rel_offset = dict_buf_.size();
+    rec.n_entries = block->n_entries();
+    rec.checksum = snii::crc32c(plain); // crc over UNCOMPRESSED block bytes
+    rec.first_term = std::move(first_term);
+
+    std::vector<uint8_t> comp;
+    Status zs = snii::zstd_compress(plain, kDictBlockZstdLevel, &comp);
+    if (zs.ok() && comp.size() < plain.size()) {
+        rec.flags = snii::format::block_ref_flags::kZstd;
+        rec.uncomp_len = static_cast<uint64_t>(plain.size());
+        rec.length = static_cast<uint64_t>(comp.size());
+        SNII_RETURN_IF_ERROR(dict_buf_.append_move(std::move(comp)));
+    } else {
+        rec.flags = 0;
+        rec.uncomp_len = 0;
+        rec.length = static_cast<uint64_t>(plain.size());
+        SNII_RETURN_IF_ERROR(dict_buf_.append_move(bsink.take()));
+    }
+    blocks_.push_back(std::move(rec));
+    return Status::OK();
+}
+
+// Running state for the in-flight DICT block while terms stream past.
+struct LogicalIndexWriter::BlockState {
+    std::unique_ptr<DictBlockBuilder> block;
+    std::string block_first_term;
+    uint64_t frq_base = 0;
+    uint64_t prx_base = 0;
+};
+
+Status LogicalIndexWriter::process_term(TermPostings& tp, BlockState* st) {
+    SNII_RETURN_IF_ERROR(validate_term(tp));
+    // Collect only the 8-byte filter key per term (no whole-vocabulary string
+    // copy). BSBF key = XXH64 seed 0 (Parquet-canonical).
+    term_hashes_.push_back(snii::format::bsbf_hash(tp.term));
+    ++term_count_;
+    stats_.sum_total_term_freq += SumOf(tp.freqs);
+
+    if (!st->block) {
+        // Both bases come from the SAME posting sink, snapshotted at block open.
+        const uint64_t base = posting_size();
+        st->frq_base = base;
+        st->prx_base = base;
+        st->block = std::make_unique<DictBlockBuilder>(tier_, has_prx_, st->frq_base, st->prx_base);
+        st->block_first_term = tp.term;
+    }
+
+    DictEntry e;
+    SNII_RETURN_IF_ERROR(build_entry(tp, st->frq_base, st->prx_base, &e));
+    st->block->add_entry(e);
+
+    if (st->block->estimated_bytes() >= target_dict_block_bytes_) {
+        SNII_RETURN_IF_ERROR(flush_block(st->block.get(), st->block_first_term));
+        st->block.reset();
+    }
+    return Status::OK();
+}
+
+Status LogicalIndexWriter::build_blocks() {
+    BlockState st;
+    if (term_source_ != nullptr) {
+        Status streamed = Status::OK();
+        // Drain the SPIMI buffer term-by-term; only one TermPostings is alive at a
+        // time, so the input+output never fully coexist. The returned Status covers
+        // both spill/merge I/O errors and add_token validation errors (the latter
+        // flow through merge_runs -> spill_status_), so a separate status() check
+        // is no longer needed.
+        SNII_RETURN_IF_ERROR(term_source_->for_each_term_sorted([&](TermPostings&& tp) {
+            if (streamed.ok()) streamed = process_term(tp, &st);
+        }));
+        SNII_RETURN_IF_ERROR(streamed);
+    } else {
+        // Materialized fallback (tests / callers holding a vector): process_term
+        // frees the term's arrays, so feed a per-term COPY to keep terms_ intact
+        // for the caller. This path is not the large out-of-core build, so the copy
+        // is cheap.
+        for (const auto& tp : terms_) {
+            TermPostings copy = tp;
+            SNII_RETURN_IF_ERROR(process_term(copy, &st));
+        }
+    }
+    if (st.block) SNII_RETURN_IF_ERROR(flush_block(st.block.get(), st.block_first_term));
+    return Status::OK();
+}
+
+Status LogicalIndexWriter::build(snii::io::FileWriter* posting_out) {
+    if (posting_out == nullptr) {
+        return Status::InvalidArgument("logical_index: null posting sink");
+    }
+    if (has_norms_ && encoded_norms_.size() != doc_count_) {
+        return Status::InvalidArgument("logical_index: norms length must equal doc_count");
+    }
+    // The interleaved posting region streams STRAIGHT into the container output
+    // (no temp round-trip): posting_size() is the region-relative byte count,
+    // derived from the output offset advanced since this index's region began.
+    // The DICT region is staged in dict_buf_ (tiered: RAM under the cap =
+    // spill-only; spills above it) since it must land contiguously after the
+    // concurrently-streamed posting region.
+    posting_out_ = posting_out;
+    posting_off0_ = posting_out->bytes_written();
+
+    SNII_RETURN_IF_ERROR(build_blocks());
+    // Seal the dict buffer so a spilled temp is flushed before
+    // stream_dict_region_into reads it back. A no-op for a RAM-resident dict.
+    SNII_RETURN_IF_ERROR(dict_buf_.seal());
+
+    stats_.doc_count = doc_count_;
+    stats_.indexed_doc_count = doc_count_ - static_cast<uint32_t>(null_docids_.size());
+    stats_.term_count = term_count_;
+    stats_.null_count = static_cast<uint32_t>(null_docids_.size());
+
+    if (has_norms_) {
+        snii::format::NormsPodWriter nw;
+        for (uint8_t n : encoded_norms_) nw.add(n);
+        ByteSink nsink;
+        nw.finish(&nsink);
+        norms_section_ = nsink.take();
+    }
+
+    if (!null_docids_.empty()) {
+        snii::format::NullBitmapWriter null_writer;
+        for (uint32_t docid : null_docids_) null_writer.add_null(docid);
+        ByteSink null_sink;
+        null_writer.finish(doc_count_, &null_sink);
+        null_bitmap_section_ = null_sink.take();
+    }
+
+    // Build the absent-term filter (block-split bloom, Parquet-canonical) from
+    // the per-term keys (no retained strings) as a [28B header][bitset] blob; the
+    // compound writer places it as a PHYSICAL section probed one 32-byte block on
+    // demand.
+    bsbf_bytes_.clear();
+    if (!term_hashes_.empty()) {
+        snii::format::BsbfBuilder bf;
+        SNII_RETURN_IF_ERROR(snii::format::BsbfBuilder::create(
+                static_cast<uint32_t>(term_hashes_.size()), kBsbfFpp, &bf));
+        for (uint64_t k : term_hashes_) bf.insert(k);
+        ByteSink bsink;
+        SNII_RETURN_IF_ERROR(bf.serialize(&bsink));
+        bsbf_bytes_ = bsink.take();
+    }
+    std::vector<uint64_t>().swap(term_hashes_); // release
+    return Status::OK();
+}
+
+Status LogicalIndexWriter::finish_meta(const SectionRefs& abs_refs, uint64_t dict_region_offset,
+                                       ByteSink* out) const {
+    if (out == nullptr) return Status::InvalidArgument("logical_index: null meta sink");
+
+    SampledTermIndexBuilder sti;
+    for (const auto& b : blocks_) sti.add_block_first_term(b.first_term);
+    ByteSink sti_sink;
+    sti.finish(&sti_sink);
+
+    DictBlockDirectoryBuilder dir;
+    for (const auto& b : blocks_) {
+        BlockRef ref;
+        ref.offset = dict_region_offset + b.rel_offset;
+        ref.length = b.length;
+        ref.n_entries = b.n_entries;
+        ref.flags = b.flags;
+        ref.checksum = b.checksum;
+        ref.uncomp_len = b.uncomp_len;
+        dir.add(ref);
+    }
+    ByteSink dir_sink;
+    dir.finish(&dir_sink);
+
+    uint32_t flags = bsbf_bytes_.empty() ? 0u : PerIndexMetaBuilder::kHasBsbf;
+    // Persist positions capability explicitly (the R1 fix): the reader must NOT
+    // infer it from posting_region.length, which is non-zero for any docs-only
+    // pod_ref index.
+    if (has_prx_) flags |= PerIndexMetaBuilder::kHasPositions;
+    PerIndexMetaBuilder builder(index_id_, index_suffix_, flags);
+    builder.set_stats(stats_);
+    builder.set_sampled_term_index(sti_sink.view());
+    builder.set_dict_block_directory(dir_sink.view());
+    // The BSBF is a physical section (abs_refs.bsbf), not embedded in the meta.
+    builder.set_section_refs(abs_refs);
+    return builder.finish(out);
+}
+
+} // namespace snii::writer
diff --git a/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp b/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp
new file mode 100644
index 00000000000000..8e6f9b9adc61b3
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp
@@ -0,0 +1,146 @@
+#include "snii/writer/snii_compound_writer.h"
+
+#include <utility>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/crc32c.h"
+#include "snii/format/bootstrap_header.h"
+#include "snii/format/per_index_meta.h" // SectionRefs
+#include "snii/format/tail_meta_region.h"
+#include "snii/format/tail_pointer.h"
+
+namespace snii::writer {
+
+using snii::format::BootstrapHeader;
+using snii::format::SectionRefs;
+using snii::format::TailMetaRegionBuilder;
+using snii::format::TailPointer;
+
+SniiCompoundWriter::SniiCompoundWriter(snii::io::FileWriter* out) : out_(out) {}
+
+Status SniiCompoundWriter::append(const std::vector<uint8_t>& bytes) {
+    if (bytes.empty()) return Status::OK();
+    return out_->append(Slice(bytes));
+}
+
+// The bootstrap header occupies offset 0 and must precede the first posting region,
+// which streams straight into the output during build(). Written lazily exactly once
+// (on the first add, or in finish() for an empty container).
+Status SniiCompoundWriter::ensure_bootstrap() {
+    if (bootstrap_written_) return Status::OK();
+    bootstrap_written_ = true;
+    return write_bootstrap();
+}
+
+Status SniiCompoundWriter::add_logical_index(const SniiIndexInput& in) {
+    if (out_ == nullptr) return Status::InvalidArgument("compound: null file writer");
+    if (finished_) return Status::Internal("compound: add after finish");
+    SNII_RETURN_IF_ERROR(ensure_bootstrap());
+    auto liw = std::make_unique<LogicalIndexWriter>(in);
+    Placement p;
+    // The posting region streams DIRECTLY into the container during build() -- no temp
+    // round-trip for the bulk -- followed immediately by this index's compact DICT
+    // trailer (produced interleaved into a temp, but laid out right after its posting
+    // region, preserving the per-index [posting][dict] layout). Offsets are read off
+    // the output writer (the single source of truth -- no separate cursor).
+    p.post_off = out_->bytes_written();
+    SNII_RETURN_IF_ERROR(liw->build(out_));
+    p.post_len = out_->bytes_written() - p.post_off;
+    p.dict_off = out_->bytes_written();
+    SNII_RETURN_IF_ERROR(liw->stream_dict_region_into(out_));
+    p.dict_len = out_->bytes_written() - p.dict_off;
+    indexes_.push_back(std::move(liw));
+    placements_.push_back(p);
+    return Status::OK();
+}
+
+Status SniiCompoundWriter::write_bootstrap() {
+    BootstrapHeader bh;
+    bh.tail_pointer_size = static_cast<uint8_t>(snii::format::tail_pointer_size());
+    ByteSink sink;
+    SNII_RETURN_IF_ERROR(snii::format::encode_bootstrap_header(bh, &sink));
+    return append(sink.buffer());
+}
+
+// Writes each index's norms POD then bsbf section (in add order), after all the
+// per-index [posting][dict] regions.
+Status SniiCompoundWriter::write_norms() {
+    for (size_t i = 0; i < indexes_.size(); ++i) {
+        const LogicalIndexWriter& w = *indexes_[i];
+        if (!w.has_norms() || w.norms_bytes().empty()) continue;
+        Placement& p = placements_[i];
+        p.norms_off = out_->bytes_written();
+        SNII_RETURN_IF_ERROR(append(w.norms_bytes()));
+        p.norms_len = out_->bytes_written() - p.norms_off;
+    }
+    for (size_t i = 0; i < indexes_.size(); ++i) {
+        const LogicalIndexWriter& w = *indexes_[i];
+        if (!w.has_null_bitmap()) continue;
+        Placement& p = placements_[i];
+        p.null_off = out_->bytes_written();
+        SNII_RETURN_IF_ERROR(append(w.null_bitmap_bytes()));
+        p.null_len = out_->bytes_written() - p.null_off;
+    }
+    for (size_t i = 0; i < indexes_.size(); ++i) {
+        const LogicalIndexWriter& w = *indexes_[i];
+        if (!w.has_bsbf()) continue;
+        Placement& p = placements_[i];
+        p.bsbf_off = out_->bytes_written();
+        SNII_RETURN_IF_ERROR(append(w.bsbf_bytes()));
+        p.bsbf_len = out_->bytes_written() - p.bsbf_off;
+    }
+    return Status::OK();
+}
+
+Status SniiCompoundWriter::write_tail() {
+    TailMetaRegionBuilder region;
+    for (size_t i = 0; i < indexes_.size(); ++i) {
+        const LogicalIndexWriter& w = *indexes_[i];
+        const Placement& p = placements_[i];
+
+        SectionRefs refs;
+        refs.dict_region = {p.dict_off, p.dict_len};
+        refs.posting_region = {p.post_off, p.post_len};
+        refs.norms = {p.norms_off, p.norms_len};
+        refs.null_bitmap = {p.null_off, p.null_len};
+        refs.bsbf = {p.bsbf_off, p.bsbf_len};
+
+        ByteSink meta;
+        SNII_RETURN_IF_ERROR(w.finish_meta(refs, p.dict_off, &meta));
+        region.add_index(w.index_id(), w.index_suffix(), meta.view());
+    }
+
+    ByteSink region_sink;
+    region.finish(&region_sink);
+    const uint64_t region_off = out_->bytes_written();
+    SNII_RETURN_IF_ERROR(append(region_sink.buffer()));
+    const uint64_t region_len = out_->bytes_written() - region_off;
+
+    TailPointer tp;
+    tp.meta_region_offset = region_off;
+    tp.meta_region_length = region_len;
+    tp.hot_off = 0;
+    tp.meta_region_checksum = snii::crc32c(region_sink.view());
+    // Reserved: the bootstrap header carries (and decode_bootstrap_header verifies) its
+    // OWN internal crc32c, so a tail-pointer copy is redundant. Left 0 until a cross-
+    // region check needs it; the tail pointer's own tail_checksum still covers this
+    // field's bytes.
+    tp.bootstrap_header_checksum = 0;
+    ByteSink tail_sink;
+    SNII_RETURN_IF_ERROR(snii::format::encode_tail_pointer(tp, &tail_sink));
+    return append(tail_sink.buffer());
+}
+
+Status SniiCompoundWriter::finish() {
+    if (out_ == nullptr) return Status::InvalidArgument("compound: null file writer");
+    if (finished_) return Status::Internal("compound: finish called twice");
+    finished_ = true;
+
+    SNII_RETURN_IF_ERROR(ensure_bootstrap()); // empty container still gets a header
+    SNII_RETURN_IF_ERROR(write_norms());
+    SNII_RETURN_IF_ERROR(write_tail());
+    return out_->finalize();
+}
+
+} // namespace snii::writer
diff --git a/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp b/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp
new file mode 100644
index 00000000000000..e68ba24b9a4164
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp
@@ -0,0 +1,597 @@
+#include "snii/writer/spill_run_codec.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <cstring>
+#include <memory>
+#include <queue>
+#include <stdexcept>
+#include <utility>
+
+#include "snii/encoding/varint.h"
+#include "snii/format/format_constants.h"
+
+namespace snii::writer {
+
+namespace {
+
+// Flush staging once it grows past this. A LARGE write buffer (4 MiB) collapses
+// the per-flush write() syscall count by ~64x: at 64 KiB the 5M build issued
+// ~8800 write()s to ext4 (~9s of syscall overhead) for ~553 MiB of runs, versus
+// a raw dd of the same bytes taking ~1.2s. Runs are PRIVATE temp files, so the
+// on-disk index is unaffected; the only cost is a slightly larger transient
+// RunWriter staging buffer (4 MiB, bounded, freed at close()).
+constexpr size_t kWriteFlushBytes = 1u << 22; // 4 MiB
+// RunReader reads this much per disk fill; the window slides so a single record
+// never needs the whole run in RAM (only the current term's encoded span). KEEP
+// this small (64 KiB): a large read chunk x many open runs would inflate the
+// merge-phase peak RSS at low spill thresholds (each reader holds a window).
+constexpr size_t kReadChunkBytes = 1u << 16; // 64 KiB
+
+void AppendVarint(std::vector<uint8_t>* buf, uint64_t v) {
+    uint8_t tmp[10];
+    const size_t n = encode_varint64(v, tmp);
+    buf->insert(buf->end(), tmp, tmp + n);
+}
+
+// Appends a block of `count` uint32 values as RAW little-endian fixed-width bytes
+// (memcpy from contiguous source). Runs are private temp files; the on-disk index
+// is unaffected. Raw blocks make encode/decode ~10x cheaper than per-value varint
+// for the freqs/positions streams (which compress poorly as varints anyway), at
+// the cost of a modestly larger temp run. Empty source is a no-op.
+void AppendRawU32(std::vector<uint8_t>* buf, const uint32_t* src, size_t count) {
+    if (count == 0) return;
+    const auto* bytes = reinterpret_cast<const uint8_t*>(src);
+    buf->insert(buf->end(), bytes, bytes + count * sizeof(uint32_t));
+}
+
+// Writes the full byte range [data, data+len) to fd, looping over short writes.
+Status WriteAll(int fd, const uint8_t* data, size_t len) {
+    size_t off = 0;
+    while (off < len) {
+        const ssize_t n = ::write(fd, data + off, len - off);
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            return Status::IoError(std::string("run write failed: ") + std::strerror(errno));
+        }
+        off += static_cast<size_t>(n);
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// RunWriter
+// ---------------------------------------------------------------------------
+
+RunWriter::~RunWriter() {
+    if (fd_ >= 0) ::close(fd_);
+}
+
+Status RunWriter::open(const std::string& path) {
+    fd_ = ::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0600);
+    if (fd_ < 0) {
+        return Status::IoError("run open(" + path + "): " + std::strerror(errno));
+    }
+    buf_.clear();
+    return Status::OK();
+}
+
+Status RunWriter::flush() {
+    if (buf_.empty()) return Status::OK();
+    SNII_RETURN_IF_ERROR(WriteAll(fd_, buf_.data(), buf_.size()));
+    buf_.clear();
+    return Status::OK();
+}
+
+Status RunWriter::write_term(uint32_t term_id, const TermPostings& tp) {
+    AppendVarint(&buf_, term_id);
+    AppendVarint(&buf_, tp.docids.size());
+    // Docids are a RAW fixed-width u32 block (bulk memcpy), NOT per-value VInt.
+    // Per-value varint over ~60M docids cost ~1.5s of encode CPU on the spill feed
+    // side; raw is a single memcpy and the decode side becomes a memcpy too. Runs
+    // are PRIVATE temp files written then read back from page cache, so the modestly
+    // larger run (no delta packing) costs ~0 extra real I/O. Absolute docids are
+    // stored (the merge concatenates per-term across runs and re-deltas at encode).
+    AppendRawU32(&buf_, tp.docids.data(), tp.docids.size());
+    // Freqs + positions are RAW fixed-width u32 blocks (bulk memcpy). The decoder
+    // reads them back the same way; n_pos == positions_flat.size() is recoverable
+    // from sum(freqs), but is written explicitly so a reader can size the block.
+    AppendRawU32(&buf_, tp.freqs.data(), tp.freqs.size());
+    const uint64_t n_pos = tp.positions_flat.size();
+    AppendVarint(&buf_, n_pos);
+    AppendRawU32(&buf_, tp.positions_flat.data(), tp.positions_flat.size());
+    if (buf_.size() >= kWriteFlushBytes) SNII_RETURN_IF_ERROR(flush());
+    return Status::OK();
+}
+
+Status RunWriter::close() {
+    if (fd_ < 0) return Status::OK();
+    SNII_RETURN_IF_ERROR(flush());
+    const int fd = fd_;
+    fd_ = -1;
+    if (::close(fd) != 0) {
+        return Status::IoError(std::string("run close: ") + std::strerror(errno));
+    }
+    return Status::OK();
+}
+
+// ---------------------------------------------------------------------------
+// RunReader
+// ---------------------------------------------------------------------------
+
+RunReader::~RunReader() {
+    if (fd_ >= 0) ::close(fd_);
+}
+
+Status RunReader::open(const std::string& path, bool has_positions) {
+    fd_ = ::open(path.c_str(), O_RDONLY);
+    if (fd_ < 0) {
+        return Status::IoError("run reopen(" + path + "): " + std::strerror(errno));
+    }
+    // Record the run's byte size so every length decoded from the stream can be
+    // bounded against it before allocating (no record holds more u32s than the whole
+    // file). Honors the header's "lengths validated against the file size" contract,
+    // turning a corrupt/truncated length into Status::Corruption rather than an
+    // uncaught std::bad_alloc from a giant resize().
+    struct stat st {};
+    if (::fstat(fd_, &st) != 0) {
+        return Status::IoError(std::string("run fstat: ") + std::strerror(errno));
+    }
+    file_size_ = static_cast<uint64_t>(st.st_size);
+    has_positions_ = has_positions;
+    exhausted_ = false;
+    eof_ = false;
+    pos_ = 0;
+    pos_count_ = 0;
+    pos_remaining_ = 0;
+    window_.clear();
+    return advance();
+}
+
+// Slides consumed bytes out of the window, then appends one disk chunk.
+Status RunReader::fill() {
+    if (pos_ > 0) {
+        window_.erase(window_.begin(), window_.begin() + pos_);
+        pos_ = 0;
+    }
+    if (eof_) return Status::OK();
+    const size_t base = window_.size();
+    window_.resize(base + kReadChunkBytes);
+    ssize_t n;
+    do {
+        n = ::read(fd_, window_.data() + base, kReadChunkBytes);
+    } while (n < 0 && errno == EINTR);
+    if (n < 0) return Status::IoError(std::string("run read: ") + std::strerror(errno));
+    window_.resize(base + static_cast<size_t>(n));
+    if (n == 0) eof_ = true;
+    return Status::OK();
+}
+
+// Buffered bytes available to the decoder right now (from pos_ to window end).
+// fill() may slide the window (erasing consumed bytes), so callers must compare
+// THIS quantity -- not window_.size() -- to decide whether more data arrived.
+size_t RunReader::available() const {
+    return window_.size() - pos_;
+}
+
+Status RunReader::ensure(size_t n) {
+    while (available() < n) {
+        const size_t had = available();
+        SNII_RETURN_IF_ERROR(fill());
+        if (available() == had && eof_) {
+            return Status::Corruption("run truncated: needed more bytes than available");
+        }
+    }
+    return Status::OK();
+}
+
+// Streamed varint: decode from the current window; if it straddles the buffered
+// boundary, top up from disk and retry. A varint is at most 10 bytes, so this
+// loops at most a couple of times. Bounds-safe: decode_varint64 never reads past
+// `end`, and a partial varint at true eof is reported as corruption.
+Status RunReader::read_varint(uint64_t* v) {
+    while (true) {
+        const uint8_t* p = window_.data() + pos_;
+        const uint8_t* end = window_.data() + window_.size();
+        const uint8_t* next = nullptr;
+        Status s = decode_varint64(p, end, v, &next);
+        if (s.ok()) {
+            pos_ += static_cast<size_t>(next - p);
+            return Status::OK();
+        }
+        if (eof_) return Status::Corruption("run truncated: incomplete varint");
+        const size_t had = available();
+        SNII_RETURN_IF_ERROR(fill());
+        if (available() == had && eof_) {
+            return Status::Corruption("run truncated: incomplete varint at eof");
+        }
+    }
+}
+
+// Streams `count` raw little-endian u32s from the window into `dst` (caller-owned
+// storage of at least count*4 bytes), topping up the window from disk as needed.
+// Copies whatever is buffered each pass (the window may hold only part of a large
+// block), so a high-df term's freqs/positions stream through in 64 KiB chunks
+// without ever needing the whole block resident at once.
+Status RunReader::pull_raw_u32(uint8_t* dst, size_t count) {
+    if (count == 0) return Status::OK();
+    size_t need = count * sizeof(uint32_t);
+    size_t written = 0;
+    while (need > 0) {
+        if (available() == 0) {
+            const size_t had = available();
+            SNII_RETURN_IF_ERROR(fill());
+            if (available() == had && eof_) {
+                return Status::Corruption("run truncated: needed more raw bytes than available");
+            }
+        }
+        const size_t take = std::min(need, available());
+        std::memcpy(dst + written, window_.data() + pos_, take);
+        pos_ += take;
+        written += take;
+        need -= take;
+    }
+    return Status::OK();
+}
+
+// Bulk-decodes `count` raw u32s into `out` (resized to count).
+Status RunReader::read_raw_u32(size_t count, std::vector<uint32_t>* out) {
+    // Bound `count` against the run's byte size BEFORE resize(): a record can never
+    // hold more u32s than the whole file. Rejects a corrupt/truncated length varint
+    // (which is otherwise an unbounded resize -> uncaught std::bad_alloc).
+    if (count > file_size_ / sizeof(uint32_t)) {
+        return Status::Corruption("run: raw u32 count exceeds file size");
+    }
+    out->resize(count);
+    if (count == 0) return Status::OK();
+    return pull_raw_u32(reinterpret_cast<uint8_t*>(out->data()), count);
+}
+
+// Materializes the current term's deferred position block into positions_flat.
+// A no-op once the positions are already drained (idempotent within a term).
+Status RunReader::materialize_positions() {
+    if (pos_remaining_ == 0) {
+        current_.positions_flat.clear();
+        return Status::OK();
+    }
+    const size_t n = static_cast<size_t>(pos_remaining_);
+    if (has_positions_) {
+        SNII_RETURN_IF_ERROR(read_raw_u32(n, &current_.positions_flat));
+    } else {
+        // No-positions runs should carry n_pos == 0; tolerate (skip) a stray block.
+        std::vector<uint32_t> skip;
+        SNII_RETURN_IF_ERROR(read_raw_u32(n, &skip));
+        current_.positions_flat.clear();
+    }
+    pos_remaining_ = 0;
+    return Status::OK();
+}
+
+// Streams the next `n` positions of the current term straight from the window.
+Status RunReader::stream_positions(uint32_t* dst, size_t n) {
+    if (n == 0) return Status::OK();
+    if (n > pos_remaining_) {
+        return Status::Corruption("run: stream_positions past block end");
+    }
+    SNII_RETURN_IF_ERROR(pull_raw_u32(reinterpret_cast<uint8_t*>(dst), n));
+    pos_remaining_ -= n;
+    return Status::OK();
+}
+
+// Discards any positions of the current term left unread, so the window cursor
+// lands at the next record boundary before advance() reads the next term.
+Status RunReader::skip_remaining_positions() {
+    if (pos_remaining_ == 0) return Status::OK();
+    const size_t n = static_cast<size_t>(pos_remaining_);
+    std::vector<uint32_t> skip;
+    SNII_RETURN_IF_ERROR(read_raw_u32(n, &skip));
+    pos_remaining_ = 0;
+    return Status::OK();
+}
+
+Status RunReader::advance() {
+    // Drain any positions the owner left unread for the previous term so the window
+    // cursor lands at the next record boundary.
+    SNII_RETURN_IF_ERROR(skip_remaining_positions());
+    // End-of-run detection: at a record boundary, if no bytes remain we are done.
+    if (available() == 0) {
+        SNII_RETURN_IF_ERROR(fill());
+        if (available() == 0 && eof_) {
+            exhausted_ = true;
+            return Status::OK();
+        }
+    }
+    uint64_t term_id = 0;
+    SNII_RETURN_IF_ERROR(read_varint(&term_id));
+    if (term_id > UINT32_MAX) return Status::Corruption("run term_id exceeds uint32");
+    current_id_ = static_cast<uint32_t>(term_id);
+    current_.term.clear(); // runs store only the id; owner resolves the string
+
+    uint64_t n_docs = 0;
+    SNII_RETURN_IF_ERROR(read_varint(&n_docs));
+    // Docids: RAW absolute u32 block (bulk read), matching the writer's AppendRawU32.
+    SNII_RETURN_IF_ERROR(read_raw_u32(static_cast<size_t>(n_docs), &current_.docids));
+    // Freqs: RAW u32 block (bulk read), matching the writer's AppendRawU32.
+    SNII_RETURN_IF_ERROR(read_raw_u32(static_cast<size_t>(n_docs), &current_.freqs));
+    uint64_t n_pos = 0;
+    SNII_RETURN_IF_ERROR(read_varint(&n_pos));
+    // Positions are LAZY: record the block count and leave the window cursor parked
+    // at the block start. The owner picks materialize_positions() (default) or
+    // stream_positions() (wide-term merge pump). The widest term's tens-of-MiB
+    // position block is thus never resident unless the owner asks for it whole.
+    current_.positions_flat.clear();
+    pos_count_ = n_pos;
+    pos_remaining_ = n_pos;
+    return Status::OK();
+}
+
+// ---------------------------------------------------------------------------
+// K-way merge
+// ---------------------------------------------------------------------------
+
+namespace {
+
+// Min-heap entry: orders by the run's current term-id's VOCAB STRING, tie-broken
+// by run index so equal terms are gathered run-order (keeping concatenated
+// docids ascending). The comparator resolves id -> string via the shared vocab,
+// so the merged stream is lexicographic (the dictionary order the writer needs).
+struct HeapItem {
+    uint32_t term_id;
+    size_t run;
+};
+struct HeapGreater {
+    const std::vector<std::string>* vocab;
+    bool operator()(const HeapItem& a, const HeapItem& b) const {
+        const std::string& sa = (*vocab)[a.term_id];
+        const std::string& sb = (*vocab)[b.term_id];
+        if (sa != sb) return sa > sb;
+        return a.run > b.run;
+    }
+};
+
+// Appends src's postings onto dst (run order). Later runs only cover docids
+// >= dst's last, so docids stay ascending. COALESCE the boundary doc: if a spill
+// fell BETWEEN two tokens of the same doc, that doc ends one run and begins the
+// next with the SAME docid -- merge them (sum freqs, splice positions) so the
+// merged term has exactly one entry per docid (matching the in-memory build).
+//
+// Positions are FLAT: doc order, partitioned by freqs. Because both dst and src
+// already store doc-ordered flat positions, the common (no-boundary-overlap) case
+// is a single bulk append. The boundary-overlap case must INSERT src's first
+// doc's positions right after dst's last doc's positions so flat order stays
+// consistent with the merged (coalesced) freqs.
+void Concat(TermPostings* dst, const TermPostings& src, bool has_positions) {
+    if (src.docids.empty()) return;
+    size_t start = 0;
+    size_t src_pos_start = 0; // flat offset of src positions to append after splice
+    if (!dst->docids.empty() && dst->docids.back() == src.docids.front()) {
+        const uint32_t head_fc = src.freqs.front();
+        if (has_positions && head_fc != 0) {
+            // Splice src's first-doc positions in right after dst's last-doc positions.
+            // dst's last doc owns dst->freqs.back() entries at the tail of positions_flat
+            // BEFORE we bump that freq, so insert at end() (last doc is the tail run).
+            auto& flat = dst->positions_flat;
+            flat.insert(flat.end(), src.positions_flat.begin(),
+                        src.positions_flat.begin() + head_fc);
+        }
+        dst->freqs.back() += head_fc;
+        src_pos_start = head_fc;
+        start = 1; // boundary doc folded in; append the rest
+    }
+    dst->docids.insert(dst->docids.end(), src.docids.begin() + start, src.docids.end());
+    dst->freqs.insert(dst->freqs.end(), src.freqs.begin() + start, src.freqs.end());
+    if (has_positions) {
+        dst->positions_flat.insert(dst->positions_flat.end(),
+                                   src.positions_flat.begin() + src_pos_start,
+                                   src.positions_flat.end());
+    }
+}
+
+// Coalesces ONLY docids/freqs (no positions). Used by the WIDE-term path, whose
+// positions are streamed via a pos_pump instead of materialized. The boundary-doc
+// freq merge (dst->freqs.back() += head_fc) is identical to Concat's, so the
+// merged df / freqs / ttf are bit-for-bit the same; positions are emitted in pure
+// run-order concatenation by the pump (the same byte stream Concat would build).
+void ConcatDocsFreqs(TermPostings* dst, const TermPostings& src) {
+    if (src.docids.empty()) return;
+    size_t start = 0;
+    if (!dst->docids.empty() && dst->docids.back() == src.docids.front()) {
+        dst->freqs.back() += src.freqs.front();
+        start = 1; // boundary doc folded in; append the rest
+    }
+    dst->docids.insert(dst->docids.end(), src.docids.begin() + start, src.docids.end());
+    dst->freqs.insert(dst->freqs.end(), src.freqs.begin() + start, src.freqs.end());
+}
+
+// A merged term is emitted with a STREAMED position pump (instead of a
+// materialized positions_flat) when it is wide enough that its full flat
+// positions would dominate the merge-phase peak RSS. The writer routes any term
+// with df >= kSlimDfThreshold through the windowed path (build_windowed_entry),
+// which is the only path that consumes pos_pump; a slim term reads positions_flat
+// directly, so it must always be materialized. Gating on the same df threshold
+// the writer uses keeps the two in lockstep and is conservative: only the few
+// genuinely-wide terms (led by the single widest, the merge-phase peak driver)
+// take the streamed path. total_pos is also required so a degenerate wide term
+// with no positions still has something to stream.
+bool ShouldStreamPositions(uint64_t total_docs, uint64_t total_pos, bool has_positions) {
+    return has_positions && total_pos != 0 && total_docs >= snii::format::kSlimDfThreshold;
+}
+
+} // namespace
+
+Status MergeRuns(const std::vector<std::string>& run_paths, const std::vector<std::string>& vocab,
+                 bool has_positions, const std::function<void(TermPostings&&)>& fn,
+                 bool allow_stream_positions) {
+    std::vector<std::unique_ptr<RunReader>> readers;
+    readers.reserve(run_paths.size());
+    std::priority_queue<HeapItem, std::vector<HeapItem>, HeapGreater> heap(HeapGreater {&vocab});
+    for (size_t i = 0; i < run_paths.size(); ++i) {
+        auto r = std::make_unique<RunReader>();
+        SNII_RETURN_IF_ERROR(r->open(run_paths[i], has_positions));
+        if (!r->exhausted()) {
+            if (r->current_id() >= vocab.size()) {
+                return Status::Corruption("run term_id out of vocab range");
+            }
+            heap.push({r->current_id(), i});
+        }
+        readers.push_back(std::move(r));
+    }
+
+    std::vector<size_t> matching; // run indices contributing the current term
+    while (!heap.empty()) {
+        const uint32_t id = heap.top().term_id;
+        TermPostings merged;
+        merged.term = vocab[id]; // resolve the id -> dictionary string once
+        // Gather every run whose head id maps to the same string (the heap's run
+        // tie-break keeps them in run order, so concatenated docids stay ascending).
+        // Equal strings imply equal ids for a dense vocab; compare by string so a
+        // duplicate string still groups correctly. The matching runs' current slices
+        // are already loaded in their readers (they were read to seed the heap), so
+        // summing their sizes here costs nothing extra in RAM.
+        matching.clear();
+        uint64_t total_docs = 0, total_pos = 0;
+        while (!heap.empty() && vocab[heap.top().term_id] == merged.term) {
+            const size_t ri = heap.top().run;
+            heap.pop();
+            const RunReader* r = readers[ri].get();
+            total_docs += r->current().docids.size();
+            total_pos += r->current_pos_count(); // positions are LAZY: use the count
+            matching.push_back(ri);
+        }
+        // Reserve EXACTLY the summed sizes (an upper bound -- boundary-doc coalescing
+        // only shrinks the final size). This eliminates std::vector's geometric
+        // over-allocation, which left ~32 MiB of dead capacity on the widest term (df
+        // in the millions split across spills) -- a dominant merge-phase peak-RSS
+        // overhang at 5M. The reserved-but-unwritten pages are not faulted in, so the
+        // empty reservation itself does not raise RSS; only the actual data does.
+        merged.docids.reserve(static_cast<size_t>(total_docs));
+        merged.freqs.reserve(static_cast<size_t>(total_docs));
+
+        bool stream = allow_stream_positions &&
+                      ShouldStreamPositions(total_docs, total_pos, has_positions);
+        if (!stream && has_positions) {
+            merged.positions_flat.reserve(static_cast<size_t>(total_pos));
+        }
+        // Coalesce docids/freqs from every matching run (always materialized -- a few
+        // u32 vectors). For the non-wide case, also coalesce positions here. For the
+        // wide case, leave positions for the streamed pump and keep the readers PARKED
+        // at their position blocks until fn() drains the pump.
+        for (size_t ri : matching) {
+            RunReader* r = readers[ri].get();
+            if (stream) {
+                ConcatDocsFreqs(&merged, r->current());
+            } else {
+                if (has_positions) SNII_RETURN_IF_ERROR(r->materialize_positions());
+                Concat(&merged, r->current(), has_positions);
+            }
+        }
+
+        // The stream gate keyed on PRE-coalesce total_docs, but the writer's slim vs
+        // windowed dispatch keys on the POST-coalesce df (merged.docids.size()).
+        // Boundary-doc coalescing across spill seams can drop df below kSlimDfThreshold
+        // while total_docs stayed above it; that term routes to build_slim_entry, which
+        // reads positions_flat directly and ignores pos_pump. Materialize positions now
+        // from the still-parked readers (mirrors drain_sorted()'s slim fallback).
+        if (stream && merged.docids.size() < snii::format::kSlimDfThreshold) {
+            merged.positions_flat.reserve(static_cast<size_t>(total_pos));
+            for (size_t ri : matching) {
+                RunReader* r = readers[ri].get();
+                SNII_RETURN_IF_ERROR(r->materialize_positions());
+                const std::vector<uint32_t>& pf = r->current().positions_flat;
+                merged.positions_flat.insert(merged.positions_flat.end(), pf.begin(), pf.end());
+            }
+            stream = false;
+        }
+
+        if (stream) {
+            // WIDE term: STREAM positions via a pump that walks the matching readers in
+            // run order (pure flat concatenation == the coalesced positions_flat,
+            // byte-for-byte). positions_flat stays empty -- the widest term's tens-of-MiB
+            // position buffer is never resident; only one ~64 KiB window per pull is. The
+            // readers are still parked at this term's blocks, so the pump pulls from them
+            // synchronously while fn() runs (fn consumes synchronously -- the windowed
+            // writer does). After fn(), advance the readers past the (now-drained) blocks.
+            merged.pos_total = total_pos;
+            size_t cursor = 0; // index into `matching` for the run currently being drained
+            Status pump_status = Status::OK();
+            std::vector<std::unique_ptr<RunReader>>* rd = &readers;
+            const std::vector<size_t>* match = &matching;
+            // Self-contained liveness guard. The pump captures references into THIS stack
+            // frame (&cursor, &pump_status) and the parked run readers (rd/match), valid
+            // ONLY while fn() runs synchronously -- after fn() the readers advance past the
+            // drained blocks. `pump_alive` is heap-owned and captured BY VALUE, so a
+            // stored/deferred pos_pump fails loudly (throws) instead of dereferencing
+            // dangling state. See the contract on TermPostings::pos_pump.
+            auto pump_alive = std::make_shared<bool>(true);
+            merged.pos_pump = [rd, match, &cursor, &pump_status, pump_alive](uint32_t* dst,
+                                                                             size_t n) {
+                if (!*pump_alive) {
+                    throw std::logic_error(
+                            "TermPostings::pos_pump invoked after its producing merge scope ended; "
+                            "the streamed TermPostings must be consumed synchronously inside fn() "
+                            "and never stored for later use");
+                }
+                size_t off = 0;
+                while (off < n) {
+                    // Advance to the next run that still has positions to yield.
+                    while (cursor < match->size() &&
+                           (*rd)[(*match)[cursor]]->positions_remaining() == 0) {
+                        ++cursor;
+                    }
+                    if (cursor >= match->size()) break; // defensive: pump over-pulled
+                    RunReader* r = (*rd)[(*match)[cursor]].get();
+                    const size_t take =
+                            std::min(n - off, static_cast<size_t>(r->positions_remaining()));
+                    Status s = r->stream_positions(dst + off, take);
+                    if (!s.ok()) {
+                        // Mid-stream I/O / corruption: zero-fill the UNFILLED tail before
+                        // returning. fn() has the pump and will consume dst BEFORE pump_status
+                        // is surfaced after fn(); never hand it uninitialized bytes (the
+                        // failed stream_positions wrote nothing into dst[off..]). The error is
+                        // still latched and surfaced after fn(), so the build aborts -- the
+                        // zero fill only guarantees deterministic, defined bytes meanwhile.
+                        std::memset(dst + off, 0, (n - off) * sizeof(uint32_t));
+                        if (pump_status.ok()) pump_status = std::move(s);
+                        return;
+                    }
+                    off += take;
+                }
+                // Short-fill on over-pull (cursor ran past the matching runs without an
+                // error status): the readers held fewer positions than n. Zero-fill the
+                // unfilled tail so the writer never reads uninitialized storage. With
+                // valid runs n == pos_total == sum(positions_remaining), so off == n and
+                // this memset spans zero bytes -- the produced .idx is unchanged.
+                if (off < n) std::memset(dst + off, 0, (n - off) * sizeof(uint32_t));
+            };
+            fn(std::move(merged));
+            *pump_alive = false;               // any later pos_pump call now throws instead of UAF
+            SNII_RETURN_IF_ERROR(pump_status); // surface a streamed-read I/O error
+        } else {
+            fn(std::move(merged));
+        }
+
+        // Advance every matching reader to its next term and re-seed the heap. For the
+        // wide path this also skips any positions the pump did not pull (none, when fn
+        // drained the whole stream); for the non-wide path positions were already
+        // materialized so nothing remains.
+        for (size_t ri : matching) {
+            RunReader* r = readers[ri].get();
+            SNII_RETURN_IF_ERROR(r->advance()); // frees this run's slice, loads next term
+            if (!r->exhausted()) {
+                if (r->current_id() >= vocab.size()) {
+                    return Status::Corruption("run term_id out of vocab range");
+                }
+                heap.push({r->current_id(), ri});
+            }
+        }
+    }
+    return Status::OK();
+}
+
+} // namespace snii::writer
diff --git a/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp b/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp
new file mode 100644
index 00000000000000..7fc8cd58ec0bf6
--- /dev/null
+++ b/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp
@@ -0,0 +1,594 @@
+#include "snii/writer/spimi_term_buffer.h"
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+
+#include "snii/encoding/varint.h"
+#include "snii/format/format_constants.h"
+#include "snii/writer/spill_run_codec.h"
+#include "snii/writer/temp_dir.h"
+
+#if defined(__GLIBC__)
+#include <malloc.h>
+#endif
+
+namespace snii::writer {
+
+namespace {
+
+// Returns freed heap arenas to the OS (glibc only). The spill encode churns many
+// small allocations whose freed chunks glibc retains in its arenas; trimming
+// before the peak-RSS-defining merge phase recovers that retention. No-op (and
+// harmless) on non-glibc libcs.
+void TrimMalloc() {
+#if defined(__GLIBC__)
+    ::malloc_trim(0);
+#endif
+}
+
+// Process-unique temp path for a spill run under `dir` (pid + monotonic counter so
+// parallel builds / multiple buffers never collide).
+std::string MakeRunPath(const std::string& dir) {
+    static std::atomic<uint64_t> counter {0};
+    const uint64_t n = counter.fetch_add(1);
+    return dir + "/snii_spill_" + std::to_string(::getpid()) + "_" + std::to_string(n) + ".run";
+}
+
+} // namespace
+
+SpimiTermBuffer::SpimiTermBuffer(const std::vector<std::string>* vocab, bool has_positions,
+                                 size_t spill_threshold_bytes, MemoryReporter* reporter)
+        : vocab_(vocab),
+          has_positions_(has_positions),
+          spill_threshold_bytes_(spill_threshold_bytes),
+          mem_reporter_(reporter) {
+    // Borrowed-vocab mode: only the 4 B/id slot-index array is sized to the
+    // vocabulary; the Term pool (slots_) grows with the LIVE touched count, so an
+    // all-but-empty vocabulary costs ~4 B/id instead of ~80 B/id.
+    slot_of_.assign(vocab_->size(), 0);
+    // The vocab-sized slot index is resident immediately and survives spills; report
+    // its initial positive delta now.
+    report_arena_delta();
+}
+
+SpimiTermBuffer::SpimiTermBuffer(bool has_positions, size_t spill_threshold_bytes,
+                                 MemoryReporter* reporter)
+        : vocab_(&owned_vocab_),
+          has_positions_(has_positions),
+          spill_threshold_bytes_(spill_threshold_bytes),
+          mem_reporter_(reporter) {
+    // Owned-vocab mode: the vocabulary grows as strings are interned; terms_ /
+    // present_ grow alongside it in add_token(string_view, ...).
+}
+
+SpimiTermBuffer::~SpimiTermBuffer() {
+    // Balance the writer-level / Doris tracker on the error path: if the buffer is
+    // destroyed while resident bytes were reported but not yet freed-and-reported
+    // (e.g. a build aborts before draining), return them here so nothing leaks.
+    if (mem_reporter_ != nullptr && reported_resident_ != 0) {
+        mem_reporter_->report(-reported_resident_);
+        reported_resident_ = 0;
+    }
+    cleanup_runs();
+}
+
+void SpimiTermBuffer::report_arena_delta() {
+    if (mem_reporter_ == nullptr) return;
+    // Diff the REAL resident bytes (arena + slot index) against the last reported
+    // total; emit the signed delta exactly once.
+    const int64_t now = static_cast<int64_t>(resident_bytes());
+    mem_reporter_->report(now - reported_resident_);
+    reported_resident_ = now;
+}
+
+size_t SpimiTermBuffer::unique_terms() const {
+    return live_term_count_;
+}
+
+uint64_t SpimiTermBuffer::resident_bytes() const {
+    // REAL resident accumulator bytes: the posting arena plus the vocab-sized slot
+    // index (capacity, since the reserved-but-unused tail is still resident RSS and
+    // survives spills -- spill_to_run does NOT free slot_of_). This is the gate-2
+    // spill trigger metric and the spill space-precheck figure -- NOT the old gated
+    // live_bytes_ estimate.
+    return pool_.arena_bytes() + static_cast<uint64_t>(slot_of_.capacity()) * sizeof(uint32_t);
+}
+
+// Returns the live Term for `term_id`, claiming a pool slot on first touch (1 ==
+// new). Reuses a freed slot from free_slots_ when available; otherwise appends a
+// fresh Term to slots_. slot_of_[term_id] holds (slot index + 1); 0 means empty.
+SpimiTermBuffer::Term& SpimiTermBuffer::term_slot(uint32_t term_id, bool* new_term) {
+    uint32_t enc = slot_of_[term_id];
+    if (enc != 0) {
+        *new_term = false;
+        return slots_[enc - 1];
+    }
+    *new_term = true;
+    uint32_t slot;
+    if (!free_slots_.empty()) {
+        slot = free_slots_.back();
+        free_slots_.pop_back();
+    } else {
+        slot = static_cast<uint32_t>(slots_.size());
+        slots_.emplace_back();
+    }
+    slot_of_[term_id] = slot + 1;
+    return slots_[slot];
+}
+
+// Appends one byte to a term's chain, starting the chain lazily on first use.
+void SpimiTermBuffer::put_byte(Term* t, uint8_t b) {
+    if (t->head == kNoChain) t->head = pool_.start_chain(&t->w, &t->level);
+    pool_.append_byte(&t->w, &t->level, b);
+}
+
+void SpimiTermBuffer::put_varint(Term* t, uint64_t v) {
+    uint8_t tmp[10];
+    const size_t n = encode_varint64(v, tmp);
+    for (size_t i = 0; i < n; ++i) put_byte(t, tmp[i]);
+}
+
+void SpimiTermBuffer::accumulate(uint32_t term_id, uint32_t docid, uint32_t pos) {
+    bool new_term = false;
+    Term& t = term_slot(term_id, &new_term);
+    if (new_term) {
+        touched_ids_.push_back(term_id);
+        ++live_term_count_;
+    }
+    // A token starts a new doc unless it continues the most-recent doc for this term.
+    const bool new_doc = !t.started || t.cur_docid != docid;
+    // Tagged entry: varint((pos << 1) | new_doc). Positions are tagged 0 when
+    // disabled. The new_doc bit lets the decoder recover per-doc freqs by counting.
+    // Widen to 64-bit so a full 32-bit position survives the << 1 without truncation.
+    const uint64_t tagged = has_positions_
+                                    ? ((static_cast<uint64_t>(pos) << 1) | (new_doc ? 1u : 0u))
+                                    : (new_doc ? 1u : 0u);
+    put_varint(&t, tagged);
+    if (new_doc) {
+        // Out-of-order docids are tolerated (zigzag delta is signed) and reordered at
+        // finalize; flag them so to_postings sorts. The delta base is the previous
+        // distinct doc (cur_docid), which is 0 for the very first doc (started==false).
+        const int64_t base = t.started ? static_cast<int64_t>(t.cur_docid) : 0;
+        if (t.started && docid < t.cur_docid) t.sorted = false;
+        const int64_t delta = static_cast<int64_t>(docid) - base;
+        put_varint(&t, zigzag_encode(delta));
+        t.cur_docid = docid;
+        t.started = true;
+    }
+    ++t.ntok;
+    ++total_tokens_;
+
+    // Gate-2 spill: trigger on REAL resident bytes (arena + slot index), NOT the old
+    // gated live_bytes_ estimate. arena_bytes() is monotonic per fill and reset to 0
+    // by spill_to_run()'s pool_.reset(), so the trigger self-rearms after each spill.
+    // The OTHER trigger is the hard arena safety stop (active even in unlimited mode):
+    // when the arena nears the 4 GiB uint32-offset limit -- without it, a single
+    // >4 GiB in-memory segment wraps alloc_run and silently corrupts data. A forced
+    // spill + final k-way merge stays byte-identical regardless of when it fires.
+    constexpr uint64_t kArenaSpillCap = 0xE0000000ull; // 3.5 GiB, < UINT32_MAX margin
+    // Report this token's REAL resident growth FIRST so the writer's unified total
+    // (reporter_->current_bytes()) reflects it before the gate-2 check. Single-source
+    // diff: cheap (subtraction + relaxed atomic add; arena_bytes() is two field reads).
+    report_arena_delta();
+    // Gate-2 spill (UNIFIED): when a reporter is attached, trigger on the writer's TOTAL
+    // build RAM (arena + slot index + dict) crossing the one configured cap -- the same
+    // total and cap every buffer of this writer shares, not a per-buffer threshold. Off
+    // Doris (no reporter) fall back to the local spill_threshold_bytes_. The hard arena
+    // safety stop (4 GiB uint32-offset limit) is always active. spill_to_run() resets the
+    // arena and reports its negative internally, so the unified total drops after a spill.
+    const bool over_cap = mem_reporter_ != nullptr ? mem_reporter_->over_cap()
+                                                   : (spill_threshold_bytes_ != 0 &&
+                                                      resident_bytes() >= spill_threshold_bytes_);
+    const bool arena_near_limit = pool_.arena_bytes() >= kArenaSpillCap;
+    if ((over_cap || arena_near_limit) && spill_status_.ok()) {
+        spill_status_ = spill_to_run();
+    }
+}
+
+void SpimiTermBuffer::add_token(uint32_t term_id, uint32_t docid, uint32_t pos) {
+    // Hot path: a pooled slot lookup + a couple of pushes. No hashing, no string
+    // construction per token. Reject (and latch) an out-of-range id.
+    if (term_id >= slot_of_.size()) {
+        if (spill_status_.ok()) {
+            spill_status_ = Status::InvalidArgument("spimi: term_id out of vocab range");
+        }
+        return;
+    }
+    accumulate(term_id, docid, pos);
+}
+
+void SpimiTermBuffer::add_token(std::string_view term, uint32_t docid, uint32_t pos) {
+    // Compatibility path: intern the term into the owned vocabulary on first
+    // occurrence, then accumulate by its id. ONLY valid in OWNED-vocab mode. In
+    // BORROWED-vocab mode vocab_ points at the caller's vector, NOT &owned_vocab_:
+    // interning here would grow owned_vocab_ / intern_ / slot_of_ out of step with
+    // the active (borrowed) vocab, so the new id indexes the WRONG string and writes
+    // a slot_of_ entry the borrowed-vocab build never reconciles -- silent
+    // corruption. Reject (and latch) instead of forwarding by a bogus id.
+    if (vocab_ != &owned_vocab_) {
+        if (spill_status_.ok()) {
+            spill_status_ = Status::InvalidArgument(
+                    "spimi: add_token(string_view) requires owned-vocab mode");
+        }
+        return;
+    }
+    auto it = intern_.find(std::string(term));
+    uint32_t term_id;
+    if (it == intern_.end()) {
+        term_id = static_cast<uint32_t>(owned_vocab_.size());
+        owned_vocab_.emplace_back(term);
+        intern_.emplace(owned_vocab_.back(), term_id);
+        slot_of_.push_back(0); // vocab grows: new id starts with no live slot
+    } else {
+        term_id = it->second;
+    }
+    accumulate(term_id, docid, pos);
+}
+
+namespace {
+
+// Reorders a term's flat arrays into ascending-docid order, COALESCING any
+// same-docid groups so the result has exactly one entry per docid -- matching the
+// k-way-merge path's boundary-doc coalescing and the writer's strictly-ascending
+// precondition. Only invoked for the rare term that received out-of-order docids
+// (the common ascending path leaves t.sorted true and skips it).
+//
+// A docid may REVISIT (e.g. feed 5,1,5): the chain holds two separate doc-groups
+// for doc 5. A STABLE sort keeps equal-docid groups in arrival order, then the
+// coalesce pass sums their freqs and concatenates their positions in that same
+// (document/arrival) order -- so the merged positions stay consistent with the
+// merged freqs, exactly as the run-order merge would have produced.
+void SortByDocid(std::vector<uint32_t>* docids, std::vector<uint32_t>* freqs,
+                 std::vector<uint32_t>* positions_flat, bool has_positions) {
+    const size_t n = docids->size();
+    std::vector<size_t> order(n);
+    std::iota(order.begin(), order.end(), 0);
+    // STABLE so equal docids keep arrival order: their positions then concatenate in
+    // document order, the same order the merge path's run concatenation yields.
+    std::stable_sort(order.begin(), order.end(),
+                     [&](size_t a, size_t b) { return (*docids)[a] < (*docids)[b]; });
+
+    std::vector<uint32_t> pos_off;
+    if (has_positions) {
+        pos_off.resize(n);
+        uint32_t running = 0;
+        for (size_t i = 0; i < n; ++i) {
+            pos_off[i] = running;
+            running += (*freqs)[i];
+        }
+    }
+    std::vector<uint32_t> nd, nf, np;
+    nd.reserve(n);
+    nf.reserve(n);
+    if (has_positions) np.reserve(positions_flat->size());
+    for (size_t k : order) {
+        // Coalesce a revisited docid into the previous entry (it sorts adjacent now):
+        // sum freqs and append this group's positions right after the prior group's,
+        // so flat doc order stays partitioned by the merged freqs.
+        if (!nd.empty() && nd.back() == (*docids)[k]) {
+            nf.back() += (*freqs)[k];
+        } else {
+            nd.push_back((*docids)[k]);
+            nf.push_back((*freqs)[k]);
+        }
+        if (has_positions) {
+            np.insert(np.end(), positions_flat->begin() + pos_off[k],
+                      positions_flat->begin() + pos_off[k] + (*freqs)[k]);
+        }
+    }
+    *docids = std::move(nd);
+    *freqs = std::move(nf);
+    if (has_positions) *positions_flat = std::move(np);
+}
+
+} // namespace
+
+namespace {
+
+// Decodes one varint from a pool chain cursor. The chain was written by
+// encode_varint*, so the same LEB128 continuation-bit loop reconstructs it.
+uint64_t DecodeChainVarint(CompactPostingPool::Cursor* c) {
+    uint64_t result = 0;
+    int shift = 0;
+    for (;;) {
+        const uint8_t b = c->next();
+        result |= static_cast<uint64_t>(b & 0x7F) << shift;
+        if ((b & 0x80) == 0) break;
+        shift += 7;
+    }
+    return result;
+}
+
+} // namespace
+
+// Decodes a term's compact tagged chain back into a flat TermPostings (the exact
+// docids/freqs/positions_flat the writer consumes), so the produced index is
+// byte-identical to the legacy raw-uint32 accumulator. The chain holds one entry
+// per token: varint((pos << 1) | new_doc); each new_doc entry is followed by a
+// zigzag(docid-delta). A doc's freq is the run length of consecutive same-doc
+// tokens; positions stream out in document order (empty when positions disabled).
+// Stream positions for a sorted term whose token count exceeds this: such a term's
+// flat positions buffer (uint32 per token) would be the peak-RSS transient (tens of
+// MiB for the widest term). Below it, the flat buffer is cheap and simpler.
+static constexpr uint32_t kStreamPositionsTokenThreshold = 1u << 16; // 65536
+
+TermPostings SpimiTermBuffer::to_postings(std::string term, Term&& t,
+                                          bool allow_stream_positions) const {
+    TermPostings tp;
+    tp.term = std::move(term);
+    if (t.ntok == 0 || t.head == kNoChain) return tp;
+
+    // Reserve docids/freqs by ntok (an upper bound on the doc count: ntok >= ndocs).
+    // The doc count is not stored separately to keep Term compact; since the corpus
+    // is freq~1 per (term, doc), ntok ~= ndocs so the over-reserve is negligible.
+    tp.docids.reserve(t.ntok);
+    tp.freqs.reserve(t.ntok);
+
+    // For a large SORTED term, stream positions on demand instead of materializing a
+    // multi-MiB flat buffer: the writer (prx builder) pulls them window by window via
+    // pos_pump, decoding straight from the still-resident arena chain. Out-of-order
+    // terms (rare, defensive) need a full sort, so they always use the flat path.
+    const bool stream_pos = allow_stream_positions && has_positions_ && t.sorted &&
+                            t.ntok >= kStreamPositionsTokenThreshold;
+    if (has_positions_ && !stream_pos) tp.positions_flat.reserve(t.ntok);
+
+    CompactPostingPool::Cursor c = pool_.cursor(t.head, t.w.cur);
+    int64_t prev = 0;
+    for (uint32_t i = 0; i < t.ntok; ++i) {
+        const uint64_t tagged = DecodeChainVarint(&c);
+        const bool new_doc = (tagged & 1u) != 0;
+        if (new_doc) {
+            prev += zigzag_decode(DecodeChainVarint(&c));
+            tp.docids.push_back(static_cast<uint32_t>(prev));
+            tp.freqs.push_back(0);
+        }
+        ++tp.freqs.back(); // count this token toward the current doc's freq
+        if (has_positions_ && !stream_pos) {
+            tp.positions_flat.push_back(static_cast<uint32_t>(tagged >> 1));
+        }
+    }
+
+    // Decide the FINAL position handling now that df (= docids.size()) is known.
+    // pos_pump is honored ONLY by the windowed writer path (build_windowed_entry),
+    // taken when df >= kSlimDfThreshold. A SLIM term (df below it) goes through
+    // build_slim_entry, which reads positions_flat directly -- so streaming would
+    // leave it empty and crash. A high-ntok but low-df term (many repeats in few
+    // docs) therefore falls back to materializing its df-bounded positions here.
+    const bool windowed_path = tp.docids.size() >= snii::format::kSlimDfThreshold;
+    if (stream_pos && windowed_path) {
+        // Hand the writer a sequential position source backed by a SECOND pass over the
+        // same chain (the chain stays resident in pool_ for the whole drain). The pump
+        // yields positions in document order -- identical to positions_flat -- so the
+        // produced .prx is byte-for-byte the same. The cursor is shared/advanced across
+        // calls (the writer pulls in order, exactly pos_total positions total).
+        tp.pos_total = t.ntok;
+        auto cur = std::make_shared<CompactPostingPool::Cursor>(pool_.cursor(t.head, t.w.cur));
+        tp.pos_pump = [cur](uint32_t* dst, size_t count) {
+            // Re-walk the tagged token stream, yielding one position per token. A new-doc
+            // token is followed by a zigzag docid-delta varint that must be consumed and
+            // discarded so the cursor stays aligned with the encoding.
+            for (size_t k = 0; k < count; ++k) {
+                const uint64_t tagged = DecodeChainVarint(cur.get());
+                if ((tagged & 1u) != 0) (void)DecodeChainVarint(cur.get()); // skip docid delta
+                dst[k] = static_cast<uint32_t>(tagged >> 1);
+            }
+        };
+    } else if (stream_pos && has_positions_) {
+        // Slim fallback: the decode loop skipped positions (stream candidate) but the
+        // term is slim, so materialize positions_flat in a second pass for build_slim.
+        tp.positions_flat.reserve(t.ntok);
+        CompactPostingPool::Cursor pc = pool_.cursor(t.head, t.w.cur);
+        for (uint32_t i = 0; i < t.ntok; ++i) {
+            const uint64_t tagged = DecodeChainVarint(&pc);
+            if ((tagged & 1u) != 0) (void)DecodeChainVarint(&pc); // skip docid delta
+            tp.positions_flat.push_back(static_cast<uint32_t>(tagged >> 1));
+        }
+    } else if (!t.sorted) {
+        // Defensive reorder for the rare out-of-order-docid feed (merge of pre-sorted
+        // runs). The common ascending path leaves t.sorted true and skips it.
+        SortByDocid(&tp.docids, &tp.freqs, &tp.positions_flat, has_positions_);
+    }
+    return tp;
+}
+
+void SpimiTermBuffer::ensure_string_rank() const {
+    const std::vector<std::string>& v = vocab();
+    if (string_rank_.size() == v.size()) return; // already built (or empty vocab)
+    // One full lexicographic sort of the vocabulary, amortized over every spill.
+    std::vector<uint32_t> order(v.size());
+    std::iota(order.begin(), order.end(), 0u);
+    std::sort(order.begin(), order.end(), [&](uint32_t a, uint32_t b) { return v[a] < v[b]; });
+    string_rank_.assign(v.size(), 0u);
+    for (uint32_t rank = 0; rank < order.size(); ++rank) {
+        string_rank_[order[rank]] = rank;
+    }
+}
+
+std::vector<uint32_t> SpimiTermBuffer::sorted_ids() const {
+    ensure_string_rank();
+    std::vector<uint32_t> ids = touched_ids_;
+    const std::vector<uint32_t>& rank = string_rank_;
+    // Integer rank compare instead of full std::string compare: equal-string ids
+    // cannot occur for a dense vocab, so a strict rank order matches the original
+    // lexicographic order exactly.
+    std::sort(ids.begin(), ids.end(), [&](uint32_t a, uint32_t b) { return rank[a] < rank[b]; });
+    return ids;
+}
+
+void SpimiTermBuffer::release_term(uint32_t term_id) {
+    const uint32_t enc = slot_of_[term_id];
+    if (enc == 0) return; // not live (defensive)
+    const uint32_t slot = enc - 1;
+    slots_[slot] = Term(); // free this term's arrays; the empty Term slot is reusable
+    free_slots_.push_back(slot);
+    slot_of_[term_id] = 0;
+    --live_term_count_;
+}
+
+Status SpimiTermBuffer::drain_sorted(const std::function<void(TermPostings&&)>& fn,
+                                     bool allow_stream_positions) {
+    const std::vector<std::string>& v = vocab();
+    for (uint32_t id : sorted_ids()) {
+        Term term = std::move(slots_[slot_of_[id] - 1]);
+        release_term(id); // release this term's slot before building the next
+        // Allow streaming positions only when the caller consumes synchronously (the
+        // arena chain stays resident for the whole drain, so the pump can read from it).
+        TermPostings tp = to_postings(v[id], std::move(term), allow_stream_positions);
+        fn(std::move(tp));
+    }
+    touched_ids_.clear();
+    // Drop the arena + the slot pool (their bytes are fully decoded) and return the
+    // freed chunks to the OS so the process peak reflects only what survives the
+    // drain, not retained input-phase arena memory.
+    pool_.reset();
+    std::vector<Term>().swap(slots_);
+    std::vector<uint32_t>().swap(free_slots_);
+    std::vector<uint32_t>().swap(slot_of_);
+    TrimMalloc();
+    // Arena reset + slot_of_ freed: now real resident ~0, so this emits the final
+    // negative that returns every reported byte (no leak after the in-memory drain).
+    report_arena_delta();
+    return Status::OK();
+}
+
+Status SpimiTermBuffer::drain_to_writer(RunWriter* w) {
+    Status st = Status::OK();
+    const std::vector<std::string>& v = vocab();
+    // Spill writes by term-id (no string IO). Iterate touched ids in vocab-string
+    // order so each run is sorted; the k-way merge re-orders runs by the same key.
+    for (uint32_t id : sorted_ids()) {
+        Term term = std::move(slots_[slot_of_[id] - 1]);
+        release_term(id);
+        // Spill path: the run codec serializes positions_flat directly, so positions
+        // must be materialized (no streaming pump).
+        TermPostings tp = to_postings(v[id], std::move(term), /*allow_stream=*/false);
+        if (st.ok()) st = w->write_term(id, tp);
+    }
+    touched_ids_.clear();
+    pool_.reset(); // all chains decoded into the run; free the arena for the refill
+    // The spill returns the arena to 0; slot_of_ keeps its capacity (survives
+    // the spill). Report the arena-drop negative now so the gate-2 spill is balanced
+    // immediately, not deferred to the next token.
+    report_arena_delta();
+    return st;
+}
+
+Status SpimiTermBuffer::spill_to_run() {
+    const std::string dir = resolve_temp_dir();
+    // Best-effort space pre-check: fail with a clear, early error rather than a
+    // mid-write IoError that leaves a half-written run. Best-effort only (TOCTOU; on
+    // tmpfs this reports RAM). resident_bytes() (arena + slot index) is the REAL
+    // resident figure about to drain -- a conservative over-estimate of the run size.
+    const uint64_t resident = resident_bytes();
+    const uint64_t avail = temp_dir_available_bytes(dir);
+    if (avail < resident) {
+        return Status::IoError("spimi: insufficient temp space in '" + dir + "' to spill ~" +
+                               std::to_string(resident) + " B (~" + std::to_string(avail) +
+                               " B free); set SNII_TEMP_DIR/TMPDIR to a larger disk");
+    }
+    const std::string path = MakeRunPath(dir);
+    RunWriter w;
+    SNII_RETURN_IF_ERROR(w.open(path));
+    run_paths_.push_back(path); // tracked for cleanup even if a later step fails
+    SNII_RETURN_IF_ERROR(drain_to_writer(&w));
+    // drain emptied touched_ids_ and freed each term's arrays; terms_/present_ keep
+    // their (vocab-sized) capacity so the next fill reuses the dense slots with no
+    // re-allocation. present_ is already all-zero after release_term per id.
+    return w.close();
+}
+
+Status SpimiTermBuffer::merge_runs(const std::function<void(TermPostings&&)>& fn,
+                                   bool allow_stream_positions) {
+    // Flush whatever is still resident as one final sorted run so the k-way merge
+    // sees a uniform set of run files (and never holds two term sources at once).
+    if (!touched_ids_.empty()) {
+        Status s = spill_to_run();
+        if (!s.ok() && spill_status_.ok()) spill_status_ = s;
+    }
+    if (!spill_status_.ok()) return spill_status_; // a spill or add_token error; emit nothing
+    // All terms are now spilled; the merge reads runs and never touches the
+    // accumulators. Free the pool + the vocab-sized slot index so the merge phase
+    // holds none of the input-side arrays resident -- keeps spill-mode peak RSS
+    // down. malloc_trim(0) returns the freed glibc arenas to the OS so the peak RSS
+    // measurement reflects the merge transient, not retained input-phase chunks.
+    std::vector<Term>().swap(slots_);
+    std::vector<uint32_t>().swap(free_slots_);
+    std::vector<uint32_t>().swap(slot_of_);
+    TrimMalloc();
+    // pool_ was already reset by the final spill_to_run -> drain_to_writer (reported
+    // there); this swap frees slot_of_, so report the remaining negative now. After a
+    // full spilled drain reported_resident_ returns to 0 (no leak).
+    report_arena_delta();
+    Status s = MergeRuns(run_paths_, vocab(), has_positions_, fn, allow_stream_positions);
+    // The merge churns one large coalesced TermPostings per term (the widest term's
+    // arrays are tens of MiB) plus per-run reader windows; on completion glibc
+    // retains those freed chunks in its arenas. Trim again so the post-merge resident
+    // set (and thus the process peak high-water if a later phase allocates) reflects
+    // only live state, not merge-transient retention.
+    TrimMalloc();
+    return s;
+}
+
+Status SpimiTermBuffer::for_each_term_sorted(const std::function<void(TermPostings&&)>& fn) {
+    // Single-drain contract: a second call would re-merge the (still-present) run
+    // files and re-emit every term, or emit nothing in the in-memory path. Return
+    // an error and emit NOTHING rather than produce a wrong second stream.
+    if (drained_) {
+        return Status::Internal("spimi: already drained (single-drain contract)");
+    }
+    drained_ = true;
+    // The callback is invoked synchronously while the arena is resident, so large
+    // sorted terms may stream positions via pos_pump (peak-RSS win for the writer).
+    if (run_paths_.empty() && spill_status_.ok()) {
+        return drain_sorted(fn, /*allow_stream_positions=*/true); // pure in-memory path
+    }
+    // Spilled path (or add_token latched a validation error): the merge may STREAM
+    // a wide term's positions via pos_pump (fn consumes each term synchronously
+    // while the run readers stay parked). merge_runs returns the I/O status
+    // directly; add_token validation errors surface via spill_status_ inside it.
+    return merge_runs(fn, /*allow_stream_positions=*/true);
+}
+
+std::vector<TermPostings> SpimiTermBuffer::finalize_sorted() {
+    std::vector<TermPostings> out;
+    // Single-drain contract (mirrors for_each_term_sorted): a second drain (including
+    // a finalize_sorted after a for_each_term_sorted, or vice versa) would re-emit or
+    // emit nothing. Latch an error and return EMPTY rather than a wrong result.
+    if (drained_) {
+        if (spill_status_.ok()) {
+            spill_status_ = Status::Internal("spimi: already drained (single-drain contract)");
+        }
+        return out;
+    }
+    drained_ = true;
+    out.reserve(touched_ids_.size());
+    // RETAINS each TermPostings past the drain, so positions must be MATERIALIZED
+    // (a streamed pos_pump would reference the arena, freed when the drain ends).
+    if (run_paths_.empty() && spill_status_.ok()) {
+        Status s = drain_sorted([&out](TermPostings&& tp) { out.push_back(std::move(tp)); },
+                                /*allow_stream_positions=*/false);
+        if (!s.ok() && spill_status_.ok()) spill_status_ = s;
+    } else {
+        // RETAINS each TermPostings past the merge, so positions MUST be materialized
+        // (a streamed pos_pump would reference run readers freed when the merge ends).
+        Status s = merge_runs([&out](TermPostings&& tp) { out.push_back(std::move(tp)); },
+                              /*allow_stream_positions=*/false);
+        if (!s.ok() && spill_status_.ok()) spill_status_ = s;
+    }
+    return out;
+}
+
+void SpimiTermBuffer::cleanup_runs() {
+    for (const std::string& p : run_paths_) std::remove(p.c_str());
+    run_paths_.clear();
+}
+
+} // namespace snii::writer
diff --git a/be/src/storage/index/snii/snii_doris_adapter.cpp b/be/src/storage/index/snii/snii_doris_adapter.cpp
new file mode 100644
index 00000000000000..5756bdc8678540
--- /dev/null
+++ b/be/src/storage/index/snii/snii_doris_adapter.cpp
@@ -0,0 +1,249 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "storage/index/snii/snii_doris_adapter.h"
+
+#include <fmt/format.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+
+#include "common/cast_set.h"
+
+namespace doris::segment_v2::snii_doris {
+
+thread_local const io::IOContext* DorisSniiFileReader::_scoped_io_ctx = nullptr;
+
+Status to_doris_status(const ::snii::Status& status) {
+    if (status.ok()) {
+        return Status::OK();
+    }
+    switch (status.code()) {
+    case ::snii::StatusCode::kNotFound:
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>("SNII: {}",
+                                                                       status.message());
+    case ::snii::StatusCode::kUnsupported:
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>("SNII: {}", status.message());
+    case ::snii::StatusCode::kInvalidArgument:
+        return Status::Error<ErrorCode::INVALID_ARGUMENT>("SNII: {}", status.message());
+    case ::snii::StatusCode::kCorruption:
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_CORRUPTED>("SNII: {}",
+                                                                       status.message());
+    case ::snii::StatusCode::kIoError:
+        return Status::IOError("SNII: {}", status.message());
+    case ::snii::StatusCode::kInternal:
+        return Status::InternalError("SNII: {}", status.message());
+    case ::snii::StatusCode::kOk:
+        break;
+    }
+    return Status::InternalError("SNII: {}", status.message());
+}
+
+::snii::Status to_snii_status(const Status& status) {
+    if (status.ok()) {
+        return ::snii::Status::OK();
+    }
+    return ::snii::Status::IoError(status.to_string_no_stack());
+}
+
+::snii::Status DorisSniiFileWriter::append(::snii::Slice data) {
+    if (_writer == nullptr) {
+        return ::snii::Status::InvalidArgument("doris writer is null");
+    }
+    return to_snii_status(
+            _writer->append(Slice(reinterpret_cast<const char*>(data.data()), data.size())));
+}
+
+::snii::Status DorisSniiFileWriter::finalize() {
+    if (_writer == nullptr) {
+        return ::snii::Status::InvalidArgument("doris writer is null");
+    }
+    return ::snii::Status::OK();
+}
+
+uint64_t DorisSniiFileWriter::bytes_written() const {
+    return _writer == nullptr ? 0 : _writer->bytes_appended();
+}
+
+DorisSniiFileReader::DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx)
+        : _reader(std::move(reader)), _default_io_ctx(_make_index_io_context(io_ctx)) {}
+
+io::IOContext DorisSniiFileReader::_make_index_io_context(const io::IOContext* io_ctx) {
+    io::IOContext index_io_ctx;
+    if (io_ctx != nullptr) {
+        index_io_ctx = *io_ctx;
+    }
+    index_io_ctx.is_inverted_index = true;
+    index_io_ctx.is_index_data = true;
+    return index_io_ctx;
+}
+
+DorisSniiFileReader::ScopedIOContext::ScopedIOContext(const io::IOContext* io_ctx)
+        : _previous(_scoped_io_ctx), _io_ctx(DorisSniiFileReader::_make_index_io_context(io_ctx)) {
+    _scoped_io_ctx = &_io_ctx;
+}
+
+DorisSniiFileReader::ScopedIOContext::~ScopedIOContext() {
+    _scoped_io_ctx = _previous;
+}
+
+::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len,
+                                            std::vector<uint8_t>* const out) {
+    SNII_RETURN_IF_ERROR(_read_at(offset, len, out));
+    if (len > 0) {
+        _record_read_stats(cast_set<int64_t>(len), cast_set<int64_t>(len), 1, 1);
+    }
+    return ::snii::Status::OK();
+}
+
+::snii::Status DorisSniiFileReader::_read_at(uint64_t offset, size_t len,
+                                             std::vector<uint8_t>* const out) const {
+    if (_reader == nullptr) {
+        return ::snii::Status::InvalidArgument("doris reader is null");
+    }
+    if (out == nullptr) {
+        return ::snii::Status::InvalidArgument("output buffer is null");
+    }
+    SNII_RETURN_IF_ERROR(_check_read_range(offset, len));
+    if (len == 0) {
+        out->clear();
+        return ::snii::Status::OK();
+    }
+    out->resize(len);
+    size_t bytes_read = 0;
+    auto status = _reader->read_at(offset, Slice(out->data(), len), &bytes_read, _current_io_ctx());
+    if (!status.ok()) {
+        return to_snii_status(status);
+    }
+    if (bytes_read != len) {
+        return ::snii::Status::IoError(
+                fmt::format("short read at offset {}, expect {}, got {}", offset, len, bytes_read));
+    }
+    return ::snii::Status::OK();
+}
+
+::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Range>& ranges,
+                                               std::vector<std::vector<uint8_t>>* const outs) {
+    if (outs == nullptr) {
+        return ::snii::Status::InvalidArgument("output buffers is null");
+    }
+    outs->clear();
+    outs->resize(ranges.size());
+    if (ranges.empty()) {
+        return ::snii::Status::OK();
+    }
+
+    struct IndexedRange {
+        uint64_t offset = 0;
+        size_t len = 0;
+        size_t index = 0;
+    };
+    int64_t request_bytes = 0;
+    std::vector<IndexedRange> sorted;
+    sorted.reserve(ranges.size());
+    for (size_t i = 0; i < ranges.size(); ++i) {
+        SNII_RETURN_IF_ERROR(_check_read_range(ranges[i].offset, ranges[i].len));
+        request_bytes += cast_set<int64_t>(ranges[i].len);
+        if (ranges[i].len == 0) {
+            continue;
+        }
+        sorted.push_back({ranges[i].offset, ranges[i].len, i});
+    }
+    if (sorted.empty()) {
+        return ::snii::Status::OK();
+    }
+    std::sort(sorted.begin(), sorted.end(), [](const IndexedRange& lhs, const IndexedRange& rhs) {
+        return lhs.offset < rhs.offset;
+    });
+
+    constexpr uint64_t max_coalesced_gap = 4096;
+    constexpr uint64_t max_coalesced_read = 1ULL << 20;
+    int64_t read_bytes = 0;
+    int64_t range_read_count = 0;
+    for (size_t begin = 0; begin < sorted.size();) {
+        uint64_t read_offset = sorted[begin].offset;
+        uint64_t read_end = sorted[begin].offset + sorted[begin].len;
+        size_t end = begin + 1;
+        while (end < sorted.size()) {
+            const uint64_t next_end = sorted[end].offset + sorted[end].len;
+            if ((sorted[end].offset > read_end &&
+                 sorted[end].offset - read_end > max_coalesced_gap) ||
+                next_end - read_offset > max_coalesced_read) {
+                break;
+            }
+            read_end = std::max(read_end, next_end);
+            ++end;
+        }
+
+        std::vector<uint8_t> bytes;
+        const size_t read_len = cast_set<size_t>(read_end - read_offset);
+        SNII_RETURN_IF_ERROR(_read_at(read_offset, read_len, &bytes));
+        read_bytes += cast_set<int64_t>(read_len);
+        ++range_read_count;
+        for (size_t i = begin; i < end; ++i) {
+            const uint64_t pos = sorted[i].offset - read_offset;
+            auto& out = (*outs)[sorted[i].index];
+            out.assign(bytes.begin() + cast_set<ptrdiff_t>(pos),
+                       bytes.begin() + cast_set<ptrdiff_t>(pos + sorted[i].len));
+        }
+        begin = end;
+    }
+    _record_read_stats(request_bytes, read_bytes, range_read_count, range_read_count);
+    return ::snii::Status::OK();
+}
+
+uint64_t DorisSniiFileReader::size() const {
+    return _reader == nullptr ? 0 : _reader->size();
+}
+
+const io::IOContext* DorisSniiFileReader::_current_io_ctx() const {
+    return _scoped_io_ctx != nullptr ? _scoped_io_ctx : &_default_io_ctx;
+}
+
+void DorisSniiFileReader::_record_read_stats(int64_t request_bytes, int64_t read_bytes,
+                                             int64_t range_read_count,
+                                             int64_t serial_read_rounds) const {
+    const auto* io_ctx = _current_io_ctx();
+    if (io_ctx->file_cache_stats == nullptr) {
+        return;
+    }
+    auto* stats = io_ctx->file_cache_stats;
+    stats->inverted_index_request_bytes += request_bytes;
+    stats->inverted_index_read_bytes += read_bytes;
+    stats->inverted_index_range_read_count += range_read_count;
+    stats->inverted_index_serial_read_rounds += serial_read_rounds;
+}
+
+::snii::Status DorisSniiFileReader::_check_read_range(uint64_t offset, size_t len) const {
+    if (_reader == nullptr) {
+        return ::snii::Status::InvalidArgument("doris reader is null");
+    }
+    if (offset > std::numeric_limits<uint64_t>::max() - len) {
+        return ::snii::Status::Corruption(
+                fmt::format("read range overflows: offset {}, len {}", offset, len));
+    }
+    const uint64_t end = offset + len;
+    if (end > _reader->size()) {
+        return ::snii::Status::Corruption(
+                fmt::format("read range exceeds file size: offset {}, len {}, file size {}", offset,
+                            len, _reader->size()));
+    }
+    return ::snii::Status::OK();
+}
+
+} // namespace doris::segment_v2::snii_doris
diff --git a/be/src/storage/index/snii/snii_doris_adapter.h b/be/src/storage/index/snii/snii_doris_adapter.h
new file mode 100644
index 00000000000000..7f099466704d5b
--- /dev/null
+++ b/be/src/storage/index/snii/snii_doris_adapter.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "common/status.h"
+#include "io/fs/file_reader.h"
+#include "io/fs/file_writer.h"
+#include "io/io_common.h"
+#include "snii/common/status.h"
+#include "snii/io/file_reader.h"
+#include "snii/io/file_writer.h"
+#include "util/slice.h"
+
+namespace doris::segment_v2::snii_doris {
+
+Status to_doris_status(const ::snii::Status& status);
+::snii::Status to_snii_status(const Status& status);
+
+class DorisSniiFileWriter final : public ::snii::io::FileWriter {
+public:
+    explicit DorisSniiFileWriter(io::FileWriter* writer) : _writer(writer) {}
+
+    ::snii::Status append(::snii::Slice data) override;
+    ::snii::Status finalize() override;
+    uint64_t bytes_written() const override;
+
+private:
+    io::FileWriter* _writer = nullptr;
+};
+
+class DorisSniiFileReader final : public ::snii::io::FileReader {
+public:
+    class ScopedIOContext {
+    public:
+        explicit ScopedIOContext(const io::IOContext* io_ctx);
+        ~ScopedIOContext();
+
+        ScopedIOContext(const ScopedIOContext&) = delete;
+        ScopedIOContext& operator=(const ScopedIOContext&) = delete;
+
+    private:
+        const io::IOContext* _previous = nullptr;
+        io::IOContext _io_ctx;
+    };
+
+    explicit DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx = nullptr);
+
+    ::snii::Status read_at(uint64_t offset, size_t len, std::vector<uint8_t>* const out) override;
+    ::snii::Status read_batch(const std::vector<::snii::io::Range>& ranges,
+                              std::vector<std::vector<uint8_t>>* const outs) override;
+    uint64_t size() const override;
+
+private:
+    static io::IOContext _make_index_io_context(const io::IOContext* io_ctx);
+    ::snii::Status _check_read_range(uint64_t offset, size_t len) const;
+    ::snii::Status _read_at(uint64_t offset, size_t len, std::vector<uint8_t>* const out) const;
+    const io::IOContext* _current_io_ctx() const;
+    void _record_read_stats(int64_t request_bytes, int64_t read_bytes, int64_t range_read_count,
+                            int64_t serial_read_rounds) const;
+
+    io::FileReaderSPtr _reader;
+    io::IOContext _default_io_ctx;
+    static thread_local const io::IOContext* _scoped_io_ctx;
+};
+
+} // namespace doris::segment_v2::snii_doris
diff --git a/be/src/storage/index/snii/snii_index_reader.cpp b/be/src/storage/index/snii/snii_index_reader.cpp
new file mode 100644
index 00000000000000..2b7129074d92a7
--- /dev/null
+++ b/be/src/storage/index/snii/snii_index_reader.cpp
@@ -0,0 +1,398 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "storage/index/snii/snii_index_reader.h"
+
+#include <CLucene.h>
+#include <fmt/format.h>
+
+#include <algorithm>
+#include <cctype>
+#include <charconv>
+#include <memory>
+#include <roaring/roaring.hh>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "common/config.h"
+#include "runtime/runtime_profile.h"
+#include "runtime/runtime_state.h"
+#include "snii/format/null_bitmap.h"
+#include "snii/query/boolean_query.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/phrase_query.h"
+#include "snii/query/prefix_query.h"
+#include "snii/query/regexp_query.h"
+#include "snii/query/term_query.h"
+#include "snii/query/wildcard_query.h"
+#include "snii/reader/logical_index_reader.h"
+#include "storage/index/index_file_reader.h"
+#include "storage/index/inverted/analyzer/analyzer.h"
+#include "storage/index/inverted/inverted_index_cache.h"
+#include "storage/index/inverted/inverted_index_iterator.h"
+#include "storage/index/snii/snii_doris_adapter.h"
+
+namespace doris::segment_v2 {
+
+namespace {
+
+class RoaringDocIdSink final : public snii::query::DocIdSink {
+public:
+    explicit RoaringDocIdSink(roaring::Roaring* bitmap) : _bitmap(bitmap) {
+        DCHECK(_bitmap != nullptr);
+    }
+
+    snii::Status append_sorted(std::span<const uint32_t> docids) override {
+        if (!docids.empty()) {
+            _bitmap->addMany(docids.size(), docids.data());
+        }
+        return snii::Status::OK();
+    }
+
+    snii::Status append_range(uint32_t first, uint64_t last_exclusive) override {
+        if (last_exclusive > first) {
+            _bitmap->addRange(first, last_exclusive);
+        }
+        return snii::Status::OK();
+    }
+
+private:
+    roaring::Roaring* _bitmap;
+};
+
+struct SniiQueryExecutionResult {
+    std::shared_ptr<roaring::Roaring> bitmap;
+};
+
+std::vector<std::string> to_terms(const InvertedIndexQueryInfo& query_info) {
+    std::vector<std::string> terms;
+    terms.reserve(query_info.term_infos.size());
+    for (const auto& term_info : query_info.term_infos) {
+        DCHECK(term_info.is_single_term());
+        terms.push_back(term_info.get_single_term());
+    }
+    return terms;
+}
+
+void parse_phrase_slop(std::string* query, InvertedIndexQueryInfo* query_info) {
+    DCHECK(query != nullptr);
+    DCHECK(query_info != nullptr);
+    const auto is_digits = [](std::string_view str) {
+        return std::all_of(str.begin(), str.end(), [](unsigned char c) { return std::isdigit(c); });
+    };
+
+    const size_t last_space_pos = query->find_last_of(' ');
+    if (last_space_pos == std::string::npos) {
+        return;
+    }
+    const size_t tilde_pos = last_space_pos + 1;
+    if (tilde_pos >= query->size() - 1 || (*query)[tilde_pos] != '~') {
+        return;
+    }
+
+    const size_t slop_pos = tilde_pos + 1;
+    std::string_view slop_str(query->data() + slop_pos, query->size() - slop_pos);
+    if (slop_str.empty()) {
+        return;
+    }
+
+    bool ordered = false;
+    if (slop_str.size() == 1) {
+        if (!std::isdigit(static_cast<unsigned char>(slop_str[0]))) {
+            return;
+        }
+    } else if (slop_str.back() == '+') {
+        ordered = true;
+        slop_str.remove_suffix(1);
+    }
+
+    if (!is_digits(slop_str)) {
+        return;
+    }
+    auto result = std::from_chars(slop_str.begin(), slop_str.end(), query_info->slop);
+    if (result.ec != std::errc()) {
+        return;
+    }
+    query_info->ordered = ordered;
+    *query = query->substr(0, last_space_pos);
+}
+
+std::string build_snii_query_cache_value(const InvertedIndexQueryInfo& query_info) {
+    std::string cache_value;
+    for (const auto& term_info : query_info.term_infos) {
+        DCHECK(term_info.is_single_term());
+        const auto& term = term_info.get_single_term();
+        cache_value.append(std::to_string(term.size()));
+        cache_value.push_back(':');
+        cache_value.append(term);
+        cache_value.push_back('@');
+        cache_value.append(std::to_string(term_info.position));
+        cache_value.push_back(';');
+    }
+    return cache_value;
+}
+
+std::shared_ptr<roaring::Roaring> docids_to_bitmap(const std::vector<uint32_t>& docids) {
+    auto result = std::make_shared<roaring::Roaring>();
+    if (!docids.empty()) {
+        result->addMany(docids.size(), docids.data());
+    }
+    result->runOptimize();
+    return result;
+}
+
+Status execute_snii_query(const snii::reader::LogicalIndexReader& logical_reader,
+                          InvertedIndexQueryType query_type,
+                          const InvertedIndexQueryInfo& query_info, std::string_view search_str,
+                          const std::vector<std::string>& terms, int32_t max_expansions,
+                          SniiQueryExecutionResult* result) {
+    result->bitmap = std::make_shared<roaring::Roaring>();
+    RoaringDocIdSink sink(result->bitmap.get());
+    std::vector<uint32_t> docids;
+    bool emitted_to_sink = false;
+    snii::Status status;
+    switch (query_type) {
+    case InvertedIndexQueryType::EQUAL_QUERY:
+    case InvertedIndexQueryType::MATCH_ANY_QUERY:
+        status = terms.size() == 1 ? snii::query::term_query(logical_reader, terms.front(), &sink)
+                                   : snii::query::boolean_or(logical_reader, terms, &sink);
+        emitted_to_sink = true;
+        break;
+    case InvertedIndexQueryType::MATCH_ALL_QUERY:
+        if (terms.size() == 1) {
+            status = snii::query::term_query(logical_reader, terms.front(), &sink);
+            emitted_to_sink = true;
+        } else {
+            status = snii::query::boolean_and(logical_reader, terms, &docids);
+        }
+        break;
+    case InvertedIndexQueryType::MATCH_PHRASE_QUERY:
+        if (query_info.slop != 0) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_BYPASS>(
+                    "SNII does not support sloppy phrase query yet");
+        }
+        if (terms.size() == 1) {
+            status = snii::query::term_query(logical_reader, terms.front(), &sink);
+            emitted_to_sink = true;
+        } else {
+            status = snii::query::phrase_query(logical_reader, terms, &docids);
+        }
+        break;
+    case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY:
+        if (terms.size() == 1) {
+            status =
+                    snii::query::prefix_query(logical_reader, terms.front(), &sink, max_expansions);
+            emitted_to_sink = true;
+        } else {
+            status = snii::query::phrase_prefix_query(logical_reader, terms, &docids,
+                                                      max_expansions);
+        }
+        break;
+    case InvertedIndexQueryType::MATCH_REGEXP_QUERY:
+        status = snii::query::regexp_query(logical_reader, search_str, &sink, max_expansions);
+        emitted_to_sink = true;
+        break;
+    case InvertedIndexQueryType::WILDCARD_QUERY:
+        status = snii::query::wildcard_query(logical_reader, search_str, &sink, max_expansions);
+        emitted_to_sink = true;
+        break;
+    case InvertedIndexQueryType::LESS_THAN_QUERY:
+    case InvertedIndexQueryType::LESS_EQUAL_QUERY:
+    case InvertedIndexQueryType::GREATER_THAN_QUERY:
+    case InvertedIndexQueryType::GREATER_EQUAL_QUERY:
+    case InvertedIndexQueryType::RANGE_QUERY:
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "SNII inverted index storage format does not support BKD/range query");
+    default:
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "SNII unsupported inverted index query type {}", query_type_to_string(query_type));
+    }
+    RETURN_IF_ERROR(snii_doris::to_doris_status(status));
+    if (emitted_to_sink) {
+        result->bitmap->runOptimize();
+    } else {
+        result->bitmap = docids_to_bitmap(docids);
+    }
+    return Status::OK();
+}
+
+} // namespace
+
+Status SniiIndexReader::new_iterator(std::unique_ptr<IndexIterator>* iterator) {
+    if (*iterator == nullptr) {
+        *iterator = InvertedIndexIterator::create_unique();
+    }
+    dynamic_cast<InvertedIndexIterator*>(iterator->get())
+            ->add_reader(_reader_type,
+                         dynamic_pointer_cast<InvertedIndexReader>(shared_from_this()));
+    return Status::OK();
+}
+
+Status SniiIndexReader::_parse_query_terms(const IndexQueryContextPtr& context,
+                                           std::string search_str,
+                                           InvertedIndexQueryType query_type,
+                                           const InvertedIndexAnalyzerCtx* analyzer_ctx,
+                                           InvertedIndexQueryInfo* query_info) {
+    DCHECK(query_info != nullptr);
+    if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY ||
+        query_type == InvertedIndexQueryType::WILDCARD_QUERY) {
+        query_info->term_infos.emplace_back(search_str, 0);
+        return Status::OK();
+    }
+    if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+        query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
+        parse_phrase_slop(&search_str, query_info);
+        SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
+        try {
+            query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+                    search_str, _index_meta.properties());
+        } catch (const CLuceneError& e) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                    "SNII analyze query failed: {}", e.what());
+        } catch (const Exception& e) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                    "SNII analyze query failed: {}", e.what());
+        }
+        return Status::OK();
+    }
+
+    SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
+    try {
+        if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) {
+            query_info->term_infos.emplace_back(search_str);
+        } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) {
+            auto reader = inverted_index::InvertedIndexAnalyzer::create_reader(
+                    analyzer_ctx->char_filter_map);
+            reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
+            query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+                    reader, analyzer_ctx->analyzer.get());
+        } else {
+            query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+                    search_str, _index_meta.properties());
+        }
+    } catch (const CLuceneError& e) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                "SNII analyze query failed: {}", e.what());
+    } catch (const Exception& e) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                "SNII analyze query failed: {}", e.what());
+    }
+    return Status::OK();
+}
+
+Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::string& column_name,
+                              const Field& query_value, InvertedIndexQueryType query_type,
+                              std::shared_ptr<roaring::Roaring>& bit_map,
+                              const InvertedIndexAnalyzerCtx* analyzer_ctx) {
+    SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
+    std::string search_str = query_value.get<PrimitiveType::TYPE_STRING>();
+
+    if (int ignore_above =
+                std::stoi(get_parser_ignore_above_value_from_properties(_index_meta.properties()));
+        _reader_type == InvertedIndexReaderType::STRING_TYPE && search_str.size() > ignore_above) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED>(
+                "query value is too long, evaluate skipped.");
+    }
+
+    InvertedIndexQueryInfo query_info;
+    RETURN_IF_ERROR(_parse_query_terms(context, search_str, query_type, analyzer_ctx, &query_info));
+    if (query_info.term_infos.empty()) {
+        auto msg = fmt::format("token parser result is empty for SNII query '{}'", search_str);
+        if (is_match_query(query_type)) {
+            LOG(WARNING) << msg;
+            bit_map = std::make_shared<roaring::Roaring>();
+            return Status::OK();
+        }
+        return Status::Error<ErrorCode::INVERTED_INDEX_NO_TERMS>(msg);
+    }
+
+    auto terms = to_terms(query_info);
+    const int32_t max_expansions =
+            context->runtime_state == nullptr
+                    ? 50
+                    : context->runtime_state->query_options().inverted_index_max_expansions;
+    std::string cache_value = build_snii_query_cache_value(query_info);
+    if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+        cache_value += " " + std::to_string(query_info.slop);
+        cache_value += " " + std::to_string(query_info.ordered);
+    } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+               query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY ||
+               query_type == InvertedIndexQueryType::WILDCARD_QUERY) {
+        cache_value += " " + std::to_string(max_expansions);
+    }
+    auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
+    InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type,
+                                                 std::move(cache_value)};
+    auto* cache = InvertedIndexQueryCache::instance();
+    InvertedIndexQueryCacheHandle cache_handler;
+    if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) {
+        return Status::OK();
+    }
+
+    snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(context->io_ctx);
+    RETURN_IF_ERROR(
+            _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx));
+    auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta));
+
+    SniiQueryExecutionResult query_result;
+    RETURN_IF_ERROR(execute_snii_query(*logical_reader, query_type, query_info, search_str, terms,
+                                       max_expansions, &query_result));
+    bit_map = std::move(query_result.bitmap);
+    cache->insert(cache_key, bit_map, &cache_handler);
+    return Status::OK();
+}
+
+Status SniiIndexReader::try_query(const IndexQueryContextPtr& /*context*/,
+                                  const std::string& /*column_name*/, const Field& /*query_value*/,
+                                  InvertedIndexQueryType /*query_type*/, size_t* /*count*/) {
+    return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>("SNII does not support try_query");
+}
+
+Status SniiIndexReader::read_null_bitmap(const IndexQueryContextPtr& context,
+                                         InvertedIndexQueryCacheHandle* cache_handle,
+                                         lucene::store::Directory* /*dir*/) {
+    SCOPED_RAW_TIMER(&context->stats->inverted_index_query_null_bitmap_timer);
+    auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
+    InvertedIndexQueryCache::CacheKey cache_key {
+            index_file_key, "", InvertedIndexQueryType::UNKNOWN_QUERY, "null_bitmap"};
+    auto* cache = InvertedIndexQueryCache::instance();
+    if (cache->lookup(cache_key, cache_handle)) {
+        return Status::OK();
+    }
+
+    snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(context->io_ctx);
+    RETURN_IF_ERROR(
+            _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx));
+    auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta));
+    auto null_bitmap = std::make_shared<roaring::Roaring>();
+    const auto& ref = logical_reader->section_refs().null_bitmap;
+    if (ref.length > 0) {
+        std::vector<uint8_t> bytes;
+        RETURN_IF_ERROR(snii_doris::to_doris_status(
+                logical_reader->reader()->read_at(ref.offset, ref.length, &bytes)));
+        snii::format::NullBitmapReader reader;
+        RETURN_IF_ERROR(snii_doris::to_doris_status(
+                snii::format::NullBitmapReader::open(snii::Slice(bytes), &reader)));
+        reader.copy_to(null_bitmap.get());
+        null_bitmap->runOptimize();
+    }
+    cache->insert(cache_key, null_bitmap, cache_handle);
+    return Status::OK();
+}
+
+} // namespace doris::segment_v2
diff --git a/be/src/storage/index/snii/snii_index_reader.h b/be/src/storage/index/snii/snii_index_reader.h
new file mode 100644
index 00000000000000..5b504802a28f9f
--- /dev/null
+++ b/be/src/storage/index/snii/snii_index_reader.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "storage/index/inverted/inverted_index_query_type.h"
+#include "storage/index/inverted/inverted_index_reader.h"
+
+namespace doris::segment_v2 {
+
+class SniiIndexReader final : public InvertedIndexReader {
+    ENABLE_FACTORY_CREATOR(SniiIndexReader);
+
+public:
+    SniiIndexReader(const TabletIndex* index_meta,
+                    const std::shared_ptr<IndexFileReader>& index_file_reader,
+                    InvertedIndexReaderType reader_type)
+            : InvertedIndexReader(index_meta, index_file_reader), _reader_type(reader_type) {}
+
+    Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
+    Status query(const IndexQueryContextPtr& context, const std::string& column_name,
+                 const Field& query_value, InvertedIndexQueryType query_type,
+                 std::shared_ptr<roaring::Roaring>& bit_map,
+                 const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) override;
+    Status try_query(const IndexQueryContextPtr& context, const std::string& column_name,
+                     const Field& query_value, InvertedIndexQueryType query_type,
+                     size_t* count) override;
+    Status read_null_bitmap(const IndexQueryContextPtr& context,
+                            InvertedIndexQueryCacheHandle* cache_handle,
+                            lucene::store::Directory* dir = nullptr) override;
+    InvertedIndexReaderType type() override { return _reader_type; }
+
+private:
+    Status _parse_query_terms(const IndexQueryContextPtr& context, std::string search_str,
+                              InvertedIndexQueryType query_type,
+                              const InvertedIndexAnalyzerCtx* analyzer_ctx,
+                              InvertedIndexQueryInfo* query_info);
+
+    InvertedIndexReaderType _reader_type;
+};
+
+} // namespace doris::segment_v2
diff --git a/be/src/storage/index/snii/snii_index_writer.cpp b/be/src/storage/index/snii/snii_index_writer.cpp
new file mode 100644
index 00000000000000..37f2d41963fb9a
--- /dev/null
+++ b/be/src/storage/index/snii/snii_index_writer.cpp
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "storage/index/snii/snii_index_writer.h"
+
+#include <CLucene.h>
+
+#include <algorithm>
+
+#include "common/cast_set.h"
+#include "common/config.h"
+#include "storage/index/index_file_writer.h"
+#include "storage/index/inverted/analyzer/analyzer.h"
+#include "storage/index/inverted/query/query_info.h"
+#include "storage/tablet/tablet_schema.h"
+
+namespace doris::segment_v2 {
+
+SniiIndexColumnWriter::SniiIndexColumnWriter(IndexFileWriter* index_file_writer,
+                                             const TabletIndex* index_meta, bool /*single_field*/)
+        : _index_file_writer(index_file_writer), _index_meta(index_meta) {}
+
+Status SniiIndexColumnWriter::init() {
+    _should_analyzer =
+            inverted_index::InvertedIndexAnalyzer::should_analyzer(_index_meta->properties());
+    _has_positions = get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
+                     INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES;
+    _config = _has_positions ? snii::format::IndexConfig::kDocsPositions
+                             : snii::format::IndexConfig::kDocsOnly;
+    auto ignore_above_value =
+            get_parser_ignore_above_value_from_properties(_index_meta->properties());
+    _ignore_above = cast_set<uint32_t>(std::stoul(ignore_above_value));
+    const auto spill_threshold =
+            static_cast<size_t>(config::inverted_index_ram_buffer_size * 1024 * 1024);
+    _memory_reporter = std::make_unique<snii::writer::MemoryReporter>(nullptr, spill_threshold);
+    _term_buffer = std::make_unique<snii::writer::SpimiTermBuffer>(_has_positions, spill_threshold,
+                                                                   _memory_reporter.get());
+    _analyzer_config.analyzer_name = get_analyzer_name_from_properties(_index_meta->properties());
+    _analyzer_config.parser_type = get_inverted_index_parser_type_from_string(
+            get_parser_string_from_properties(_index_meta->properties()));
+    _analyzer_config.parser_mode =
+            get_parser_mode_string_from_properties(_index_meta->properties());
+    _analyzer_config.char_filter_map =
+            get_parser_char_filter_map_from_properties(_index_meta->properties());
+    _analyzer_config.lower_case =
+            get_parser_lowercase_from_properties<true>(_index_meta->properties());
+    _analyzer_config.stop_words = get_parser_stopwords_from_properties(_index_meta->properties());
+    try {
+        _char_string_reader = inverted_index::InvertedIndexAnalyzer::create_reader(
+                _analyzer_config.char_filter_map);
+        if (_should_analyzer) {
+            _analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer(&_analyzer_config);
+        }
+    } catch (const CLuceneError& e) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                "SNII create analyzer failed: {}", e.what());
+    } catch (const Exception& e) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                "SNII create analyzer failed: {}", e.what());
+    }
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::_analyze(const Slice& value, std::vector<TermInfo>* terms) {
+    terms->clear();
+    if (!_should_analyzer) {
+        TermInfo term;
+        term.term = std::string(value.data, value.size);
+        term.position = 0;
+        terms->emplace_back(std::move(term));
+        return Status::OK();
+    }
+    try {
+        _char_string_reader->init(value.data, cast_set<int32_t>(value.size), false);
+        *terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result(_char_string_reader,
+                                                                           _analyzer.get());
+    } catch (const CLuceneError& e) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                "SNII analyze value failed: {}", e.what());
+    } catch (const Exception& e) {
+        return Status::Error<ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                "SNII analyze value failed: {}", e.what());
+    }
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::_add_value_tokens(const Slice& value, uint32_t docid,
+                                                uint32_t position_base, uint32_t* max_position) {
+    DCHECK(max_position != nullptr);
+    *max_position = position_base;
+    if ((!_should_analyzer && value.size > _ignore_above) || (_should_analyzer && value.empty())) {
+        return Status::OK();
+    }
+
+    std::vector<TermInfo> terms;
+    RETURN_IF_ERROR(_analyze(value, &terms));
+    for (const auto& term_info : terms) {
+        DCHECK(term_info.is_single_term());
+        const auto& term = term_info.get_single_term();
+        const uint32_t position =
+                _has_positions ? position_base + cast_set<uint32_t>(term_info.position) : 0;
+        _term_buffer->add_token(term, docid, position);
+        *max_position = std::max(*max_position, position);
+    }
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::add_values(const std::string /*name*/, const void* values,
+                                         size_t count) {
+    const auto* v = reinterpret_cast<const Slice*>(values);
+    for (size_t i = 0; i < count; ++i) {
+        uint32_t max_position = 0;
+        RETURN_IF_ERROR(_add_value_tokens(*v, _rid, 0, &max_position));
+        ++v;
+        ++_rid;
+    }
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr,
+                                               const uint8_t* nested_null_map,
+                                               const uint8_t* offsets_ptr, size_t count) {
+    if (count == 0) {
+        return Status::OK();
+    }
+    const auto* offsets = reinterpret_cast<const uint64_t*>(offsets_ptr);
+    size_t start_off = 0;
+    for (size_t i = 0; i < count; ++i) {
+        auto array_elem_size = offsets[i + 1] - offsets[i];
+        uint32_t position_base = 0;
+        for (auto j = start_off; j < start_off + array_elem_size; ++j) {
+            if (nested_null_map != nullptr && nested_null_map[j] == 1) {
+                continue;
+            }
+            const auto* value = reinterpret_cast<const Slice*>(
+                    reinterpret_cast<const uint8_t*>(value_ptr) + j * field_size);
+            uint32_t max_position = position_base;
+            RETURN_IF_ERROR(_add_value_tokens(*value, _rid, position_base, &max_position));
+            position_base = max_position + 1;
+        }
+        start_off += array_elem_size;
+        ++_rid;
+    }
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::add_nulls(uint32_t count) {
+    _null_docids.reserve(_null_docids.size() + count);
+    for (uint32_t i = 0; i < count; ++i) {
+        _null_docids.push_back(_rid + i);
+    }
+    _rid += count;
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::add_array_nulls(const uint8_t* null_map, size_t num_rows) {
+    DCHECK(_rid >= num_rows);
+    if (num_rows == 0 || null_map == nullptr) {
+        return Status::OK();
+    }
+    const auto first_row = _rid - num_rows;
+    for (size_t i = 0; i < num_rows; ++i) {
+        if (null_map[i] == 1) {
+            _null_docids.push_back(cast_set<uint32_t>(first_row + i));
+        }
+    }
+    return Status::OK();
+}
+
+Status SniiIndexColumnWriter::finish() {
+    DCHECK(_term_buffer != nullptr);
+    auto status = _term_buffer->status();
+    if (!status.ok()) {
+        return Status::InternalError("SNII term buffer error: {}", status.to_string());
+    }
+    RETURN_IF_ERROR(_index_file_writer->add_snii_index(_index_meta, cast_set<uint32_t>(_rid),
+                                                       std::move(_null_docids), _term_buffer.get(),
+                                                       _config, _memory_reporter.get()));
+    _index_file_writer->retain_snii_memory_reporter(std::move(_memory_reporter));
+    _term_buffer.reset();
+    return Status::OK();
+}
+
+void SniiIndexColumnWriter::close_on_error() {
+    _term_buffer.reset();
+    _memory_reporter.reset();
+    _null_docids.clear();
+}
+
+} // namespace doris::segment_v2
diff --git a/be/src/storage/index/snii/snii_index_writer.h b/be/src/storage/index/snii/snii_index_writer.h
new file mode 100644
index 00000000000000..f9c6686bbed4cf
--- /dev/null
+++ b/be/src/storage/index/snii/snii_index_writer.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "snii/format/format_constants.h"
+#include "snii/writer/memory_reporter.h"
+#include "snii/writer/spimi_term_buffer.h"
+#include "storage/index/index_writer.h"
+#include "storage/index/inverted/inverted_index_parser.h"
+#include "storage/index/inverted/query/query_info.h"
+#include "storage/index/inverted/util/reader.h"
+#include "util/slice.h"
+
+namespace lucene::analysis {
+class Analyzer;
+}
+
+namespace doris::segment_v2 {
+
+class SniiIndexColumnWriter final : public IndexColumnWriter {
+public:
+    SniiIndexColumnWriter(IndexFileWriter* index_file_writer, const TabletIndex* index_meta,
+                          bool single_field);
+    ~SniiIndexColumnWriter() override = default;
+
+    Status init() override;
+    Status add_values(const std::string name, const void* values, size_t count) override;
+    Status add_array_values(size_t field_size, const void* value_ptr,
+                            const uint8_t* nested_null_map, const uint8_t* offsets_ptr,
+                            size_t count) override;
+    Status add_nulls(uint32_t count) override;
+    Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override;
+    Status finish() override;
+    int64_t size() const override { return 0; }
+    void close_on_error() override;
+
+private:
+    Status _add_value_tokens(const Slice& value, uint32_t docid, uint32_t position_base,
+                             uint32_t* max_position);
+    Status _analyze(const Slice& value, std::vector<TermInfo>* terms);
+
+    IndexFileWriter* _index_file_writer = nullptr;
+    const TabletIndex* _index_meta = nullptr;
+    bool _should_analyzer = false;
+    bool _has_positions = false;
+    uint32_t _ignore_above = 0;
+    uint32_t _rid = 0;
+    snii::format::IndexConfig _config = snii::format::IndexConfig::kDocsOnly;
+    InvertedIndexAnalyzerConfig _analyzer_config;
+    inverted_index::ReaderPtr _char_string_reader;
+    std::shared_ptr<lucene::analysis::Analyzer> _analyzer;
+    std::unique_ptr<snii::writer::MemoryReporter> _memory_reporter;
+    std::unique_ptr<snii::writer::SpimiTermBuffer> _term_buffer;
+    std::vector<uint32_t> _null_docids;
+};
+
+} // namespace doris::segment_v2
diff --git a/be/src/storage/rowset/beta_rowset.cpp b/be/src/storage/rowset/beta_rowset.cpp
index 70950dfe065634..4f6e038661958e 100644
--- a/be/src/storage/rowset/beta_rowset.cpp
+++ b/be/src/storage/rowset/beta_rowset.cpp
@@ -827,6 +827,9 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value,
     case InvertedIndexStorageFormatPB::V3:
         format_str = "V3";
         break;
+    case InvertedIndexStorageFormatPB::SNII:
+        format_str = "SNII";
+        break;
     default:
         return Status::InternalError("inverted index storage format error");
         break;
@@ -836,6 +839,19 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value,
     rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str.c_str(), allocator),
                             allocator);
     rapidjson::Value segments(rapidjson::kArrayType);
+    auto add_file_info_to_json = [&](const std::string& path,
+                                     rapidjson::Value& json_value) -> Status {
+        json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), allocator);
+        int64_t idx_file_size = 0;
+        auto st = fs->file_size(path, &idx_file_size);
+        if (st != Status::OK()) {
+            LOG(WARNING) << "show nested index file get file size error, file: " << path
+                         << ", error: " << st.msg();
+            return st;
+        }
+        json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), allocator);
+        return Status::OK();
+    };
     for (int seg_id = 0; seg_id < num_segments(); ++seg_id) {
         rapidjson::Value segment(rapidjson::kObjectType);
         segment.AddMember("segment_id", rapidjson::Value(seg_id).Move(), allocator);
@@ -846,24 +862,20 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value,
                 fs, std::string(index_file_path_prefix), storage_format, InvertedIndexFileInfo(),
                 _rowset_meta->tablet_id());
         RETURN_IF_ERROR(index_file_reader->init());
+        if (storage_format == InvertedIndexStorageFormatPB::SNII) {
+            rapidjson::Value index_file(rapidjson::kObjectType);
+            auto index_file_path =
+                    InvertedIndexDescriptor::get_index_file_path_v2(index_file_path_prefix);
+            RETURN_IF_ERROR(add_file_info_to_json(index_file_path, index_file));
+            segment.AddMember("index_files", rapidjson::Value(rapidjson::kArrayType).Move(),
+                              allocator);
+            auto& index_files = segment["index_files"];
+            index_files.PushBack(index_file, allocator);
+            segments.PushBack(segment, allocator);
+            continue;
+        }
         auto dirs = index_file_reader->get_all_directories();
 
-        auto add_file_info_to_json = [&](const std::string& path,
-                                         rapidjson::Value& json_value) -> Status {
-            json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator),
-                                 allocator);
-            int64_t idx_file_size = 0;
-            auto st = fs->file_size(path, &idx_file_size);
-            if (st != Status::OK()) {
-                LOG(WARNING) << "show nested index file get file size error, file: " << path
-                             << ", error: " << st.msg();
-                return st;
-            }
-            json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(),
-                                 allocator);
-            return Status::OK();
-        };
-
         auto process_files = [&allocator, &index_file_reader](auto& index_meta,
                                                               rapidjson::Value& indices,
                                                               rapidjson::Value& index) -> Status {
diff --git a/be/src/storage/segment/column_reader.cpp b/be/src/storage/segment/column_reader.cpp
index ebb1887c8ee920..262c1dd048be16 100644
--- a/be/src/storage/segment/column_reader.cpp
+++ b/be/src/storage/segment/column_reader.cpp
@@ -55,6 +55,7 @@
 #include "storage/index/index_reader.h"
 #include "storage/index/inverted/analyzer/analyzer.h"
 #include "storage/index/inverted/inverted_index_reader.h"
+#include "storage/index/snii/snii_index_reader.h"
 #include "storage/index/zone_map/zone_map_index.h"
 #include "storage/iterators.h"
 #include "storage/olap_common.h"
@@ -647,6 +648,17 @@ Status ColumnReader::_load_index(const std::shared_ptr<IndexFileReader>& index_f
     }
 
     IndexReaderPtr index_reader;
+    if (index_file_reader->get_storage_format() == InvertedIndexStorageFormatPB::SNII) {
+        if (!is_string_type(type)) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                    "SNII inverted index storage format does not support BKD index type {}", type);
+        }
+        auto reader_type = should_analyzer ? InvertedIndexReaderType::FULLTEXT
+                                           : InvertedIndexReaderType::STRING_TYPE;
+        index_reader = SniiIndexReader::create_shared(index_meta, index_file_reader, reader_type);
+        _index_readers[index_meta->index_id()] = index_reader;
+        return Status::OK();
+    }
 
     if (is_string_type(type)) {
         if (should_analyzer) {
diff --git a/be/src/storage/tablet/tablet_meta.cpp b/be/src/storage/tablet/tablet_meta.cpp
index b289cda58e7d3b..1e0660339fb4ec 100644
--- a/be/src/storage/tablet/tablet_meta.cpp
+++ b/be/src/storage/tablet/tablet_meta.cpp
@@ -101,6 +101,9 @@ TabletMetaSharedPtr TabletMeta::create(
         case TInvertedIndexStorageFormat::V2:
             inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2;
             break;
+        case TInvertedIndexStorageFormat::SNII:
+            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::SNII;
+            break;
         default:
             break;
         }
@@ -495,6 +498,9 @@ void TabletMeta::init_schema_from_thrift(const TTabletSchema& tablet_schema,
     case TInvertedIndexFileStorageFormat::V3:
         tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
         break;
+    case TInvertedIndexFileStorageFormat::SNII:
+        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::SNII);
+        break;
     default:
         tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
         break;
diff --git a/be/src/storage/task/index_builder.cpp b/be/src/storage/task/index_builder.cpp
index ef49626e143ab5..0e0ffeeb1d1036 100644
--- a/be/src/storage/task/index_builder.cpp
+++ b/be/src/storage/task/index_builder.cpp
@@ -338,6 +338,13 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
 
     if (_is_drop_op) {
         const auto& output_rs_tablet_schema = output_rowset_meta->tablet_schema();
+        if (output_rs_tablet_schema->get_inverted_index_storage_format() ==
+            InvertedIndexStorageFormatPB::SNII) {
+            LOG(INFO) << "skip physical SNII inverted index rewrite for drop index. tablet_id="
+                      << _tablet->tablet_id()
+                      << " rowset_id=" << output_rowset_meta->rowset_id().to_string();
+            return Status::OK();
+        }
         if (output_rs_tablet_schema->get_inverted_index_storage_format() !=
             InvertedIndexStorageFormatPB::V1) {
             const auto& fs = output_rowset_meta->fs();
@@ -421,6 +428,11 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
             _olap_data_convertor->reserve(_alter_inverted_indexes.size());
 
             std::unique_ptr<IndexFileWriter> index_file_writer = nullptr;
+            if (output_rowset_schema->get_inverted_index_storage_format() ==
+                InvertedIndexStorageFormatPB::SNII) {
+                return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                        "BUILD INDEX is not supported for SNII inverted index storage format yet");
+            }
             if (output_rowset_schema->get_inverted_index_storage_format() >=
                 InvertedIndexStorageFormatPB::V2) {
                 auto idx_file_reader_iter = _index_file_readers.find(
diff --git a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp
index e74ad758ac1db3..4e7bb6bb1d05a4 100644
--- a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp
+++ b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp
@@ -52,6 +52,10 @@ io::FileCacheStatistics make_file_cache_stats(int64_t multiplier) {
     stats.inverted_index_remote_io_timer = multiplier * 26;
     stats.inverted_index_peer_io_timer = multiplier * 27;
     stats.inverted_index_io_timer = multiplier * 28;
+    stats.inverted_index_request_bytes = multiplier * 29;
+    stats.inverted_index_read_bytes = multiplier * 30;
+    stats.inverted_index_range_read_count = multiplier * 31;
+    stats.inverted_index_serial_read_rounds = multiplier * 32;
     return stats;
 }
 
@@ -89,6 +93,10 @@ void expect_file_cache_stats_eq(const io::FileCacheStatistics& actual,
     EXPECT_EQ(actual.inverted_index_remote_io_timer, expected.inverted_index_remote_io_timer);
     EXPECT_EQ(actual.inverted_index_peer_io_timer, expected.inverted_index_peer_io_timer);
     EXPECT_EQ(actual.inverted_index_io_timer, expected.inverted_index_io_timer);
+    EXPECT_EQ(actual.inverted_index_request_bytes, expected.inverted_index_request_bytes);
+    EXPECT_EQ(actual.inverted_index_read_bytes, expected.inverted_index_read_bytes);
+    EXPECT_EQ(actual.inverted_index_range_read_count, expected.inverted_index_range_read_count);
+    EXPECT_EQ(actual.inverted_index_serial_read_rounds, expected.inverted_index_serial_read_rounds);
 }
 
 } // namespace
@@ -134,6 +142,14 @@ TEST(FileCacheProfileReporterTest, ReporterAggregatesDeltaReportsToExactFinalTot
     EXPECT_EQ(profile->get_counter("CacheGetOrSetTimer")->value(),
               after_second_report.cache_get_or_set_timer);
     EXPECT_EQ(profile->get_counter("LockWaitTimer")->value(), after_second_report.lock_wait_timer);
+    EXPECT_EQ(profile->get_counter("InvertedIndexRequestBytes")->value(),
+              after_second_report.inverted_index_request_bytes);
+    EXPECT_EQ(profile->get_counter("InvertedIndexReadBytes")->value(),
+              after_second_report.inverted_index_read_bytes);
+    EXPECT_EQ(profile->get_counter("InvertedIndexRangeReadCount")->value(),
+              after_second_report.inverted_index_range_read_count);
+    EXPECT_EQ(profile->get_counter("InvertedIndexSerialReadRounds")->value(),
+              after_second_report.inverted_index_serial_read_rounds);
 }
 
 } // namespace doris
diff --git a/be/test/storage/index/snii_doris_adapter_test.cpp b/be/test/storage/index/snii_doris_adapter_test.cpp
new file mode 100644
index 00000000000000..f307fb731daff5
--- /dev/null
+++ b/be/test/storage/index/snii_doris_adapter_test.cpp
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "storage/index/snii/snii_doris_adapter.h"
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "common/status.h"
+#include "io/fs/file_reader.h"
+#include "io/fs/path.h"
+#include "io/io_common.h"
+#include "snii/io/file_reader.h"
+#include "util/slice.h"
+
+namespace doris::segment_v2::snii_doris {
+namespace {
+
+struct CapturedIOContext {
+    bool has_ctx = false;
+    bool is_inverted_index = false;
+    bool is_index_data = false;
+    bool read_file_cache = true;
+    bool is_disposable = false;
+    io::FileCacheStatistics* file_cache_stats = nullptr;
+};
+
+struct CapturedRead {
+    size_t offset = 0;
+    size_t len = 0;
+    CapturedIOContext io_ctx;
+};
+
+class RecordingFileReader final : public io::FileReader {
+public:
+    explicit RecordingFileReader(std::string data) : _data(std::move(data)) {}
+
+    Status close() override {
+        _closed = true;
+        return Status::OK();
+    }
+
+    const io::Path& path() const override { return _path; }
+    size_t size() const override { return _data.size(); }
+    bool closed() const override { return _closed; }
+    int64_t mtime() const override { return 0; }
+
+    const std::vector<CapturedRead>& reads() const { return _reads; }
+
+protected:
+    Status read_at_impl(size_t offset, Slice result, size_t* bytes_read,
+                        const io::IOContext* io_ctx) override {
+        CapturedRead read;
+        read.offset = offset;
+        read.len = result.size;
+        if (io_ctx != nullptr) {
+            read.io_ctx.has_ctx = true;
+            read.io_ctx.is_inverted_index = io_ctx->is_inverted_index;
+            read.io_ctx.is_index_data = io_ctx->is_index_data;
+            read.io_ctx.read_file_cache = io_ctx->read_file_cache;
+            read.io_ctx.is_disposable = io_ctx->is_disposable;
+            read.io_ctx.file_cache_stats = io_ctx->file_cache_stats;
+        }
+        _reads.push_back(read);
+
+        if (result.size > 0) {
+            std::memcpy(result.data, _data.data() + offset, result.size);
+        }
+        *bytes_read = result.size;
+        return Status::OK();
+    }
+
+private:
+    std::string _data;
+    io::Path _path = "/tmp/snii_doris_adapter_test.idx";
+    bool _closed = false;
+    std::vector<CapturedRead> _reads;
+};
+
+} // namespace
+
+TEST(DorisSniiFileReaderTest, ReadAtPropagatesIndexIOContextAndRecordsStats) {
+    auto recording_reader = std::make_shared<RecordingFileReader>("0123456789abcdef");
+    DorisSniiFileReader reader(recording_reader);
+
+    io::FileCacheStatistics stats;
+    io::IOContext io_ctx;
+    io_ctx.is_disposable = true;
+    io_ctx.is_index_data = false;
+    io_ctx.read_file_cache = false;
+    io_ctx.file_cache_stats = &stats;
+
+    std::vector<uint8_t> out;
+    {
+        DorisSniiFileReader::ScopedIOContext scope(&io_ctx);
+        auto status = reader.read_at(2, 5, &out);
+        ASSERT_TRUE(status.ok()) << status.message();
+    }
+
+    ASSERT_EQ(out.size(), 5);
+    EXPECT_EQ(std::string(out.begin(), out.end()), "23456");
+    ASSERT_EQ(recording_reader->reads().size(), 1);
+    const auto& captured = recording_reader->reads()[0].io_ctx;
+    EXPECT_TRUE(captured.has_ctx);
+    EXPECT_TRUE(captured.is_inverted_index);
+    EXPECT_TRUE(captured.is_index_data);
+    EXPECT_FALSE(captured.read_file_cache);
+    EXPECT_TRUE(captured.is_disposable);
+    EXPECT_EQ(captured.file_cache_stats, &stats);
+
+    EXPECT_EQ(stats.inverted_index_request_bytes, 5);
+    EXPECT_EQ(stats.inverted_index_read_bytes, 5);
+    EXPECT_EQ(stats.inverted_index_range_read_count, 1);
+    EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1);
+}
+
+TEST(DorisSniiFileReaderTest, ReadBatchRecordsLogicalAndCoalescedPhysicalIO) {
+    auto recording_reader =
+            std::make_shared<RecordingFileReader>("0123456789abcdefghijklmnopqrstuvwxyz");
+    DorisSniiFileReader reader(recording_reader);
+
+    io::FileCacheStatistics stats;
+    io::IOContext io_ctx;
+    io_ctx.file_cache_stats = &stats;
+
+    std::vector<std::vector<uint8_t>> outs;
+    {
+        DorisSniiFileReader::ScopedIOContext scope(&io_ctx);
+        std::vector<::snii::io::Range> ranges {{0, 4}, {6, 3}, {20, 2}};
+        auto status = reader.read_batch(ranges, &outs);
+        ASSERT_TRUE(status.ok()) << status.message();
+    }
+
+    ASSERT_EQ(outs.size(), 3);
+    EXPECT_EQ(std::string(outs[0].begin(), outs[0].end()), "0123");
+    EXPECT_EQ(std::string(outs[1].begin(), outs[1].end()), "678");
+    EXPECT_EQ(std::string(outs[2].begin(), outs[2].end()), "kl");
+
+    ASSERT_EQ(recording_reader->reads().size(), 1);
+    EXPECT_EQ(recording_reader->reads()[0].offset, 0);
+    EXPECT_EQ(recording_reader->reads()[0].len, 22);
+
+    EXPECT_EQ(stats.inverted_index_request_bytes, 9);
+    EXPECT_EQ(stats.inverted_index_read_bytes, 22);
+    EXPECT_EQ(stats.inverted_index_range_read_count, 1);
+    EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1);
+}
+
+} // namespace doris::segment_v2::snii_doris
diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp
new file mode 100644
index 00000000000000..d735770d8402cc
--- /dev/null
+++ b/be/test/storage/index/snii_query_test.cpp
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cstring>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "snii/common/slice.h"
+#include "snii/encoding/byte_sink.h"
+#include "snii/encoding/byte_source.h"
+#include "snii/encoding/pfor.h"
+#include "snii/format/format_constants.h"
+#include "snii/format/prx_pod.h"
+#include "snii/io/file_reader.h"
+#include "snii/io/file_writer.h"
+#include "snii/query/docid_sink.h"
+#include "snii/query/phrase_query.h"
+#include "snii/query/term_query.h"
+#include "snii/reader/logical_index_reader.h"
+#include "snii/reader/snii_segment_reader.h"
+#include "snii/writer/snii_compound_writer.h"
+#include "snii/writer/spimi_term_buffer.h"
+
+namespace snii::query {
+namespace {
+
+class MemoryFile final : public snii::io::FileReader, public snii::io::FileWriter {
+public:
+    Status append(Slice data) override {
+        data_.insert(data_.end(), data.data(), data.data() + data.size());
+        return Status::OK();
+    }
+
+    Status finalize() override {
+        finalized_ = true;
+        return Status::OK();
+    }
+
+    uint64_t bytes_written() const override { return data_.size(); }
+
+    // NOLINTBEGIN(readability-non-const-parameter): FileReader interface writes into out.
+    Status read_at(uint64_t offset, size_t len, std::vector<uint8_t>* out) override {
+        if (offset > data_.size() || len > data_.size() - offset) {
+            return Status::Corruption("memory file read past eof");
+        }
+        out->resize(len);
+        if (len != 0) {
+            std::memcpy(out->data(), data_.data() + offset, len);
+        }
+        return Status::OK();
+    }
+    // NOLINTEND(readability-non-const-parameter)
+
+    uint64_t size() const override { return data_.size(); }
+    bool finalized() const { return finalized_; }
+
+private:
+    std::vector<uint8_t> data_;
+    bool finalized_ = false;
+};
+
+class RecordingDocIdSink final : public DocIdSink {
+public:
+    Status append_sorted(std::span<const uint32_t> docids) override {
+        out.insert(out.end(), docids.begin(), docids.end());
+        return Status::OK();
+    }
+
+    Status append_range(uint32_t first, uint64_t last_exclusive) override {
+        ++range_calls;
+        for (uint64_t docid = first; docid < last_exclusive; ++docid) {
+            out.push_back(static_cast<uint32_t>(docid));
+        }
+        return Status::OK();
+    }
+
+    std::vector<uint32_t> out;
+    size_t range_calls = 0;
+};
+
+struct PostingDoc {
+    uint32_t docid = 0;
+    std::vector<uint32_t> positions;
+};
+
+writer::TermPostings make_term(std::string term, std::vector<PostingDoc> docs) {
+    std::ranges::sort(docs, [](const PostingDoc& lhs, const PostingDoc& rhs) {
+        return lhs.docid < rhs.docid;
+    });
+
+    writer::TermPostings posting;
+    posting.term = std::move(term);
+    posting.docids.reserve(docs.size());
+    posting.freqs.reserve(docs.size());
+    for (const PostingDoc& doc : docs) {
+        posting.docids.push_back(doc.docid);
+        posting.freqs.push_back(static_cast<uint32_t>(doc.positions.size()));
+        posting.positions_flat.insert(posting.positions_flat.end(), doc.positions.begin(),
+                                      doc.positions.end());
+    }
+    return posting;
+}
+
+std::vector<PostingDoc> docs_with_one_position(uint32_t begin, uint32_t end, uint32_t position) {
+    std::vector<PostingDoc> docs;
+    docs.reserve(end - begin);
+    for (uint32_t docid = begin; docid < end; ++docid) {
+        docs.push_back({docid, {position}});
+    }
+    return docs;
+}
+
+void assert_ok(const Status& status) {
+    ASSERT_TRUE(status.ok()) << status.to_string();
+}
+
+Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader,
+                    reader::LogicalIndexReader* index_reader) {
+    constexpr uint32_t kDocCount = 9000;
+    auto failed_docs = docs_with_one_position(0, kDocCount, 0);
+    auto order_docs = docs_with_one_position(0, kDocCount, 2);
+    auto ordinal_docs = docs_with_one_position(0, kDocCount, 2);
+    auto driver_docs = docs_with_one_position(0, 8000, 0);
+    auto almost_docs = docs_with_one_position(0, kDocCount, 1);
+    std::vector<PostingDoc> sparse_left_docs;
+    std::vector<PostingDoc> sparse_right_docs;
+    std::vector<PostingDoc> repeat_docs;
+    sparse_left_docs.reserve(kDocCount / 3 + 1);
+    sparse_right_docs.reserve(kDocCount);
+    repeat_docs.reserve(kDocCount);
+    for (uint32_t docid = 0; docid < kDocCount; ++docid) {
+        if (docid % 3 == 0) {
+            sparse_left_docs.push_back({docid, {0}});
+        }
+        if (docid % 4 != 1) {
+            sparse_right_docs.push_back({docid, {1}});
+        }
+        repeat_docs.push_back({docid, {0, 1, 2}});
+    }
+    almost_docs.erase(almost_docs.begin() + 4000);
+    failed_docs[8000].positions = {0, 4};
+    for (PostingDoc& doc : order_docs) {
+        if (doc.docid == 5000 || doc.docid == 7000) {
+            doc.positions = {1};
+        } else if (doc.docid == 8000) {
+            doc.positions = {5};
+        }
+    }
+    for (PostingDoc& doc : ordinal_docs) {
+        if (doc.docid == 6000) {
+            doc.positions = {1};
+        }
+    }
+
+    writer::SniiIndexInput input;
+    input.index_id = 7;
+    input.index_suffix = "Body";
+    input.config = format::IndexConfig::kDocsPositions;
+    input.doc_count = kDocCount;
+    input.terms = {make_term("almost", std::move(almost_docs)),
+                   make_term("driver", std::move(driver_docs)),
+                   make_term("failed", std::move(failed_docs)),
+                   make_term("order", std::move(order_docs)),
+                   make_term("ordinal", std::move(ordinal_docs)),
+                   make_term("repeat", std::move(repeat_docs)),
+                   make_term("sparse_left", std::move(sparse_left_docs)),
+                   make_term("sparse_right", std::move(sparse_right_docs))};
+
+    writer::SniiCompoundWriter writer(file);
+    SNII_RETURN_IF_ERROR(writer.add_logical_index(input));
+    SNII_RETURN_IF_ERROR(writer.finish());
+    EXPECT_TRUE(file->finalized());
+
+    SNII_RETURN_IF_ERROR(reader::SniiSegmentReader::open(file, segment_reader));
+    return segment_reader->open_index(input.index_id, input.index_suffix, index_reader);
+}
+
+TEST(SniiPhraseQueryTest, WindowedPhraseQueryKeepsCorrectCandidateOrdinals) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_query(index_reader, {"failed", "order"}, &docids));
+
+    const std::vector<uint32_t> expected {5000, 7000, 8000};
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiPhraseQueryTest, WindowedPhrasePrefixQueryKeepsCorrectCandidateOrdinals) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_prefix_query(index_reader, {"failed", "ord"}, &docids, 10));
+
+    const std::vector<uint32_t> expected {5000, 6000, 7000, 8000};
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiPhraseQueryTest, SingleTailPhrasePrefixUsesStreamingPhrasePath) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_prefix_query(index_reader, {"failed", "orde"}, &docids, 10));
+
+    const std::vector<uint32_t> expected {5000, 7000, 8000};
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiPhraseQueryTest, MultiTermPhraseUsesPairPrefilter) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_query(index_reader, {"failed", "order", "ordinal"}, &docids));
+
+    const std::vector<uint32_t> expected {5000, 7000};
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiPhraseQueryTest, RepeatedTermPhraseUsesCachedPostingSpan) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_query(index_reader, {"repeat", "repeat", "repeat"}, &docids));
+
+    std::vector<uint32_t> expected(9000);
+    std::iota(expected.begin(), expected.end(), 0);
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiPhraseQueryTest, DenseTermWithMissingDocKeepsCandidateOrdinals) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> driver_docids;
+    assert_ok(term_query(index_reader, "driver", &driver_docids));
+    EXPECT_EQ(driver_docids.size(), 8000);
+
+    std::vector<uint32_t> almost_docids;
+    assert_ok(term_query(index_reader, "almost", &almost_docids));
+    EXPECT_EQ(almost_docids.size(), 8999);
+    ASSERT_GT(almost_docids.size(), 6144);
+    EXPECT_EQ(almost_docids[3999], 3999);
+    EXPECT_EQ(almost_docids[4000], 4001);
+    EXPECT_EQ(almost_docids[6143], 6144);
+    EXPECT_EQ(almost_docids[6144], 6145);
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_query(index_reader, {"driver", "almost"}, &docids));
+
+    std::vector<uint32_t> expected;
+    expected.reserve(7999);
+    for (uint32_t docid = 0; docid < 8000; ++docid) {
+        if (docid != 4000) {
+            expected.push_back(docid);
+        }
+    }
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiPhraseQueryTest, SparseWindowBitsetKeepsCandidateOrdinals) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    std::vector<uint32_t> docids;
+    assert_ok(phrase_query(index_reader, {"sparse_left", "sparse_right"}, &docids));
+
+    std::vector<uint32_t> expected;
+    for (uint32_t docid = 0; docid < 9000; ++docid) {
+        if (docid % 3 == 0 && docid % 4 != 1) {
+            expected.push_back(docid);
+        }
+    }
+    EXPECT_EQ(docids, expected);
+}
+
+TEST(SniiTermQueryTest, WindowedDenseTermEmitsRangesToSink) {
+    MemoryFile file;
+    reader::SniiSegmentReader segment_reader;
+    reader::LogicalIndexReader index_reader;
+    assert_ok(build_reader(&file, &segment_reader, &index_reader));
+
+    RecordingDocIdSink sink;
+    assert_ok(term_query(index_reader, "failed", &sink));
+
+    std::vector<uint32_t> expected(9000);
+    std::iota(expected.begin(), expected.end(), 0);
+    EXPECT_EQ(sink.out, expected);
+    EXPECT_GT(sink.range_calls, 0);
+}
+
+TEST(SniiPrxPodTest, SelectivePforCsrMatchesFullCsrAcrossRuns) {
+    std::vector<uint32_t> freqs;
+    std::vector<uint32_t> positions;
+    freqs.reserve(320);
+    for (uint32_t doc = 0; doc < 320; ++doc) {
+        const uint32_t freq = (doc % 5 == 0) ? 2 : 1;
+        freqs.push_back(freq);
+        positions.push_back(doc * 3);
+        if (freq == 2) {
+            positions.push_back(doc * 3 + 2);
+        }
+    }
+
+    ByteSink sink;
+    assert_ok(format::build_prx_window_flat(positions, freqs, -1, &sink));
+
+    std::vector<uint32_t> full_positions;
+    std::vector<uint32_t> full_offsets;
+    ByteSource full_source(sink.view());
+    assert_ok(format::read_prx_window_csr(&full_source, &full_positions, &full_offsets));
+
+    auto assert_selected_matches_full = [&](const std::vector<uint32_t>& selected_docs) {
+        std::vector<uint32_t> selected_positions;
+        std::vector<uint32_t> selected_offsets;
+        ByteSource selected_source(sink.view());
+        assert_ok(format::read_prx_window_csr_selective(&selected_source, selected_docs,
+                                                        &selected_positions, &selected_offsets));
+
+        ASSERT_EQ(selected_offsets.size(), selected_docs.size() + 1);
+        for (size_t i = 0; i < selected_docs.size(); ++i) {
+            const uint32_t doc = selected_docs[i];
+            const std::vector<uint32_t> expected(full_positions.begin() + full_offsets[doc],
+                                                 full_positions.begin() + full_offsets[doc + 1]);
+            const std::vector<uint32_t> actual(
+                    selected_positions.begin() + selected_offsets[i],
+                    selected_positions.begin() + selected_offsets[i + 1]);
+            EXPECT_EQ(actual, expected);
+        }
+    };
+
+    assert_selected_matches_full({0, 1, 2});
+    assert_selected_matches_full({0, 1, 127, 128, 129, 255, 256, 319});
+}
+
+TEST(SniiPforTest, LowBitWidthFastPathsRoundTrip) {
+    auto assert_round_trip = [](const std::vector<uint32_t>& values, uint8_t expected_width) {
+        ByteSink sink;
+        snii::pfor_encode(values.data(), values.size(), &sink);
+        ASSERT_FALSE(sink.buffer().empty());
+        EXPECT_EQ(sink.buffer().front(), expected_width);
+
+        std::vector<uint32_t> decoded(values.size(), 0xFFFFFFFF);
+        ByteSource source(sink.view());
+        assert_ok(snii::pfor_decode(&source, values.size(), decoded.data()));
+        EXPECT_TRUE(source.eof());
+        EXPECT_EQ(decoded, values);
+    };
+
+    std::vector<uint32_t> one_bit(128);
+    for (size_t i = 0; i < one_bit.size(); ++i) {
+        one_bit[i] = static_cast<uint32_t>(i & 1);
+    }
+    assert_round_trip(one_bit, 1);
+
+    one_bit[17] = 1000;
+    assert_round_trip(one_bit, 1);
+
+    std::vector<uint32_t> two_bit(128);
+    for (size_t i = 0; i < two_bit.size(); ++i) {
+        two_bit[i] = static_cast<uint32_t>(i & 3);
+    }
+    assert_round_trip(two_bit, 2);
+
+    std::vector<uint32_t> three_bit(131);
+    for (size_t i = 0; i < three_bit.size(); ++i) {
+        three_bit[i] = static_cast<uint32_t>(i & 7);
+    }
+    assert_round_trip(three_bit, 3);
+
+    std::vector<uint32_t> four_bit(128);
+    for (size_t i = 0; i < four_bit.size(); ++i) {
+        four_bit[i] = static_cast<uint32_t>(i & 15);
+    }
+    assert_round_trip(four_bit, 4);
+
+    std::vector<uint32_t> five_bit(129);
+    for (size_t i = 0; i < five_bit.size(); ++i) {
+        five_bit[i] = static_cast<uint32_t>(i & 31);
+    }
+    assert_round_trip(five_bit, 5);
+
+    std::vector<uint32_t> six_bit(130);
+    for (size_t i = 0; i < six_bit.size(); ++i) {
+        six_bit[i] = static_cast<uint32_t>(i & 63);
+    }
+    assert_round_trip(six_bit, 6);
+
+    std::vector<uint32_t> seven_bit(131);
+    for (size_t i = 0; i < seven_bit.size(); ++i) {
+        seven_bit[i] = static_cast<uint32_t>(i & 127);
+    }
+    assert_round_trip(seven_bit, 7);
+
+    std::vector<uint32_t> eight_bit(256);
+    for (size_t i = 0; i < eight_bit.size(); ++i) {
+        eight_bit[i] = static_cast<uint32_t>(i);
+    }
+    assert_round_trip(eight_bit, 8);
+}
+
+} // namespace
+} // namespace snii::query
diff --git a/be/test/storage/segment/inverted_index_fs_directory_test.cpp b/be/test/storage/segment/inverted_index_fs_directory_test.cpp
index d42559a0e39975..99cd9d8b613cc7 100644
--- a/be/test/storage/segment/inverted_index_fs_directory_test.cpp
+++ b/be/test/storage/segment/inverted_index_fs_directory_test.cpp
@@ -287,6 +287,58 @@ TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalWithBytesReadError) {
     _CLDELETE(input);
 }
 
+TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalRecordsIndexIOStatsAndContext) {
+    std::filesystem::path test_file = _tmp_dir / "test_file_with_stats";
+    std::ofstream ofs(test_file);
+    ofs << "test content for stats";
+    ofs.close();
+
+    lucene::store::IndexInput* input = nullptr;
+    CLuceneError error;
+
+    bool result =
+            DorisFSDirectory::FSIndexInput::open(_fs, test_file.string().c_str(), input, error);
+    EXPECT_TRUE(result);
+
+    io::FileCacheStatistics stats;
+    io::IOContext io_ctx;
+    io_ctx.is_disposable = true;
+    io_ctx.is_index_data = false;
+    io_ctx.read_file_cache = false;
+    io_ctx.file_cache_stats = &stats;
+
+    input->setIoContext(&io_ctx);
+    input->setIndexFile(true);
+
+    uint8_t buffer[6];
+    input->readBytes(buffer, 6, false);
+    EXPECT_EQ(std::string(reinterpret_cast<char*>(buffer), 6), "test c");
+
+    const auto* captured = static_cast<const io::IOContext*>(input->getIoContext());
+    EXPECT_TRUE(captured->is_inverted_index);
+    EXPECT_TRUE(captured->is_index_data);
+    EXPECT_FALSE(captured->read_file_cache);
+    EXPECT_TRUE(captured->is_disposable);
+    EXPECT_EQ(captured->file_cache_stats, &stats);
+
+    EXPECT_EQ(stats.inverted_index_request_bytes, 6);
+    EXPECT_EQ(stats.inverted_index_read_bytes, 6);
+    EXPECT_EQ(stats.inverted_index_range_read_count, 1);
+    EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1);
+
+    input->setIoContext(nullptr);
+    captured = static_cast<const io::IOContext*>(input->getIoContext());
+    EXPECT_TRUE(captured->is_inverted_index);
+    EXPECT_TRUE(captured->is_index_data);
+    EXPECT_EQ(captured->file_cache_stats, nullptr);
+
+    input->setIndexFile(false);
+    captured = static_cast<const io::IOContext*>(input->getIoContext());
+    EXPECT_FALSE(captured->is_index_data);
+
+    _CLDELETE(input);
+}
+
 // Test 19: FSIndexOutput init error
 TEST_F(DorisFSDirectoryTest, FSIndexOutputInitError) {
     DebugPoints::instance()->add(
@@ -841,4 +893,4 @@ TEST_F(DorisFSDirectoryTest, PrivGetFN) {
     }
 }
 
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 54129adf81bed0..3e8def3c9710a5 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -105,6 +105,12 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
             checkInvertedIndexProperties(properties, colType, invertedIndexFileStorageFormat);
         }
 
+        if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII
+                && !colType.isStringType() && !colType.isArrayType()) {
+            throw new AnalysisException("SNII inverted index storage format only supports string columns, column: "
+                    + indexColName + " type: " + colType);
+        }
+
         // default is "none" if not set
         if (parser == null) {
             parser = INVERTED_INDEX_PARSER_NONE;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
index b208e712c273c4..24337cd4929316 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
@@ -379,11 +379,15 @@ public OlapFile.TabletMetaCloudPB.Builder createTabletMetaBuilder(long tableId,
                 schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2);
             } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V3) {
                 schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3);
+            } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) {
+                schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.SNII);
             } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.DEFAULT) {
                 if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) {
                     schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V1);
                 } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) {
                     schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2);
+                } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) {
+                    schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.SNII);
                 } else {
                     schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3);
                 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
index b27db96bbe176b..392131a8cd4ea1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
@@ -1219,6 +1219,8 @@ public static TInvertedIndexFileStorageFormat analyzeInvertedIndexFileStorageFor
                 return TInvertedIndexFileStorageFormat.V1;
             } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) {
                 return TInvertedIndexFileStorageFormat.V2;
+            } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) {
+                return TInvertedIndexFileStorageFormat.SNII;
             } else {
                 return TInvertedIndexFileStorageFormat.V3;
             }
@@ -1230,11 +1232,15 @@ public static TInvertedIndexFileStorageFormat analyzeInvertedIndexFileStorageFor
             return TInvertedIndexFileStorageFormat.V2;
         } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v3")) {
             return TInvertedIndexFileStorageFormat.V3;
+        } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("snii")) {
+            return TInvertedIndexFileStorageFormat.SNII;
         } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("default")) {
             if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) {
                 return TInvertedIndexFileStorageFormat.V1;
             } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) {
                 return TInvertedIndexFileStorageFormat.V2;
+            } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) {
+                return TInvertedIndexFileStorageFormat.SNII;
             } else {
                 return TInvertedIndexFileStorageFormat.V3;
             }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
index 494e756538b112..bf5aac95225629 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
@@ -31,6 +31,7 @@
 import org.apache.doris.common.Config;
 import org.apache.doris.common.UserException;
 import org.apache.doris.qe.ConnectContext;
+import org.apache.doris.thrift.TInvertedIndexFileStorageFormat;
 
 import com.google.common.collect.Maps;
 import org.apache.commons.lang3.StringUtils;
@@ -134,6 +135,10 @@ public void validate(ConnectContext ctx) throws UserException {
         }
 
         IndexType indexType = existedIdx.getIndexType();
+        OlapTable olapTable = (OlapTable) table;
+        if (olapTable.getInvertedIndexFileStorageFormat() == TInvertedIndexFileStorageFormat.SNII) {
+            throw new AnalysisException("BUILD INDEX is not supported for SNII inverted index storage format yet");
+        }
         if ((Config.isNotCloudMode() && indexType == IndexType.NGRAM_BF)
                 || indexType == IndexType.BLOOMFILTER
                 || (Config.isCloudMode()
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java
index 14869e7925cf86..9303ebf95bcb7b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java
@@ -848,8 +848,10 @@ public void validate(ConnectContext ctx) {
                 }
                 if (indexDef.getIndexType() == IndexType.ANN) {
                     if (invertedIndexFileStorageFormat != null
-                            && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) {
-                        throw new AnalysisException("ANN index is not supported in index format V1");
+                            && (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1
+                            || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII)) {
+                        throw new AnalysisException("ANN index is not supported in index format "
+                                + invertedIndexFileStorageFormat);
                     }
                 }
                 for (String indexColName : indexDef.getColumnNames()) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java
index 8630d80b7dc0ab..36f256994a7116 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java
@@ -164,6 +164,11 @@ public void checkColumn(ColumnDefinition column, KeysType keysType,
                         "ANN index can only be used in DUP_KEYS table or UNIQUE_KEYS table with"
                                 + " merge-on-write enabled");
             }
+            if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1
+                    || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) {
+                throw new AnalysisException("ANN index is not supported in index format "
+                        + invertedIndexFileStorageFormat);
+            }
             return;
         }
 
@@ -177,6 +182,17 @@ public void checkColumn(ColumnDefinition column, KeysType keysType,
                 throw new AnalysisException(colType + " is not supported in " + indexType.toString()
                         + " index. " + "invalid index: " + name);
             }
+            if (indexType == IndexType.INVERTED
+                    && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) {
+                boolean isStringIndex = colType.isStringLikeType()
+                        || (colType.isArrayType()
+                            && ((ArrayType) colType).getItemType().isStringLikeType());
+                if (!isStringIndex) {
+                    throw new AnalysisException(
+                            "SNII inverted index storage format does not support BKD index on column: "
+                                    + indexColName);
+                }
+            }
 
             // In inverted index format v1, each subcolumn of a variant has its own index file, leading to high IOPS.
             // when the subcolumn type changes, it may result in missing files, causing link file failure.
@@ -264,8 +280,10 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe
                         "ANN index can only be used in DUP_KEYS table or UNIQUE_KEYS table with"
                                 + " merge-on-write enabled");
             }
-            if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) {
-                throw new AnalysisException("ANN index is not supported in index format V1");
+            if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1
+                    || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) {
+                throw new AnalysisException("ANN index is not supported in index format "
+                        + invertedIndexFileStorageFormat);
             }
             return;
         }
@@ -280,9 +298,16 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe
                 throw new AnalysisException(colType + " is not supported in " + indexType.toString() + " index. "
                     + "invalid index: " + name);
             }
-
-            if (indexType == IndexType.ANN && !colType.isArrayType()) {
-                throw new AnalysisException("ANN index column must be array type");
+            if (indexType == IndexType.INVERTED
+                    && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) {
+                boolean isStringIndex = colType.isStringType()
+                        || (colType.isArrayType()
+                            && ((org.apache.doris.catalog.ArrayType) columnType).getItemType().isStringType());
+                if (!isStringIndex) {
+                    throw new AnalysisException(
+                            "SNII inverted index storage format does not support BKD index on column: "
+                                    + indexColName);
+                }
             }
 
             // In inverted index format v1, each subcolumn of a variant has its own index file, leading to high IOPS.
diff --git a/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java
index fa6260d19f7a8d..8a836b6b5d6f2c 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java
@@ -46,6 +46,7 @@
 import org.apache.doris.qe.ConnectContext;
 import org.apache.doris.task.AgentTask;
 import org.apache.doris.task.AgentTaskQueue;
+import org.apache.doris.thrift.TInvertedIndexFileStorageFormat;
 import org.apache.doris.thrift.TStatusCode;
 import org.apache.doris.thrift.TTaskType;
 import org.apache.doris.transaction.FakeTransactionIDGenerator;
@@ -195,6 +196,47 @@ public void testBuildIndexIndexChange() throws UserException {
         Assert.assertEquals(OlapTableState.NORMAL, olapTable.getState());
     }
 
+    @Test
+    public void testBuildIndexRejectedForSniiStorageFormat() throws UserException {
+        if (fakeEnv != null) {
+            fakeEnv.close();
+        }
+        fakeEnv = new FakeEnv();
+        if (fakeEditLog != null) {
+            fakeEditLog.close();
+        }
+        fakeEditLog = new FakeEditLog();
+        FakeEnv.setEnv(masterEnv);
+        SchemaChangeHandler schemaChangeHandler = Env.getCurrentEnv().getSchemaChangeHandler();
+        ArrayList<AlterOp> alterOps = new ArrayList<>();
+        Database db = masterEnv.getInternalCatalog().getDbOrDdlException(CatalogTestUtil.testDbId1);
+        OlapTable olapTable = (OlapTable) db.getTableOrDdlException(CatalogTestUtil.testTableId1);
+        String indexName = "index1";
+        TableNameInfo tableNameInfo = new TableNameInfo(masterEnv.getInternalCatalog().getName(), db.getName(),
+                olapTable.getName());
+        IndexDefinition indexDefinition = new IndexDefinition(indexName, false,
+                Lists.newArrayList(olapTable.getBaseSchema().get(1).getName()),
+                "INVERTED",
+                Maps.newHashMap(), "balabala");
+        CreateIndexOp createIndexClause = new CreateIndexOp(tableNameInfo, indexDefinition, false);
+        ConnectContext connectContext = new ConnectContext();
+        createIndexClause.validate(connectContext);
+        alterOps.add(createIndexClause);
+        schemaChangeHandler.process(alterOps, db, olapTable);
+        TInvertedIndexFileStorageFormat originalFormat = olapTable.getInvertedIndexFileStorageFormat();
+        try {
+            olapTable.setInvertedIndexFileStorageFormat(TInvertedIndexFileStorageFormat.SNII);
+            BuildIndexOp buildIndexClause = new BuildIndexOp(tableNameInfo, indexName, null, false);
+            buildIndexClause.validate(connectContext);
+            Assert.fail("BUILD INDEX should be rejected for SNII inverted index storage format.");
+        } catch (AnalysisException e) {
+            Assert.assertTrue(e.getMessage().contains(
+                    "BUILD INDEX is not supported for SNII inverted index storage format yet"));
+        } finally {
+            olapTable.setInvertedIndexFileStorageFormat(originalFormat);
+        }
+    }
+
     @Test
     public void testDropIndexIndexChange() throws UserException {
         if (fakeEnv != null) {
diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java
index 7b41ddc95cf840..060e687b495242 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java
@@ -18,7 +18,9 @@
 package org.apache.doris.nereids.trees.plans.commands;
 
 import org.apache.doris.catalog.AggregateType;
+import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.KeysType;
+import org.apache.doris.catalog.Type;
 import org.apache.doris.catalog.info.IndexType;
 import org.apache.doris.nereids.exceptions.AnalysisException;
 import org.apache.doris.nereids.trees.plans.commands.info.ColumnDefinition;
@@ -57,6 +59,68 @@ void testVariantIndexFormatV1() throws AnalysisException {
         }
     }
 
+    @Test
+    void testSniiInvertedIndexColumnTypes() throws AnalysisException {
+        IndexDefinition def = new IndexDefinition("snii_index", false, Lists.newArrayList("col1"),
+                "INVERTED", null, "comment");
+
+        def.checkColumn(new ColumnDefinition("col1", StringType.INSTANCE, false, AggregateType.NONE, true,
+                        null, "comment"), KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII);
+        def.checkColumn(new ColumnDefinition("col1", ArrayType.of(StringType.INSTANCE), false,
+                        AggregateType.NONE, true, null, "comment"), KeysType.DUP_KEYS, false,
+                TInvertedIndexFileStorageFormat.SNII);
+
+        AnalysisException intException = Assertions.assertThrows(AnalysisException.class, () ->
+                def.checkColumn(new ColumnDefinition("col1", IntegerType.INSTANCE, false, AggregateType.NONE,
+                                true, null, "comment"), KeysType.DUP_KEYS, false,
+                        TInvertedIndexFileStorageFormat.SNII));
+        Assertions.assertTrue(intException.getMessage().contains("does not support BKD index"));
+
+        AnalysisException arrayIntException = Assertions.assertThrows(AnalysisException.class, () ->
+                def.checkColumn(new ColumnDefinition("col1", ArrayType.of(IntegerType.INSTANCE), false,
+                                AggregateType.NONE, true, null, "comment"), KeysType.DUP_KEYS, false,
+                        TInvertedIndexFileStorageFormat.SNII));
+        Assertions.assertTrue(arrayIntException.getMessage().contains("does not support BKD index"));
+    }
+
+    @Test
+    void testSniiInvertedIndexCatalogColumnTypes() throws AnalysisException {
+        IndexDefinition def = new IndexDefinition("snii_index", false, Lists.newArrayList("col1"),
+                "INVERTED", null, "comment");
+
+        def.checkColumn(new Column("col1", Type.STRING, true), KeysType.DUP_KEYS, false,
+                TInvertedIndexFileStorageFormat.SNII);
+        def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.STRING), true),
+                KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII);
+
+        AnalysisException intException = Assertions.assertThrows(AnalysisException.class, () ->
+                def.checkColumn(new Column("col1", Type.INT, true), KeysType.DUP_KEYS, false,
+                        TInvertedIndexFileStorageFormat.SNII));
+        Assertions.assertTrue(intException.getMessage().contains("does not support BKD index"));
+
+        AnalysisException arrayIntException = Assertions.assertThrows(AnalysisException.class, () ->
+                def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.INT), true),
+                        KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII));
+        Assertions.assertTrue(arrayIntException.getMessage().contains("does not support BKD index"));
+    }
+
+    @Test
+    void testSniiRejectsAnnIndex() {
+        IndexDefinition def = new IndexDefinition("ann_index", false, Lists.newArrayList("col1"),
+                "ANN", null, "comment");
+        AnalysisException exception = Assertions.assertThrows(AnalysisException.class, () ->
+                def.checkColumn(new ColumnDefinition("col1", ArrayType.of(FloatType.INSTANCE), false,
+                                AggregateType.NONE, false, null, "comment"), KeysType.DUP_KEYS, false,
+                        TInvertedIndexFileStorageFormat.SNII));
+        Assertions.assertTrue(exception.getMessage().contains("ANN index is not supported in index format SNII"));
+
+        AnalysisException catalogException = Assertions.assertThrows(AnalysisException.class, () ->
+                def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.FLOAT), false),
+                        KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII));
+        Assertions.assertTrue(catalogException.getMessage().contains(
+                "ANN index is not supported in index format SNII"));
+    }
+
     void testArrayTypeSupport() throws AnalysisException {
         IndexDefinition def = new IndexDefinition("array_index", false, Lists.newArrayList("col1"),
                 "INVERTED", null, "array test");
diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto
index 210a5ba0a1cf89..8577957927e64d 100644
--- a/gensrc/proto/olap_file.proto
+++ b/gensrc/proto/olap_file.proto
@@ -446,6 +446,7 @@ enum InvertedIndexStorageFormatPB {
     V1 = 0;
     V2 = 1;
     V3 = 2;
+    SNII = 3;
 }
 
 // Tablet-level storage format. Values match TStorageFormat (Thrift) integer values so
diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift
index 4b4780d933c6e2..8917efd68cd31a 100644
--- a/gensrc/thrift/AgentService.thrift
+++ b/gensrc/thrift/AgentService.thrift
@@ -196,7 +196,8 @@ enum TCompressionType {
 enum TInvertedIndexStorageFormat {
     DEFAULT = 0, // Default format, unspecified storage method.
     V1 = 1,      // Index per idx: Each index is stored separately based on its identifier.
-    V2 = 2       // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment.
+    V2 = 2,      // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment.
+    SNII = 4     // SNII native inverted index storage format
 }
 
 enum TBinlogFormat {
diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift
index c6b9c705307380..d088a936b9e05f 100644
--- a/gensrc/thrift/Types.thrift
+++ b/gensrc/thrift/Types.thrift
@@ -130,7 +130,8 @@ enum TInvertedIndexFileStorageFormat {
     DEFAULT = 0, // Default format, unspecified storage method.
     V1 = 1,      // Index per idx: Each index is stored separately based on its identifier.
     V2 = 2,      // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment.
-    V3 = 3       // Position and dictionary compression
+    V3 = 3,      // Position and dictionary compression
+    SNII = 4     // SNII native inverted index storage format
 }
 
 struct TScalarType {
diff --git a/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out
new file mode 100644
index 00000000000000..33e05cf4214d2f
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !match_any --
+1
+2
+
+-- !match_all --
+1
+
+-- !match_phrase --
+5
+
+-- !null_bitmap --
+4
+
+-- !array_contains --
+1
diff --git a/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy
new file mode 100644
index 00000000000000..7800350fb6b753
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy
@@ -0,0 +1,212 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_storage_format_snii", "p0, nonConcurrent") {
+    sql "DROP TABLE IF EXISTS test_storage_format_snii"
+    sql "DROP TABLE IF EXISTS test_storage_format_snii_array"
+    sql "DROP TABLE IF EXISTS test_storage_format_snii_add_index"
+    sql "DROP TABLE IF EXISTS test_storage_format_snii_bkd"
+    sql "DROP TABLE IF EXISTS test_storage_format_snii_array_bkd"
+    sql "DROP TABLE IF EXISTS test_storage_format_snii_ann"
+
+    sql """
+        CREATE TABLE test_storage_format_snii (
+          id INT NULL,
+          body TEXT NULL,
+          INDEX idx_body (`body`) USING INVERTED PROPERTIES(
+            "parser" = "english",
+            "support_phrase" = "true",
+            "lower_case" = "true"
+          ) COMMENT ''
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY RANDOM BUCKETS 1
+        PROPERTIES (
+          "replication_allocation" = "tag.location.default: 1",
+          "disable_auto_compaction" = "true",
+          "inverted_index_storage_format" = "SNII"
+        );
+    """
+
+    sql """
+        INSERT INTO test_storage_format_snii VALUES
+          (1, 'alpha beta gamma'),
+          (2, 'alpha delta'),
+          (3, 'beta epsilon'),
+          (4, NULL),
+          (5, 'quick brown fox'),
+          (6, 'quick fox');
+    """
+    sql "sync"
+
+    order_qt_match_any """
+        SELECT id FROM test_storage_format_snii
+        WHERE body MATCH_ANY 'alpha'
+        ORDER BY id
+    """
+    order_qt_match_all """
+        SELECT id FROM test_storage_format_snii
+        WHERE body MATCH_ALL 'alpha beta'
+        ORDER BY id
+    """
+    order_qt_match_phrase """
+        SELECT id FROM test_storage_format_snii
+        WHERE body MATCH_PHRASE 'quick brown'
+        ORDER BY id
+    """
+    order_qt_null_bitmap """
+        SELECT id FROM test_storage_format_snii
+        WHERE body IS NULL
+        ORDER BY id
+    """
+
+    sql """
+        CREATE TABLE test_storage_format_snii_array (
+          id INT NULL,
+          tags ARRAY<TEXT> NULL,
+          INDEX idx_tags (`tags`) USING INVERTED COMMENT ''
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY RANDOM BUCKETS 1
+        PROPERTIES (
+          "replication_allocation" = "tag.location.default: 1",
+          "inverted_index_storage_format" = "SNII"
+        );
+    """
+
+    sql """
+        INSERT INTO test_storage_format_snii_array VALUES
+          (1, '["alpha", "beta"]'),
+          (2, '["gamma"]'),
+          (3, NULL);
+    """
+    sql "sync"
+
+    order_qt_array_contains """
+        SELECT id FROM test_storage_format_snii_array
+        WHERE array_contains(tags, 'alpha')
+        ORDER BY id
+    """
+
+    test {
+        if (isCloudMode()) {
+            sql "BUILD INDEX ON test_storage_format_snii"
+        } else {
+            sql "BUILD INDEX idx_body ON test_storage_format_snii"
+        }
+        exception "BUILD INDEX is not supported for SNII inverted index storage format yet"
+    }
+
+    sql """
+        CREATE TABLE test_storage_format_snii_add_index (
+          id INT NULL,
+          body TEXT NULL,
+          score INT NULL,
+          scores ARRAY<INT> NULL,
+          embedding ARRAY<FLOAT> NOT NULL,
+          INDEX idx_body_added_table (`body`) USING INVERTED COMMENT ''
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY RANDOM BUCKETS 1
+        PROPERTIES (
+          "replication_allocation" = "tag.location.default: 1",
+          "inverted_index_storage_format" = "SNII"
+        );
+    """
+
+    test {
+        sql """
+            ALTER TABLE test_storage_format_snii_add_index
+            ADD INDEX idx_score_added (`score`) USING INVERTED COMMENT ''
+        """
+        exception "SNII inverted index storage format"
+    }
+
+    test {
+        sql """
+            ALTER TABLE test_storage_format_snii_add_index
+            ADD INDEX idx_scores_added (`scores`) USING INVERTED COMMENT ''
+        """
+        exception "SNII inverted index storage format"
+    }
+
+    test {
+        sql """
+            CREATE INDEX idx_ann_added ON test_storage_format_snii_add_index (`embedding`) USING ANN PROPERTIES(
+              "index_type" = "hnsw",
+              "metric_type" = "l2_distance",
+              "dim" = "1"
+            )
+        """
+        exception "ANN index is not supported in index format SNII"
+    }
+
+    test {
+        sql """
+            CREATE TABLE test_storage_format_snii_bkd (
+              id INT NULL,
+              score INT NULL,
+              INDEX idx_score (`score`) USING INVERTED COMMENT ''
+            ) ENGINE=OLAP
+            DUPLICATE KEY(`id`)
+            DISTRIBUTED BY RANDOM BUCKETS 1
+            PROPERTIES (
+              "replication_allocation" = "tag.location.default: 1",
+              "inverted_index_storage_format" = "SNII"
+            );
+        """
+        exception "SNII inverted index storage format"
+    }
+
+    test {
+        sql """
+            CREATE TABLE test_storage_format_snii_array_bkd (
+              id INT NULL,
+              scores ARRAY<INT> NULL,
+              INDEX idx_scores (`scores`) USING INVERTED COMMENT ''
+            ) ENGINE=OLAP
+            DUPLICATE KEY(`id`)
+            DISTRIBUTED BY RANDOM BUCKETS 1
+            PROPERTIES (
+              "replication_allocation" = "tag.location.default: 1",
+              "inverted_index_storage_format" = "SNII"
+            );
+        """
+        exception "SNII inverted index storage format"
+    }
+
+    test {
+        sql """
+            CREATE TABLE test_storage_format_snii_ann (
+              id INT NULL,
+              embedding ARRAY<FLOAT> NOT NULL,
+              INDEX idx_ann (`embedding`) USING ANN PROPERTIES(
+                "index_type" = "hnsw",
+                "metric_type" = "l2_distance",
+                "dim" = "1"
+              )
+            ) ENGINE=OLAP
+            DUPLICATE KEY(`id`)
+            DISTRIBUTED BY RANDOM BUCKETS 1
+            PROPERTIES (
+              "replication_allocation" = "tag.location.default: 1",
+              "inverted_index_storage_format" = "SNII"
+            );
+        """
+        exception "ANN index is not supported in index format SNII"
+    }
+}