diff --git a/be/src/exec/scan/olap_scanner.cpp b/be/src/exec/scan/olap_scanner.cpp index 320976814679b9..efa536ea690779 100644 --- a/be/src/exec/scan/olap_scanner.cpp +++ b/be/src/exec/scan/olap_scanner.cpp @@ -152,7 +152,10 @@ static bool has_file_cache_statistics(const io::FileCacheStatistics& stats) { stats.inverted_index_bytes_read_from_remote != 0 || stats.inverted_index_bytes_read_from_peer != 0 || stats.inverted_index_local_io_timer != 0 || stats.inverted_index_remote_io_timer != 0 || - stats.inverted_index_peer_io_timer != 0 || stats.inverted_index_io_timer != 0; + stats.inverted_index_peer_io_timer != 0 || stats.inverted_index_io_timer != 0 || + stats.inverted_index_request_bytes != 0 || stats.inverted_index_read_bytes != 0 || + stats.inverted_index_range_read_count != 0 || + stats.inverted_index_serial_read_rounds != 0; } Status OlapScanner::_prepare_impl() { diff --git a/be/src/io/cache/block_file_cache_profile.cpp b/be/src/io/cache/block_file_cache_profile.cpp index 8f9c167c9989e6..10ea52670789a0 100644 --- a/be/src/io/cache/block_file_cache_profile.cpp +++ b/be/src/io/cache/block_file_cache_profile.cpp @@ -98,6 +98,10 @@ FileCacheStatistics diff_file_cache_statistics(const FileCacheStatistics& curren SUBTRACT_FIELD(inverted_index_remote_io_timer); SUBTRACT_FIELD(inverted_index_peer_io_timer); SUBTRACT_FIELD(inverted_index_io_timer); + SUBTRACT_FIELD(inverted_index_request_bytes); + SUBTRACT_FIELD(inverted_index_read_bytes); + SUBTRACT_FIELD(inverted_index_range_read_count); + SUBTRACT_FIELD(inverted_index_serial_read_rounds); #undef SUBTRACT_FIELD return diff; } @@ -156,6 +160,14 @@ FileCacheProfileReporter::FileCacheProfileReporter(RuntimeProfile* profile) { ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexPeerIOUseTimer", cache_profile, 1); inverted_index_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexIOTimer", cache_profile, 1); + inverted_index_request_bytes = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexRequestBytes", TUnit::BYTES, cache_profile, 1); + inverted_index_read_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "InvertedIndexReadBytes", + TUnit::BYTES, cache_profile, 1); + inverted_index_range_read_count = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexRangeReadCount", TUnit::UNIT, cache_profile, 1); + inverted_index_serial_read_rounds = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexSerialReadRounds", TUnit::UNIT, cache_profile, 1); } void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) const { @@ -193,6 +205,11 @@ void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) con COUNTER_UPDATE(inverted_index_remote_io_timer, statistics->inverted_index_remote_io_timer); COUNTER_UPDATE(inverted_index_peer_io_timer, statistics->inverted_index_peer_io_timer); COUNTER_UPDATE(inverted_index_io_timer, statistics->inverted_index_io_timer); + COUNTER_UPDATE(inverted_index_request_bytes, statistics->inverted_index_request_bytes); + COUNTER_UPDATE(inverted_index_read_bytes, statistics->inverted_index_read_bytes); + COUNTER_UPDATE(inverted_index_range_read_count, statistics->inverted_index_range_read_count); + COUNTER_UPDATE(inverted_index_serial_read_rounds, + statistics->inverted_index_serial_read_rounds); } } // namespace doris::io diff --git a/be/src/io/cache/block_file_cache_profile.h b/be/src/io/cache/block_file_cache_profile.h index 6c95e49791c054..41cc2e0c01b41a 100644 --- a/be/src/io/cache/block_file_cache_profile.h +++ b/be/src/io/cache/block_file_cache_profile.h @@ -58,7 +58,6 @@ class FileCacheMetrics { void register_entity(); void update_metrics_callback(); -private: std::mutex _mtx; // use shared_ptr for concurrent std::shared_ptr _statistics; @@ -97,6 +96,10 @@ struct FileCacheProfileReporter { RuntimeProfile::Counter* inverted_index_remote_io_timer = nullptr; RuntimeProfile::Counter* inverted_index_peer_io_timer = nullptr; RuntimeProfile::Counter* inverted_index_io_timer = nullptr; + RuntimeProfile::Counter* inverted_index_request_bytes = nullptr; + RuntimeProfile::Counter* inverted_index_read_bytes = nullptr; + RuntimeProfile::Counter* inverted_index_range_read_count = nullptr; + RuntimeProfile::Counter* inverted_index_serial_read_rounds = nullptr; FileCacheProfileReporter(RuntimeProfile* profile); void update(const FileCacheStatistics* statistics) const; diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 36b20517afb87c..391f3b15c34e8d 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -74,6 +74,10 @@ struct FileCacheStatistics { int64_t inverted_index_remote_io_timer = 0; int64_t inverted_index_peer_io_timer = 0; int64_t inverted_index_io_timer = 0; + int64_t inverted_index_request_bytes = 0; + int64_t inverted_index_read_bytes = 0; + int64_t inverted_index_range_read_count = 0; + int64_t inverted_index_serial_read_rounds = 0; }; struct IOContext { diff --git a/be/src/snii/common/slice.h b/be/src/snii/common/slice.h new file mode 100644 index 00000000000000..db10b2dfc52b6f --- /dev/null +++ b/be/src/snii/common/slice.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace snii { + +// Read-only byte view (does not own memory). Lifetime is managed by the underlying buffer. +class Slice { +public: + Slice() = default; + Slice(const uint8_t* d, size_t n) : data_(d), size_(n) {} + explicit Slice(const std::vector& v) : data_(v.data()), size_(v.size()) {} + explicit Slice(std::string_view sv) + : data_(reinterpret_cast(sv.data())), size_(sv.size()) {} + + const uint8_t* data() const { return data_; } + size_t size() const { return size_; } + bool empty() const { return size_ == 0; } + + uint8_t operator[](size_t i) const { + assert(i < size_); + return data_[i]; + } + + Slice subslice(size_t off, size_t n) const { + assert(off + n <= size_); + return Slice(data_ + off, n); + } + +private: + const uint8_t* data_ = nullptr; + size_t size_ = 0; +}; + +} // namespace snii diff --git a/be/src/snii/common/status.h b/be/src/snii/common/status.h new file mode 100644 index 00000000000000..a8e21da814184a --- /dev/null +++ b/be/src/snii/common/status.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include + +namespace snii { + +enum class StatusCode { + kOk, + kCorruption, + kNotFound, + kInvalidArgument, + kIoError, + kUnsupported, + kInternal, +}; + +// Lightweight error type: success is kOk with no message; failure carries a code + human-readable message. +// Always return Status across API boundaries; silent failures are not allowed. +class Status { +public: + Status() = default; + + static Status OK() { return Status(); } + static Status Corruption(std::string m) { + return Status(StatusCode::kCorruption, std::move(m)); + } + static Status NotFound(std::string m) { return Status(StatusCode::kNotFound, std::move(m)); } + static Status InvalidArgument(std::string m) { + return Status(StatusCode::kInvalidArgument, std::move(m)); + } + static Status IoError(std::string m) { return Status(StatusCode::kIoError, std::move(m)); } + static Status Unsupported(std::string m) { + return Status(StatusCode::kUnsupported, std::move(m)); + } + static Status Internal(std::string m) { return Status(StatusCode::kInternal, std::move(m)); } + + bool ok() const { return code_ == StatusCode::kOk; } + StatusCode code() const { return code_; } + const std::string& message() const { return message_; } + std::string to_string() const; + +private: + Status(StatusCode c, std::string m) : code_(c), message_(std::move(m)) {} + + StatusCode code_ = StatusCode::kOk; + std::string message_; +}; + +} // namespace snii + +// Short-circuit return for expressions returning Status (propagate errors upward). +#define SNII_RETURN_IF_ERROR(expr) \ + do { \ + ::snii::Status _s = (expr); \ + if (!_s.ok()) return _s; \ + } while (0) diff --git a/be/src/snii/encoding/byte_sink.h b/be/src/snii/encoding/byte_sink.h new file mode 100644 index 00000000000000..604e307228cf39 --- /dev/null +++ b/be/src/snii/encoding/byte_sink.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" + +namespace snii { + +// append-only write cursor: all section serialization goes through this; manual byte assembly is forbidden. +// All multi-byte fixed-width fields are little-endian. +class ByteSink { +public: + void put_u8(uint8_t v) { buf_.push_back(v); } + void put_fixed16(uint16_t v); + void put_fixed32(uint32_t v); + void put_fixed64(uint64_t v); + void put_varint32(uint32_t v); + void put_varint64(uint64_t v); + void put_zigzag(int64_t v); + void put_bytes(Slice s); + + size_t size() const { return buf_.size(); } + const std::vector& buffer() const { return buf_; } + Slice view() const { return Slice(buf_); } + + // Resets the cursor to empty while RETAINING the backing capacity, so a sink can + // be reused across many small encodes (e.g. per-window region/prx scratch in the + // windowed posting builder) without re-allocating each time -- this avoids the + // cumulative small-allocation churn that fragments the heap arena and inflates + // peak RSS during the merge of a high-df term split into thousands of windows. + void clear() { buf_.clear(); } + + // Moves the backing buffer OUT to the caller (the sink is left empty), so an encoded + // section can be handed off without the copy (+ copy-induced capacity slack) that + // reading buffer() and copy-assigning would incur. Use only when the sink is not + // reused afterward (a stack-local about to die, or one that is clear()'d next). + std::vector take() { return std::move(buf_); } + +private: + std::vector buf_; +}; + +} // namespace snii diff --git a/be/src/snii/encoding/byte_source.h b/be/src/snii/encoding/byte_source.h new file mode 100644 index 00000000000000..96cf4eed665269 --- /dev/null +++ b/be/src/snii/encoding/byte_source.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" + +namespace snii { + +// Slice read cursor: all section deserialization goes through this; any overrun returns Corruption. +class ByteSource { +public: + explicit ByteSource(Slice s) : s_(s) {} + + Status get_u8(uint8_t* v); + Status get_fixed16(uint16_t* v); + Status get_fixed32(uint32_t* v); + Status get_fixed64(uint64_t* v); + Status get_varint32(uint32_t* v); + Status get_varint64(uint64_t* v); + Status get_zigzag(int64_t* v); + Status get_bytes(size_t n, Slice* out); + + size_t remaining() const { return s_.size() - pos_; } + size_t position() const { return pos_; } + bool eof() const { return pos_ == s_.size(); } + + // Returns a sub-view starting at absolute offset start with length len (used by framer etc. to rewind over the CRC coverage region). + Slice slice_from(size_t start, size_t len) const { return s_.subslice(start, len); } + +private: + Slice s_; + size_t pos_ = 0; +}; + +} // namespace snii diff --git a/be/src/snii/encoding/crc32c.h b/be/src/snii/encoding/crc32c.h new file mode 100644 index 00000000000000..08210379064d91 --- /dev/null +++ b/be/src/snii/encoding/crc32c.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include "snii/common/slice.h" + +namespace snii { + +// CRC32C (Castagnoli, polynomial 0x1EDC6F41). Used to checksum the tail of each format block. +uint32_t crc32c_extend(uint32_t crc, Slice data); + +inline uint32_t crc32c(Slice data) { + return crc32c_extend(0, data); +} + +} // namespace snii diff --git a/be/src/snii/encoding/pfor.h b/be/src/snii/encoding/pfor.h new file mode 100644 index 00000000000000..743cfe6f58e1a7 --- /dev/null +++ b/be/src/snii/encoding/pfor.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" + +namespace snii { + +// PFOR integer block encoder/decoder (unsigned uint32 array). +// Encoded layout: [u8 bit_width][varint n_exceptions][bit-packed low +// bits][exception table]. Selects the bit_width that minimizes total byte size; +// values exceeding it go into the exception table (index_delta, full_value). +// delta/zigzag is handled by the upper layer (.frq window); PFOR only processes +// unsigned integer arrays. +void pfor_encode(const uint32_t* values, size_t n, ByteSink* out); +Status pfor_decode(ByteSource* src, size_t n, uint32_t* out); +Status pfor_skip(ByteSource* src, size_t n); + +} // namespace snii diff --git a/be/src/snii/encoding/section_framer.h b/be/src/snii/encoding/section_framer.h new file mode 100644 index 00000000000000..cd8594f589a8da --- /dev/null +++ b/be/src/snii/encoding/section_framer.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" + +namespace snii { + +// A framed section: type + payload view. +struct FramedSection { + uint8_t type = 0; + Slice payload; +}; + +// Unified section framing: [u8 type][varint64 len][payload][fixed32 crc32c(type+len+payload)]. +// All full-format sections reuse this encode/checksum path to avoid ad-hoc hand-assembly. +// Unknown optional sections are dispatched by the caller based on type; read still verifies the CRC and skips the payload. +class SectionFramer { +public: + static void write(ByteSink& sink, uint8_t section_type, Slice payload); + static Status read(ByteSource& src, FramedSection* out); +}; + +} // namespace snii diff --git a/be/src/snii/encoding/varint.h b/be/src/snii/encoding/varint.h new file mode 100644 index 00000000000000..8a878b1d2928b4 --- /dev/null +++ b/be/src/snii/encoding/varint.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" + +namespace snii { + +// LEB128 variable-length integer encoding + zigzag. out buffer must be >=10 bytes; returns number of bytes written. +size_t varint_len(uint64_t v); +size_t encode_varint32(uint32_t v, uint8_t* out); +size_t encode_varint64(uint64_t v, uint8_t* out); + +// Decode a varint from the range [p, end); on success *next points to the next byte after the consumed input. +Status decode_varint32(const uint8_t* p, const uint8_t* end, uint32_t* v, const uint8_t** next); +Status decode_varint64(const uint8_t* p, const uint8_t* end, uint64_t* v, const uint8_t** next); + +inline uint64_t zigzag_encode(int64_t v) { + return (static_cast(v) << 1) ^ static_cast(v >> 63); +} +inline int64_t zigzag_decode(uint64_t v) { + return static_cast(v >> 1) ^ -static_cast(v & 1); +} + +} // namespace snii diff --git a/be/src/snii/encoding/zstd_codec.h b/be/src/snii/encoding/zstd_codec.h new file mode 100644 index 00000000000000..838df9af41b617 --- /dev/null +++ b/be/src/snii/encoding/zstd_codec.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" + +namespace snii { + +// Thin ZSTD wrapper. Used for compressing large payloads such as .prx windows. Decompression requires the caller to supply the original uncompressed length (from the block header). +Status zstd_compress(Slice input, int level, std::vector* out); +Status zstd_decompress(Slice input, size_t expected_uncomp_len, std::vector* out); + +} // namespace snii diff --git a/be/src/snii/format/bootstrap_header.h b/be/src/snii/format/bootstrap_header.h new file mode 100644 index 00000000000000..1face0347596c6 --- /dev/null +++ b/be/src/snii/format/bootstrap_header.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +// Fixed container header at the very start of a {rowset_id}_{seg_id}.idx file. +// Identifies the SNII container and carries basic compatibility info so a +// reader can fail fast before touching any streamed section or the tail meta +// region. +// +// On-disk layout (all multi-byte fields little-endian, fixed width; NOT framed +// by SectionFramer because it must be parseable without prior knowledge of the +// file): +// u32 magic == kContainerMagic +// u16 format_version == kFormatVersion +// u16 min_reader_version readers with kFormatVersion < this MUST refuse to +// read u32 flags container-level feature flags u32 +// header_length total bytes of this header including the checksum u8 +// tail_pointer_size size of the fixed tail pointer at EOF (hint for the +// reader) u32 header_checksum crc32c over all preceding header bytes +struct BootstrapHeader { + uint32_t magic = kContainerMagic; + uint16_t format_version = kFormatVersion; + uint16_t min_reader_version = kMinReaderVersion; + uint32_t flags = 0; + uint32_t header_length = 0; + uint8_t tail_pointer_size = 0; +}; + +// Total fixed on-disk size of the header, including the trailing crc32c. +inline constexpr uint32_t kBootstrapHeaderSize = + 4 /*magic*/ + 2 /*format_version*/ + 2 /*min_reader_version*/ + 4 /*flags*/ + + 4 /*header_length*/ + 1 /*tail_pointer_size*/ + 4 /*header_checksum*/; + +// Serializes the header to sink: writes header_length = kBootstrapHeaderSize +// and appends a crc32c over all preceding bytes. The caller's header_length +// field is ignored on input (it is always derived). Returns OK. +Status encode_bootstrap_header(const BootstrapHeader& header, ByteSink* sink); + +// Parses and validates a bootstrap header from the front of data. +// - too short / trailing bytes beyond the fixed header -> kCorruption +// - magic != kContainerMagic -> kCorruption +// - checksum mismatch -> kCorruption +// - format_version != kFormatVersion -> kUnsupported +// - min_reader_version > kFormatVersion -> kUnsupported +Status decode_bootstrap_header(Slice data, BootstrapHeader* out); + +} // namespace snii::format diff --git a/be/src/snii/format/bsbf.h b/be/src/snii/format/bsbf.h new file mode 100644 index 00000000000000..42a4e80f4dac12 --- /dev/null +++ b/be/src/snii/format/bsbf.h @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/io/file_reader.h" + +// Block-split bloom filter (BSBF) -- Apache Parquet split-block spec, with an +// S3-native on-demand single-block probe that none of the reference implementations +// (Apache Parquet, Doris storage, Doris format/parquet) ship. +// +// BIT FORMAT IS PARQUET-CANONICAL (interoperable with Apache Parquet / Doris +// format/parquet for the bitset bytes): +// - 256-bit (32-byte) blocks, 8 bits set per block. +// - key = XXH64(term, seed=0); high 32 bits select the block via FASTRANGE +// `block = ((hash>>32) * num_blocks) >> 32` (no power-of-2 requirement); low 32 +// bits select 8 in-block positions `1 << ((key * SALT[i]) >> 27)`. +// - num_bytes via Parquet OptimalNumOfBytes: power of 2 in [32, 128 MiB]. +// +// SNII WRAPPER (NOT Parquet's variable thrift header): a FIXED 28-byte header, then +// the contiguous, uncompressed, little-endian bitset. Because the header size is a +// constant, the bitset start is a constant offset (`section_base + 28`) and block i +// is at `section_base + 28 + i*32` -- so a single 32-byte block can be range-read on +// demand WITHOUT parsing a variable-length header and WITHOUT loading the whole blob. +namespace snii::format { + +constexpr uint32_t kBsbfBytesPerBlock = 32; // 256-bit block +constexpr uint32_t kBsbfBitsSetPerBlock = 8; // 8 uint32 words / block +constexpr uint32_t kBsbfMinBytes = 32; +constexpr uint32_t kBsbfMaxBytes = 128u * 1024 * 1024; // Parquet kMaximumBloomFilterBytes +constexpr uint32_t kBsbfHeaderSize = 28; // FIXED (constant bitset offset) +// L0/L1 tiering threshold (design "不存在的term快速过滤"): a bsbf section whose total +// size is <= this is loaded WHOLE into the resident reader at open (L0 -> free +// in-memory probe, no per-lookup round); larger filters stay L1 (header-only, probed +// one 32-byte block on demand). 256 KiB fits in a single cloud FileCache block. +constexpr uint32_t kBsbfResidentMaxBytes = 256u * 1024; + +// Canonical Parquet/Doris split-block SALT (8 odd 32-bit constants). +extern const uint32_t kBsbfSalt[kBsbfBitsSetPerBlock]; + +// XXH64(term, seed=0) -- the Parquet-canonical key (NOT XXH3, NOT Doris murmur). +uint64_t bsbf_hash(std::string_view term); + +// Parquet OptimalNumOfBytes(ndv, fpp): power of 2 in [32, 128 MiB]. +uint32_t bsbf_optimal_num_bytes(uint32_t ndv, double fpp); + +// Fastrange block index from a 64-bit hash and the block count. +inline uint32_t bsbf_block_index(uint64_t hash, uint32_t num_blocks) { + return static_cast(((hash >> 32) * num_blocks) >> 32); +} + +// Pure 32-byte-block kernel: does `block` contain the key's 8 bits? SIMD (AVX2) +// accelerated at runtime when available, scalar otherwise. Returns true => the term +// MAY be present (could be a false positive); false => DEFINITELY ABSENT. +bool bsbf_block_contains(uint64_t hash, const uint8_t block[kBsbfBytesPerBlock]); + +// In-memory builder + serializer. +class BsbfBuilder { +public: + BsbfBuilder() = default; + + // Sizes the filter for `ndv` distinct keys at target `fpp`. fpp in (0,1). + static Status create(uint32_t ndv, double fpp, BsbfBuilder* out); + + // Insert a key / term. SIMD-accelerated. + void insert(uint64_t hash); + void insert_term(std::string_view term) { insert(bsbf_hash(term)); } + + // In-memory probe over the resident bitset (build/warm path). SIMD-accelerated. + bool maybe_contains(uint64_t hash) const; + bool maybe_contains_term(std::string_view term) const { + return maybe_contains(bsbf_hash(term)); + } + + // Serialize [28-byte header][contiguous LE bitset] into `sink`. The header carries + // magic/version/hash+index strategy/num_bytes/num_blocks/ndv + header & bitset + // crc32c. The bitset is Parquet-canonical bytes. + Status serialize(ByteSink* sink) const; + + uint32_t num_bytes() const { return num_bytes_; } + uint32_t num_blocks() const { return num_blocks_; } + +private: + std::vector words_; // num_bytes_/4, blocks of 8 words + uint32_t num_bytes_ = 0; + uint32_t num_blocks_ = 0; + uint32_t ndv_ = 0; +}; + +// Resident header (28 bytes), parsed once at open. Validates magic/version/crc/bounds. +struct BsbfHeader { + uint32_t num_bytes = 0; + uint32_t num_blocks = 0; + uint32_t bitset_crc = 0; // stored crc32c of the bitset body (for L0 verification) + uint64_t bitset_base = 0; // absolute file offset of block 0 = section_base + 28 + + // Parse a 28-byte header located at `section_base` in the file. The bitset_base + // is set to section_base + kBsbfHeaderSize. + static Status parse(Slice header28, uint64_t section_base, BsbfHeader* out); + + // Absolute file offset of the 32-byte block this hash maps to. + uint64_t block_offset(uint64_t hash) const { + return bitset_base + + static_cast(bsbf_block_index(hash, num_blocks)) * kBsbfBytesPerBlock; + } +}; + +// On-demand probe: read EXACTLY ONE 32-byte block via `reader`, then test. No whole +// blob load, no deep copy. *maybe_present=false means DEFINITELY ABSENT. +Status bsbf_probe(snii::io::FileReader* reader, const BsbfHeader& header, uint64_t hash, + bool* maybe_present); + +} // namespace snii::format diff --git a/be/src/snii/format/dict_block.h b/be/src/snii/format/dict_block.h new file mode 100644 index 00000000000000..82ae2476c53561 --- /dev/null +++ b/be/src/snii/format/dict_block.h @@ -0,0 +1,144 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" + +// DICT block —— a positioning unit mapping term → postings read plan, and also +// the unit for remote on-demand fetching, caching, and CRC checksum +// verification (see docs/design/SNII-design-spec.source.md "DICT block" and +// "dict lookup flow summary" sections). +// +// Byte layout (strictly implemented; multi-byte fixed-width fields are +// little-endian, variable-length integers use LEB128): +// header: +// n_entries varint +// entry_format_ver u8 # = kDictBlockFormatVer +// block_flags u8 # bit0 = has_positions (consistency check +// against the value passed to reader) frq_base varint64 prx_base +// varint64 # present only when has_positions is set +// entries[n_entries] # variable-length DictEntry, front-coded in +// lexicographic order anchor_offsets[n_anchors] # u32 * n_anchors, byte +// offset of each anchor entry within the block n_anchors u32 crc32c +// u32 # covers [header .. n_anchors], detects corruption (sole CRC +// layer) +// +// Anchor rule: every anchor_interval entries, one "term anchor" is forced — +// that entry is encoded with prev_term="" (prefix_len=0, storing the full +// term), and its byte offset is recorded in anchor_offsets; non-anchor entries +// use the preceding entry's term as prev_term for front coding. The reader can +// start from any anchor and scan independently without needing earlier terms, +// enabling anchor binary search + local scan for exact term lookup. +namespace snii::format { + +// DICT block entry_format_ver: self-describing version of the DictEntry +// encoding. Reader rejects a mismatch so a query-only run cannot silently read +// an older dict-entry layout as the current one. +inline constexpr uint8_t kDictBlockFormatVer = 2; + +// block_flags bit definitions. +namespace dict_block_flags { +inline constexpr uint8_t kHasPositions = 1u << 0; // whether to write prx_base / .prx fields +// bit1-7 reserved +} // namespace dict_block_flags + +// DICT block writer: entries are added in lexicographic order via add_entry; +// internally maintains prev_term, determines anchors, accumulates size +// estimates, and on finish serializes header + entries + anchor table + CRC in +// one pass. +class DictBlockBuilder { +public: + DictBlockBuilder(IndexTier tier, bool has_positions, uint64_t frq_base, uint64_t prx_base, + uint32_t anchor_interval = 16); + + // Append one entry (caller must guarantee lexicographic term order). + // Internally decides whether it becomes an anchor. + void add_entry(const DictEntry& entry); + + // Upper-bound estimate of the serialized size of the current block (including + // header + entries + anchor table + CRC footer), used by the upper layer to + // decide when to cut a new block based on target_dict_block_bytes. + size_t estimated_bytes() const; + + // Number of entries. + uint32_t n_entries() const { return n_entries_; } + + // Serialize the entire block and append it to sink. + void finish(ByteSink* sink) const; + +private: + bool is_anchor(uint32_t index) const { return index % anchor_interval_ == 0; } + + IndexTier tier_; + bool has_positions_; + uint64_t frq_base_; + uint64_t prx_base_; + uint32_t anchor_interval_; + + uint32_t n_entries_ = 0; + std::vector entries_; + std::string prev_term_; // term of the previous entry (front coding base) + size_t entries_est_ = 0; // accumulated byte estimate for the entries section + size_t n_anchors_ = 0; // number of anchors +}; + +// DICT block reader: on open, verifies the CRC and parses the header / anchor +// table; find_term uses anchor binary search + local scan to locate a +// DictEntry. Holds a byte view of the block (non-owning); lifetime is managed +// by the caller. +class DictBlockReader { +public: + DictBlockReader() = default; + + // Parse and verify the entire block. CRC mismatch / truncation / invalid + // structure → Corruption; has_positions in the header inconsistent with the + // supplied argument → InvalidArgument. + static Status open(Slice block, IndexTier tier, bool has_positions, DictBlockReader* out); + + // Anchor binary search + local scan to locate target. Hit → *found=true and + // *out is filled; miss (including out-of-range, gap) → *found=false. + // Structural error → non-OK Status. + Status find_term(std::string_view target, bool* found, DictEntry* out) const; + + // Decodes EVERY entry in the block in lexicographic order into *out (each a + // self-contained DictEntry, owning its term). Used for ordered term + // enumeration (prefix / range scans). Resets the front-coding base at each + // anchor segment. + Status decode_all(std::vector* out) const; + + uint64_t frq_base() const { return frq_base_; } + uint64_t prx_base() const { return prx_base_; } + uint32_t n_entries() const { return n_entries_; } + +private: + // Sequentially scan from anchor anchor_idx to the end of that anchor segment, + // searching for target. + Status scan_from_anchor(size_t anchor_idx, std::string_view target, bool* found, + DictEntry* out) const; + + // Find the last anchor index where first_term(anchor) <= target; return false + // if none exists. + bool locate_anchor(std::string_view target, size_t* anchor_idx) const; + + Slice block_; // [header .. crc) full block view + IndexTier tier_ = IndexTier::kT1; + bool has_positions_ = false; + uint64_t frq_base_ = 0; + uint64_t prx_base_ = 0; + uint32_t n_entries_ = 0; + + size_t entries_begin_ = 0; // absolute offset of the start of the entries section + std::vector anchor_offsets_; // byte offset within the block for each anchor entry + std::vector + anchor_terms_; // full term of each anchor entry (used for binary search) +}; + +} // namespace snii::format diff --git a/be/src/snii/format/dict_block_directory.h b/be/src/snii/format/dict_block_directory.h new file mode 100644 index 00000000000000..a1d70e9ed5aec9 --- /dev/null +++ b/be/src/snii/format/dict_block_directory.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// BlockRef.flags bit definitions. +namespace block_ref_flags { +// bit0: the on-disk block bytes are zstd(uncompressed_block). When set, the +// directory also stores uncomp_len, and the reader zstd-decompresses the fetched +// [offset, offset+length) range to uncomp_len before parsing the dict block. The +// block-level crc32c (and BlockRef.checksum) cover the UNCOMPRESSED bytes, so a +// zstd block shrinks the bytes fetched from S3 while keeping the same integrity +// guarantees after decompression in RAM. +inline constexpr uint8_t kZstd = 1u << 0; +} // namespace block_ref_flags + +// Physical location and checksum info for a single DICT block. Aligned with SampledTermIndex by ordinal: +// SampledTermIndex[i]'s first_term corresponds to DictBlockDirectory[i] (see design spec +// "sampled dict index"). The read path issues a single range read over [offset, offset+length). +struct BlockRef { + uint64_t offset = 0; // absolute byte offset of the block within the container + uint64_t length = 0; // ON-DISK byte length of the block (compressed when kZstd) + uint32_t n_entries = 0; // number of DictEntry records within this block + uint8_t flags = 0; // block-level flags (block_ref_flags::*) + uint32_t checksum = 0; // crc32c of the block's UNCOMPRESSED content (verified after read) + uint64_t uncomp_len = 0; // uncompressed block byte length (stored only when kZstd set) +}; + +// DICT block directory: block ordinal → physical location mapping. +// +// on-disk layout (framed by SectionFramer with a unified type+len+crc32c wrapper): +// [u8 type=kDictBlockDirectory][varint64 payload_len][payload][fixed32 crc32c] +// payload = varint32 n_blocks +// then n_blocks × block_ref{ +// varint64 offset, varint64 length, varint32 n_entries, +// u8 flags, fixed32 checksum } +// Section-level crc detects truncation/corruption; block_ref.checksum is the per-block crc. +class DictBlockDirectoryBuilder { +public: + void add(const BlockRef& ref) { refs_.push_back(ref); } + + // Encodes as a kDictBlockDirectory framed section (with embedded crc32c) and appends to sink. + void finish(ByteSink* sink) const; + +private: + std::vector refs_; +}; + +// Reads and verifies a kDictBlockDirectory framed section; provides ordinal → BlockRef lookup. +// After parsing, all block_refs reside in the reader (entering the searcher cache along with meta). +class DictBlockDirectoryReader { +public: + // Verifies the section crc and deserializes all block_refs. + // crc mismatch / truncation / trailing bytes → kCorruption; wrong section type → kInvalidArgument. + static Status open(Slice section, DictBlockDirectoryReader* out); + + uint32_t n_blocks() const { return static_cast(refs_.size()); } + + // Returns the ordinal-th block_ref; ordinal >= n_blocks → kNotFound. + Status get(uint32_t ordinal, BlockRef* out) const; + +private: + std::vector refs_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/dict_entry.h b/be/src/snii/format/dict_entry.h new file mode 100644 index 00000000000000..e2b434ece3a22f --- /dev/null +++ b/be/src/snii/format/dict_entry.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/format_constants.h" +#include "snii/format/frq_pod.h" + +// DictEntry —— on-disk encoding/decoding of a dict entry. +// +// Byte layout (see docs/design/SNII-design-spec.source.md "dict entry" +// section): +// entry_len varint # byte length of entry body, allowing reader to skip +// unknown extensions or fast-skip entries +// --- entry body begins here, covered by entry_len --- +// prefix_len varint # length of shared prefix with prev_term +// suffix_len varint # number of suffix bytes +// suffix u8[] # suffix bytes that differ from prev_term +// flags u8 # bit0 kind / bit1 enc / bit2 has_sb / bit3 +// has_champion(=0) / bit4 offsets_ref(=0) df varint ttf_delta varint +// # only when tier>=T2 max_freq varint # only when tier>=T2 locator: +// pod_ref: frq_off_delta varint, frq_len varint, +// [prelude_len varint, frq_docs_len varint when enc=windowed] +// # docs-only prefix [prelude][dd-block]; windowed entries +// carry # per-window region metadata in the prelude. +// [frq_docs_len varint, slim region meta when enc=slim]: +// # frq_docs_len == dd region on-disk length; the docs-only +// prefix # [frq_off, frq_off+frq_docs_len) a docid-only reader +// fetches # without the freq region. win_mode u8 (bit0 +// dd_zstd, bit1 freq_zstd) dd_uncomp_len varint, crc_dd u32 +// [freq_uncomp_len varint, crc_freq u32 when tier>=T2] +// # The single slim window is [dd_region][freq_region]; +// dd_disk_len # = frq_docs_len, freq_disk_len = frq_len - +// frq_docs_len. +// [prx_off_delta varint, prx_len varint when tier>=T2] +// inline: frq_len varint, frq_bytes u8[], # frq_bytes = +// [dd_region][freq_region] +// slim region meta (as above, sans frq_docs_len which == dd disk +// len +// carried as inline_dd_disk_len varint), +// [prx_len varint, prx_bytes u8[] when tier>=T2] +// --- entry body ends --- +// +// CRC verification is performed at the DICT block level (covering block header +// + all entries + anchor offset table), no per-entry CRC to keep slim/inline +// low-frequency terms compact (spec §DICT block line 330/348). tier and +// positions capability are provided by per-index meta (not stored redundantly +// inside entries): when tier>=T2, ttf_delta / max_freq and .prx locator/bytes +// are written. +namespace snii::format { + +// Dict entry: inline or pod-ref (two states), self-described length, supports +// intra-block front coding. +struct DictEntry { + // term key (front coding relative to prev_term is applied during + // encode/decode; full term stored here). + std::string term; + + // flags. + DictEntryKind kind = DictEntryKind::kPodRef; + DictEntryEnc enc = DictEntryEnc::kSlim; + bool has_sb = false; + + // term stats. + uint32_t df = 0; + uint64_t ttf_delta = 0; // only when tier>=T2 + uint64_t max_freq = 0; // only when tier>=T2 + + // pod_ref locator. + uint64_t frq_off_delta = 0; + uint64_t frq_len = 0; + uint64_t prelude_len = 0; // only when enc=windowed + uint64_t frq_docs_len = 0; // pod_ref docs-only prefix length + uint64_t prx_off_delta = 0; // only when tier>=T2 + uint64_t prx_len = 0; // only when tier>=T2 + + // slim/inline single-window region codecs. The window is + // [dd_region][freq_region] (no self-describing header). dd_meta drives the + // docs-only decode; freq_meta the scoring decode (only when tier>=T2). For + // slim pod_ref dd_meta.disk_len == frq_docs_len; for inline it is stored as + // inline_dd_disk_len. + FrqRegionMeta dd_meta; + FrqRegionMeta freq_meta; // only when tier>=T2 + uint64_t inline_dd_disk_len = 0; // only for inline: dd region on-disk length + + // inline payload. + std::vector frq_bytes; // = [dd_region][freq_region] + std::vector prx_bytes; // only when tier>=T2 +}; + +// Encodes an entry into sink (appending) using the layout above, with front +// coding relative to prev_term. tier determines whether optional fields are +// written. +Status encode_dict_entry(const DictEntry& entry, std::string_view prev_term, IndexTier tier, + ByteSink* sink); + +// Decodes one entry from the current position of src; term is reconstructed +// from prev_term + suffix. Verifies the trailing CRC; out-of-range / CRC +// mismatch / invalid prefix_len all return Corruption. +Status decode_dict_entry(ByteSource* src, std::string_view prev_term, IndexTier tier, + DictEntry* out); + +// Skips one entry using only entry_len (does not parse internal fields or +// verify CRC). +Status skip_dict_entry(ByteSource* src); + +} // namespace snii::format diff --git a/be/src/snii/format/format_constants.h b/be/src/snii/format/format_constants.h new file mode 100644 index 00000000000000..188266d02910cf --- /dev/null +++ b/be/src/snii/format/format_constants.h @@ -0,0 +1,111 @@ +#pragma once + +#include + +// SNII container and per-section on-disk contract constants. +// Once published, these values are format semantics; changes require bumping +// format_version and maintaining a compatibility policy. All multi-byte +// fixed-width fields are little-endian; variable-length integers use LEB128 +// (see snii/encoding/varint.h). +namespace snii::format { + +// ---- Container-level magic / version ---- +// "SNII" reads as 0x49494E53 in little-endian. +inline constexpr uint32_t kContainerMagic = 0x49494E53u; // 'S''N''I''I' +inline constexpr uint32_t kTailMagic = 0x4C494154u; // 'T''A''I''L' +inline constexpr uint16_t kFormatVersion = 2; +inline constexpr uint16_t kMinReaderVersion = 2; +// Self-describing version of the meta layout (the per-index meta header AND the +// tail meta region share this single constant; a reader fails fast with +// Corruption on any mismatch). This is a from-scratch, pre-launch format: there +// is exactly ONE meta layout, so the value is 1. Bump it only AFTER launch, +// when a real on-disk change must coexist with already-written indexes -- +// pre-launch changes just fold into v1. +inline constexpr uint16_t kMetaFormatVersion = 1; + +// ---- SectionFramer section type ids (within per-index meta / tail region) +// ---- +enum class SectionType : uint8_t { + kStatsBlock = 1, + kSampledTermIndex = 2, + kDictBlockDirectory = 3, + kXFilter = 4, // reserved: legacy embedded XFilter; meta no longer emits/reads it + kSectionRefs = 5, + kPerIndexMetaHeader = 6, + kLogicalIndexDirectory = 7, + kTailMetaHeader = 8, + kFeatureBits = 9, +}; + +// ---- Logical index postings storage content configuration (fixed per logical +// index, not per-term) ---- Determines whether to write freq / positions / +// norms+stats. +enum class IndexConfig : uint8_t { + kDocsOnly = 0, // docid only: term/match filtering + kDocsPositions = 1, // docid+freq+positions: MATCH_PHRASE + kDocsPositionsScoring = 2, // + norms + stats: phrase + BM25 + kPositionsOffsets = 3, // reserved (highlight/RAG), not implemented in this release +}; + +// term stats / postings capability tiers: only tier>=kT2 writes +// ttf_delta/max_freq and .prx. +enum class IndexTier : uint8_t { + kT1 = 1, // docs-only + kT2 = 2, // docs-positions + kT3 = 3, // docs-positions-scoring +}; + +inline constexpr IndexTier tier_of(IndexConfig cfg) { + return cfg == IndexConfig::kDocsOnly ? IndexTier::kT1 + : cfg == IndexConfig::kDocsPositions ? IndexTier::kT2 + : IndexTier::kT3; // scoring / offsets +} +inline constexpr bool has_positions(IndexConfig cfg) { + return cfg != IndexConfig::kDocsOnly; +} +inline constexpr bool has_scoring(IndexConfig cfg) { + return cfg == IndexConfig::kDocsPositionsScoring; +} + +// ---- DictEntry flags bit definitions ---- +namespace dict_flags { +inline constexpr uint8_t kKind = 1u << 0; // 0=pod_ref / 1=inline +inline constexpr uint8_t kEnc = 1u << 1; // 0=slim / 1=windowed +inline constexpr uint8_t kHasSb = 1u << 2; // posting prelude includes sub-block directory +inline constexpr uint8_t kHasChampion = 1u << 3; // v1 always 0 +inline constexpr uint8_t kOffsetsRef = 1u << 4; // v1 always 0 +// bit5-7 reserved +} // namespace dict_flags + +enum class DictEntryKind : uint8_t { kPodRef = 0, kInline = 1 }; +enum class DictEntryEnc : uint8_t { kSlim = 0, kWindowed = 1 }; + +// ---- .prx window codec (codec byte bit0-5) ---- +// kRaw : plaintext varint payload (doc_count, per-doc pos_count + position +// deltas). kZstd : zstd-compressed plaintext payload (legacy reader still +// supported). kPfor : doc_count + per-doc pos_count (varint), then position +// deltas bit-packed +// as PFOR runs (kFrqBaseUnit each). No entropy coding -> far cheaper +// build CPU than zstd while staying competitive on size for ascending +// deltas. +enum class PrxCodec : uint8_t { + kRaw = 0, + kZstd = 1, + kPfor = 2 /* bit7 cont-reserved */ +}; + +// ---- Build-time parameters (not format semantics; may be tuned against real +// metrics) ---- +inline constexpr uint32_t kFrqBaseUnit = 256; // window base unit +inline constexpr uint32_t kSlimDfThreshold = 512; // df < this → slim +inline constexpr uint32_t kDefaultInlineThreshold = 256; // slim encoded bytes ≤ this → inline +// Adaptive window sizing (design #4): high-df windowed terms use larger windows +// to cut prelude rows + per-window header/crc overhead. Windows remain a whole +// multiple of kFrqBaseUnit so .prx alignment and win_base/last_docid semantics +// are preserved. A term whose df >= kAdaptiveWindowDfThreshold splits into +// kAdaptiveWindowDocs-sized windows instead of kFrqBaseUnit-sized ones. +inline constexpr uint32_t kAdaptiveWindowDfThreshold = 8192; // df >= this -> larger windows +inline constexpr uint32_t kAdaptiveWindowDocs = 1024; // larger window size (4 * base unit) +inline constexpr uint32_t kDefaultTargetDictBlockBytes = 64 * 1024; + +} // namespace snii::format diff --git a/be/src/snii/format/frq_pod.h b/be/src/snii/format/frq_pod.h new file mode 100644 index 00000000000000..aa3b36b23a4af5 --- /dev/null +++ b/be/src/snii/format/frq_pod.h @@ -0,0 +1,101 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +// .frq region codec (FrqPod): doc-delta (dd) and freq postings, columnar + PFOR +// (see docs/design SNII "frq design" and the read-byte-optimizations +// design 1.6). +// +// PHASE D (posting-level dd/freq grouping): windows are NO LONGER +// self-describing. A windowed .frq payload is laid out as +// [prelude][dd-block][freq-block] +// where the dd-block concatenates every window's dd_region and the freq-block +// concatenates every window's freq_region. Each region is independently encoded +// (raw or zstd, chosen by size) and the per-window codec metadata (mode, +// lengths, crc, offsets) is hoisted into the frq_prelude rows -- the region +// bytes carry NO header. This makes the docs-only prefix ([prelude][dd-block]) +// ONE contiguous run a docid-only / phrase reader can fetch in a single range, +// skipping the freq-block entirely. +// +// dd_region plaintext = VInt n ++ PFOR_runs(doc_delta) # n = doc count +// dd[0] = first_docid - win_base; dd[i] = docid[i] - docid[i-1]; win_base is +// the previous window's last docid (first window = 0). +// freq_region plaintext = PFOR_runs(freq) # present iff +// has_freq PFOR runs are segmented at 256 docs (kFrqBaseUnit); a partial +// segment writes the remainder. Variable-length integers reuse +// snii/encoding/varint; PFOR reuses snii/encoding/pfor; crc32c covers each +// region's ON-DISK bytes. +namespace snii::format { + +// Codec metadata for ONE encoded region (dd or freq), hoisted into the prelude. +// The region's on-disk bytes are pure payload (no header); these fields drive +// the decode. crc covers the on-disk (disk_len) bytes. +struct FrqRegionMeta { + bool zstd = false; // true => disk bytes are zstd(plaintext); false => raw + uint64_t uncomp_len = 0; // plaintext byte length (== disk_len when raw) + uint64_t disk_len = 0; // on-disk byte length of this region + uint32_t crc = 0; // crc32c of the on-disk (disk_len) bytes + // When false, decode_*_region SKIPS the per-region crc check (and the writer + // omits the 4-byte crc from the dict entry). Set false for INLINE entries: + // their region bytes live inside the dict block, whose own block-level crc32c + // already covers them, so a per-region crc is fully redundant. POD-ref + // regions (slim/windowed) live in the separately-fetched .frq POD -- their + // crc stays. + bool verify_crc = true; +}; + +// Encodes a window's dd_region plaintext (VInt n ++ PFOR_runs(doc_delta)) into +// raw or zstd (per zstd_level_or_neg_for_auto), APPENDS the on-disk bytes to +// out, and fills meta (mode/uncomp_len/disk_len/crc). The region carries no +// header. docids_ascending: ascending docids in this window (single doc or +// empty allowed). win_base: previous window's last docid (first window = 0); +// requires docids[0] >= win_base. zstd_level_or_neg_for_auto: <0 auto (zstd +// when large enough, else raw); 0 force +// raw; >0 force zstd at that level. +// Non-ascending docids / first_docid < win_base / null out returns +// InvalidArgument. +Status build_dd_region(std::span docids_ascending, uint64_t win_base, + int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta); + +// Vector convenience overload (forwards a span view; no copy of the elements). +inline Status build_dd_region(const std::vector& docids_ascending, uint64_t win_base, + int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta) { + return build_dd_region(std::span(docids_ascending), win_base, + zstd_level_or_neg_for_auto, out, meta); +} + +// Encodes a window's freq_region plaintext (PFOR_runs(freq)) into raw or zstd, +// APPENDS the on-disk bytes to out, and fills meta. Empty freqs yields a +// zero-length region. Null out returns InvalidArgument. +Status build_freq_region(std::span freqs, int zstd_level_or_neg_for_auto, + ByteSink* out, FrqRegionMeta* meta); + +// Vector convenience overload (forwards a span view; no copy of the elements). +inline Status build_freq_region(const std::vector& freqs, int zstd_level_or_neg_for_auto, + ByteSink* out, FrqRegionMeta* meta) { + return build_freq_region(std::span(freqs), zstd_level_or_neg_for_auto, out, + meta); +} + +// Decodes a dd_region from its on-disk slice (exactly disk_len bytes) + meta + +// win_base, reconstructing ascending docids. Verifies meta.crc against the +// slice. crc mismatch / wrong slice length / truncation / decompression / +// oversized count all return a non-OK Status. The freq region is irrelevant +// here (docs-only path). +Status decode_dd_region(Slice dd_disk, const FrqRegionMeta& meta, uint64_t win_base, + std::vector* docids); + +// Decodes a freq_region from its on-disk slice (exactly disk_len bytes) + meta, +// producing doc_count freqs. Verifies meta.crc. doc_count == 0 yields empty +// freqs (and requires a zero-length region). crc mismatch / wrong slice length +// / etc. return a non-OK Status. +Status decode_freq_region(Slice freq_disk, const FrqRegionMeta& meta, size_t doc_count, + std::vector* freqs); + +} // namespace snii::format diff --git a/be/src/snii/format/frq_prelude.h b/be/src/snii/format/frq_prelude.h new file mode 100644 index 00000000000000..848e2bf0e2926b --- /dev/null +++ b/be/src/snii/format/frq_prelude.h @@ -0,0 +1,178 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +// FrqPrelude: a TWO-LEVEL (super-block -> window) skippable directory that +// precedes a windowed .frq posting whose payload is laid out (PHASE D, design +// 1.6) with dd and freq regions GROUPED at posting level: +// windowed .frq payload = [prelude][dd-block][freq-block] +// dd-block = dd_region_0 ++ dd_region_1 ++ ... ++ dd_region_{N-1} +// freq-block = freq_region_0 ++ ... ++ freq_region_{N-1} (iff has_freq) +// Windows are NOT self-describing: each window's full codec metadata (region +// offsets, on-disk/uncompressed lengths, modes, crcs) lives in the prelude rows. +// The docs-only prefix [prelude][dd-block] is therefore ONE contiguous run a +// docid-only / phrase reader fetches in a single range, skipping the freq-block. +// +// DictEntry records prelude_len, frq_len (whole payload) and frq_docs_len +// (= prelude_len + dd_block_len) so a reader can range-fetch the prelude first, +// then fetch either the contiguous dd-block (docs-only) or both blocks (scoring). +// +// On-disk layout (strict; all multi-byte fixed fields little-endian, VInt = +// LEB128 via snii/encoding): +// header: +// u8 flags # bit0 has_freq, bit1 has_prx +// VInt N # number of .frq windows +// VInt G # windows per super-block (group_size; >=1) +// VInt n_super # = ceil(N / G); 0 when N==0 +// VInt sbdir_len # byte length of the super_block_dir region +// u32 crc32c # covers header + super_block_dir (NOT the window blocks) +// super_block_dir[n_super]: # small, resident: one row per super-block +// VInt sb_last_docid_delta # cumulative across super-blocks => absolute last +// # docid of the super-block's last window +// VInt sb_block_off # byte offset of this super-block's window block, +// # measured from the start of the window_dir region +// VInt sb_block_len # byte length of this super-block's window block +// window_dir: n_super self-contained blocks, each holding <=G window rows. +// per window row: +// VInt last_docid_delta # cumulative WITHIN the block => absolute last docid +// # (previous window's absolute last docid = win_base; +// # first window of first block: win_base = 0) +// VInt doc_count # number of docs in the window (frq_pod needs it) +// u8 win_mode # bit0 dd_zstd, bit1 freq_zstd +// VInt dd_off # dd_region byte offset within the dd-block +// VInt dd_disk_len # dd_region on-disk byte length +// VInt dd_uncomp_len # dd_region plaintext byte length +// u32 crc_dd # crc32c of the dd_region on-disk bytes +// VInt freq_off # freq_region offset within the freq-block (has_freq) +// VInt freq_disk_len # freq_region on-disk byte length (has_freq) +// VInt freq_uncomp_len # freq_region plaintext byte length (has_freq) +// u32 crc_freq # crc32c of the freq_region on-disk bytes (has_freq) +// VInt prx_off # .prx payload byte offset (present iff has_prx) +// VInt prx_len # .prx payload byte length (present iff has_prx) +// VInt max_freq # window max term frequency (WAND block-max) +// u8 max_norm # window score-max norm (WAND); 0 acceptable +// +// Reconstructing win_base / absolute last_docid (READER CONTRACT) is unchanged: +// the writer chains absolute last docids across windows; each row stores the delta +// of its absolute last docid from the previous window, and sb_last_docid seeds +// each block, so super-block binary search then in-block window binary search +// locate the window covering any docid without decoding the .frq blocks. +// +// The trailing crc32c covers only header + super_block_dir; every region carries +// its own crc (crc_dd / crc_freq) in the row. +namespace snii::format { + +namespace frq_prelude_flags { +inline constexpr uint8_t kHasFreq = 1u << 0; +inline constexpr uint8_t kHasPrx = 1u << 1; +} // namespace frq_prelude_flags + +// Per-window codec mode bits (win_mode byte). +namespace frq_win_mode { +inline constexpr uint8_t kDdZstd = 1u << 0; +inline constexpr uint8_t kFreqZstd = 1u << 1; +inline constexpr uint8_t kKnownBits = kDdZstd | kFreqZstd; +} // namespace frq_win_mode + +// Absolute, decoded metadata for one window (as the reader exposes it). The dd / +// freq region locators are offsets WITHIN the dd-block / freq-block respectively +// (both blocks follow the prelude). The reader derives the dd-block length from +// the last window's dd_off + dd_disk_len. +struct WindowMeta { + uint32_t last_docid = 0; // absolute last docid in the window + uint64_t win_base = 0; // absolute last docid of the previous window (0 for w==0) + uint32_t doc_count = 0; + + // dd_region locator (within the dd-block). + bool dd_zstd = false; + uint64_t dd_off = 0; + uint64_t dd_disk_len = 0; + uint64_t dd_uncomp_len = 0; + uint32_t crc_dd = 0; + + // freq_region locator (within the freq-block); valid only when has_freq. + bool freq_zstd = false; + uint64_t freq_off = 0; + uint64_t freq_disk_len = 0; + uint64_t freq_uncomp_len = 0; + uint32_t crc_freq = 0; + + uint64_t prx_off = 0; // valid only when has_prx + uint64_t prx_len = 0; // valid only when has_prx + uint32_t max_freq = 0; + uint8_t max_norm = 0; + + // In-memory only (NOT serialized in the prelude row). When false, the dd/freq + // region decode skips crc verification -- used when these region bytes are + // covered by an enclosing crc (e.g. an INLINE entry inside its dict block). + // Windowed/slim POD-ref rows leave this true (their regions carry a crc). + bool verify_crc = true; +}; + +// Builder input: one fully-computed WindowMeta per window, in term order, plus the +// super-block grouping factor. The writer fills last_docid (absolute), doc_count, +// the region locators/crcs, prx locator, max_freq and max_norm; win_base is derived +// during build (so callers may leave it 0). group_size must be >= 1. +struct FrqPreludeColumns { + bool has_freq = true; + bool has_prx = false; + uint32_t group_size = 64; // windows per super-block (G) + std::vector windows; +}; + +// Builds the prelude bytes and appends them to out. +// Returns InvalidArgument when out is null, group_size is 0, or the windows are +// not in non-decreasing last_docid order (a window's absolute last docid must be +// >= the previous window's). +Status build_frq_prelude(const FrqPreludeColumns& cols, ByteSink* out); + +// Reads and verifies a prelude buffer, exposing two-level skip access. The reader +// parses the header + super_block_dir on open (verifying the trailing crc) and +// eagerly decodes every window block into owned WindowMeta rows (the prelude is +// small relative to the postings). It does not retain the input. +class FrqPreludeReader { +public: + // Parses + verifies the prelude. crc mismatch / truncation / inconsistent + // offsets-or-lengths / oversized counts => kCorruption. + static Status open(Slice prelude, FrqPreludeReader* out); + + uint32_t n_windows() const { return static_cast(windows_.size()); } + uint32_t n_super_blocks() const { return n_super_; } + bool has_freq() const { return has_freq_; } + bool has_prx() const { return has_prx_; } + + // Total on-disk byte length of the dd-block (== sum of dd_disk_len; the docs-only + // prefix after the prelude). 0 when there are no windows. + uint64_t dd_block_len() const { return dd_block_len_; } + // Total on-disk byte length of the freq-block (== sum of freq_disk_len). 0 when + // !has_freq or no windows. + uint64_t freq_block_len() const { return freq_block_len_; } + + // Returns the absolute WindowMeta for window w. Out-of-range => InvalidArgument. + Status window(uint32_t w, WindowMeta* out) const; + + // Locates the window covering docid via super-block binary search then window + // binary search. *found=false (with OK) when docid is past the term's last + // docid; otherwise *w is the index of the covering window (the first window + // whose absolute last_docid >= docid). + Status locate_window(uint32_t docid, bool* found, uint32_t* w) const; + +private: + bool has_freq_ = false; + bool has_prx_ = false; + uint32_t group_size_ = 1; + uint32_t n_super_ = 0; + uint64_t dd_block_len_ = 0; + uint64_t freq_block_len_ = 0; + // Absolute last docid at each super-block boundary (size n_super_). + std::vector sb_last_docid_; + // All windows decoded with absolute fields, in term order (size N). + std::vector windows_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/logical_index_directory.h b/be/src/snii/format/logical_index_directory.h new file mode 100644 index 00000000000000..3cfddbd7227bb8 --- /dev/null +++ b/be/src/snii/format/logical_index_directory.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// Container-level directory entry: maps a logical index identity (index_id, index_suffix) +// to the physical location of its per-index meta block. Aligned with Doris key system +// (see design spec "footer meta region" logical index directory). The reader issues a +// single range read over [meta_off, meta_off + meta_len) to load that per-index meta. +struct LogicalIndexRef { + uint64_t index_id = 0; // logical index id (matches Doris InvertedIndexDescriptor key) + std::string index_suffix; // UTF-8 sub-index suffix; may be empty for the primary index + uint64_t meta_off = 0; // absolute byte offset of the per-index meta block in the container + uint64_t meta_len = 0; // byte length of the per-index meta block +}; + +// Logical index directory: (index_id, index_suffix) -> per-index meta block reference. +// +// on-disk layout (framed by SectionFramer with a unified type+len+crc32c wrapper): +// [u8 type=kLogicalIndexDirectory][varint64 payload_len][payload][fixed32 crc32c] +// payload = varint32 n_entries +// then n_entries x { +// varint64 index_id, +// varint32 suffix_len, suffix_bytes, +// varint64 per_index_meta_off, +// varint64 per_index_meta_len } +// The section-level crc covers the whole directory, so no per-entry crc is stored +// (the spec lists a per-entry crc32c as optional; it is folded into the framer crc here). +class LogicalIndexDirectoryBuilder { +public: + void add(const LogicalIndexRef& ref) { refs_.push_back(ref); } + + // Encodes as a kLogicalIndexDirectory framed section (with embedded crc32c) and appends to sink. + void finish(ByteSink* sink) const; + +private: + std::vector refs_; +}; + +// Reads and verifies a kLogicalIndexDirectory framed section; provides ordinal access and +// (index_id, suffix) lookup. After parsing, all entries reside in the reader (entering the +// searcher cache along with the rest of the tail meta region). +class LogicalIndexDirectoryReader { +public: + // Verifies the section crc and deserializes all entries. + // crc mismatch / truncation / trailing bytes / oversized counts -> kCorruption; + // wrong section type -> kInvalidArgument; null out -> kInvalidArgument. + static Status open(Slice framed, LogicalIndexDirectoryReader* out); + + uint32_t size() const { return static_cast(refs_.size()); } + + // Returns the i-th entry in encounter order; i >= size -> kNotFound. + Status get(uint32_t i, LogicalIndexRef* out) const; + + // Looks up the entry for (index_id, suffix). On match, *found=true and *out is populated; + // when absent, *found=false and *out is left untouched. Returns kInvalidArgument on null + // output pointers. The pair (index_id, suffix) is the unique key. + Status find(uint64_t index_id, std::string_view suffix, bool* found, + LogicalIndexRef* out) const; + +private: + std::vector refs_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/norms_pod.h b/be/src/snii/format/norms_pod.h new file mode 100644 index 00000000000000..6580b1df2ffcc1 --- /dev/null +++ b/be/src/snii/format/norms_pod.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// norms POD: per logical index / field stores 1-byte encoded doc length per doc, +// used by BM25 length normalization (SniiStatsProvider::encoded_norm) for per-docid lookup. +// +// On-disk layout (the whole section is framed by SectionFramer, which adds a type+len+crc32c envelope): +// framer payload = [varint64 doc_count][bytes encoded_norm[doc_count]] +// framer envelope = [u8 type][varint64 payload_len][payload][fixed32 crc32c] +// The encoding of encoded_norm (length -> 1B) is out of scope for this module; here we only handle raw byte storage and retrieval. +class NormsPodWriter { +public: + // Appends the encoded_norm for the next docid (docid is implicit, assigned in append order starting from 0). + void add(uint8_t encoded_norm) { norms_.push_back(encoded_norm); } + + // Number of docs accumulated so far (i.e., the next docid to be assigned). + size_t count() const { return norms_.size(); } + + // Writes [doc_count][bytes] framed by SectionFramer into sink (appends; does not clear sink). + void finish(ByteSink* sink) const; + +private: + std::vector norms_; +}; + +// Read-only view: on open, verifies the framer CRC and checks that doc_count/payload length are consistent, +// afterwards encoded_norm(docid) is O(1) direct indexing (zero-copy, borrows the underlying buffer). +class NormsPodReader { +public: + NormsPodReader() = default; + + // Parses the entire section (including the framer envelope). Returns Corruption on CRC mismatch, truncation, or length inconsistency. + // On success, *out borrows the memory pointed to by framer_payload; the caller must ensure its lifetime. + static Status open(Slice framed, NormsPodReader* out); + + uint32_t doc_count() const { return doc_count_; } + + // Precondition (hard contract): docid < doc_count(). Semantics match std::vector::operator[]: + // the caller is responsible for guaranteeing this (docid comes from trusted postings decoded internally by SNII). Asserts in debug builds; + // no check in Release (NDEBUG). Use try_encoded_norm when the docid is untrusted and needs validation. + uint8_t encoded_norm(uint32_t docid) const { + assert(docid < doc_count_); + return norms_[docid]; + } + + // Checked access: returns InvalidArgument if docid is out of range; never reads out-of-range memory. + Status try_encoded_norm(uint32_t docid, uint8_t* out) const { + if (docid >= doc_count_) return Status::InvalidArgument("norms: docid out of range"); + *out = norms_[docid]; + return Status::OK(); + } + +private: + const uint8_t* norms_ = nullptr; + uint32_t doc_count_ = 0; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/null_bitmap.h b/be/src/snii/format/null_bitmap.h new file mode 100644 index 00000000000000..21c6f92be59709 --- /dev/null +++ b/be/src/snii/format/null_bitmap.h @@ -0,0 +1,90 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +// Forward-declare the CRoaring C++ bitmap so this header stays free of the +// (large) roaring include; the concrete type is only needed in the .cpp. +namespace roaring { +class Roaring; +} // namespace roaring + +namespace snii::format { + +// SectionFramer type byte for the null-bitmap POD. There is no dedicated +// SectionType enum value yet, so we use a documented literal (0x20) outside the +// currently allocated enum range (1..9) to avoid colliding with existing types. +inline constexpr uint8_t kNullBitmapSectionType = 0x20; + +// NullBitmap POD: per logical index, a Roaring bitmap of null docids (docs whose +// value is NULL / not indexed). It decouples per-doc NULL information from the +// per-term dictionary / postings so NULL handling can pull only this side POD. +// +// On-disk layout (the whole section is framed by SectionFramer, which adds a +// type + varint64 len + payload + fixed32 crc32c envelope): +// framer payload = [varint64 doc_count][varint64 roaring_size][roaring_bytes] +// roaring_bytes is the portable CRoaring serialization (Roaring::write). +class NullBitmapWriter { +public: + NullBitmapWriter(); + ~NullBitmapWriter(); + + NullBitmapWriter(const NullBitmapWriter&) = delete; + NullBitmapWriter& operator=(const NullBitmapWriter&) = delete; + + // Marks docid as NULL (adding the same docid twice is idempotent). + void add_null(uint32_t docid); + + // Number of distinct null docids accumulated so far. + uint32_t null_count() const; + + // Serializes [doc_count][roaring_size][roaring_bytes] framed by SectionFramer + // and appends it to sink (does not clear sink). doc_count is the total number + // of docs in the logical index (recorded so the reader can round-trip it). + void finish(uint32_t doc_count, ByteSink* sink) const; + +private: + std::unique_ptr bitmap_; +}; + +// Read-only view: on open, SectionFramer verifies the CRC and truncation; this +// class then guards roaring_size against the remaining payload bytes before +// deserializing the Roaring bitmap (anti-DoS), so a corrupt size cannot trigger +// an oversized allocation/read. is_null() is then an O(1) membership test. +class NullBitmapReader { +public: + NullBitmapReader(); + ~NullBitmapReader(); + + NullBitmapReader(const NullBitmapReader&) = delete; + NullBitmapReader& operator=(const NullBitmapReader&) = delete; + NullBitmapReader(NullBitmapReader&&) noexcept; + NullBitmapReader& operator=(NullBitmapReader&&) noexcept; + + // Parses the entire section (framer envelope + payload). Returns Corruption on + // CRC mismatch, truncation, doc_count overflow, or an oversized roaring_size. + static Status open(Slice framed, NullBitmapReader* out); + + // True iff docid was marked NULL. docids outside the null set (including those + // >= doc_count) return false. + bool is_null(uint32_t docid) const; + + // Number of distinct null docids in the bitmap. + uint32_t null_count() const; + + // Copies the decoded bitmap into the caller-owned Roaring object. + void copy_to(roaring::Roaring* out) const; + + // Total doc count of the logical index, as recorded by the writer. + uint32_t doc_count() const { return doc_count_; } + +private: + std::unique_ptr bitmap_; + uint32_t doc_count_ = 0; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/per_index_meta.h b/be/src/snii/format/per_index_meta.h new file mode 100644 index 00000000000000..1a89a8710fbd7a --- /dev/null +++ b/be/src/snii/format/per_index_meta.h @@ -0,0 +1,150 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/format_constants.h" +#include "snii/format/stats_block.h" + +// PerIndexMeta -- the per-logical-index metadata block that enters the searcher +// cache. It COMPOSES already-built sub-sections (StatsBlock, SampledTermIndex, +// DICT block directory, optional XFilter) plus the physical SectionRefs into a +// single contiguous block. See design spec "Per-index meta block". +// +// On-disk layout: +// PerIndexMetaHeader (fixed prefix, self-checksummed): +// u16 meta_format_version (== kMetaFormatVersion), little-endian +// varint64 index_id +// varint32 suffix_len +// u8[] suffix_bytes +// u32 flags (fixed32, little-endian) # feature bits, e.g. kHasBsbf +// u32 crc32c (fixed32) over all preceding header bytes +// then framed sub-sections (each via SectionFramer, type+len+payload+crc32c): +// StatsBlock (kStatsBlock, built here) +// SampledTermIndex (kSampledTermIndex, embedded already-framed bytes) +// DICT block directory (kDictBlockDirectory,embedded already-framed bytes) +// SectionRefs (kSectionRefs, built here; carries the bsbf ref) +// (+ any extra raw framed sections appended by add_raw_section) +// +// Design choice: the SampledTermIndex / DICT block directory / XFilter +// sub-sections are EMBEDDED as their producers' already-framed output (the raw +// SectionFramer frame), not re-framed. This lets the reader hand the exact frame +// Slice straight back to each sub-module's open() (which expects a full frame), +// and reuses the framer instead of re-implementing sub-section parsing. +namespace snii::format { + +// Physical reference to a contiguous region within the container. (0, 0) means +// the region is absent (e.g. no norms POD for a non-scoring index). A present- +// but-empty region (e.g. an all-INLINE index's posting_region) is (off, 0). +struct RegionRef { + uint64_t offset = 0; + uint64_t length = 0; +}; + +// Physical references to the data sections / side PODs of one logical index. +// Each RegionRef is encoded as varint64 offset followed by varint64 length, in +// the field order below. +// +// posting_region is the single interleaved [prx][frq] posting region (it replaced +// the former two separate frq_pod + prx_pod refs). Each pod_ref term writes its +// prx span first then its frq span, contiguously, in term order; both +// frq_off_delta and prx_off_delta now index into this one region. NO positions +// capability is inferred from posting_region.length -- it is non-zero for any +// docs-only index with a pod_ref term, and zero for an all-INLINE positional +// index; capability lives in the header kHasPositions flag instead. +struct SectionRefs { + RegionRef dict_region; + RegionRef posting_region; // interleaved [prx][frq] per term; was frq_pod + prx_pod + RegionRef norms; + RegionRef null_bitmap; + // Block-split bloom XFilter section ([28B header][bitset]); {0,0} when absent. + // A PHYSICAL section (not embedded in the resident meta) so a single 32-byte block + // can be probed on demand without loading the whole filter at open. + RegionRef bsbf; +}; + +// Builds a per-index meta block by composing already-built sub-sections. +class PerIndexMetaBuilder { +public: + // Header flags / feature bits. + static constexpr uint32_t kHasPositions = 1u << 0; // index is positions-capable (tier>=T2) + static constexpr uint32_t kHasBsbf = 1u << 1; // block-split bloom XFilter (section ref) + + PerIndexMetaBuilder(uint64_t index_id, std::string index_suffix, uint32_t flags); + + void set_stats(const StatsBlock& stats); + + // Raw output of SampledTermIndexBuilder::finish (a full kSampledTermIndex frame). + void set_sampled_term_index(Slice framed_bytes); + + // Raw output of DictBlockDirectoryBuilder::finish (a full kDictBlockDirectory frame). + void set_dict_block_directory(Slice framed_bytes); + + void set_section_refs(const SectionRefs& refs); + + // Appends an arbitrary already-framed section verbatim. Used for forward-compat + // optional sections; the reader skips unrecognized types. + void add_raw_section(Slice framed_bytes); + + // Serializes the header and all sub-sections into sink. + // sink == nullptr -> kInvalidArgument. + Status finish(ByteSink* sink) const; + +private: + uint64_t index_id_; + std::string index_suffix_; + uint32_t flags_; + StatsBlock stats_; + std::vector sampled_term_index_; + std::vector dict_block_directory_; + SectionRefs section_refs_; + std::vector> extra_sections_; +}; + +// Parses a per-index meta block: verifies the header crc, then walks the framed +// sub-sections (each crc-verified by the framer), capturing the full frame Slice +// of each known sub-section so callers can re-open it with the sub-module reader. +// Unrecognized optional section types are skipped. +class PerIndexMetaReader { +public: + PerIndexMetaReader() = default; + + // block == the full per-index meta block bytes; out must be non-null. + // Header crc mismatch / truncation / a sub-section crc mismatch -> kCorruption; + // missing a required sub-section -> kCorruption; out == nullptr -> kInvalidArgument. + static Status open(Slice block, PerIndexMetaReader* out); + + uint64_t index_id() const { return index_id_; } + const std::string& index_suffix() const { return index_suffix_; } + uint32_t flags() const { return flags_; } + + const StatsBlock& stats() const { return stats_; } + const SectionRefs& section_refs() const { return section_refs_; } + + // Full kSampledTermIndex frame Slice, ready for SampledTermIndexReader::open. + Slice sampled_term_index_bytes() const { return sampled_term_index_; } + // Full kDictBlockDirectory frame Slice, ready for DictBlockDirectoryReader::open. + Slice dict_block_directory_bytes() const { return dict_block_directory_; } + + // Block-split bloom XFilter: present iff a non-empty bsbf section ref exists. + bool has_bsbf() const { return section_refs_.bsbf.length > 0; } + + // Positions capability, read from the persisted header flag (NOT from any region + // length). True iff the index was built as docs-positions(+scoring) (tier>=T2). + bool has_positions() const { return (flags_ & PerIndexMetaBuilder::kHasPositions) != 0; } + +private: + uint64_t index_id_ = 0; + std::string index_suffix_; + uint32_t flags_ = 0; + StatsBlock stats_; + SectionRefs section_refs_; + Slice sampled_term_index_; + Slice dict_block_directory_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/prx_pod.h b/be/src/snii/format/prx_pod.h new file mode 100644 index 00000000000000..50c8536acb4cfe --- /dev/null +++ b/be/src/snii/format/prx_pod.h @@ -0,0 +1,90 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" + +// .prx position window (PrxPod): stores term position information for several +// docs within one window. +// +// Single-window on-disk byte layout (see docs/design SNII "prx design"): +// u8 codec # PrxCodec: 0=raw / 1=zstd / 2=pfor (bit7 cont-reserved) +// VInt uncomp_len # payload length (raw/pfor: on-disk payload bytes; zstd: +// plaintext) VInt comp_len # present only when codec==zstd u32 crc32c # +// covers header (codec..comp_len) + payload bytes payload # raw: varint +// plaintext; zstd: compressed; pfor: bit-packed +// +// raw/zstd plaintext payload (self-describing per-doc boundaries): +// VInt doc_count +// per doc: VInt pos_count, followed by pos_count position deltas (VInt) +// positions within a doc are ascending, stored as deltas (first absolute). +// +// pfor payload (default build codec; no entropy coding): +// VInt doc_count +// VInt total_pos # sum of all pos_counts +// per doc: VInt pos_count +// PFOR_runs(position_deltas) # total_pos deltas, kFrqBaseUnit per run, +// # flat doc order (first per doc +// absolute) +// +// Multi-byte fixed-length fields are little-endian; variable-length integers +// reuse snii/encoding/varint. crc32c checksum at window tail detects +// corruption. +namespace snii::format { + +// Build a .prx window and append it to sink. +// per_doc_positions[d] is the position list for the d-th doc within this +// window; must be ascending (duplicates allowed). +// zstd_level_or_negative_for_auto: +// <0 → auto: use ZSTD (default level) when payload is large enough, +// otherwise raw. 0 → force raw (no compression). >0 → force ZSTD with the +// given level. +// Non-ascending positions within a doc return InvalidArgument. +Status build_prx_window(std::span> per_doc_positions, + int zstd_level_or_negative_for_auto, ByteSink* sink); + +// Vector convenience overload (forwards a span view over the window's per-doc +// lists; the writer can pass a slice of its flat positions WITHOUT deep-copying +// the inner vectors into a fresh std::vector> per +// window). +inline Status build_prx_window(const std::vector>& per_doc_positions, + int zstd_level_or_negative_for_auto, ByteSink* sink) { + return build_prx_window(std::span>(per_doc_positions), + zstd_level_or_negative_for_auto, sink); +} + +// FLAT-positions builder: byte-identical output to build_prx_window above, but +// reads the window's positions from a single flat span partitioned per-doc by +// `freqs` (doc d owns the next freqs[d] entries; freqs.size() == doc count and +// sum(freqs) == positions_flat.size()). Lets the writer pass a subspan of the +// term's flat positions/freqs with NO vector-of-vectors materialization. +Status build_prx_window_flat(std::span positions_flat, + std::span freqs, int zstd_level_or_negative_for_auto, + ByteSink* sink); + +// Read and verify a .prx window from source, reconstructing the per-doc +// position list. CRC mismatch / invalid codec / truncation / decompression +// failure all return a non-OK Status. +Status read_prx_window(ByteSource* source, std::vector>* per_doc_positions); + +// CSR variant of read_prx_window: decodes ALL docs' positions into one flat +// buffer `pos_flat` with per-doc offsets `pos_off` (size doc_count+1, +// pos_off[0]==0), so doc d's positions are pos_flat[pos_off[d] .. +// pos_off[d+1]). Avoids the per-doc std::vector allocation of read_prx_window +// -- both output vectors are flat uint32 buffers whose capacity a caller can +// retain (clear()) across windows/queries. +Status read_prx_window_csr(ByteSource* source, std::vector* pos_flat, + std::vector* pos_off); + +// Selective CSR variant: decodes positions only for the requested local doc +// ordinals within this PRX window. `doc_ordinals` must be strictly ascending. +// The output uses the same CSR shape, but has doc_ordinals.size()+1 offsets. +Status read_prx_window_csr_selective(ByteSource* source, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off); + +} // namespace snii::format diff --git a/be/src/snii/format/sampled_term_index.h b/be/src/snii/format/sampled_term_index.h new file mode 100644 index 00000000000000..b4348dd74eccd9 --- /dev/null +++ b/be/src/snii/format/sampled_term_index.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/format_constants.h" + +// SampledTermIndex -- resident metadata for locating a query term to a candidate DICT block. +// +// Sampling granularity is per DICT block (not a fixed term count): each time the writer produces a DICT block, +// it writes the block's first_term into this index. Size grows proportionally to block count. At read time it is +// loaded into the searcher cache together with SniiLogicalIndexReader. See design spec "Sampled Term Index". +// +// On-disk layout (framed by SectionFramer, uniform type+len+crc32c): +// [u8 type=kSampledTermIndex][varint64 payload_len][payload][fixed32 crc32c] +// payload = +// n_blocks varint32 +// min_term len(varint32) + bytes # == sample_terms[0], omitted when n_blocks=0 +// max_term len(varint32) + bytes # == sample_terms[n-1], omitted when n_blocks=0 +// sample_terms[n_blocks]: # first_term of each block, in ascending order +// prefix_len varint32 # shared prefix length with the previous sample_term +// suffix_len varint32 +// suffix u8[suffix_len] +// +// Term bytes are compared as unsigned byte order (UTF-8 friendly, binary-safe). Front coding reuses +// the same prefix/suffix primitives as DictEntry; do not reimplement. +namespace snii::format { + +// Builder: appends the first_term of each DICT block in block ordinal order (must be strictly ascending), +// and serializes the entire set into a single kSampledTermIndex framed section on finish. +class SampledTermIndexBuilder { +public: + // Appends the first_term of the next DICT block. Call order determines block ordinal order. + void add_block_first_term(std::string_view first_term); + + // Serializes and appends to sink. An empty collection (no blocks) is valid; n_blocks=0. + void finish(ByteSink* sink); + +private: + std::vector first_terms_; +}; + +// Reader: verifies the checksum and materializes all sample_terms on open; subsequent locate calls are pure in-memory binary search. +class SampledTermIndexReader { +public: + SampledTermIndexReader() = default; + + // Parses a kSampledTermIndex framed section. + // CRC mismatch / truncation / field overrun → kCorruption; type != kSampledTermIndex → kInvalidArgument. + static Status open(Slice section, SampledTermIndexReader* out); + + // Binary-search locate: returns the block ordinal of the last sample_term <= target. + // target < min_term or target > max_term (including empty index) → *maybe_present=false (out of range, term is definitely absent). + // Otherwise *maybe_present=true and *block_ordinal is the ordinal of the matching block. + Status locate(std::string_view target, bool* maybe_present, uint32_t* block_ordinal) const; + + uint32_t n_blocks() const { return static_cast(sample_terms_.size()); } + +private: + std::vector sample_terms_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/stats_block.h b/be/src/snii/format/stats_block.h new file mode 100644 index 00000000000000..20ef0c6613f85d --- /dev/null +++ b/be/src/snii/format/stats_block.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +// Statistics block within the per-index meta block. Carries only the counting stats +// needed for query planning and BM25; section location info is stored separately in SectionRefs (see design spec "Per-index meta block"). +// +// On-disk layout (framed by SectionFramer with unified type+len+crc32c): +// [u8 type=kStatsBlock][varint64 payload_len][payload][fixed32 crc32c] +// payload = varint64{ doc_count, indexed_doc_count, term_count, +// sum_total_term_freq, null_count } +// For field semantics see design spec "Scoring statistics design". +struct StatsBlock { + uint64_t doc_count = 0; // total doc count at segment level (including unindexed/NULL) + uint64_t indexed_doc_count = 0; // number of docs actually indexed (denominator for avgdl) + uint64_t term_count = 0; // number of unique terms in this index + uint64_t sum_total_term_freq = 0; // total token count across all indexed docs + uint64_t null_count = 0; // number of NULL / not-indexed docs +}; + +// Encodes into a kStatsBlock framed section (with built-in crc32c checksum) and appends to sink. +void encode_stats_block(const StatsBlock& sb, ByteSink* sink); + +// Reads and verifies a kStatsBlock framed section from src, populates out. +// CRC mismatch / truncation → kCorruption; type is not kStatsBlock → kInvalidArgument. +Status decode_stats_block(ByteSource* src, StatsBlock* out); + +} // namespace snii::format diff --git a/be/src/snii/format/tail_meta_region.h b/be/src/snii/format/tail_meta_region.h new file mode 100644 index 00000000000000..21fd737e55cf30 --- /dev/null +++ b/be/src/snii/format/tail_meta_region.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/logical_index_directory.h" + +namespace snii::format { + +// TailMetaRegion: the container's tail metadata region, located via the fixed +// tail pointer and read in one range. It bundles the per-logical-index meta +// blocks and the logical index directory so a reader can, after a single read, +// map (index_id, index_suffix) -> per-index meta block. See spec "footer meta +// region". +// +// On-disk layout (offsets are relative to the region start; the region is read +// whole into memory, so internal refs need not be file-absolute): +// TailMetaHeader: +// u32 meta_format_version (== kMetaFormatVersion) +// u32 flags +// u64 meta_region_len (== total region byte length) +// u32 n_logical_indexes +// u64 directory_offset (offset of the logical index directory in-region) +// u64 directory_length +// u32 header_crc32c (covers the header fields above) +// [per-index meta block #0][per-index meta block #1]... (opaque payloads) +// [logical index directory] (framed via LogicalIndexDirectory) +// u32 meta_region_checksum (crc32c over everything before it) +class TailMetaRegionBuilder { +public: + // Adds a per-index meta block (already serialized by PerIndexMetaBuilder) keyed + // by (index_id, index_suffix). Bytes are copied. + void add_index(uint64_t index_id, std::string index_suffix, Slice per_index_meta_bytes); + + // Serializes the whole region and appends it to sink. + void finish(ByteSink* sink) const; + +private: + struct Entry { + uint64_t index_id; + std::string suffix; + std::vector bytes; + }; + std::vector entries_; +}; + +class TailMetaRegionReader { +public: + TailMetaRegionReader() = default; + + // Parses and validates the region (header crc + region checksum + directory). + // region must outlive this reader (find() returns sub-views of it). + static Status open(Slice region, TailMetaRegionReader* out); + + uint32_t n_logical_indexes() const { return n_; } + const LogicalIndexDirectoryReader& directory() const { return dir_; } + + // Locates the per-index meta block bytes for (index_id, suffix). On match, + // *found=true and *per_index_meta_bytes views into the region; else *found=false. + Status find(uint64_t index_id, std::string_view suffix, bool* found, + Slice* per_index_meta_bytes) const; + +private: + Slice region_; + LogicalIndexDirectoryReader dir_; + uint32_t n_ = 0; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/tail_pointer.h b/be/src/snii/format/tail_pointer.h new file mode 100644 index 00000000000000..655635bf071fb8 --- /dev/null +++ b/be/src/snii/format/tail_pointer.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// Fixed-size entry written at the very end of a segment's .idx file. It lets a +// reader locate the tail meta region with a single read of the trailing +// tail_pointer_size() bytes (see design spec "fixed tail pointer"). +// +// On-disk layout (all multi-byte fields little-endian, FIXED total size so the +// reader can read exactly the last tail_pointer_size() bytes): +// [u32 magic = kTailMagic] +// [u16 format_version = kFormatVersion] +// [u64 meta_region_offset] +// [u64 meta_region_length] +// [u64 hot_off] (offset of the hot region [hot_off, EOF); +// 0 if absent) +// [u32 meta_region_checksum] +// [u32 bootstrap_header_checksum] +// [u8 tail_pointer_size] (== tail_pointer_size()) +// [u32 tail_checksum] (crc32c over all preceding tail-pointer bytes) +// +// The fixed layout deliberately does NOT use the SectionFramer (which is +// variable-length): a footer needs a constant trailing size the reader knows up +// front. +struct TailPointer { + uint64_t meta_region_offset = 0; + uint64_t meta_region_length = 0; + uint64_t hot_off = 0; + uint32_t meta_region_checksum = 0; + uint32_t bootstrap_header_checksum = 0; +}; + +// Constant on-disk size of the tail pointer, so the reader knows how many +// trailing bytes to read. +size_t tail_pointer_size(); + +// Appends the fixed-layout tail-pointer bytes (magic / version / fields / size / +// tail_checksum) to sink. Returns Internal if the encoded size would not fit the +// fixed-size contract (a programming error, never expected at runtime). +Status encode_tail_pointer(const TailPointer& tp, ByteSink* sink); + +// Parses the trailing tail-pointer bytes. last_bytes must be exactly +// tail_pointer_size() bytes long. Verifies magic and tail_checksum, then fills +// out with the parsed fields. Wrong magic / checksum mismatch / wrong length -> +// Corruption. +Status decode_tail_pointer(Slice last_bytes, TailPointer* out); + +} // namespace snii::format diff --git a/be/src/snii/io/batch_range_fetcher.h b/be/src/snii/io/batch_range_fetcher.h new file mode 100644 index 00000000000000..c9fc7bd083558e --- /dev/null +++ b/be/src/snii/io/batch_range_fetcher.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" + +namespace snii::io { + +// Collects the byte ranges a query plan needs, coalesces overlapping/adjacent +// ranges into physical reads, and fetches them in a single batch (one serial +// I/O round on a MeteredFileReader). Callers retrieve each requested range by +// the handle returned from add(). This is the SNII read path's batching layer: +// it front-loads range planning so reads are issued concurrently rather than +// cursor-by-cursor. +class BatchRangeFetcher { +public: + // coalesce_gap: requests separated by a gap <= this many bytes are merged into + // one physical read (reads a few extra bytes to save a request). 0 merges only + // overlapping/adjacent ranges. + explicit BatchRangeFetcher(FileReader* reader, uint64_t coalesce_gap = 0); + + // Registers a desired range; returns a handle usable with get() after fetch(). + size_t add(uint64_t offset, uint64_t len); + + // Coalesces and issues one batched read; fills internal buffers. + Status fetch(); + + // Bytes for handle h (valid only after a successful fetch(), until clear()). + Slice get(size_t h) const; + + size_t pending() const { return reqs_.size(); } + void clear(); + +private: + struct Req { + uint64_t offset; + uint64_t len; + size_t len_size = 0; // validated size_t length after successful fetch() + size_t phys_idx = 0; // index into phys_ after fetch + size_t sub_offset = 0; // byte offset of this req within its physical read + }; + + FileReader* reader_; + uint64_t coalesce_gap_; + std::vector reqs_; + std::vector> phys_; // physical read buffers after fetch +}; + +} // namespace snii::io diff --git a/be/src/snii/io/file_reader.h b/be/src/snii/io/file_reader.h new file mode 100644 index 00000000000000..b8aae0c9957d1a --- /dev/null +++ b/be/src/snii/io/file_reader.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/io_metrics.h" + +namespace snii::io { + +// One logical read request (offset, length). +struct Range { + uint64_t offset = 0; + size_t len = 0; +}; + +// The single physical-read primitive (a BE-internal read_at). All higher layers +// route reads through this so I/O can be accounted and backed by local files or +// object storage interchangeably. +class FileReader { +public: + virtual ~FileReader() = default; + + // Reads exactly len bytes starting at offset into *out (which is resized to + // len). Reading past EOF is an error (Corruption/IoError). + virtual Status read_at(uint64_t offset, size_t len, std::vector* out) = 0; + + // Reads a batch of ranges that may be served concurrently. The default is a + // sequential loop; backends that model concurrency (MeteredFileReader) or + // perform real parallel fetches (object storage) override this. + virtual Status read_batch(const std::vector& ranges, + std::vector>* outs) { + outs->resize(ranges.size()); + for (size_t i = 0; i < ranges.size(); ++i) { + SNII_RETURN_IF_ERROR(read_at(ranges[i].offset, ranges[i].len, &(*outs)[i])); + } + return Status::OK(); + } + + // Total size of the underlying object in bytes. + virtual uint64_t size() const = 0; + + // Optional live metrics. Readers that do not account I/O return nullptr. + virtual const IoMetrics* io_metrics() const { return nullptr; } +}; + +} // namespace snii::io diff --git a/be/src/snii/io/file_writer.h b/be/src/snii/io/file_writer.h new file mode 100644 index 00000000000000..a216898423c209 --- /dev/null +++ b/be/src/snii/io/file_writer.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" + +namespace snii::io { + +// Append-only writer (no seek-back), so the format can be produced in a single +// streaming pass compatible with S3FileWriter / StreamSinkFileWriter / packed +// writer. All container bytes are written front-to-back; back-references are +// resolved by writing metadata last. +class FileWriter { +public: + virtual ~FileWriter() = default; + + virtual Status append(Slice data) = 0; + virtual Status finalize() = 0; + virtual uint64_t bytes_written() const = 0; +}; + +} // namespace snii::io diff --git a/be/src/snii/io/io_metrics.h b/be/src/snii/io/io_metrics.h new file mode 100644 index 00000000000000..27e4d21bb0c2f8 --- /dev/null +++ b/be/src/snii/io/io_metrics.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +namespace snii::io { + +// Object-storage access metrics collected at FileReader boundaries. +struct IoMetrics { + uint64_t read_at_calls = 0; // BE-internal logical read requests issued + uint64_t serial_rounds = 0; // dependent serial I/O rounds + uint64_t range_gets = 0; // remote range GETs after cache coalescing + uint64_t remote_bytes = 0; // bytes fetched from remote + uint64_t total_request_bytes = 0; // sum of requested lengths before cache +}; + +inline IoMetrics delta(const IoMetrics& after, const IoMetrics& before) { + IoMetrics out; + out.read_at_calls = after.read_at_calls - before.read_at_calls; + out.serial_rounds = after.serial_rounds - before.serial_rounds; + out.range_gets = after.range_gets - before.range_gets; + out.remote_bytes = after.remote_bytes - before.remote_bytes; + out.total_request_bytes = after.total_request_bytes - before.total_request_bytes; + return out; +} + +} // namespace snii::io diff --git a/be/src/snii/io/local_file.h b/be/src/snii/io/local_file.h new file mode 100644 index 00000000000000..a67477750c2be3 --- /dev/null +++ b/be/src/snii/io/local_file.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" + +namespace snii::io { + +// Local-filesystem FileReader. Uses pread for positional, thread-safe reads +// (so concurrent batch fetches do not contend on a shared file offset). +class LocalFileReader : public FileReader { +public: + LocalFileReader() = default; + ~LocalFileReader() override; + + LocalFileReader(const LocalFileReader&) = delete; + LocalFileReader& operator=(const LocalFileReader&) = delete; + + Status open(const std::string& path); + Status read_at(uint64_t offset, size_t len, std::vector* out) override; + uint64_t size() const override { return size_; } + +private: + int fd_ = -1; + uint64_t size_ = 0; +}; + +// Local-filesystem append-only FileWriter. Appends accumulate in a fixed +// userspace buffer and are flushed to the fd in large chunks, collapsing the +// many tiny per-append ::write() syscalls of the build path (e.g. ~53k writes +// averaging ~683 B each) into a handful of big writes. The produced file is +// byte-identical to the unbuffered path; only the syscall count drops. +class LocalFileWriter : public FileWriter { +public: + LocalFileWriter() = default; + ~LocalFileWriter() override; + + LocalFileWriter(const LocalFileWriter&) = delete; + LocalFileWriter& operator=(const LocalFileWriter&) = delete; + + Status open(const std::string& path); + Status append(Slice data) override; + Status finalize() override; + uint64_t bytes_written() const override { return bytes_written_; } + +private: + // Userspace write buffer size. 256 KiB amortizes the write() syscall cost over + // many appends while keeping transient RAM negligible vs the index sections. + static constexpr size_t kBufCapacity = 256u * 1024; + + // Flushes the userspace buffer to the fd with a robust partial-write loop. + Status flush_buffer(); + // Writes a raw byte span straight to the fd (used for spans larger than the + // buffer, bypassing a needless copy). + Status write_all(const uint8_t* data, size_t len); + + int fd_ = -1; + uint64_t bytes_written_ = 0; + std::vector buf_; +}; + +} // namespace snii::io diff --git a/be/src/snii/io/metered_file_reader.h b/be/src/snii/io/metered_file_reader.h new file mode 100644 index 00000000000000..41fed3eb7ac49a --- /dev/null +++ b/be/src/snii/io/metered_file_reader.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/io/file_reader.h" +#include "snii/io/io_metrics.h" + +namespace snii::io { + +// A FileReader decorator that models an object-storage FileCache: reads are +// aligned to fixed (default 1MiB) blocks; only not-yet-resident blocks become +// remote range GETs (adjacent misses are coalesced). It is the single shared +// "yardstick" through which both single blocking reads and batched concurrent +// reads are measured. +// +// - read_at(): a single blocking read. Any cache miss => +1 serial round +// (the cursor must wait for bytes before the next offset is known). +// - read_batch(): all ranges submitted concurrently => the whole batch is at +// most one serial round (+1 iff any range misses). +class MeteredFileReader : public FileReader { +public: + explicit MeteredFileReader(FileReader* inner, size_t block_size = (1u << 20)); + + Status read_at(uint64_t offset, size_t len, std::vector* out) override; + Status read_batch(const std::vector& ranges, + std::vector>* outs) override; + uint64_t size() const override { return inner_->size(); } + + const IoMetrics& metrics() const { return metrics_; } + const IoMetrics* io_metrics() const override { return &metrics_; } + // Clears counters AND the resident block set, modelling a cold (cache-empty) query. + void reset_metrics(); + +private: + Status validate_range(uint64_t offset, size_t len) const; + + // Accounts the cache effect of touching [offset, offset+len): records misses, + // coalesced GETs, and remote bytes. Returns true iff at least one block missed. + bool account_blocks(uint64_t offset, size_t len); + + FileReader* inner_; + size_t block_size_; + std::unordered_set resident_; + IoMetrics metrics_; +}; + +} // namespace snii::io diff --git a/be/src/snii/io/s3_object_store.h b/be/src/snii/io/s3_object_store.h new file mode 100644 index 00000000000000..2cf2270d751bb6 --- /dev/null +++ b/be/src/snii/io/s3_object_store.h @@ -0,0 +1,122 @@ +#pragma once + +// S3 / OSS object-storage backend for snii::io. +// +// ISOLATION: the ENTIRE body of this header (and its .cpp) is guarded by +// SNII_WITH_S3. When the option is OFF the translation unit compiles to nothing +// and pulls in NO aws-sdk headers, so core stays free of any aws dependency by +// default. Only when CMake is configured with -DSNII_WITH_S3=ON is the macro +// defined and aws linked. +#ifdef SNII_WITH_S3 + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" + +// Forward declarations only -- aws types are pimpl'd in the .cpp so that this +// header never leaks aws-sdk includes to its consumers. +namespace Aws::S3 { +class S3Client; +} // namespace Aws::S3 + +namespace snii::io { + +// Connection / addressing parameters for an S3-compatible endpoint (tested +// against Aliyun OSS, which requires virtual-hosted addressing). +struct S3Config { + std::string endpoint; // e.g. "oss-cn-hongkong.aliyuncs.com" + std::string region; // e.g. "cn-hongkong" + std::string bucket; // e.g. "doris-community-test" + std::string prefix; // object key prefix (no trailing slash required) + std::string ak; // access key id + std::string sk; // secret access key + long connect_timeout_ms = 10000; + long request_timeout_ms = 180000; + long http_request_timeout_ms = 180000; +}; + +// Process-wide aws InitAPI / ShutdownAPI lifecycle guard. +// +// aws-sdk-cpp requires Aws::InitAPI to be called exactly once before any client +// is used and Aws::ShutdownAPI once at teardown. Construct a single +// AwsApiGuard (e.g. on the stack of main, or as a static) that lives for the +// whole duration during which S3FileReader / S3FileWriter are used. The guard is +// reference counted, so nested guards are safe; the underlying InitAPI runs only +// for the first live instance and ShutdownAPI when the last one is destroyed. +class AwsApiGuard { +public: + AwsApiGuard(); + ~AwsApiGuard(); + + AwsApiGuard(const AwsApiGuard&) = delete; + AwsApiGuard& operator=(const AwsApiGuard&) = delete; +}; + +// Read-only FileReader backed by an S3/OSS object. Range reads use a ranged +// GetObject; size() is the object length cached from a HeadObject at open(). +class S3FileReader : public FileReader { +public: + S3FileReader() = default; + ~S3FileReader() override; + + S3FileReader(const S3FileReader&) = delete; + S3FileReader& operator=(const S3FileReader&) = delete; + S3FileReader(S3FileReader&&) noexcept; + S3FileReader& operator=(S3FileReader&&) noexcept; + + // Opens the object (prefix + "/" + key) and caches its size via HeadObject. + static Status open(const S3Config& cfg, const std::string& key, S3FileReader* out); + + Status read_at(uint64_t offset, size_t len, std::vector* out) override; + // Concurrent batch: issues the ranges' GetObjects in parallel (bounded), so a + // planned read round costs ~one round-trip instead of the sum of all GETs. + Status read_batch(const std::vector& ranges, + std::vector>* outs) override; + uint64_t size() const override { return size_; } + +private: + std::shared_ptr client_; + std::string bucket_; + std::string object_key_; // full key (prefix + "/" + key) + uint64_t size_ = 0; +}; + +// Append-only FileWriter backed by an S3/OSS object. Appends are buffered in +// memory; finalize() flushes the whole buffer in a single PutObject. Multipart +// upload is a future optimization. +class S3FileWriter : public FileWriter { +public: + S3FileWriter() = default; + ~S3FileWriter() override; + + S3FileWriter(const S3FileWriter&) = delete; + S3FileWriter& operator=(const S3FileWriter&) = delete; + S3FileWriter(S3FileWriter&&) noexcept; + S3FileWriter& operator=(S3FileWriter&&) noexcept; + + // Opens a writer targeting object (prefix + "/" + key). + Status open(const S3Config& cfg, const std::string& key); + + Status append(Slice data) override; + Status finalize() override; + uint64_t bytes_written() const override { return bytes_written_; } + +private: + std::shared_ptr client_; + std::string bucket_; + std::string object_key_; // full key (prefix + "/" + key) + std::vector buffer_; + uint64_t bytes_written_ = 0; + bool finalized_ = false; +}; + +} // namespace snii::io + +#endif // SNII_WITH_S3 diff --git a/be/src/snii/query/bm25_scorer.h b/be/src/snii/query/bm25_scorer.h new file mode 100644 index 00000000000000..85df67d3f5e1be --- /dev/null +++ b/be/src/snii/query/bm25_scorer.h @@ -0,0 +1,63 @@ +#pragma once + +#include + +// Bm25Scorer -- classic Okapi BM25 relevance scoring over SNII native stats. +// +// Per query term, idf is precomputed once from the collection statistics: +// idf = log(1 + (N - df + 0.5) / (df + 0.5)) +// where N = indexed doc count and df = the term's document frequency. The +// per-document contribution of a term then is: +// score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / avgdl)) +// where tf is the in-doc term frequency, dl the document length decoded from the +// 1-byte encoded norm, and avgdl the average document length. +// +// Norm encode/decode (DOCUMENTED CONTRACT): the writer stores doc length as a +// byte-quantized value floor-clamped to [1, 255]; decode is the identity map +// back to a double length. encode_norm(len) = clamp(len, 1, 255); +// decode_norm(b) = (b == 0 ? 1.0 : (double)b). This keeps short docs (len <= 255) +// exact and saturates longer docs at 255, matching the reference oracle. +namespace snii::query { + +// BM25 free parameters. Defaults are the classic Lucene/Elasticsearch values. +struct Bm25Params { + double k1 = 1.2; + double b = 0.75; +}; + +// Decodes a 1-byte encoded norm into a document length. byte 0 maps to 1.0 to +// avoid a zero-length divisor; otherwise it is the byte value itself. +double decode_norm(uint8_t encoded); + +// Encodes a document length into a 1-byte norm (clamped to [1, 255]). Provided +// so writers and test oracles share one quantization. +uint8_t encode_norm(uint64_t doc_length); + +// Per-term scoring context: the precomputed idf and the term's df. Built once per +// query term, then reused for every candidate document of that term. +class ScorerContext { +public: + // Builds the context from collection size n (indexed doc count) and the term's + // document frequency df. avgdl and params are supplied per score call. + static ScorerContext make(uint64_t n, uint64_t df); + + double idf() const { return idf_; } + uint64_t df() const { return df_; } + + // Scores one document occurrence: tf is the in-doc term frequency, encoded_norm + // the doc's 1-byte length norm, avgdl the collection average length. + double score(uint32_t tf, uint8_t encoded_norm, double avgdl, const Bm25Params& params) const; + + // Upper bound on score() over any document, given a window's maximum tf and the + // shortest doc length in the window (smallest dl maximizes the score). Used by + // the WAND-style block-max pruner. max_freq is the window's max tf; min_norm is + // the smallest encoded norm (=> smallest dl => largest score). + double max_score(uint32_t max_freq, uint8_t min_norm, double avgdl, + const Bm25Params& params) const; + +private: + double idf_ = 0.0; + uint64_t df_ = 0; +}; + +} // namespace snii::query diff --git a/be/src/snii/query/boolean_query.h b/be/src/snii/query/boolean_query.h new file mode 100644 index 00000000000000..f9cba6485eb37c --- /dev/null +++ b/be/src/snii/query/boolean_query.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// boolean_or -- MATCH_ANY semantics: return the sorted docid set containing at +// least one query term. Empty terms or all-absent terms produce an empty +// result. Duplicate input terms are ignored semantically and do not duplicate +// output docids. +namespace snii::query { + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, DocIdSink* sink); + +// boolean_and (MATCH all-terms): sorted docid set of docs containing EVERY +// term, no positional constraint. Valid on docs-only indexes. Empty terms or +// any absent term -> empty result. +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); + +} // namespace snii::query diff --git a/be/src/snii/query/docid_sink.h b/be/src/snii/query/docid_sink.h new file mode 100644 index 00000000000000..9fc5dc2d9739d3 --- /dev/null +++ b/be/src/snii/query/docid_sink.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/status.h" + +namespace snii::query { + +// Bulk docid handoff for query operators. Each span is sorted ascending; callers +// that need a single vector can use VectorDocIdSink. +class DocIdSink { +public: + virtual ~DocIdSink() = default; + virtual Status append_sorted(std::span docids) = 0; + virtual Status append_range(uint32_t first, uint64_t last_exclusive) = 0; +}; + +class VectorDocIdSink final : public DocIdSink { +public: + explicit VectorDocIdSink(std::vector& docids) : docids_(docids) {} + + Status append_sorted(std::span docids) override { + docids_.insert(docids_.end(), docids.begin(), docids.end()); + return Status::OK(); + } + + Status append_range(uint32_t first, uint64_t last_exclusive) override { + if (last_exclusive <= first) { + return Status::OK(); + } + if (last_exclusive > static_cast(std::numeric_limits::max()) + 1) { + return Status::InvalidArgument("docid_sink: range exceeds uint32 docid space"); + } + const uint64_t count = last_exclusive - first; + if (count > static_cast(docids_.max_size() - docids_.size())) { + return Status::InvalidArgument("docid_sink: range too large"); + } + docids_.reserve(docids_.size() + static_cast(count)); + for (uint64_t docid = first; docid < last_exclusive; ++docid) { + docids_.push_back(static_cast(docid)); + } + return Status::OK(); + } + +private: + std::vector& docids_; +}; + +} // namespace snii::query diff --git a/be/src/snii/query/internal/docid_conjunction.h b/be/src/snii/query/internal/docid_conjunction.h new file mode 100644 index 00000000000000..3cb6cc42f5a294 --- /dev/null +++ b/be/src/snii/query/internal/docid_conjunction.h @@ -0,0 +1,77 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_prelude.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +struct ResolvedQueryTerm { + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; +}; + +struct TermPlan { + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + uint32_t df = 0; + size_t order = 0; + size_t frq_handle = 0; + size_t prx_handle = 0; + size_t prelude_handle = 0; + bool pod_ref = false; + bool windowed = false; + snii::format::FrqPreludeReader prelude; +}; + +struct DocidChunk { + std::vector docids; + std::vector prx_doc_ordinals; + uint32_t prx_doc_count = 0; + bool windowed = false; + uint32_t window = 0; +}; + +struct DocidSource { + std::vector chunks; + bool docids_are_final_candidates = false; +}; + +Status resolve_query_term(const snii::reader::LogicalIndexReader& idx, const std::string& term, + ResolvedQueryTerm* resolved, bool* found); + +Status plan_terms(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, snii::io::BatchRangeFetcher* fetcher, + std::vector* plans, bool* all_present, bool need_positions); + +Status plan_resolved_terms(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, + snii::io::BatchRangeFetcher* fetcher, std::vector* plans, + bool need_positions); + +Status open_preludes(const snii::io::BatchRangeFetcher& fetcher, std::vector* plans, + bool need_positions); + +Status inline_dd_region(const snii::format::DictEntry& entry, Slice* out); + +Status build_docid_only_conjunction(const snii::reader::LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates); + +Status build_docid_only_conjunction(const snii::reader::LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates, + std::vector* sources); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/docid_posting_reader.h b/be/src/snii/query/internal/docid_posting_reader.h new file mode 100644 index 00000000000000..bf5927b5857335 --- /dev/null +++ b/be/src/snii/query/internal/docid_posting_reader.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/format/dict_entry.h" +#include "snii/query/docid_sink.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +struct ResolvedDocidPosting { + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; +}; + +// Decodes the docid-only posting for a resolved term. The caller owns term +// lookup and can batch/plan lookups independently; this module owns only the +// three posting encodings (inline, slim pod_ref, windowed pod_ref). +Status read_docid_posting(const snii::reader::LogicalIndexReader& idx, + const snii::format::DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, std::vector* docids); + +Status read_docid_posting(const snii::reader::LogicalIndexReader& idx, + const snii::format::DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, snii::query::DocIdSink* sink); + +// Batch counterpart for multi-term docid-only operators. Windowed terms share one +// prelude fetch round and one docid fetch round, so OR-style operators pay by +// stage rather than by term. +Status read_docid_postings_batched(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, + std::vector>* docids); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/docid_set_ops.h b/be/src/snii/query/internal/docid_set_ops.h new file mode 100644 index 00000000000000..8aae88b90fa974 --- /dev/null +++ b/be/src/snii/query/internal/docid_set_ops.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +namespace snii::query::internal { + +std::vector intersect_sorted(const std::vector& a, + const std::vector& b); + +void union_sorted_into(std::vector* acc, const std::vector& next); + +std::vector union_sorted_many(const std::vector>& lists); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/docid_union.h b/be/src/snii/query/internal/docid_union.h new file mode 100644 index 00000000000000..89c53f103d2343 --- /dev/null +++ b/be/src/snii/query/internal/docid_union.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +// Reads already-resolved docid postings in planned batches, merges them as a +// sorted deduplicated union, then emits one bulk span to the sink. +Status build_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, + std::vector* out); + +Status emit_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, DocIdSink* sink); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/position_math.h b/be/src/snii/query/internal/position_math.h new file mode 100644 index 00000000000000..04e964a67b6e7e --- /dev/null +++ b/be/src/snii/query/internal/position_math.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include +#include + +namespace snii::query::internal { + +inline bool build_position_offsets(size_t count, std::vector* out) { + if (count >= std::numeric_limits::max()) { + return false; + } + out->clear(); + out->reserve(count); + uint32_t offset = 0; + while (out->size() < count) { + out->push_back(offset); + ++offset; + } + return true; +} + +inline bool add_position_offset(uint32_t start, uint32_t offset, uint32_t* out) { + if (start > std::numeric_limits::max() - offset) return false; + *out = start + offset; + return true; +} + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/term_expansion.h b/be/src/snii/query/internal/term_expansion.h new file mode 100644 index 00000000000000..3393c31dc8457a --- /dev/null +++ b/be/src/snii/query/internal/term_expansion.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +using TermMatcher = std::function; + +// Enumerates dictionary terms from `enum_prefix`, filters them with `matches`, +// and emits the sorted docid union for matching entries. PrefixHit carries the +// DictEntry and block bases, so callers avoid a second lookup per expanded term. +Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx, + std::string_view enum_prefix, const TermMatcher& matches, + DocIdSink* const sink, int32_t max_expansions = 0); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/phrase_query.h b/be/src/snii/query/phrase_query.h new file mode 100644 index 00000000000000..0de44c1fdbd921 --- /dev/null +++ b/be/src/snii/query/phrase_query.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// phrase_query -- MATCH_PHRASE: return the sorted docid set in which the terms +// occur consecutively (for some i, every term k appears at position pos+k in +// the same doc). It first builds the docid conjunction with docs-only posting +// reads, then fetches PRX only for chunks that can contain final candidates: +// 1. read preludes / docs-only posting ranges and intersect per-term docids; +// 2. fetch retained PRX chunks and stream positions for survivors; +// 3. for each surviving doc, check that some position p exists with +// term[0]@p, term[1]@p+1, ... term[n-1]@p+(n-1). +// An empty term list -> empty result. Any term absent -> empty result. +namespace snii::query { + +Status phrase_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status phrase_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); + +// phrase_prefix_query -- MATCH_PHRASE_PREFIX: the last item in `terms` is a +// term prefix and preceding items are exact terms. For example {"quick", "bro"} +// matches "quick brown" and "quick bronze". Empty terms -> empty result. +Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, + std::vector* const docids, int32_t max_expansions = 0); +Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); + +} // namespace snii::query diff --git a/be/src/snii/query/prefix_query.h b/be/src/snii/query/prefix_query.h new file mode 100644 index 00000000000000..cd8dc5559f3232 --- /dev/null +++ b/be/src/snii/query/prefix_query.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// prefix_query -- MATCH_PREFIX semantics: enumerate dictionary terms with the +// requested prefix, then return the sorted docid set containing any enumerated +// term. Empty prefix enumerates all terms. No matching terms -> empty result. +namespace snii::query { + +Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, + std::vector* const docids, int32_t max_expansions = 0); +Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); +Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, + DocIdSink* const sink, int32_t max_expansions = 0); + +} // namespace snii::query diff --git a/be/src/snii/query/query_profile.h b/be/src/snii/query/query_profile.h new file mode 100644 index 00000000000000..a4988f6a80c8d1 --- /dev/null +++ b/be/src/snii/query/query_profile.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +#include "snii/io/io_metrics.h" + +namespace snii::io { +class FileReader; +} + +namespace snii::query { + +struct QueryProfile { + uint64_t elapsed_ns = 0; + bool has_io_metrics = false; + snii::io::IoMetrics io_before; + snii::io::IoMetrics io_after; + snii::io::IoMetrics io_delta; +}; + +class QueryProfileScope { +public: + QueryProfileScope(snii::io::FileReader* reader, QueryProfile* profile); + ~QueryProfileScope(); + QueryProfileScope(const QueryProfileScope&) = delete; + QueryProfileScope& operator=(const QueryProfileScope&) = delete; + + void finish(); + +private: + snii::io::FileReader* reader_ = nullptr; + QueryProfile* profile_ = nullptr; + std::chrono::steady_clock::time_point start_; + bool finished_ = false; +}; + +} // namespace snii::query diff --git a/be/src/snii/query/regexp_query.h b/be/src/snii/query/regexp_query.h new file mode 100644 index 00000000000000..a088ed42dcc1f8 --- /dev/null +++ b/be/src/snii/query/regexp_query.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// regexp_query -- MATCH_REGEXP semantics over dictionary terms. The pattern is +// evaluated with std::regex_match, so it must match the whole term. Matching +// terms are executed as a sorted deduplicated docid union. +namespace snii::query { + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, int32_t max_expansions = 0); +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* const sink, int32_t max_expansions = 0); + +} // namespace snii::query diff --git a/be/src/snii/query/scoring_query.h b/be/src/snii/query/scoring_query.h new file mode 100644 index 00000000000000..dc2ea75f0751e7 --- /dev/null +++ b/be/src/snii/query/scoring_query.h @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/bm25_scorer.h" +#include "snii/reader/logical_index_reader.h" +#include "snii/stats/snii_stats_provider.h" + +// scoring_query -- top-K BM25 scored retrieval over one logical index for one or +// more query terms. Two entry points produce IDENTICAL rankings: +// - scoring_query_exhaustive(): scores every candidate document (the baseline +// correctness oracle). +// - scoring_query_wand(): a block-max / WAND-style optimization that uses the +// per-window max_freq / max_norm columns from the frq_prelude to bound each +// window's best possible score and SKIP windows that cannot enter the +// current top-K. A window without block-max stats (slim/inline entries or a +// missing prelude) is never pruned, so the result still equals the +// exhaustive ranking. +// +// Results are sorted by score descending; ties are broken by ascending docid so +// the ordering is deterministic and the two paths compare equal. +namespace snii::query { + +// One scored hit. +struct ScoredDoc { + uint32_t docid = 0; + double score = 0.0; +}; + +// Exhaustive baseline: score every doc that contains any query term, return the +// top-k by score. params controls k1/b. Unknown terms are skipped. +Status scoring_query_exhaustive(const snii::reader::LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out); + +// WAND-style block-max pruning. MUST return the same top-k as the exhaustive +// path. Windows whose block-max upper bound cannot beat the current k-th score +// are skipped; windows lacking block-max stats are scored fully. +Status scoring_query_wand(const snii::reader::LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out); + +// SELECTIVE-FETCH block-max WAND (design spec section 5, "Phase C"). Same WAND / +// theta / >= tie machinery as scoring_query_wand, but it DEFERS the .frq window +// fetch: for each windowed term it first reads ONLY the frq_prelude (block-max +// columns), then fetches a term's .frq window lazily and at most once -- and ONLY +// when the running block-max bound proves a doc in that window can still reach the +// top-K (bound >= theta). A window the bound rules out is never fetched. The +// result (top-K docids AND scores, INCLUDING ties) is byte-identical to +// scoring_query_exhaustive / scoring_query_wand; only the bytes read differ. +// Slim/inline terms (no prelude) are fetched fully, exactly as today. +Status scoring_query_wand_selective(const snii::reader::LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out); + +} // namespace snii::query diff --git a/be/src/snii/query/term_query.h b/be/src/snii/query/term_query.h new file mode 100644 index 00000000000000..c804405a2ec104 --- /dev/null +++ b/be/src/snii/query/term_query.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// term_query -- the simplest SNII query: return the sorted docid set that +// contains term. It runs the term lookup on the logical index, then issues a +// single batched .frq range read (one serial round) to decode the postings. +// Absent term -> empty result (OK status). +namespace snii::query { + +Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, + std::vector* docids); +Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, + DocIdSink* sink); +Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, + std::vector* docids, QueryProfile* profile); + +} // namespace snii::query diff --git a/be/src/snii/query/wildcard_query.h b/be/src/snii/query/wildcard_query.h new file mode 100644 index 00000000000000..1cb0d5551dcf09 --- /dev/null +++ b/be/src/snii/query/wildcard_query.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// wildcard_query -- MATCH_WILDCARD semantics over dictionary terms. `*` matches +// any byte sequence, `?` matches one byte, and all other bytes match literally. +// Matching terms are executed as a sorted deduplicated docid union. +namespace snii::query { + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, int32_t max_expansions = 0); +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* const sink, int32_t max_expansions = 0); + +} // namespace snii::query diff --git a/be/src/snii/reader/logical_index_reader.h b/be/src/snii/reader/logical_index_reader.h new file mode 100644 index 00000000000000..b10a5d7c7791f5 --- /dev/null +++ b/be/src/snii/reader/logical_index_reader.h @@ -0,0 +1,129 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/bsbf.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/per_index_meta.h" +#include "snii/format/sampled_term_index.h" +#include "snii/format/stats_block.h" +#include "snii/io/file_reader.h" + +// LogicalIndexReader -- read-side counterpart of LogicalIndexWriter for one +// logical index. It owns the resident per-index meta sub-readers (XFilter, +// SampledTermIndex, DICT block directory, StatsBlock, SectionRefs) parsed from +// the per-index meta block, and resolves a query term to its DictEntry through +// the documented lookup flow: +// XFilter (reject absent) -> SampledTermIndex (candidate block ordinal) -> +// DICT block directory (block range) -> resident small-DICT block or one +// range read of the DICT block -> DictBlockReader::find_term. +// +// lookup() also returns the block's frq_base/prx_base (captured by the +// DictBlockReader) so callers can resolve a pod_ref entry's absolute .frq/.prx +// offsets via the writer's contract. Both deltas index into the SAME +// interleaved posting region (prx_base == frq_base; the prx span precedes the +// frq span): +// abs_frq = posting_region.offset + frq_base + entry.frq_off_delta +// abs_prx = posting_region.offset + prx_base + entry.prx_off_delta +// +// The meta block bytes must outlive this reader (they are owned by the parent +// SniiSegmentReader's resident meta region). +namespace snii::reader { + +class LogicalIndexReader { +public: + LogicalIndexReader() = default; + + // Parses the per-index meta block and binds the reader to file_reader. + // file_reader / meta_block must outlive this reader. + static Status open(snii::io::FileReader* file_reader, snii::format::IndexTier tier, + bool has_positions, Slice meta_block, LogicalIndexReader* out); + + // Resolves term to a DictEntry. *found=false when the term is absent (XFilter + // rejection, out-of-range sample, or DICT-block miss). On a hit, *entry is + // filled and *frq_base / *prx_base carry the candidate block's bases. + Status lookup(std::string_view term, bool* found, snii::format::DictEntry* entry, + uint64_t* frq_base, uint64_t* prx_base) const; + + // One enumerated term whose key has the requested prefix, with its DictEntry + // and the owning DICT block's frq/prx bases (for posting resolution). + struct PrefixHit { + std::string term; + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + }; + + using PrefixHitVisitor = std::function; + + // Ordered term enumeration: every term with `prefix`, in lexicographic order, + // by seeking the start DICT block via the SampledTermIndex and scanning + // forward across contiguous blocks until the terms pass the prefix range. + // Empty prefix enumerates all terms. This is the contiguous-DICT-block design + // the term-anchor layout was built for (MATCH_PHRASE_PREFIX / prefix / range + // queries). The visitor form avoids materializing all hits when callers only + // need a bounded expansion. + Status visit_prefix_terms(std::string_view prefix, const PrefixHitVisitor& visitor) const; + Status prefix_terms(std::string_view prefix, std::vector* const out, + int32_t max_terms = 0) const; + + // Resolves a pod_ref entry's absolute .frq / .prx window byte range, + // validating the locator against the posting_region length (defends against + // corrupt entries: prelude_len > frq_len underflow, or off_delta+len past the + // region). Both windows resolve against the single posting_region. *abs_off + // is the absolute file offset of the window (after prelude); *len its byte + // length. + Status resolve_frq_window(const snii::format::DictEntry& entry, uint64_t frq_base, + uint64_t* abs_off, uint64_t* len) const; + Status resolve_prx_window(const snii::format::DictEntry& entry, uint64_t prx_base, + uint64_t* abs_off, uint64_t* len) const; + + const snii::format::SectionRefs& section_refs() const { return meta_.section_refs(); } + const snii::format::StatsBlock& stats() const { return meta_.stats(); } + snii::format::IndexTier tier() const { return tier_; } + bool has_positions() const { return has_positions_; } + snii::io::FileReader* reader() const { return reader_; } + +private: + snii::io::FileReader* reader_ = nullptr; + snii::format::IndexTier tier_ = snii::format::IndexTier::kT1; + bool has_positions_ = false; + snii::format::PerIndexMetaReader meta_; + snii::format::SampledTermIndexReader sti_; + snii::format::DictBlockDirectoryReader dbd_; + snii::format::BsbfHeader bsbf_header_; // resident header (from section ref) + bool has_bsbf_ = false; + // L0 tiering: when the bsbf section is small (<= kBsbfResidentMaxBytes) its + // whole bitset is loaded here at open -> in-memory probe, no per-lookup + // round. Empty => L1 (on-demand single-block probe via bsbf_probe). + bool bsbf_resident_ = false; + std::vector bsbf_resident_bitset_; + + // Small DICT blocks are opened once with the index so exact lookups avoid an + // otherwise serial S3 round for the term dictionary. Empty means the + // dictionary exceeded the resident threshold and lookup/prefix enumeration + // read blocks on demand. Each DictBlockReader holds a Slice into the owning + // bytes. + struct ResidentDictBlock { + std::vector bytes; + snii::format::DictBlockReader reader; + }; + struct OnDemandDictBlock { + std::vector bytes; + snii::format::DictBlockReader reader; + }; + Status load_resident_dict_blocks(); + Status dict_block_reader_for_ordinal(uint32_t ordinal, OnDemandDictBlock* on_demand, + const snii::format::DictBlockReader** out) const; + std::vector resident_dict_blocks_; +}; + +} // namespace snii::reader diff --git a/be/src/snii/reader/snii_segment_reader.h b/be/src/snii/reader/snii_segment_reader.h new file mode 100644 index 00000000000000..fc725889a03f94 --- /dev/null +++ b/be/src/snii/reader/snii_segment_reader.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/tail_meta_region.h" +#include "snii/io/file_reader.h" +#include "snii/reader/logical_index_reader.h" + +// SniiSegmentReader -- entry point for the SNII segment read path. It opens a +// single .idx container through a (possibly metered) io::FileReader and exposes +// its logical indexes. open() performs the minimal bootstrap reads: +// 1. the fixed bootstrap header (front of the file), +// 2. the fixed tail pointer (last tail_pointer_size() bytes), and +// 3. the tail meta region (one range read located via the tail pointer). +// The meta region bytes are held resident by the reader so per-index meta blocks +// (returned as sub-views) remain valid for the reader's lifetime. +// +// open_index() then materializes one LogicalIndexReader from the per-index meta +// block of a given (index_id, suffix); query functions operate on that reader. +namespace snii::reader { + +class SniiSegmentReader { +public: + SniiSegmentReader() = default; + + // Reads bootstrap header + tail pointer + tail meta region from reader. + // reader must outlive the returned SniiSegmentReader and every + // LogicalIndexReader opened from it. reader == nullptr / out == nullptr -> + // InvalidArgument; structural problems -> Corruption / Unsupported. + static Status open(snii::io::FileReader* reader, SniiSegmentReader* out); + + uint32_t n_logical_indexes() const { return region_reader_.n_logical_indexes(); } + + // Loads the per-index meta block for (index_id, suffix) and builds a + // LogicalIndexReader bound to the same FileReader. Absent index -> NotFound. + Status open_index(uint64_t index_id, std::string_view suffix, LogicalIndexReader* out) const; + + snii::io::FileReader* reader() const { return reader_; } + +private: + snii::io::FileReader* reader_ = nullptr; + std::vector meta_region_; // owned resident copy of the tail meta region + snii::format::TailMetaRegionReader region_reader_; +}; + +} // namespace snii::reader diff --git a/be/src/snii/reader/windowed_posting.h b/be/src/snii/reader/windowed_posting.h new file mode 100644 index 00000000000000..e02e6e2831e05b --- /dev/null +++ b/be/src/snii/reader/windowed_posting.h @@ -0,0 +1,105 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_prelude.h" +#include "snii/reader/logical_index_reader.h" + +// WindowedPostingReader -- shared read-side decode of a windowed term's posting +// from its two-level frq_prelude + GROUPED dd-block / freq-block (design 1.6). +// +// A windowed pod_ref entry's .frq payload is laid out +// [prelude][dd-block][freq-block] +// where the dd-block concatenates every window's dd_region and the freq-block +// every window's freq_region. The docs-only prefix [prelude][dd-block] is ONE +// contiguous run. This helper: +// 1. range-fetches the prelude (prelude_len bytes) and parses the directory, +// 2. range-fetches the WHOLE dd-block in ONE contiguous range (and, for +// scoring, +// the whole freq-block in one more range), +// 3. decodes each window's dd region (and freq region) from the in-memory +// blocks +// via the prelude metadata (dd_off/dd_disk_len, freq_off/freq_disk_len), +// and concatenates the per-window docids / freqs / positions. +// +// The slim/inline single-window path is handled by the term/phrase/scoring +// callers directly; this helper is for enc=windowed entries only. +namespace snii::reader { + +// Coalesce gap (bytes) used when batch-fetching MULTIPLE dd sub-ranges of the +// SAME term (the phrase window-skip path): dd regions of one term are +// contiguous in the dd-block, so merging reads separated by <= this gap into +// one physical Range GET trades a little over-read for fewer remote GETs (the +// design's higher-priority metric). Only applied to same-term multi-window +// batches, never to cross-term. +inline constexpr uint64_t kSameTermCoalesceGap = 0; + +// Full decoded posting for one windowed term (docids ascending across windows). +struct DecodedPosting { + std::vector docids; + std::vector freqs; // aligned with docids + std::vector> positions; // aligned; empty when no prx +}; + +// Decodes the entire windowed posting. want_positions requires the index to +// have positions (and the entry to carry prx). want_freq selects whether the +// freq-block is fetched + decoded: when false ONLY the contiguous +// [prelude][dd-block] prefix is fetched (docid-only / phrase callers) and +// DecodedPosting.freqs stays empty; when true the freq-block is additionally +// fetched (scoring). Returns Corruption on any prelude/block inconsistency +// (doc-count mismatch, out-of-range offsets). +Status read_windowed_posting(const LogicalIndexReader& idx, const snii::format::DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, bool want_positions, + bool want_freq, DecodedPosting* out); + +// --- Sub-block (window) skipping helpers (shared with phrase / selective WAND) +// -- +// +// These expose the per-window dd/freq/prx addressing within the grouped blocks +// so the skip path can fetch ONLY the windows covering candidate docids (their +// dd sub-ranges within the dd-block, near-contiguous and coalesce-friendly) +// instead of the whole posting, without duplicating the offset arithmetic. + +// Absolute file byte ranges of one window's regions. dd is always valid; freq +// is valid only when want_freq; prx is valid only when want_positions (and +// has_prx). +struct WindowAbsRange { + uint64_t dd_off = 0; + uint64_t dd_len = 0; + uint64_t freq_off = 0; + uint64_t freq_len = 0; + uint64_t prx_off = 0; + uint64_t prx_len = 0; +}; + +// Fetches + parses the two-level prelude of a windowed entry (one batched +// read). +Status fetch_windowed_prelude(const LogicalIndexReader& idx, const snii::format::DictEntry& entry, + uint64_t frq_base, snii::format::FrqPreludeReader* prelude); + +// Computes the absolute file ranges of window w's dd region (and freq region +// when want_freq, and .prx window when want_positions), fully validated against +// the POD sections (anti-DoS: rejects out-of-range offsets and overflowing +// locators). +Status windowed_window_range(const LogicalIndexReader& idx, const snii::format::DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, + const snii::format::FrqPreludeReader& prelude, uint32_t w, + bool want_positions, bool want_freq, WindowAbsRange* out); + +// Decodes one window's docids (and per-doc positions when want_positions, and +// per-doc freqs when want_freq) from already-fetched byte slices: dd_region is +// the window's dd sub-slice; freq_region its freq sub-slice (ignored when +// !want_freq); prx_window its .prx bytes. The decoded docids are absolute +// (win_base applied). Returns Corruption on any doc-count mismatch between the +// prelude, dd/freq and prx. +Status decode_window_slices(const snii::format::WindowMeta& meta, Slice dd_region, + Slice freq_region, Slice prx_window, bool want_positions, + bool want_freq, std::vector* docids, + std::vector* freqs, + std::vector>* positions); + +} // namespace snii::reader diff --git a/be/src/snii/stats/snii_stats_provider.h b/be/src/snii/stats/snii_stats_provider.h new file mode 100644 index 00000000000000..12fdfa607bf0bd --- /dev/null +++ b/be/src/snii/stats/snii_stats_provider.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/format/norms_pod.h" +#include "snii/reader/logical_index_reader.h" + +// SniiStatsProvider -- exposes the native SNII scoring statistics required by +// BM25, sourced directly from the on-disk structures of one logical index: +// - segment-level counts (doc_count, indexed_doc_count, sum_total_term_freq) +// from the StatsBlock embedded in the per-index meta block. +// - per-term df / ttf from the term's DictEntry (resolved through the reader's +// lookup flow). The LogicalIndexWriter stores ttf directly in ttf_delta for +// tier>=T2 entries, so total_term_freq returns entry.ttf_delta. +// - per-doc length normalization byte (encoded_norm) from the norms POD, +// range-read once at open via section_refs().norms and parsed with +// NormsPodReader. +// +// avgdl() = sum_total_term_freq / max(1, indexed_doc_count): the average document +// length used by BM25 length normalization. The provider performs no scoring; it +// only surfaces the statistics so snii::query::Bm25Scorer can combine them. +namespace snii::stats { + +class SniiStatsProvider { +public: + SniiStatsProvider() = default; + + // Binds to idx and materializes the norms POD (one range read) when the index + // carries scoring norms. idx must outlive this provider. A scoring index + // without a norms section, or a corrupt norms POD, returns a non-OK Status. + static Status open(const snii::reader::LogicalIndexReader* idx, SniiStatsProvider* out); + + // Segment-level counts (direct StatsBlock fields). + uint64_t doc_count() const { return doc_count_; } + uint64_t indexed_doc_count() const { return indexed_doc_count_; } + uint64_t sum_total_term_freq() const { return sum_total_term_freq_; } + + // Average document length: sum_total_term_freq / max(1, indexed_doc_count). + double avgdl() const; + + // Per-term document frequency. Absent term -> *df = 0 (OK status). + Status doc_freq(std::string_view term, uint64_t* df) const; + + // Per-term total term frequency (ttf = df + ttf_delta at tier>=T2). Absent + // term -> *ttf = 0 (OK status). + Status total_term_freq(std::string_view term, uint64_t* ttf) const; + + // 1-byte encoded doc-length norm for docid (raw byte from the norms POD). + // Out-of-range docid -> InvalidArgument; index without norms -> InvalidArgument. + Status encoded_norm(uint32_t docid, uint8_t* out) const; + + bool has_norms() const { return has_norms_; } + +private: + const snii::reader::LogicalIndexReader* idx_ = nullptr; + uint64_t doc_count_ = 0; + uint64_t indexed_doc_count_ = 0; + uint64_t sum_total_term_freq_ = 0; + bool has_norms_ = false; + // Owned copy of the framed norms section bytes; norms_reader_ borrows from it. + std::vector norms_bytes_; + snii::format::NormsPodReader norms_reader_; +}; + +} // namespace snii::stats diff --git a/be/src/snii/version.h b/be/src/snii/version.h new file mode 100644 index 00000000000000..dd2bdef2af8e3e --- /dev/null +++ b/be/src/snii/version.h @@ -0,0 +1,4 @@ +#pragma once +#define SNII_VERSION_MAJOR 0 +#define SNII_VERSION_MINOR 1 +#define SNII_VERSION_STRING "0.1.0" diff --git a/be/src/snii/writer/compact_posting_pool.h b/be/src/snii/writer/compact_posting_pool.h new file mode 100644 index 00000000000000..ceeb150faffc4f --- /dev/null +++ b/be/src/snii/writer/compact_posting_pool.h @@ -0,0 +1,180 @@ +#pragma once + +#include +#include +#include + +namespace snii::writer { + +// SEGMENTED BYTE ARENA with per-term SLICED runs (a ByteBlockPool, after Lucene). +// +// WHY: the SPIMI accumulator's bulk memory is the per-term posting bytes. Backing +// each term with its own std::vector pays two taxes that dominate peak +// RSS at scale: (1) geometric-growth doubling slack (~1.17x of the live payload), +// and (2) a 24-32 B vector/struct header per term (hundreds of thousands of +// terms). This pool removes both: all term bytes live in a few large fixed-size +// blocks (so slack is ~one block, amortized to ~1.05x), and a term needs only two +// 32-bit cursors of live state (chain head for reads + write head for appends). +// +// HOW (slices): a term's bytes are not stored contiguously. They live in a chain +// of SLICES of geometrically growing payload capacity (the kSliceSizes schedule: +// 4, 8, 16, ... bytes of payload). Each slice is laid out as +// [ payload bytes ... ][ 4-byte forward pointer ] +// The forward pointer holds the absolute offset of the next slice's first payload +// byte (0 while the slice is still the tail of the chain). When a slice's payload +// region fills, the writer allocates a larger slice, stores its head into the old +// slice's 4 pointer bytes, and keeps appending. A reader walks the chain by +// reading payload bytes until a slice boundary, then following the pointer. +// +// Both writer and reader recompute each slice's capacity from the chain's slice +// INDEX (0, 1, 2, ...) via the deterministic schedule, so neither needs to store +// per-slice sizes. The writer carries the current slice's end offset in its +// SliceWriter handle; the reader recomputes capacities as it advances. +// +// Offsets are GLOBAL absolute byte indices into the logical concatenation of all +// blocks: offset = block_index * kBlockSize + byte_in_block. kBlockSize is a power +// of two, so offset -> (block, byte) is a shift/mask. +class CompactPostingPool { +public: + // Block size (power of two). 32 KiB blocks keep per-block tail waste tiny (it + // matters at the smaller 1M scale where the whole arena is only tens of MiB) and + // bound the outer vector header cost; at the 5M scale a few thousand + // blocks is still cheap. Empirically the lowest peak across both scales. + static constexpr uint32_t kBlockShift = 15; + static constexpr uint32_t kBlockSize = 1u << kBlockShift; // 32 KiB + static constexpr uint32_t kBlockMask = kBlockSize - 1; + + // Per-slice forward-pointer width (absolute uint32 next-slice offset). + static constexpr uint32_t kPtrBytes = 4; + + // Geometric slice payload-capacity schedule and the level transition. Level i + // slices hold kSliceSizes[i] payload bytes; on overflow the chain advances to + // kNextLevel[i] (capping at the largest level). A GENTLE (~1.5x) many-level + // schedule starting small minimizes the over-allocated final slice (the + // dominant arena overhead) while keeping the per-slice forward-pointer count + // bounded for high-df chains. + static constexpr int kLevelCount = 16; + + CompactPostingPool(); + + CompactPostingPool(const CompactPostingPool&) = delete; + CompactPostingPool& operator=(const CompactPostingPool&) = delete; + + // Payload capacity (bytes) of a fresh level-0 slice. Exposed for tests that need + // to fill exactly one slice without hardcoding the schedule. + static uint32_t kSliceSizes_level0(); + + // Payload capacity of the slice at `level`, and the level a chain advances to when + // that slice overflows. Exposed (like kSliceSizes_level0) so tests can simulate the + // arena's bump allocator exactly -- e.g. to construct an EXACT block-boundary fill -- + // without hardcoding the private schedule. `level` must be in [0, kLevelCount). + static uint32_t kSliceSize_at(int level); + static uint8_t kNextLevel_at(int level); + + // Live append handle for one term's chain. POD, 8 bytes: the absolute write + // cursor and the absolute end of the current slice's payload region. The chain's + // current slice LEVEL is kept by the caller (a uint8, packed alongside its other + // flags) so this handle stays 8 bytes -- shaving the per-term accumulator. `head` + // (the chain's first payload offset) is also stored by the CALLER (the read entry + // point); start_chain returns it. + struct SliceWriter { + uint32_t cur = 0; // next byte to write (absolute) + uint32_t slice_end = 0; // one-past-last payload byte of the current slice + }; + + // Begins a fresh chain, initializing `w` to its first (level-0) slice and + // *level to 0, and returns the chain head (absolute first payload offset). + uint32_t start_chain(SliceWriter* w, uint8_t* level); + + // Appends one payload byte to the chain described by `w` / `*level`, growing the + // chain with a new linked slice (and advancing *level) when the current slice's + // payload region is exhausted. + void append_byte(SliceWriter* w, uint8_t* level, uint8_t value); + + // Total live payload bytes ever written across all chains (excludes slice + // forward-pointer overhead). Drives the spill-threshold estimate only. + uint64_t payload_bytes() const { return payload_bytes_; } + + // Bytes the arena currently occupies (block_count * kBlockSize). The pool + // addresses bytes with a uint32 offset (next_offset_), so the arena MUST stay + // below 4 GiB or alloc_run wraps and silently aliases block 0. The accumulator + // watches this to force a safety spill before the wrap; alloc_run also enforces it + // directly (throws std::overflow_error on a would-be wrap) so a direct user of the + // pool fails loudly rather than silently corrupting. + // Hard invariant: a single CompactPostingPool never exceeds UINT32_MAX bytes. + uint64_t arena_bytes() const { return static_cast(blocks_.size()) << kBlockShift; } + + // Releases ALL blocks back to the OS. Called after the accumulator is fully + // drained (or before a spill's next fill) so no input-side bytes stay resident. + void reset(); + + // ---- Reader ---------------------------------------------------------------- + // Forward cursor over one term's chain, yielding its payload bytes in write + // order by walking the slice forward pointers. + // + // CONTRACT of the `budget` ctor argument (single, unambiguous meaning): + // `budget` is an UPPER BOUND on the number of bytes this cursor may yield. It + // is NOT required to equal the exact payload length: passing the exact length + // is fine, and so is passing any value >= it (the production caller passes the + // chain's write-head offset, which always bounds the payload from above). The + // cursor is SELF-TERMINATING: once it walks off the last written byte it sees + // the tail slice's zero forward pointer and stops, regardless of how much + // budget remains. So an over-large budget can never make next() read past the + // chain (no aliasing of block 0, no off-chain access) -- the budget is purely a + // secondary cap. has_next() is therefore a reliable "more bytes remain" + // predicate for ANY budget >= the true length: it becomes false at the smaller + // of (budget exhausted, chain tail reached). + class Cursor { + public: + Cursor(const CompactPostingPool* pool, uint32_t head, uint64_t budget); + + // True while the cursor can still yield a REAL payload byte: the budget is not + // spent AND the cursor has not reached the chain tail. It peeks the tail forward + // pointer at a slice boundary so it never reports a phantom trailing byte, making + // has_next()/next() a safe loop for any budget >= the true payload length. + bool has_next() const; + // Yields the next payload byte. Returns 0 (and yields no more) once the chain + // tail is reached or the budget is spent -- never reads past the chain. + uint8_t next(); + + private: + const CompactPostingPool* pool_; + uint32_t cur_; // absolute read cursor + uint32_t slice_end_; // one-past-last payload byte of the current slice + uint32_t level_; // current slice level + uint64_t budget_; // remaining byte budget (upper bound on bytes to yield) + }; + + // Builds a cursor over the chain at `head`. `budget` is an UPPER BOUND on bytes to + // read (see Cursor's contract): the exact payload length or anything larger. The + // production caller passes the write-head offset, which always bounds the payload + // from above; the cursor self-terminates at the chain tail regardless. + Cursor cursor(uint32_t head, uint64_t budget) const { return Cursor(this, head, budget); } + +private: + static const uint32_t kSliceSizes[kLevelCount]; + static const uint8_t kNextLevel[kLevelCount]; + + uint8_t* at(uint32_t off) { return &blocks_[off >> kBlockShift][off & kBlockMask]; } + const uint8_t* at(uint32_t off) const { return &blocks_[off >> kBlockShift][off & kBlockMask]; } + + // Reads/writes the 4-byte forward pointer at the END of a slice whose payload + // region ends at `slice_end` (pointer occupies [slice_end, slice_end+4)). + uint32_t read_ptr(uint32_t slice_end) const; + void write_ptr(uint32_t slice_end, uint32_t next_head); + + // Reserves `bytes` contiguous bytes from the arena tail (a fresh block if the + // current tail cannot hold them) and returns the first reserved absolute offset. + // `bytes` must be <= kBlockSize. + uint32_t alloc_run(uint32_t bytes); + + // Allocates a slice at `level` (payload region + 4 pointer bytes), zeroes its + // forward pointer, and returns the first payload offset; sets *slice_end. + uint32_t alloc_slice(int level, uint32_t* slice_end); + + std::vector> blocks_; // fixed kBlockSize blocks + uint32_t next_offset_ = 0; // global bump pointer (absolute) into the tail block + uint64_t payload_bytes_ = 0; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/logical_index_writer.h b/be/src/snii/writer/logical_index_writer.h new file mode 100644 index 00000000000000..03fbe7994918a7 --- /dev/null +++ b/be/src/snii/writer/logical_index_writer.h @@ -0,0 +1,238 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/per_index_meta.h" +#include "snii/format/sampled_term_index.h" +#include "snii/format/stats_block.h" +#include "snii/io/file_writer.h" +#include "snii/writer/memory_reporter.h" +#include "snii/writer/spillable_byte_buffer.h" +#include "snii/writer/spimi_term_buffer.h" + +// LogicalIndexWriter -- builds the per-logical-index section bytes (interleaved +// posting region + DICT block region) and the meta sub-sections (SampledTermIndex, +// DICT block directory, StatsBlock, XFilter) for ONE logical index. It owns the +// in-memory section bytes and the metadata needed by the container orchestrator +// (SniiCompoundWriter) to resolve absolute offsets and emit the per-index meta +// block. +// +// This module deliberately produces ONLY relative bytes/structures: it has no +// knowledge of the absolute file position where the sections will land. The +// orchestrator stitches the absolute offsets in afterward (append-only, no +// seek-back). See snii_compound_writer.h for the precise offset contract. +// +// POSTING REGION (single interleaved sink): the former separate .frq POD and .prx +// POD are merged into ONE posting region. For each pod_ref term, in term order, the +// writer appends its prx span FIRST then its frq span, contiguously: +// posting region = concat over pod_ref terms of [prx span][frq span]. +// The prx span is empty when !has_prx (docs-only / keyword tier). INLINE terms +// append NOTHING to the posting region. +// +// Per-term encoding policy (v1): +// df >= kSlimDfThreshold (512): WINDOWED pod_ref. The term's [prx windows] are +// appended to the posting region first, then its [prelude][dd-block][freq-block] +// frq span. The DictEntry records frq/prx off_delta+len relative to +// frq_base/prx_base (see below). +// df < kSlimDfThreshold: SLIM. The postings are encoded as a single .frq +// window (and .prx window). If the encoded .frq bytes are small +// (<= kDefaultInlineThreshold), they are stored INLINE inside the DictEntry +// (kind=inline); otherwise the term's [prx][frq] spans are appended to the +// posting region as a slim pod_ref (kind=pod_ref, enc=slim, no prelude). +// +// frq_base / prx_base convention (DOCUMENTED CONTRACT): +// For each DICT block, frq_base == prx_base == the running byte offset into THIS +// index's posting region at the moment the block opens (the posting-region size +// when the block's first POD-backed entry is appended). A windowed/slim pod_ref +// entry then sets frq_off_delta = (offset of its frq span within the posting +// region) - frq_base, so the reader computes the absolute file offset as +// section_refs.posting_region.offset + frq_base + frq_off_delta. +// prx_base / prx_off_delta follow the identical rule against the SAME region. +// Because [prx][frq] are written contiguously per term, a writer-side property +// holds when has_prx: frq_off_delta == prx_off_delta + prx_len. The reader does +// NOT rely on it -- each delta is resolved independently. +// Inline entries carry no off_delta (bytes live in the entry). +namespace snii::writer { + +// Inputs describing one logical index to be written. +struct SniiIndexInput { + uint64_t index_id = 0; + std::string index_suffix; + snii::format::IndexConfig config = snii::format::IndexConfig::kDocsPositions; + uint32_t doc_count = 0; + std::vector null_docids; + // Per-doc 1-byte encoded norm (length doc_count); only consumed when the + // config has scoring. May be empty otherwise. + std::vector encoded_norms; + // Lexicographically sorted terms with ascending-docid postings. Used when + // `term_source` is null (callers that already hold a materialized vector, + // e.g. unit tests). The writer reads but does not retain these. + std::vector terms; + // Optional streaming term source. When non-null, the writer DRAINS it via + // SpimiTermBuffer::for_each_term_sorted so that only one term's postings is + // materialized at a time (avoiding the full TermPostings vector and its + // second-copy peak). `terms` is ignored when this is set. The buffer is + // consumed (emptied) by build(); the caller must keep it alive until build() + // returns and must not reuse it afterwards. + SpimiTermBuffer* term_source = nullptr; + // Target DICT block size in bytes; a block is cut once its estimate reaches + // this. 0 uses kDefaultTargetDictBlockBytes. Smaller values yield more blocks + // (and a finer-grained sampled-term index). + uint32_t target_dict_block_bytes = 0; + // Optional writer-level build-RAM reporter (one per SniiCompoundWriter = one + // segment inverted index). When non-null, the dict buffer reports its REAL + // resident-byte deltas (positive on grow, negative on spill). The SPIMI side + // (arena + slot index) reports through the SAME reporter, injected directly at + // the term_source's construction by the caller. null in bench / unit tests -> no + // reporting. NEVER report live_bytes_ (a gated estimate); report + // arena_bytes()+slot_of_+dict ram_bytes_. + MemoryReporter* mem_reporter = nullptr; +}; + +// Builds and holds the section bytes + meta sub-sections for one logical index. +class LogicalIndexWriter { +public: + explicit LogicalIndexWriter(const SniiIndexInput& in); + + // Builds DICT blocks, the interleaved posting region, sampled-term index, dict + // directory, stats and bsbf. The posting region is written STRAIGHT into + // `posting_out` as terms are produced (no temp round-trip for the bulk); the + // orchestrator captures its absolute offset/length from posting_out->bytes_written() + // around this call. Must be called once before the accessors below. Returns + // InvalidArgument on a null sink or inconsistent input (e.g. norms/doc_count + // mismatch when scoring is enabled, or non-ascending docids). + Status build(snii::io::FileWriter* posting_out); + + // DICT region byte length (relative; orchestrator decides its absolute offset). The + // DICT region (zstd-compressed blocks) is built into a tiered buffer during build() + // -- it must land contiguously AFTER the posting region (streamed concurrently), so + // it cannot stream directly. The buffer stays in RAM while small (spill-only build) + // and spills to a temp once it crosses the RAM cap (bounded peak RSS for a huge + // dict). Its bytes are emitted via stream_dict_region_into below. The posting region + // went straight to the output during build(), so it has no length accessor here -- + // the orchestrator measures it directly. norms stays in RAM (1 byte/doc). + uint64_t dict_region_size() const { return dict_buf_.size(); } + const std::vector& norms_bytes() const { return norms_section_; } + const std::vector& null_bitmap_bytes() const { return null_bitmap_section_; } + // Block-split bloom XFilter blob ([28B header][bitset]); empty when no terms. + const std::vector& bsbf_bytes() const { return bsbf_bytes_; } + bool has_bsbf() const { return !bsbf_bytes_.empty(); } + bool has_null_bitmap() const { return !null_bitmap_section_.empty(); } + + // Streams the DICT region (RAM or spilled temp) into the append-only container + // after its posting region. + Status stream_dict_region_into(snii::io::FileWriter* out) const { + return dict_buf_.stream_into(out); + } + + bool has_prx() const { return has_prx_; } + bool has_norms() const { return has_norms_; } + snii::format::IndexTier tier() const { return tier_; } + uint64_t index_id() const { return index_id_; } + const std::string& index_suffix() const { return index_suffix_; } + + // Builds the per-index meta block bytes given the resolved ABSOLUTE section + // refs (filled by the orchestrator), appending them to out. The DICT block + // directory entries are rebased to absolute offsets using dict_region_offset. + Status finish_meta(const snii::format::SectionRefs& abs_refs, uint64_t dict_region_offset, + ByteSink* out) const; + +private: + // One DICT block's directory record. The block's serialized bytes are appended to + // the in-RAM dict buffer as soon as the block is cut; only this compact summary + // (offset within the dict region + length + entry count + checksum) is kept to + // build the DICT block directory at finish_meta time. The absolute file offset is + // computed as dict_region_offset + rel_offset. + struct BlockRecord { + uint64_t rel_offset = 0; // byte offset of this block within the dict region + uint64_t length = 0; // ON-DISK block length (compressed when flags&kZstd) + uint32_t n_entries = 0; + uint32_t checksum = 0; // crc32c of the UNCOMPRESSED block bytes + uint8_t flags = 0; // block_ref_flags::* (kZstd when block is compressed) + uint64_t uncomp_len = 0; // uncompressed block length (when flags&kZstd) + std::string first_term; + }; + + // Validates one term's shape (parallel lengths, strictly ascending docids). + Status validate_term(const TermPostings& tp) const; + // Iterates terms (from the streaming source or the materialized vector), + // splitting DICT blocks by target size and filling PODs + blocks_. + Status build_blocks(); + // Per-term driver shared by both the streaming and materialized paths: + // validates the term, opens a block if needed, builds its DictEntry, and cuts + // the block once it reaches the target size. Mutates the running block state. + struct BlockState; + // `tp` is taken by mutable reference: the encode FREES the term's large flat + // arrays (docids/freqs/positions_flat) as soon as they are consumed, so the + // widest term's source does not co-exist with its encoded output at peak RSS. + Status process_term(TermPostings& tp, BlockState* st); + // Region-relative byte count of the posting bytes written so far (the offset basis + // for frq_base/prx_base + frq_off_delta/prx_off_delta). During build() the only + // writes to posting_out_ are this index's posting region, so the count is the + // output offset advanced since the region began. + uint64_t posting_size() const { return posting_out_->bytes_written() - posting_off0_; } + // Builds one DictEntry (inline or pod_ref), growing the posting region as needed. + Status build_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + snii::format::DictEntry* e); + // Builds a windowed (df >= kSlimDfThreshold) entry: multi-window + two-level + // prelude. The term's [prx span][frq span] is appended to the posting region. + Status build_windowed_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + snii::format::DictEntry* e); + // Builds a slim (df < kSlimDfThreshold) entry: single window, inline or + // pod_ref, no prelude. + Status build_slim_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + snii::format::DictEntry* e); + // Serializes the current open block, streams its bytes into the dict scratch + // file, and records a compact directory entry (no block bytes retained). + Status flush_block(snii::format::DictBlockBuilder* block, std::string first_term); + + uint64_t index_id_; + std::string index_suffix_; + snii::format::IndexTier tier_; + bool has_prx_; + bool has_freq_; // tier >= T2: a freq region is encoded per window + bool has_norms_; + uint32_t doc_count_; + std::vector null_docids_; + const std::vector& terms_; // materialized fallback (may be empty) + SpimiTermBuffer* term_source_; // streaming source (null => use terms_) + uint64_t term_count_ = 0; // distinct terms actually consumed + const std::vector& encoded_norms_; + + uint32_t target_dict_block_bytes_; + // The DICT region (zstd-compressed blocks) is staged here as blocks flush. It must + // land contiguously AFTER the posting region (which streams concurrently to the + // output), so it cannot stream directly; the orchestrator streams it into the + // container right after the posting region. It has NO independent local cap -- it + // spills to a temp via the writer's UNIFIED gate-2 cap (the MemoryReporter from + // SniiIndexInput, null off-Doris), the same single cap the SPIMI arena uses, so one + // threshold bounds the writer's total build RAM. The dict self-reports its ram_bytes_ + // deltas; the SPIMI term_source self-reports its arena+slot deltas (its reporter is + // injected at the source's own construction by the caller). + SpillableByteBuffer dict_buf_; + // The interleaved [prx][frq] posting region streams STRAIGHT into the container + // output during build() -- no temp. posting_out_ is the container writer (borrowed + // for the duration of build); posting_off0_ is its absolute offset when this index's + // region began, so posting_size() = bytes_written() - posting_off0_. + snii::io::FileWriter* posting_out_ = nullptr; + uint64_t posting_off0_ = 0; + std::vector norms_section_; + std::vector null_bitmap_section_; + + std::vector blocks_; + // One 8-byte XXH64 (seed 0) filter key per term, collected during the build pass + // so the whole-vocabulary string copy is never retained. + std::vector term_hashes_; + snii::format::StatsBlock stats_; + std::vector bsbf_bytes_; // serialized block-split bloom XFilter section +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/memory_reporter.h b/be/src/snii/writer/memory_reporter.h new file mode 100644 index 00000000000000..e9352d43d18e61 --- /dev/null +++ b/be/src/snii/writer/memory_reporter.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include + +namespace snii::writer { + +// Per-WRITER accurate byte counter for build-time RAM (one per SniiCompoundWriter = +// one per segment's inverted index). Modules report their own resident-byte deltas; +// current_bytes() is that writer's accurate live usage. OBSERVE-ONLY -- SNII never +// makes a flush decision from it (gate 1 belongs to Doris; gate 2 is the internal +// threshold). consume_release mirrors the delta into Doris's LOAD MemTracker so the +// inverted-index RAM is counted by MemTableMemoryLimiter's pressure decision; it is +// null off-Doris (bench / unit tests), where only the local atomic is updated. +class MemoryReporter { +public: + using ConsumeReleaseFn = std::function; // null off-Doris + // cap_bytes is the UNIFIED gate-2 buffer cap for the WHOLE writer (e.g. Doris's + // 512 MiB inverted-index buffer config); 0 = unlimited. Every build buffer of this + // writer (SPIMI arena + dict) self-spills when over_cap() is true -- one threshold on + // the unified total, not a separate per-buffer threshold. + explicit MemoryReporter(ConsumeReleaseFn consume_release = nullptr, uint64_t cap_bytes = 0) + : consume_release_(std::move(consume_release)), cap_bytes_(cap_bytes) {} + + MemoryReporter(const MemoryReporter&) = delete; + MemoryReporter& operator=(const MemoryReporter&) = delete; + + // delta > 0 grows, delta < 0 shrinks/frees. Exactly one report per change site. + void report(int64_t delta) { + current_.fetch_add(delta, std::memory_order_relaxed); + if (consume_release_) consume_release_(delta); // mirror into Doris load tracker + } + + int64_t current_bytes() const { return current_.load(std::memory_order_relaxed); } + + // True once the writer's UNIFIED total build RAM (arena + slot index + dict + ...) + // reaches the cap. The single gate-2 trigger shared by every buffer of the writer. + bool over_cap() const { + return cap_bytes_ != 0 && current_bytes() >= static_cast(cap_bytes_); + } + uint64_t cap_bytes() const { return cap_bytes_; } + +private: + std::atomic current_ {0}; + ConsumeReleaseFn consume_release_; + uint64_t cap_bytes_ = 0; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/snii_compound_writer.h b/be/src/snii/writer/snii_compound_writer.h new file mode 100644 index 00000000000000..bd3a7c454026ad --- /dev/null +++ b/be/src/snii/writer/snii_compound_writer.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/io/file_writer.h" +#include "snii/writer/logical_index_writer.h" + +// SniiCompoundWriter -- orchestrates a single-segment SNII container for one or +// more logical indexes, written front-to-back through an append-only +// io::FileWriter (no seek-back). It resolves all back-references by writing the +// tail meta region and the fixed tail pointer LAST. +// +// CONTAINER LAYOUT PRODUCED (this is the on-disk contract the reader matches): +// [bootstrap_header] (kBootstrapHeaderSize bytes) +// for each logical index, in add order: +// [posting region] interleaved [prx][frq] per pod_ref term, term order +// (prx span empty when !has_prx) +// [DICT blocks region] concatenated DICT blocks, split by +// target_dict_block_bytes +// for each logical index, in add order: +// [norms POD] NormsPodWriter::finish (scoring only; else absent) +// [null bitmap POD] NullBitmapWriter::finish (when nulls exist) +// [tail_meta_region] one per_index_meta block per index + directory +// [tail_pointer] encode_tail_pointer at EOF +// +// (The posting region is streamed BEFORE the DICT region per index: postings are +// the large append-only term-ordered stream; the DICT region is the compact +// compressed trailer.) +// +// OFFSET CONVENTIONS (ABSOLUTE file offsets unless stated otherwise): +// - SectionRefs in each per_index_meta record ABSOLUTE file offset+length of +// that index's posting_region, dict_region, norms. Absent regions are (0,0) +// (e.g. norms for a docs-positions index; null_bitmap is always (0,0) in v1). +// A present-but-empty posting_region (all-INLINE index) is (off, 0). +// - DictBlockDirectory entries record each DICT block's ABSOLUTE file offset + +// length. +// - A windowed/slim pod_ref entry's absolute .frq offset = +// section_refs.posting_region.offset + frq_base + frq_off_delta +// where frq_base is the posting-region-relative running offset captured at the +// block's open (see logical_index_writer.h). prx follows the identical rule +// against the SAME region (prx_base == frq_base). +// - tail_pointer.meta_region_offset/length point at the tail_meta_region; +// hot_off = 0 (no hot region in v1). +namespace snii::writer { + +class SniiCompoundWriter { +public: + explicit SniiCompoundWriter(snii::io::FileWriter* out); + + // Buffers one logical index: builds its section bytes and meta sub-sections. + // The actual file writing happens in finish() (single front-to-back pass). + Status add_logical_index(const SniiIndexInput& in); + + // Writes bootstrap header + all index sections + norms + tail meta region + + // tail pointer, then finalizes the underlying writer. May be called once. + Status finish(); + +private: + // Absolute placement of one index's sections, resolved during finish(). + struct Placement { + uint64_t dict_off = 0; + uint64_t dict_len = 0; + uint64_t post_off = 0; // interleaved [prx][frq] posting region (was frq + prx) + uint64_t post_len = 0; + uint64_t norms_off = 0; + uint64_t norms_len = 0; + uint64_t null_off = 0; + uint64_t null_len = 0; + uint64_t bsbf_off = 0; + uint64_t bsbf_len = 0; + }; + + Status ensure_bootstrap(); + Status write_bootstrap(); + Status write_norms(); + Status write_tail(); + Status append(const std::vector& bytes); + + snii::io::FileWriter* out_; + std::vector> indexes_; + // Per-index placement; post_off/post_len are filled as each index's posting region + // streams in during add_logical_index, the rest during finish(). The absolute write + // offset is out_->bytes_written() (the single source of truth -- no separate cursor). + std::vector placements_; + bool bootstrap_written_ = false; + bool finished_ = false; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/spill_run_codec.h b/be/src/snii/writer/spill_run_codec.h new file mode 100644 index 00000000000000..d79381aa67184f --- /dev/null +++ b/be/src/snii/writer/spill_run_codec.h @@ -0,0 +1,181 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/writer/spimi_term_buffer.h" + +namespace snii::writer { + +// On-disk SPIMI "run" codec for the spill / k-way-merge out-of-core build path. +// +// A RUN is a self-describing file holding a sequence of terms keyed by TERM-ID, +// each followed by its postings, in this exact wire layout. The file is produced +// and consumed by THIS module only (a private temp file -- the on-disk INDEX is +// unaffected), so the format is chosen for cheap I/O: docids, freqs and positions +// are ALL RAW fixed-width little-endian u32 BLOCKS (bulk memcpy on both ends, +// ~10x cheaper than per-value varint -- which cost ~1.5s of encode CPU over the +// 5M build's ~60M docids and compressed those streams poorly anyway). Decode +// still validates every length against the file size. +// +// run := record* (term-ids ordered by vocab string, +// strictly ascending within a run) +// record := +// VInt term_id (index into the shared vocabulary; the +// string is NOT stored -- smaller runs, +// no per-record string IO) +// VInt n_docs +// u32 docid * n_docs (RAW LE block, memcpy; ABSOLUTE ascending +// docids -- the merge concatenates across +// runs and re-deltas at index encode time) +// u32 freq * n_docs (RAW LE block, memcpy; each >= 1) +// VInt n_pos (== sum(freqs) when has_positions, else 0) +// u32 position * n_pos (RAW LE block, document-order, partitioned +// by freqs) +// +// Decode is fully STREAMED: a RunReader reads a small fixed buffer at a time and +// materializes only the CURRENT term's postings, never the whole run. The k-way +// merge keeps one heap slot per run (each holding only its current term-id + +// that term's postings), so peak memory is bounded by the widest single term +// summed across the runs that contain it -- not by total postings. The merge +// orders runs by the term-id's VOCAB STRING (resolved via the shared vocabulary) +// so the merged stream is lexicographic. + +// Writes a sorted sequence of terms (by id) to one run file. Term-ids must be +// handed to write_term in vocab-string ascending order (the spill caller sorts +// before spilling). RAII: the file is flushed and closed on close(); the partial +// file is left for the owning SpimiTermBuffer to delete on its temp-path list. +class RunWriter { +public: + RunWriter() = default; + ~RunWriter(); + + RunWriter(const RunWriter&) = delete; + RunWriter& operator=(const RunWriter&) = delete; + + // Opens `path` for writing (truncating). Returns IoError on failure. + Status open(const std::string& path); + + // Appends one term's postings under `term_id`. `tp.positions_flat` must be empty + // iff !has_positions (and otherwise hold sum(freqs) entries in doc order). + // Caller guarantees ascending docids and parallel docids/freqs lengths. + Status write_term(uint32_t term_id, const TermPostings& tp); + + // Flushes the buffer and closes the file. Safe to call once; idempotent. + Status close(); + +private: + Status flush(); + + int fd_ = -1; + std::vector buf_; // staging buffer; flushed in fixed-size chunks +}; + +// Streamed reader over one run file. After open() the first term is loaded; +// current()/current_id() expose it; advance() loads the next (or marks +// exhausted). Only the current term's postings live in memory at a time. The +// current record's `term` string is left EMPTY -- runs store only the id; the +// owner resolves the string via the shared vocabulary. +// +// LAZY POSITIONS (peak-RSS optimization for the widest merged term): advance() +// loads term_id / docids / freqs and the position-block COUNT, but does NOT read +// the position bytes -- it leaves the decode window cursor parked at the start of +// the position block. The owner then chooses, per term: +// * materialize_positions(): bulk-reads the block into current().positions_flat +// (the default; behaves exactly as the old eager reader). +// * stream_positions(dst, n): pulls the next n positions straight from the +// window in 64 KiB chunks, never materializing the whole block -- used by the +// k-way merge's wide-term position pump so the widest term's tens-of-MiB +// positions buffer is never resident. +// advance() drains any positions left unread from the previous term before the +// next record, so a partly-streamed (or skipped) term still lands at the right +// record boundary. The yielded byte sequence is identical either way. +class RunReader { +public: + RunReader() = default; + ~RunReader(); + + RunReader(const RunReader&) = delete; + RunReader& operator=(const RunReader&) = delete; + + // Opens `path`, loading the first record (if any). has_positions must match + // the writer's setting so n_pos is interpreted consistently. + Status open(const std::string& path, bool has_positions); + + bool exhausted() const { return exhausted_; } + const TermPostings& current() const { return current_; } + uint32_t current_id() const { return current_id_; } + + // Number of positions in the current term's (lazily-loaded) position block. + uint64_t current_pos_count() const { return pos_count_; } + // True once the current term's positions have been materialized OR fully + // streamed (i.e. nothing remains to read before advance()). + bool positions_drained() const { return pos_remaining_ == 0; } + + // Materializes the current term's position block into current().positions_flat + // (bulk read). Idempotent within a term: a no-op once positions are drained. + Status materialize_positions(); + // Streams the next `n` positions of the current term into dst[0..n) directly + // from the decode window (64 KiB chunks topped up on demand). Caller must not + // request more than positions_remaining(); each call advances the cursor. + Status stream_positions(uint32_t* dst, size_t n); + uint64_t positions_remaining() const { return pos_remaining_; } + + // Loads the next record into current(); sets exhausted() at end of file. Any + // positions of the current term left unread are skipped first. + Status advance(); + +private: + size_t available() const; // buffered bytes from pos_ to window end + Status fill(); // tops up the decode window from disk + Status ensure(size_t n); // guarantees >= n buffered bytes (or eof) + Status read_varint(uint64_t* v); // bounds-checked streamed varint + // Bulk-reads `count` RAW little-endian u32s from the window into `out` (resized + // to count). Bounds-checked against the run's true length (Corruption on EOF). + Status read_raw_u32(size_t count, std::vector* out); + // Streams `count` raw u32s from the window into dst (caller-owned, sized by the + // caller); shared by read_raw_u32 (into a vector) and stream_positions. + Status pull_raw_u32(uint8_t* dst, size_t count); + // Drains (and discards) any remaining positions of the current term so the + // window cursor lands at the next record boundary. + Status skip_remaining_positions(); + + int fd_ = -1; + bool has_positions_ = false; + bool exhausted_ = false; + uint64_t file_size_ = 0; // total run byte size (fstat at open); bounds lengths + std::vector window_; // sliding decode window + size_t pos_ = 0; // consumed offset within window_ + bool eof_ = false; // no more bytes on disk + uint32_t current_id_ = 0; // current record's term-id + uint64_t pos_count_ = 0; // current term's total position count (from n_pos) + uint64_t pos_remaining_ = 0; // positions still unread in the current block + TermPostings current_; +}; + +// K-way merges the given run files into a single term stream ordered by the +// term-id's VOCAB STRING (lexicographic), invoking `fn` once per distinct +// term-id with its postings concatenated across all runs that contain it (in +// run order -> docids stay ascending) and its `term` resolved from `vocab`. +// Only one merged term is materialized at a time. Returns IoError/Corruption on +// bad run data. has_positions must match how the runs were written. `vocab` +// maps term-id -> string and is borrowed. +// +// allow_stream_positions (peak-RSS optimization): when true (the streaming-writer +// path), a WIDE merged term's positions are NOT materialized into positions_flat; +// instead the TermPostings carries a pos_pump that streams positions in document +// order straight from the run readers (which stay parked at this term's blocks +// for the duration of the fn() call). `fn` MUST therefore consume each term +// SYNCHRONOUSLY and must NOT retain the TermPostings past the call (the pump +// references live readers freed when the merge advances). Callers that retain the +// term (e.g. finalize_sorted) MUST pass false, so positions are always fully +// materialized. The produced bytes are identical either way. +Status MergeRuns(const std::vector& run_paths, const std::vector& vocab, + bool has_positions, const std::function& fn, + bool allow_stream_positions = true); + +} // namespace snii::writer diff --git a/be/src/snii/writer/spillable_byte_buffer.h b/be/src/snii/writer/spillable_byte_buffer.h new file mode 100644 index 00000000000000..0f5737e2bdd2f1 --- /dev/null +++ b/be/src/snii/writer/spillable_byte_buffer.h @@ -0,0 +1,158 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/local_file.h" +#include "snii/writer/memory_reporter.h" +#include "snii/writer/temp_dir.h" + +namespace snii::writer { + +// A tiered append buffer for one build-time section. While resident it holds the +// bytes as a CHAIN OF CHUNKS (one per append) rather than a single growing vector: +// each append owns a right-sized allocation, so there is NO geometric-doubling +// realloc transient and NO power-of-two capacity slack -- the resident cost is +// exactly the bytes appended, for any section size. Once the running size crosses +// `cap_bytes` the buffer SPILLS to a temp file (resolve_temp_dir()) and routes later +// appends there, so a huge section stays RSS-bounded at ~cap_bytes while a small one +// is RAM-only (zero disk, spill-only build). append order/bytes are identical +// wherever they land; stream_into() reproduces the section in order. RAII-removes the +// temp. (cap_bytes == UINT64_MAX disables spilling -> always RAM.) +class SpillableByteBuffer { +public: + // `reporter` is an OPTIONAL writer-level build-RAM reporter (null off-Doris / + // unit tests). When non-null, every change to ram_bytes_ (the RESIDENT tier) is + // mirrored to it as a signed delta: a positive delta per RAM append, and a single + // negative delta == prior ram_bytes_ when the buffer spills (the resident chunks + // are dropped and the bytes move to disk, so they must NOT be counted as RSS). + // Spilled bytes live on disk and are never reported. + SpillableByteBuffer(uint64_t cap_bytes, std::string tag, MemoryReporter* reporter = nullptr) + : cap_bytes_(cap_bytes), tag_(std::move(tag)), reporter_(reporter) {} + ~SpillableByteBuffer() { + // Balance the reporter: on the common un-spilled path the resident ram_bytes_ was + // reported as positive on append but never released, so release it now (a missed + // negative would leak into Doris's MemTracker). After a spill, spill_to_disk() + // already reported the negative and ram_bytes_ no longer counts as resident. + if (reporter_ && !spilled_ && ram_bytes_ > 0) { + reporter_->report(-static_cast(ram_bytes_)); + } + if (!temp_path_.empty()) std::remove(temp_path_.c_str()); + } + SpillableByteBuffer(const SpillableByteBuffer&) = delete; + SpillableByteBuffer& operator=(const SpillableByteBuffer&) = delete; + + // Total bytes appended so far (the offset basis for callers recording sub-offsets). + uint64_t size() const { return spilled_ ? spilled_bytes_ : ram_bytes_; } + + // Copying append (the Slice bytes are copied into a fresh chunk). + Status append(Slice bytes) { + if (spilled_) { + SNII_RETURN_IF_ERROR(temp_.append(bytes)); + spilled_bytes_ += bytes.size(); + return Status::OK(); + } + if (!bytes.empty()) { + chunks_.emplace_back(bytes.data(), bytes.data() + bytes.size()); + ram_bytes_ += bytes.size(); + if (reporter_) reporter_->report(static_cast(bytes.size())); + } + if (over_cap()) return spill_to_disk(); + return Status::OK(); + } + + // Move append: the section ADOPTS the caller's vector (no copy, no slack). The + // common dict path -- each flushed block is handed off by move. + Status append_move(std::vector&& v) { + if (spilled_) { + SNII_RETURN_IF_ERROR(temp_.append(Slice(v))); + spilled_bytes_ += v.size(); + return Status::OK(); + } + if (!v.empty()) { + ram_bytes_ += v.size(); + if (reporter_) reporter_->report(static_cast(v.size())); + chunks_.push_back(std::move(v)); + } + if (over_cap()) return spill_to_disk(); + return Status::OK(); + } + + // Must be called once after the last append, before stream_into(): flushes the temp + // (if spilled) so it can be read back. A no-op for a RAM-resident buffer. + Status seal() { + if (spilled_ && !sealed_) { + SNII_RETURN_IF_ERROR(temp_.finalize()); + sealed_ = true; + } + return Status::OK(); + } + + // Streams the whole section (RAM chunks or sealed temp) into `out`, in append order. + Status stream_into(snii::io::FileWriter* out) const { + if (!spilled_) { + for (const auto& c : chunks_) { + if (!c.empty()) SNII_RETURN_IF_ERROR(out->append(Slice(c))); + } + return Status::OK(); + } + snii::io::LocalFileReader r; + SNII_RETURN_IF_ERROR(r.open(temp_path_)); + constexpr uint64_t kChunk = 1u << 20; // fixed copy window (no whole-section reload) + std::vector buf; + for (uint64_t off = 0; off < spilled_bytes_; off += kChunk) { + const uint64_t n = std::min(kChunk, spilled_bytes_ - off); + SNII_RETURN_IF_ERROR(r.read_at(off, n, &buf)); + SNII_RETURN_IF_ERROR(out->append(Slice(buf))); + } + return Status::OK(); + } + + bool spilled() const { return spilled_; } + +private: + // Gate-2 spill condition (UNIFIED): spill when the writer's TOTAL build RAM crosses + // the one shared cap (reporter_->over_cap()), with the local cap_bytes_ kept only as + // a defensive per-buffer hard ceiling (e.g. when no reporter is attached). + bool over_cap() const { + return (reporter_ != nullptr && reporter_->over_cap()) || ram_bytes_ >= cap_bytes_; + } + Status spill_to_disk() { + temp_path_ = resolve_temp_dir() + "/snii_" + tag_ + "_" + std::to_string(::getpid()) + "_" + + std::to_string(reinterpret_cast(this)) + ".tmp"; + SNII_RETURN_IF_ERROR(temp_.open(temp_path_)); + for (const auto& c : chunks_) { + if (!c.empty()) SNII_RETURN_IF_ERROR(temp_.append(Slice(c))); + } + spilled_bytes_ = ram_bytes_; + // The resident tier is freed: report the full negative delta == prior ram_bytes_ + // so the writer-level RAM counter (and Doris's LOAD tracker) no longer counts + // these bytes as RSS -- they now live on disk. This single negative balances the + // sum of all prior positive append deltas (net-zero RAM after spill). + if (reporter_) reporter_->report(-static_cast(ram_bytes_)); + std::vector>().swap(chunks_); // reclaim the RAM immediately + spilled_ = true; + return Status::OK(); + } + + uint64_t cap_bytes_; + std::string tag_; + MemoryReporter* reporter_ = nullptr; // optional build-RAM reporter (null off-Doris) + std::vector> chunks_; // resident tier: one chunk per append + uint64_t ram_bytes_ = 0; + bool spilled_ = false; + bool sealed_ = false; + snii::io::LocalFileWriter temp_; + std::string temp_path_; + uint64_t spilled_bytes_ = 0; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/spimi_term_buffer.h b/be/src/snii/writer/spimi_term_buffer.h new file mode 100644 index 00000000000000..d2b617ccfb4c69 --- /dev/null +++ b/be/src/snii/writer/spimi_term_buffer.h @@ -0,0 +1,362 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/writer/compact_posting_pool.h" +#include "snii/writer/memory_reporter.h" + +namespace snii::writer { + +// One term's posting list: docids ascending, with parallel freqs and (when +// positions are enabled) a single FLAT positions buffer. +// +// positions_flat holds every position for the term in document order, partitioned +// by freqs: doc i owns the next freqs[i] entries. This is the SAME layout the +// accumulator stores natively, so no per-doc vector-of-vectors is ever built on +// the build/merge hot path (that vector-of-vectors was the dominant peak-RSS +// driver for high-df terms). doc_positions(i) returns a non-owning span view of +// doc i's positions for consumers that want per-doc access (e.g. the prx window +// builder, tests). positions_flat is empty when positions are disabled. +struct TermPostings { + std::string term; + std::vector docids; + std::vector freqs; + std::vector positions_flat; // empty when positions disabled + + // OPTIONAL streamed-positions source (peak-RSS optimization for very-high-df + // terms). When set, positions_flat is left EMPTY and the writer pulls positions + // SEQUENTIALLY in document order via pos_pump(dst, n) -- filling `dst[0..n)` with + // the next n positions -- one window at a time, so the term's full flat positions + // buffer (tens of MiB for the widest term) is never materialized. The yielded + // bytes are byte-identical to building from positions_flat (same values, same + // order). pos_total is the total number of positions the pump will yield (== + // sum(freqs)); it lets the writer validate without a flat buffer. When pos_pump + // is null, positions come from positions_flat as before. Only the writer's prx + // builders consume this; all other consumers use positions_flat. + // + // OWNERSHIP CONTRACT (synchronous-consume-once): a streamed pos_pump captures + // references into the producer's stack and its parked run readers/arena, valid ONLY + // for the duration of the synchronous fn(TermPostings&&) call that delivered this + // TermPostings. The consumer MUST pull all positions inside fn() and MUST NOT store + // the TermPostings or invoke pos_pump after fn() returns. Callers that retain the + // TermPostings pass allow_stream_positions=false, which materializes positions into + // positions_flat instead (no pump). As a safety net, a deferred call to a streamed + // pump throws std::logic_error rather than dereferencing freed state. + std::function pos_pump; + uint64_t pos_total = 0; + + // Byte offset of doc i's first position within positions_flat (prefix sum of + // freqs). O(i) -- callers iterating all docs should track a running offset. + size_t pos_offset(size_t doc_index) const { + size_t off = 0; + for (size_t i = 0; i < doc_index; ++i) off += freqs[i]; + return off; + } + // Non-owning view of doc i's positions (length freqs[i]) into positions_flat. + std::span doc_positions(size_t doc_index) const { + const size_t off = pos_offset(doc_index); + return std::span(positions_flat.data() + off, freqs[doc_index]); + } + + // Rebuilds the per-doc position lists (for callers/tests wanting per-doc access) + // from positions_flat partitioned by freqs. O(total positions); allocates. + std::vector> positions_per_doc() const { + std::vector> out(freqs.size()); + size_t off = 0; + for (size_t i = 0; i < freqs.size(); ++i) { + out[i].assign(positions_flat.begin() + off, positions_flat.begin() + off + freqs[i]); + off += freqs[i]; + } + return out; + } + + // Sets the flat positions from per-doc lists (convenience for tests / callers + // that produce per-doc positions). Does NOT touch freqs; the caller is expected + // to keep freqs[i] == per_doc[i].size() consistent (the writer validates this). + void set_positions_per_doc(const std::vector>& per_doc) { + positions_flat.clear(); + for (const auto& d : per_doc) + positions_flat.insert(positions_flat.end(), d.begin(), d.end()); + } +}; + +// In-memory SPIMI (Single-Pass In-Memory Indexing) accumulator for one logical +// index. Records term occurrences and produces lexicographically sorted terms +// with ascending-docid posting lists. +// +// TERM-ID ACCUMULATION (no per-token string work): tokens are accumulated by an +// INTEGER term-id, not by hashing/constructing a std::string per token. The +// caller supplies a VOCABULARY mapping term-id -> term string; the buffer keeps +// a DENSE std::vector indexed by term-id, so the hot add_token path is a +// vector index + a couple of pushes -- no hashing, no allocation per token. The +// vocabulary is resolved to strings only once per distinct term at finalize. +// +// Two construction modes: +// * BORROWED vocab (the fast path): pass a non-null `vocab` that the caller +// owns and keeps alive; add_token(term_id, ...) indexes straight into it. +// * OWNED vocab (compatibility): pass a null `vocab`; the string-keyed +// add_token(string_view, ...) interns each new term into an internal owned +// vocabulary (assigning ids in first-seen order) and forwards to the id +// path. Existing callers that feed strings keep working unchanged. +// +// SPILL / K-WAY MERGE (out-of-core, bounds input RAM): when a non-zero +// spill_threshold_bytes is set, the REAL resident accumulator size (the posting +// arena + the vocab-sized slot index, pool_.arena_bytes() + slot_of_.capacity()*4) +// is compared against the threshold as tokens arrive; once it crosses the +// threshold the buffer SORTS its current terms, +// writes a self-describing sorted RUN to a temp file, and CLEARS memory. Each +// run record is keyed by the TERM-ID (varint); the k-way merge orders runs by +// the id's VOCAB STRING so the merged stream stays lexicographic. Because +// tokens arrive in globally ascending docid order, a term that reappears in a +// later run only covers strictly-later docids, so concatenating its postings in +// run order during the final merge keeps docids ascending. for_each_term_sorted +// flushes the residual buffer as a final run, then k-way merges all runs +// materializing only ONE merged term at a time -> peak memory stays bounded by +// the threshold (plus the widest single term), NOT by total postings. With the +// default threshold 0 (unlimited) the path is exactly the in-memory behavior. +// +// Internal representation is a COMPACT TAGGED VARINT byte stream per term, held in +// a shared SEGMENTED ARENA (CompactPostingPool), NOT per-term uint32 vectors. Each +// term owns ONE arena chain holding a stream of per-TOKEN entries in arrival +// order: every token contributes varint((pos << 1) | new_doc_bit); when new_doc_bit +// is set, the token's doc differs from the previous one, so a zigzag-varint(docid - +// prev_docid) immediately follows. Frequencies are NOT stored -- a doc's freq is +// the count of consecutive same-doc tokens, recovered while decoding. This drops +// the entire freq stream and the second (positions) chain versus a freq/prox split, +// so the payload is ~3.4x smaller than raw uint32 docids/freqs/positions, and the +// shared arena removes per-vector doubling slack and per-term vector headers. Each +// append writes straight into the chain (no deferred per-doc flush): the only live +// per-term state is the current doc id (to detect a doc change) and the delta base. +// to_postings() decodes a term's chain back to the SAME flat TermPostings the +// writer consumes, so the produced .idx is BYTE-IDENTICAL. positions_flat stays +// empty (and pos is tagged as 0) when positions are disabled; freq still counts. +// +// Duplicate vocab strings: the vocab is assumed to map each id to a DISTINCT +// string (a dense vocabulary). If two ids share a string they sort adjacently +// but are emitted as two separate terms; callers must not rely on coalescing. +class SpimiTermBuffer { +public: + // BORROWED-vocab constructor: `vocab` maps term-id -> term string and is + // borrowed (NOT owned) -- the caller must keep it alive for the buffer's + // lifetime. add_token(term_id, ...) accumulates by id with no string work. + // spill_threshold_bytes is the gate-2 internal buffer cap (e.g. 512 MiB), + // sourced from config; == 0 means unlimited (pure in-memory, default). A + // positive value caps the REAL resident accumulator size (pool_.arena_bytes() + + // slot_of_.capacity()*4), triggering a spill when that crosses the cap -- NOT the + // old per-token estimate. + // `reporter` is the OPTIONAL writer-level build-RAM reporter (null off-Doris / + // unit tests). When non-null, the accumulator reports its REAL resident-byte + // deltas -- pool_.arena_bytes() + slot_of_.capacity()*4 -- positive on grow, + // negative on every reset/free, exactly once. NEVER reports live_bytes_ (a gated + // estimate that feeds only the spill threshold). + explicit SpimiTermBuffer(const std::vector* vocab, bool has_positions, + size_t spill_threshold_bytes = 0, MemoryReporter* reporter = nullptr); + + // OWNED-vocab (compatibility) constructor: no external vocab. The string-keyed + // add_token interns terms into an internal vocabulary on first occurrence. + explicit SpimiTermBuffer(bool has_positions, size_t spill_threshold_bytes = 0, + MemoryReporter* reporter = nullptr); + + ~SpimiTermBuffer(); + + SpimiTermBuffer(const SpimiTermBuffer&) = delete; + SpimiTermBuffer& operator=(const SpimiTermBuffer&) = delete; + + // Records one token by TERM-ID: term `term_id` occurs in `docid` at `pos`. + // `term_id` must be in [0, vocab_size). An out-of-range id latches an + // InvalidArgument into status() and is ignored. For a given term, docids are + // expected to arrive in non-decreasing order, and positions within a docid in + // ascending order; out-of-order docids (INCLUDING a REVISITED docid -- the same + // docid appearing again after a different one) are tolerated and reordered at + // finalize: SortByDocid stably sorts by docid and COALESCES same-docid groups + // (summing freqs, concatenating positions in document order), so the emitted + // postings have exactly ONE strictly-ascending entry per docid -- matching the + // k-way merge path and the writer's strictly-ascending precondition. + void add_token(uint32_t term_id, uint32_t docid, uint32_t pos); + + // Compatibility overload: records one token by TERM STRING. Valid ONLY on an + // OWNED-vocab buffer (constructed without an external vocab); interns `term` + // into the internal vocabulary on first occurrence, then forwards by id. Called + // on a BORROWED-vocab buffer it is REJECTED (latches InvalidArgument, token + // ignored) -- interning would grow the owned vocab out of step with the borrowed + // one and corrupt the build. It also allocates a std::string per call, so the + // hot path is the id overload; prefer that and reserve this for tests / legacy + // string-fed callers. + void add_token(std::string_view term, uint32_t docid, uint32_t pos); + + // Number of DISTINCT terms accumulated so far (touched ids still resident). + size_t unique_terms() const; + uint64_t total_tokens() const { return total_tokens_; } + bool has_positions() const { return has_positions_; } + + // OK unless an add_token validation error (out-of-range term-id, wrong vocab + // mode) was latched. for_each_term_sorted now returns its own I/O Status + // directly; callers that use add_token's latch-and-report pattern MUST check + // this after draining to surface input-side validation errors. + [[nodiscard]] Status status() const { return spill_status_; } + + // TEST-ONLY: number of spill run files written so far (== 0 in pure in-memory + // mode). Lets tests assert that a gate-2 spill actually fired once the REAL + // resident size crossed the configured cap. Not part of the production API. + size_t run_count_for_test() const { return run_paths_.size(); } + + // Materializes all terms sorted lexicographically; each term's docids are + // ascending. Convenience wrapper around for_each_term_sorted that keeps the + // whole result alive at once. Prefer for_each_term_sorted for low peak memory. + // MUST be called at most once: it drains internal state. A SECOND drain (a + // repeat call, or a finalize_sorted after a for_each_term_sorted, or vice versa) + // returns EMPTY and latches an error into status() rather than re-emitting. + std::vector finalize_sorted(); + + // Streams terms to `fn` in lexicographic order, building ONE transient + // TermPostings at a time and freeing that term's accumulated arrays before + // moving to the next. This keeps at most a single term's postings duplicated, + // avoiding the input+output coexistence peak. MUST be called at most once: it + // drains internal state. A SECOND drain invokes `fn` zero times and returns + // an Internal error (a re-merge of the still-present run files would otherwise + // re-emit every term). Returns non-OK on spill/merge I/O or corruption errors, + // or if a prior add_token latched a validation error into status(). + Status for_each_term_sorted(const std::function& fn); + +private: + // Compact per-term accumulator: ONE tagged-varint arena chain plus a few cursors. + // Every token is appended immediately (no deferred flush), so the only running + // state is the current doc id and the delta base. A sentinel chain head of + // kNoChain marks a term that has not started its chain yet (so an all-empty term + // costs no arena bytes). ntok / ndocs bound the decode loop and size reserves. + // Total ~36 B per live term. + static constexpr uint32_t kNoChain = 0xFFFFFFFFu; + struct Term { + uint32_t head = kNoChain; // chain read entry point + CompactPostingPool::SliceWriter w; // append cursor for the chain (8 B) + uint32_t ntok = 0; // total tokens (entries) in the chain + uint32_t cur_docid = 0; // most-recent doc id: detects doc change AND + // is the zigzag delta base for the next doc + uint8_t level = 0; // current slice level of w (packed here, not in w) + bool started = false; // false until the first token is appended + bool sorted = true; // false if a docid arrived out of ascending order + }; + static_assert(sizeof(CompactPostingPool::SliceWriter) == 8, + "SliceWriter must stay 8 bytes to keep Term compact"); + + // The active vocabulary (term-id -> string): either the borrowed pointer or, + // in owned mode, &owned_vocab_. Always non-null after construction. + const std::vector& vocab() const { return *vocab_; } + + // Accumulates one already-validated token into the per-id Term. + void accumulate(uint32_t term_id, uint32_t docid, uint32_t pos); + + // Decodes `t`'s compact chain into a TermPostings (the exact docids/freqs/ + // positions the writer consumes), sorting by docid first if `t.sorted` is false. + // When `allow_stream_positions` is true (the in-memory drain path), a large + // sorted term's positions are provided via TermPostings::pos_pump instead of a + // materialized positions_flat (peak-RSS win). The spill path passes false so the + // run codec always sees a fully-materialized positions_flat. + TermPostings to_postings(std::string term, Term&& t, bool allow_stream_positions) const; + + // Returns the touched term-ids sorted by their vocab string (lexicographic). + // Sorts by a PRECOMPUTED integer string-rank (term-id -> lexicographic rank), + // not by full std::string compare: a single std::string sort over the whole + // vocabulary is amortized across every spill, so each spill's sort is an + // integer compare instead of paying a fresh O(touched * strcmp) on every spill. + std::vector sorted_ids() const; + // Builds string_rank_ (term-id -> lexicographic rank) once, lazily. Idempotent. + void ensure_string_rank() const; + // Streams the in-memory terms in sorted order, draining the slot pool (the + // in-memory single-pass path). When `allow_stream_positions` is true, large + // sorted terms stream positions via pos_pump (valid only because the callback + // consumes each term synchronously while the arena is still resident); callers + // that RETAIN the TermPostings past the drain (finalize_sorted) must pass false. + Status drain_sorted(const std::function& fn, bool allow_stream_positions); + // Spills the current buffer to a fresh sorted run file and clears memory. + Status spill_to_run(); + // Writes all current terms (sorted) to an already-open RunWriter, draining. + Status drain_to_writer(class RunWriter* w); + // REAL resident accumulator bytes: pool_.arena_bytes() + slot_of_.capacity()*4. + // The single source of truth for both the gate-2 spill trigger and the spill + // space-precheck -- replaces the old gated live_bytes_ estimate. + uint64_t resident_bytes() const; + // Reports the signed change in REAL resident bytes (pool_.arena_bytes() + + // slot_of_.capacity()*4) to mem_reporter_ since the previous call, then caches the + // new total. Single-source diff: every grow/reset/free emits EXACTLY ONE delta + // (self-balancing -> impossible to double-count or miss a negative). No-op when + // mem_reporter_ is null. + void report_arena_delta(); + // Final k-way merge over the spilled runs (+ the residual flushed as a run). + // When `allow_stream_positions` is true (the streaming for_each path), a wide + // merged term streams positions via pos_pump (valid only because fn consumes + // synchronously while the run readers stay parked); callers that RETAIN the + // TermPostings past the merge (finalize_sorted) MUST pass false. + Status merge_runs(const std::function& fn, bool allow_stream_positions); + // Deletes every temp run file; called from the destructor (RAII cleanup). + void cleanup_runs(); + // Frees a drained term's accumulator (id leaves the touched set). + void release_term(uint32_t term_id); + + const std::vector* vocab_; // active vocab (borrowed or &owned_) + std::vector owned_vocab_; // owned mode: interned term strings + // Owned mode only: term string -> term-id, for interning on first occurrence. + std::unordered_map intern_; + + bool has_positions_; + size_t spill_threshold_bytes_; // 0 => unlimited (no spilling) + uint64_t total_tokens_ = 0; + + // POOLED accumulators (replaces a dense vocab-sized std::vector, which + // cost ~80 B per vocab id even for the ~empty majority -- the single largest + // input-phase memory line). slot_of_ is the only vocab-sized array: a 4 B index + // per id (0 == no live Term; otherwise slot index + 1). slots_ holds ONE Term + // per CURRENTLY-LIVE id, so its size tracks the live touched count, not the + // vocabulary. On first touch an id claims a slot (reusing a freed one from + // free_slots_ when available, else appending). release_term frees the slot back + // to the pool and clears slot_of_[id]. touched_ids_ lists every live id so + // finalize/spill iterate touched ids without scanning the whole vocabulary. + // present_[id] is now (slot_of_[id] != 0). The hot add path is still a vector + // index + a couple of pushes: no hashing, no per-token allocation. + std::vector slot_of_; // vocab-sized: id -> slot index + 1 (0=empty) + std::vector slots_; // live Term pool (size ~ live touched count) + std::vector free_slots_; // recycled slot indices (drained terms) + std::vector touched_ids_; + size_t live_term_count_ = 0; // present (non-drained) terms; == unique_terms() + + // Shared arena backing every live term's DOC and POS varint byte chains. Holds + // the bulk of the accumulator's memory in a few large blocks (no per-term vector + // headers, no per-vector doubling slack) -- the compact-RSS win. + CompactPostingPool pool_; + + // Optional writer-level build-RAM reporter (null off-Doris / unit tests) and the + // last resident-byte total it was told about. report_arena_delta() diffs the live + // total (arena_bytes() + slot_of_.capacity()*4) against reported_resident_. + MemoryReporter* mem_reporter_ = nullptr; + int64_t reported_resident_ = 0; + + // Returns the live Term for `term_id`, claiming a pool slot on first touch. + Term& term_slot(uint32_t term_id, bool* new_term); + + // Appends one byte / one varint to a term's tagged chain, lazily starting the + // chain on first use (so an untouched term costs no arena bytes). + void put_byte(Term* t, uint8_t b); + void put_varint(Term* t, uint64_t v); + + std::vector run_paths_; // spilled run temp files (deleted in dtor) + Status spill_status_; // first spill / range error, at finalize + bool drained_ = false; // set once finalize_sorted/for_each_term_sorted has run; + // a second drain would (spilled path) re-merge the run + // files and re-emit every term, or (in-memory path) emit + // nothing -- both wrong. Guard against the double-drain. + + // Lazily-built vocab-sized map: term-id -> its lexicographic rank among all + // vocab strings. Computed once (one full std::string sort of the vocabulary) + // on the first sorted_ids() call, then reused by every spill's id sort. mutable + // so the const sorted_ids() can fill it on demand. + mutable std::vector string_rank_; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/temp_dir.h b/be/src/snii/writer/temp_dir.h new file mode 100644 index 00000000000000..36d51d578a5e2a --- /dev/null +++ b/be/src/snii/writer/temp_dir.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +#include +#include +#include + +namespace snii::writer { + +// Scratch directory for spill runs and section temp files. Resolution order: +// SNII_TEMP_DIR (explicit config) -> TMPDIR (POSIX default) -> /tmp (fallback). +// +// Point SNII_TEMP_DIR / TMPDIR at a REAL disk (SSD/NVMe). /tmp is often tmpfs (a +// RAM-backed filesystem) on modern systems, where spilling does NOT reduce RSS -- +// it just moves bytes from heap to tmpfs, defeating the purpose of spilling. +inline std::string resolve_temp_dir() { + for (const char* var : {"SNII_TEMP_DIR", "TMPDIR"}) { + const char* v = std::getenv(var); + if (v != nullptr && v[0] != '\0') { + std::string d(v); + while (d.size() > 1 && d.back() == '/') d.pop_back(); // strip trailing '/' + return d; + } + } + return "/tmp"; +} + +// Best-effort free bytes on the filesystem backing `dir`. Returns UINT64_MAX when +// statvfs fails, so a caller's space pre-check never false-positives on an +// unstattable path. CAVEATS: this is best-effort only -- it is subject to TOCTOU +// (free space can drop before/while the write runs), and on tmpfs it reports +// RAM-backed space (use the temp-dir config to avoid tmpfs in the first place). +inline uint64_t temp_dir_available_bytes(const std::string& dir) { + struct statvfs vfs; + if (::statvfs(dir.c_str(), &vfs) != 0) return UINT64_MAX; + return static_cast(vfs.f_bavail) * static_cast(vfs.f_frsize); +} + +} // namespace snii::writer diff --git a/be/src/storage/CMakeLists.txt b/be/src/storage/CMakeLists.txt index e7a82b486dbe63..3aee9b6a87bae2 100644 --- a/be/src/storage/CMakeLists.txt +++ b/be/src/storage/CMakeLists.txt @@ -28,6 +28,7 @@ file(GLOB_RECURSE SRC_FILES CONFIGURE_DEPENDS *.cpp) # files in the ann_index directory. They are compiled separately as a .a library # and linked by Storage. list(FILTER SRC_FILES EXCLUDE REGEX ".*/storage/index/ann/.*\\.cpp$") +list(FILTER SRC_FILES EXCLUDE REGEX ".*/storage/index/snii/core/src/io/s3_object_store\\.cpp$") if (ENABLE_VARIANT_NESTED_GROUP) list(REMOVE_ITEM SRC_FILES diff --git a/be/src/storage/compaction/compaction.cpp b/be/src/storage/compaction/compaction.cpp index df2fee8b1146d8..5f040fae3ac00f 100644 --- a/be/src/storage/compaction/compaction.cpp +++ b/be/src/storage/compaction/compaction.cpp @@ -1221,6 +1221,12 @@ static bool check_rowset_has_inverted_index(const RowsetSharedPtr& src_rs, int32 } void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { + if (_cur_tablet_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::SNII) { + LOG(INFO) << "tablet[" << _tablet->tablet_id() + << "] uses SNII inverted index storage format, skip CLucene index compaction"; + return; + } for (const auto& index : _cur_tablet_schema->inverted_indexes()) { auto col_unique_ids = index->col_unique_ids(); // check if column unique ids is empty to avoid crash diff --git a/be/src/storage/index/index_file_reader.cpp b/be/src/storage/index/index_file_reader.cpp index 348e1399421e5a..e90d642b56c57b 100644 --- a/be/src/storage/index/index_file_reader.cpp +++ b/be/src/storage/index/index_file_reader.cpp @@ -20,6 +20,8 @@ #include #include +#include "common/cast_set.h" +#include "common/config.h" #include "storage/index/inverted/inverted_index_compound_reader.h" #include "storage/index/inverted/inverted_index_fs_directory.h" #include "storage/tablet/tablet_schema.h" @@ -31,7 +33,9 @@ Status IndexFileReader::init(int32_t read_buffer_size, const io::IOContext* io_c std::unique_lock lock(_mutex); // Lock for writing if (!_inited) { _read_buffer_size = read_buffer_size; - if (_storage_format >= InvertedIndexStorageFormatPB::V2) { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + RETURN_IF_ERROR(_init_snii(io_ctx)); + } else if (_storage_format >= InvertedIndexStorageFormatPB::V2) { RETURN_IF_ERROR(_init_from(read_buffer_size, io_ctx)); } _inited = true; @@ -136,7 +140,35 @@ Status IndexFileReader::_init_from(int32_t read_buffer_size, const io::IOContext return Status::OK(); } +Status IndexFileReader::_init_snii(const io::IOContext* io_ctx) { + auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + int64_t file_size = -1; + if (_idx_file_info.has_index_size()) { + file_size = _idx_file_info.index_size(); + } + file_size = file_size == 0 ? -1 : file_size; + + io::FileReaderOptions opts; + opts.cache_type = config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE + : io::FileCachePolicy::NO_CACHE; + opts.is_doris_table = true; + opts.file_size = file_size; + opts.tablet_id = _tablet_id; + io::FileReaderSPtr reader; + RETURN_IF_ERROR(_fs->open_file(index_file_full_path, &reader, &opts)); + _snii_file_reader = std::make_shared(std::move(reader)); + _snii_segment_reader = std::make_unique(); + snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(io_ctx); + RETURN_IF_ERROR(snii_doris::to_doris_status(snii::reader::SniiSegmentReader::open( + _snii_file_reader.get(), _snii_segment_reader.get()))); + return Status::OK(); +} + Result IndexFileReader::get_all_directories() { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return ResultError(Status::Error( + "SNII format does not expose CLucene directories")); + } InvertedIndexDirectoryMap res; std::shared_lock lock(_mutex); // Lock for reading for (auto& [index, _] : _indices_entries) { @@ -155,6 +187,11 @@ Result> IndexFileReader:: int64_t index_id, const std::string& index_suffix, const io::IOContext* io_ctx) const { std::unique_ptr compound_reader; + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return ResultError(Status::Error( + "SNII format does not open CLucene compound readers")); + } + if (_storage_format == InvertedIndexStorageFormatPB::V1) { auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_id, index_suffix); @@ -229,6 +266,26 @@ Result> IndexFileReader:: return compound_reader; } +Result> IndexFileReader::open_snii_index( + const TabletIndex* index_meta) const { + DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII); + std::shared_lock lock(_mutex); + if (_snii_segment_reader == nullptr) { + return ResultError(Status::Error( + "SNII index file {} is not opened", + InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix))); + } + auto logical_reader = std::make_unique(); + auto status = + _snii_segment_reader->open_index(cast_set(index_meta->index_id()), + index_meta->get_index_suffix(), logical_reader.get()); + auto doris_status = snii_doris::to_doris_status(status); + if (!doris_status.ok()) { + return ResultError(doris_status); + } + return logical_reader; +} + Result> IndexFileReader::open( const TabletIndex* index_meta, const io::IOContext* io_ctx) const { auto index_id = index_meta->index_id(); @@ -254,6 +311,28 @@ Status IndexFileReader::index_file_exist(const TabletIndex* index_meta, bool* re auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_meta->index_id(), index_meta->get_index_suffix()); return _fs->exists(index_file_path, res); + } else if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + RETURN_IF_ERROR(_fs->exists(index_file_path, res)); + if (!*res) { + return Status::OK(); + } + std::shared_lock lock(_mutex); + if (_snii_segment_reader == nullptr) { + *res = false; + return Status::OK(); + } + auto logical_reader = std::make_unique(); + auto status = _snii_segment_reader->open_index(cast_set(index_meta->index_id()), + index_meta->get_index_suffix(), + logical_reader.get()); + if (status.code() == snii::StatusCode::kNotFound) { + *res = false; + return Status::OK(); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(status)); + *res = true; + return Status::OK(); } else { std::shared_lock lock(_mutex); // Lock for reading if (_stream == nullptr) { @@ -279,6 +358,11 @@ Status IndexFileReader::has_null(const TabletIndex* index_meta, bool* res) const *res = true; return Status::OK(); } + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + auto logical_reader = DORIS_TRY(open_snii_index(index_meta)); + *res = logical_reader->section_refs().null_bitmap.length > 0; + return Status::OK(); + } std::shared_lock lock(_mutex); // Lock for reading if (_stream == nullptr) { return Status::Error( diff --git a/be/src/storage/index/index_file_reader.h b/be/src/storage/index/index_file_reader.h index fb4ec2b9a62fe3..896c8bd51745ff 100644 --- a/be/src/storage/index/index_file_reader.h +++ b/be/src/storage/index/index_file_reader.h @@ -33,8 +33,11 @@ #include "common/be_mock_util.h" #include "common/config.h" #include "io/fs/file_system.h" +#include "snii/reader/logical_index_reader.h" +#include "snii/reader/snii_segment_reader.h" #include "storage/index/index_file_writer.h" #include "storage/index/inverted/inverted_index_desc.h" +#include "storage/index/snii/snii_doris_adapter.h" namespace doris { class TabletIndex; @@ -60,7 +63,7 @@ class IndexFileReader { : _fs(std::move(fs)), _index_path_prefix(std::move(index_path_prefix)), _storage_format(storage_format), - _idx_file_info(idx_file_info), + _idx_file_info(std::move(idx_file_info)), _tablet_id(tablet_id) {} virtual ~IndexFileReader() = default; @@ -68,6 +71,8 @@ class IndexFileReader { const io::IOContext* io_ctx = nullptr); MOCK_FUNCTION Result> open( const TabletIndex* index_meta, const io::IOContext* io_ctx = nullptr) const; + Result> open_snii_index( + const TabletIndex* index_meta) const; void debug_file_entries(); std::string get_index_file_cache_key(const TabletIndex* index_meta) const; std::string get_index_file_path(const TabletIndex* index_meta) const; @@ -75,12 +80,19 @@ class IndexFileReader { Status has_null(const TabletIndex* index_meta, bool* res) const; Result get_all_directories(); // open file v2, init _stream - int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } + int64_t get_inverted_file_size() const { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return _snii_file_reader == nullptr ? 0 : _snii_file_reader->size(); + } + return _stream == nullptr ? 0 : _stream->length(); + } const std::string& get_index_path_prefix() const { return _index_path_prefix; } + InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } friend IndexFileWriter; protected: Status _init_from(int32_t read_buffer_size, const io::IOContext* io_ctx); + Status _init_snii(const io::IOContext* io_ctx); Result> _open( int64_t index_id, const std::string& index_suffix, const io::IOContext* io_ctx = nullptr) const; @@ -88,6 +100,8 @@ class IndexFileReader { private: IndicesEntriesMap _indices_entries; std::unique_ptr _stream = nullptr; + std::shared_ptr _snii_file_reader; + std::unique_ptr _snii_segment_reader; const io::FileSystemSPtr _fs; std::string _index_path_prefix; int32_t _read_buffer_size = -1; @@ -99,4 +113,4 @@ class IndexFileReader { }; } // namespace segment_v2 -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/storage/index/index_file_writer.cpp b/be/src/storage/index/index_file_writer.cpp index afd09c84620bb5..665cb185d4aae7 100644 --- a/be/src/storage/index/index_file_writer.cpp +++ b/be/src/storage/index/index_file_writer.cpp @@ -22,6 +22,7 @@ #include #include +#include "common/cast_set.h" #include "common/status.h" #include "io/fs/packed_file_writer.h" #include "io/fs/s3_file_writer.h" @@ -34,6 +35,7 @@ #include "storage/index/inverted/inverted_index_desc.h" #include "storage/index/inverted/inverted_index_fs_directory.h" #include "storage/index/inverted/inverted_index_reader.h" +#include "storage/index/snii/snii_doris_adapter.h" #include "storage/tablet/tablet_schema.h" namespace doris::segment_v2 { @@ -56,7 +58,7 @@ IndexFileWriter::IndexFileWriter(io::FileSystemSPtr fs, std::string index_path_p _tmp_dir = tmp_file_dir.native(); if (_storage_format == InvertedIndexStorageFormatPB::V1) { _index_storage_format = std::make_unique(this); - } else { + } else if (_storage_format != InvertedIndexStorageFormatPB::SNII) { _index_storage_format = std::make_unique(this); } } @@ -84,6 +86,10 @@ Status IndexFileWriter::_insert_directory_into_map(int64_t index_id, } Result> IndexFileWriter::open(const TabletIndex* index_meta) { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return ResultError(Status::Error( + "SNII format does not open CLucene directories")); + } auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( _tmp_dir, _rowset_id, _seg_id, index_meta->index_id(), index_meta->get_index_suffix()); auto dir = std::shared_ptr(DorisFSDirectoryFactory::getDirectory( @@ -97,6 +103,43 @@ Result> IndexFileWriter::open(const TabletInde return dir; } +Status IndexFileWriter::add_snii_index(const TabletIndex* index_meta, uint32_t doc_count, + std::vector null_docids, + snii::writer::SpimiTermBuffer* const term_buffer, + snii::format::IndexConfig index_config, + snii::writer::MemoryReporter* const mem_reporter) { + DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII); + DCHECK(index_meta != nullptr); + DCHECK(term_buffer != nullptr); + if (_idx_v2_writer == nullptr) { + return Status::Error( + "SNII index file writer is null for {}", _index_path_prefix); + } + if (_snii_file_writer == nullptr) { + _snii_file_writer = std::make_unique(_idx_v2_writer.get()); + _snii_compound_writer = + std::make_unique(_snii_file_writer.get()); + } + + snii::writer::SniiIndexInput input; + input.index_id = cast_set(index_meta->index_id()); + input.index_suffix = index_meta->get_index_suffix(); + input.config = index_config; + input.doc_count = doc_count; + input.null_docids = std::move(null_docids); + input.term_source = term_buffer; + input.mem_reporter = mem_reporter; + RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->add_logical_index(input))); + ++_snii_index_count; + return Status::OK(); +} + +void IndexFileWriter::retain_snii_memory_reporter( + std::unique_ptr mem_reporter) { + DCHECK(mem_reporter != nullptr); + _snii_memory_reporters.push_back(std::move(mem_reporter)); +} + Status IndexFileWriter::delete_index(const TabletIndex* index_meta) { DBUG_EXECUTE_IF("IndexFileWriter::delete_index_index_meta_nullptr", { index_meta = nullptr; }); if (!index_meta) { @@ -123,6 +166,9 @@ Status IndexFileWriter::delete_index(const TabletIndex* index_meta) { } Status IndexFileWriter::add_into_searcher_cache() { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return Status::OK(); + } auto index_file_reader = std::make_unique( _fs, _index_path_prefix, _storage_format, InvertedIndexFileInfo(), _tablet_id); auto st = index_file_reader->init(); @@ -196,6 +242,21 @@ Result> IndexFileWriter::_construct_index_ Status IndexFileWriter::begin_close() { DCHECK(!_closed) << debug_string(); _closed = true; + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + if (_snii_compound_writer == nullptr) { + if (_idx_v2_writer == nullptr) { + return Status::OK(); + } + _snii_file_writer = + std::make_unique(_idx_v2_writer.get()); + _snii_compound_writer = + std::make_unique(_snii_file_writer.get()); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->finish())); + _total_file_size = _idx_v2_writer == nullptr ? 0 : _idx_v2_writer->bytes_appended(); + _file_info.set_index_size(_total_file_size); + return Status::OK(); + } if (_indices_dirs.empty()) { // An empty file must still be created even if there are no indexes to write if (dynamic_cast(_idx_v2_writer.get()) != nullptr || @@ -238,6 +299,12 @@ Status IndexFileWriter::begin_close() { Status IndexFileWriter::finish_close() { DCHECK(_closed) << debug_string(); + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + if (_idx_v2_writer != nullptr && _idx_v2_writer->state() != io::FileWriter::State::CLOSED) { + RETURN_IF_ERROR(_idx_v2_writer->close(false)); + } + return Status::OK(); + } if (_indices_dirs.empty()) { // An empty file must still be created even if there are no indexes to write if (dynamic_cast(_idx_v2_writer.get()) != nullptr || diff --git a/be/src/storage/index/index_file_writer.h b/be/src/storage/index/index_file_writer.h index a303de8b68c156..7f16d19cb90e74 100644 --- a/be/src/storage/index/index_file_writer.h +++ b/be/src/storage/index/index_file_writer.h @@ -24,21 +24,34 @@ #include #include +#include #include "common/be_mock_util.h" #include "io/fs/file_system.h" #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" +#include "snii/format/format_constants.h" +#include "snii/writer/snii_compound_writer.h" #include "storage/index/index_storage_format.h" #include "storage/index/inverted/inverted_index_common.h" #include "storage/index/inverted/inverted_index_compound_reader.h" #include "storage/index/inverted/inverted_index_searcher.h" +#include "storage/index/snii/snii_doris_adapter.h" + +namespace snii::writer { +class MemoryReporter; +class SpimiTermBuffer; +class SniiCompoundWriter; +} // namespace snii::writer namespace doris { class TabletIndex; namespace segment_v2 { class DorisFSDirectory; +namespace snii_doris { +class DorisSniiFileWriter; +} // namespace snii_doris using InvertedIndexDirectoryMap = std::map, std::shared_ptr>; @@ -55,6 +68,12 @@ class IndexFileWriter { virtual ~IndexFileWriter() = default; MOCK_FUNCTION Result> open(const TabletIndex* index_meta); + Status add_snii_index(const TabletIndex* index_meta, uint32_t doc_count, + std::vector null_docids, + snii::writer::SpimiTermBuffer* const term_buffer, + snii::format::IndexConfig config, + snii::writer::MemoryReporter* const mem_reporter); + void retain_snii_memory_reporter(std::unique_ptr mem_reporter); Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); Status add_into_searcher_cache(); @@ -113,6 +132,10 @@ class IndexFileWriter { IndexStorageFormatPtr _index_storage_format; int64_t _tablet_id = -1; + std::unique_ptr _snii_file_writer; + std::vector> _snii_memory_reporters; + std::unique_ptr _snii_compound_writer; + size_t _snii_index_count = 0; friend class IndexStorageFormatV1; friend class IndexStorageFormatV2; diff --git a/be/src/storage/index/index_writer.cpp b/be/src/storage/index/index_writer.cpp index 2325d280471337..6fb23c3c107e51 100644 --- a/be/src/storage/index/index_writer.cpp +++ b/be/src/storage/index/index_writer.cpp @@ -18,6 +18,7 @@ #include "common/exception.h" #include "storage/index/ann/ann_index_writer.h" #include "storage/index/inverted/inverted_index_writer.h" +#include "storage/index/snii/snii_index_writer.h" #include "storage/tablet/tablet_schema.h" #include "storage/types.h" @@ -80,6 +81,22 @@ Status IndexColumnWriter::create(const TabletColumn* column, } } + if (storage_format == InvertedIndexStorageFormatPB::SNII) { + if (!is_string_type(type)) { + return Status::Error( + "SNII inverted index storage format does not support BKD index type {}", + type); + } + *res = std::make_unique(index_file_writer, index_meta, + single_field); + auto st = (*res)->init(); + if (!st.ok()) { + (*res)->close_on_error(); + return st; + } + return Status::OK(); + } + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_unsupported_type_for_inverted_index", { type = FieldType::OLAP_FIELD_TYPE_JSONB; }) switch (type) { diff --git a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp index e65025b25a4fc7..d03cdf38a9abf1 100644 --- a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp +++ b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp @@ -179,16 +179,15 @@ void DorisFSDirectory::FSIndexInput::close() { } void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) { + const bool is_index_data = _io_ctx.is_index_data; if (io_ctx) { const auto& ctx = static_cast(io_ctx); - _io_ctx.reader_type = ctx->reader_type; - _io_ctx.query_id = ctx->query_id; - _io_ctx.file_cache_stats = ctx->file_cache_stats; + _io_ctx = *ctx; } else { - _io_ctx.reader_type = ReaderType::UNKNOWN; - _io_ctx.query_id = nullptr; - _io_ctx.file_cache_stats = nullptr; + _io_ctx = io::IOContext {}; } + _io_ctx.is_index_data = is_index_data; + _io_ctx.is_inverted_index = true; } const void* DorisFSDirectory::FSIndexInput::getIoContext() { @@ -247,6 +246,10 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) if (_io_ctx.file_cache_stats != nullptr) { _io_ctx.file_cache_stats->inverted_index_io_timer += inverted_index_io_timer; + _io_ctx.file_cache_stats->inverted_index_request_bytes += len; + _io_ctx.file_cache_stats->inverted_index_read_bytes += len; + ++_io_ctx.file_cache_stats->inverted_index_range_read_count; + ++_io_ctx.file_cache_stats->inverted_index_serial_read_rounds; } } diff --git a/be/src/storage/index/inverted/inverted_index_reader.h b/be/src/storage/index/inverted/inverted_index_reader.h index 0e2f6a120d41e3..a2aa0533f2bf7b 100644 --- a/be/src/storage/index/inverted/inverted_index_reader.h +++ b/be/src/storage/index/inverted/inverted_index_reader.h @@ -230,9 +230,9 @@ class InvertedIndexReader : public IndexReader { const Field& query_value, InvertedIndexQueryType query_type, size_t* count) = 0; - Status read_null_bitmap(const IndexQueryContextPtr& context, - InvertedIndexQueryCacheHandle* cache_handle, - lucene::store::Directory* dir = nullptr); + virtual Status read_null_bitmap(const IndexQueryContextPtr& context, + InvertedIndexQueryCacheHandle* cache_handle, + lucene::store::Directory* dir = nullptr); virtual InvertedIndexReaderType type() = 0; @@ -335,7 +335,6 @@ class InvertedIndexVisitor : public lucene::util::bkd::bkd_reader::intersect_vis std::string query_min; std::string query_max; -public: InvertedIndexVisitor(const void* io_ctx, lucene::util::bkd::bkd_reader* r, roaring::Roaring* hits, bool only_count = false); ~InvertedIndexVisitor() override = default; diff --git a/be/src/storage/index/snii/core/src/common/status.cpp b/be/src/storage/index/snii/core/src/common/status.cpp new file mode 100644 index 00000000000000..d8f66b4a68cd98 --- /dev/null +++ b/be/src/storage/index/snii/core/src/common/status.cpp @@ -0,0 +1,24 @@ +#include "snii/common/status.h" + +#include +#include + +namespace snii { +namespace { + +// Name table in the same order as the StatusCode enum, to avoid a long switch chain in to_string. +constexpr std::array kCodeNames = { + "OK", "Corruption", "NotFound", "InvalidArgument", "IoError", "Unsupported", "Internal"}; + +} // namespace + +std::string Status::to_string() const { + std::string out = kCodeNames[static_cast(code_)]; + if (!message_.empty()) { + out += ": "; + out += message_; + } + return out; +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp b/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp new file mode 100644 index 00000000000000..fc5c70d6b5569d --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp @@ -0,0 +1,39 @@ +#include "snii/encoding/byte_sink.h" + +#include "snii/encoding/varint.h" + +namespace snii { + +void ByteSink::put_fixed16(uint16_t v) { + for (int i = 0; i < 2; ++i) buf_.push_back(static_cast(v >> (8 * i))); +} + +void ByteSink::put_fixed32(uint32_t v) { + for (int i = 0; i < 4; ++i) buf_.push_back(static_cast(v >> (8 * i))); +} + +void ByteSink::put_fixed64(uint64_t v) { + for (int i = 0; i < 8; ++i) buf_.push_back(static_cast(v >> (8 * i))); +} + +void ByteSink::put_varint32(uint32_t v) { + uint8_t tmp[5]; + size_t n = encode_varint32(v, tmp); + buf_.insert(buf_.end(), tmp, tmp + n); +} + +void ByteSink::put_varint64(uint64_t v) { + uint8_t tmp[10]; + size_t n = encode_varint64(v, tmp); + buf_.insert(buf_.end(), tmp, tmp + n); +} + +void ByteSink::put_zigzag(int64_t v) { + put_varint64(zigzag_encode(v)); +} + +void ByteSink::put_bytes(Slice s) { + buf_.insert(buf_.end(), s.data(), s.data() + s.size()); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/byte_source.cpp b/be/src/storage/index/snii/core/src/encoding/byte_source.cpp new file mode 100644 index 00000000000000..d75d4945ff7f9d --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/byte_source.cpp @@ -0,0 +1,70 @@ +#include "snii/encoding/byte_source.h" + +#include "snii/encoding/varint.h" + +namespace snii { + +Status ByteSource::get_u8(uint8_t* v) { + if (remaining() < 1) return Status::Corruption("get_u8 overrun"); + *v = s_[pos_++]; + return Status::OK(); +} + +Status ByteSource::get_fixed16(uint16_t* v) { + if (remaining() < 2) return Status::Corruption("get_fixed16 overrun"); + uint16_t r = 0; + for (int i = 0; i < 2; ++i) r |= static_cast(s_[pos_ + i]) << (8 * i); + pos_ += 2; + *v = r; + return Status::OK(); +} + +Status ByteSource::get_fixed32(uint32_t* v) { + if (remaining() < 4) return Status::Corruption("get_fixed32 overrun"); + uint32_t r = 0; + for (int i = 0; i < 4; ++i) r |= static_cast(s_[pos_ + i]) << (8 * i); + pos_ += 4; + *v = r; + return Status::OK(); +} + +Status ByteSource::get_fixed64(uint64_t* v) { + if (remaining() < 8) return Status::Corruption("get_fixed64 overrun"); + uint64_t r = 0; + for (int i = 0; i < 8; ++i) r |= static_cast(s_[pos_ + i]) << (8 * i); + pos_ += 8; + *v = r; + return Status::OK(); +} + +Status ByteSource::get_varint64(uint64_t* v) { + const uint8_t* p = s_.data() + pos_; + const uint8_t* next = nullptr; + SNII_RETURN_IF_ERROR(decode_varint64(p, s_.data() + s_.size(), v, &next)); + pos_ = static_cast(next - s_.data()); + return Status::OK(); +} + +Status ByteSource::get_varint32(uint32_t* v) { + uint64_t tmp; + SNII_RETURN_IF_ERROR(get_varint64(&tmp)); + if (tmp > 0xFFFFFFFFu) return Status::Corruption("varint32 overflow"); + *v = static_cast(tmp); + return Status::OK(); +} + +Status ByteSource::get_zigzag(int64_t* v) { + uint64_t tmp; + SNII_RETURN_IF_ERROR(get_varint64(&tmp)); + *v = zigzag_decode(tmp); + return Status::OK(); +} + +Status ByteSource::get_bytes(size_t n, Slice* out) { + if (remaining() < n) return Status::Corruption("get_bytes overrun"); + *out = s_.subslice(pos_, n); + pos_ += n; + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/crc32c.cpp b/be/src/storage/index/snii/core/src/encoding/crc32c.cpp new file mode 100644 index 00000000000000..811ef86a697152 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/crc32c.cpp @@ -0,0 +1,111 @@ +#include "snii/encoding/crc32c.h" + +#include +#include +#include + +#if defined(__x86_64__) || defined(_M_X64) +#define SNII_CRC32C_X86 1 +#include +#include // _mm_crc32_u8/u32/u64 (SSE4.2) +#endif + +namespace snii { +namespace { + +// Bit-reflected Castagnoli polynomial (CRC32C / iSCSI). +constexpr uint32_t kPoly = 0x82F63B78u; + +// Builds the slice-by-8 lookup tables. Column 0 is the classic byte table; each +// successive column folds in one more byte of look-ahead, letting the inner loop +// consume 8 bytes per iteration with 8 table reads + XORs instead of 8 dependent +// shift/lookup steps. The checksum value is identical to the byte-at-a-time loop. +std::array, 8> make_slice8_table() { + std::array, 8> t {}; + for (uint32_t i = 0; i < 256; ++i) { + uint32_t c = i; + for (int k = 0; k < 8; ++k) c = (c & 1) ? (kPoly ^ (c >> 1)) : (c >> 1); + t[0][i] = c; + } + for (uint32_t i = 0; i < 256; ++i) { + uint32_t c = t[0][i]; + for (int s = 1; s < 8; ++s) { + c = t[0][c & 0xFF] ^ (c >> 8); + t[s][i] = c; + } + } + return t; +} + +const std::array, 8> kSlice8 = make_slice8_table(); + +inline uint32_t load_le32(const uint8_t* p) { + return static_cast(p[0]) | (static_cast(p[1]) << 8) | + (static_cast(p[2]) << 16) | (static_cast(p[3]) << 24); +} + +// Pure software slice-by-8 (used as the portable path and the hardware fallback). +uint32_t crc32c_slice8(uint32_t crc, const uint8_t* p, size_t n) { + while (n >= 8) { + crc ^= load_le32(p); + const uint32_t hi = load_le32(p + 4); + crc = kSlice8[7][crc & 0xFF] ^ kSlice8[6][(crc >> 8) & 0xFF] ^ + kSlice8[5][(crc >> 16) & 0xFF] ^ kSlice8[4][crc >> 24] ^ kSlice8[3][hi & 0xFF] ^ + kSlice8[2][(hi >> 8) & 0xFF] ^ kSlice8[1][(hi >> 16) & 0xFF] ^ kSlice8[0][hi >> 24]; + p += 8; + n -= 8; + } + while (n--) { + crc = kSlice8[0][(crc ^ *p++) & 0xFF] ^ (crc >> 8); + } + return crc; +} + +#if SNII_CRC32C_X86 +// Hardware CRC32C via the SSE4.2 crc32 instruction. The intrinsics operate on the +// same bit-reflected Castagnoli polynomial as the tables, so the result is +// byte-identical. This TU is compiled without -msse4.2, so gate the intrinsics +// behind a function-level target attribute and a runtime CPUID check. +__attribute__((target("sse4.2"))) uint32_t crc32c_hw(uint32_t crc, const uint8_t* p, size_t n) { + while (n >= 8) { + uint64_t v; + std::memcpy(&v, p, sizeof(v)); // unaligned-safe; x86 folds to a plain load + crc = static_cast(_mm_crc32_u64(crc, v)); + p += 8; + n -= 8; + } + if (n >= 4) { + crc = _mm_crc32_u32(crc, load_le32(p)); + p += 4; + n -= 4; + } + while (n--) crc = _mm_crc32_u8(crc, *p++); + return crc; +} + +bool detect_sse42() { + unsigned eax = 0, ebx = 0, ecx = 0, edx = 0; + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) return false; + return (ecx & bit_SSE4_2) != 0; +} + +const bool kHasSse42 = detect_sse42(); +#endif + +} // namespace + +uint32_t crc32c_extend(uint32_t crc, Slice data) { + const uint8_t* p = data.data(); + const size_t n = data.size(); + crc = ~crc; +#if SNII_CRC32C_X86 + if (kHasSse42) { + crc = crc32c_hw(crc, p, n); + return ~crc; + } +#endif + crc = crc32c_slice8(crc, p, n); + return ~crc; +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/pfor.cpp b/be/src/storage/index/snii/core/src/encoding/pfor.cpp new file mode 100644 index 00000000000000..5cdf8fdb57f9d6 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/pfor.cpp @@ -0,0 +1,360 @@ +#include "snii/encoding/pfor.h" + +#include +#include +#include +#include + +#include "snii/common/slice.h" + +namespace snii { +namespace { + +// Unaligned little-endian 64-bit load from a raw byte pointer (single +// instruction on x86; memcpy is the portable, UB-free spelling the compiler +// folds to a mov). +inline uint64_t load_u64_le(const uint8_t* p) { + uint64_t v; + std::memcpy(&v, p, sizeof(v)); +#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + v = __builtin_bswap64(v); +#endif + return v; +} + +uint8_t bits_for(uint32_t v) { + uint8_t b = 0; + while (v) { + ++b; + v >>= 1; + } + return b; +} + +// Choose the bit_width that minimizes total bytes (packed + exceptions). +// Exception cost estimated at ~6 bytes each. +uint8_t choose_width(const uint32_t* v, size_t n) { + uint8_t maxw = 0; + for (size_t i = 0; i < n; ++i) { + maxw = std::max(maxw, bits_for(v[i])); + } + uint8_t best = maxw; + size_t best_cost = SIZE_MAX; + for (uint8_t w = 0; w <= maxw; ++w) { + size_t exc = 0; + for (size_t i = 0; i < n; ++i) { + if (bits_for(v[i]) > w) { + ++exc; + } + } + size_t cost = (static_cast(w) * n + 7) / 8 + exc * 6; + if (cost < best_cost) { + best_cost = cost; + best = w; + } + } + return best; +} + +uint32_t low_mask(uint8_t w) { + return (w >= 32) ? 0xFFFFFFFFU : ((1U << w) - 1U); +} + +void bitpack(const uint32_t* v, size_t n, uint8_t w, ByteSink* out) { + if (w == 0) { + return; + } + uint64_t acc = 0; + int filled = 0; + for (size_t i = 0; i < n; ++i) { + acc |= static_cast(v[i] & low_mask(w)) << filled; + filled += w; + while (filled >= 8) { + out->put_u8(static_cast(acc)); + acc >>= 8; + filled -= 8; + } + } + if (filled > 0) { + out->put_u8(static_cast(acc)); + } +} + +void bitunpack_tail(const uint8_t* base, size_t packed, size_t n, uint8_t w, size_t i, + uint64_t mask, uint32_t* out) { + for (; i < n; ++i) { + const size_t bit_off = static_cast(w) * i; + const size_t byte_off = bit_off >> 3; + uint64_t word = 0; + for (size_t b = byte_off; b < packed && b < byte_off + 8; ++b) { + word |= static_cast(base[b]) << ((b - byte_off) * 8); + } + out[i] = static_cast((word >> (bit_off & 7)) & mask); + } +} + +void bitunpack_w1(const uint8_t* base, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 1U; + out[i + 1] = (v >> 1) & 1U; + out[i + 2] = (v >> 2) & 1U; + out[i + 3] = (v >> 3) & 1U; + out[i + 4] = (v >> 4) & 1U; + out[i + 5] = (v >> 5) & 1U; + out[i + 6] = (v >> 6) & 1U; + out[i + 7] = (v >> 7) & 1U; + } + if (i < n) { + const uint8_t v = base[byte]; + for (uint8_t bit = 0; i < n; ++i, ++bit) { + out[i] = (v >> bit) & 1U; + } + } +} + +void bitunpack_w2(const uint8_t* base, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 4 <= n; i += 4, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 3U; + out[i + 1] = (v >> 2) & 3U; + out[i + 2] = (v >> 4) & 3U; + out[i + 3] = (v >> 6) & 3U; + } + if (i < n) { + const uint8_t v = base[byte]; + for (uint8_t shift = 0; i < n; ++i, shift += 2) { + out[i] = (v >> shift) & 3U; + } + } +} + +void bitunpack_w3(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, byte += 3) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + out[i] = b0 & 7U; + out[i + 1] = (b0 >> 3) & 7U; + out[i + 2] = ((b0 >> 6) | (b1 << 2)) & 7U; + out[i + 3] = (b1 >> 1) & 7U; + out[i + 4] = (b1 >> 4) & 7U; + out[i + 5] = ((b1 >> 7) | (b2 << 1)) & 7U; + out[i + 6] = (b2 >> 2) & 7U; + out[i + 7] = (b2 >> 5) & 7U; + } + bitunpack_tail(base, packed, n, 3, i, 7U, out); +} + +void bitunpack_w4(const uint8_t* base, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 2 <= n; i += 2, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 15U; + out[i + 1] = (v >> 4) & 15U; + } + if (i < n) { + out[i] = base[byte] & 15U; + } +} + +void bitunpack_w5(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, byte += 5) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + const uint32_t b3 = base[byte + 3]; + const uint32_t b4 = base[byte + 4]; + out[i] = b0 & 31U; + out[i + 1] = ((b0 >> 5) | (b1 << 3)) & 31U; + out[i + 2] = (b1 >> 2) & 31U; + out[i + 3] = ((b1 >> 7) | (b2 << 1)) & 31U; + out[i + 4] = ((b2 >> 4) | (b3 << 4)) & 31U; + out[i + 5] = (b3 >> 1) & 31U; + out[i + 6] = ((b3 >> 6) | (b4 << 2)) & 31U; + out[i + 7] = (b4 >> 3) & 31U; + } + bitunpack_tail(base, packed, n, 5, i, 31U, out); +} + +void bitunpack_w6(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 4 <= n; i += 4, byte += 3) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + out[i] = b0 & 63U; + out[i + 1] = ((b0 >> 6) | (b1 << 2)) & 63U; + out[i + 2] = ((b1 >> 4) | (b2 << 4)) & 63U; + out[i + 3] = (b2 >> 2) & 63U; + } + bitunpack_tail(base, packed, n, 6, i, 63U, out); +} + +void bitunpack_w7(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, byte += 7) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + const uint32_t b3 = base[byte + 3]; + const uint32_t b4 = base[byte + 4]; + const uint32_t b5 = base[byte + 5]; + const uint32_t b6 = base[byte + 6]; + out[i] = b0 & 127U; + out[i + 1] = ((b0 >> 7) | (b1 << 1)) & 127U; + out[i + 2] = ((b1 >> 6) | (b2 << 2)) & 127U; + out[i + 3] = ((b2 >> 5) | (b3 << 3)) & 127U; + out[i + 4] = ((b3 >> 4) | (b4 << 4)) & 127U; + out[i + 5] = ((b4 >> 3) | (b5 << 5)) & 127U; + out[i + 6] = ((b5 >> 2) | (b6 << 6)) & 127U; + out[i + 7] = (b6 >> 1) & 127U; + } + bitunpack_tail(base, packed, n, 7, i, 127U, out); +} + +void bitunpack_w8(const uint8_t* base, size_t n, uint32_t* out) { + for (size_t i = 0; i < n; ++i) { + out[i] = base[i]; + } +} + +void bitunpack_generic(const uint8_t* base, size_t packed, size_t n, uint8_t w, uint32_t* out) { + const uint64_t mask = low_mask(w); + size_t i = 0; + if (packed >= 8) { + const size_t last_safe_byte = packed - 8; + for (; i < n; ++i) { + const size_t bit_off = static_cast(w) * i; + const size_t byte_off = bit_off >> 3; + if (byte_off > last_safe_byte) { + break; + } + out[i] = static_cast((load_u64_le(base + byte_off) >> (bit_off & 7)) & mask); + } + } + bitunpack_tail(base, packed, n, w, i, mask, out); +} + +Status bitunpack(ByteSource* src, size_t n, uint8_t w, uint32_t* out) { + if (w == 0) { + std::memset(out, 0, n * sizeof(uint32_t)); + return Status::OK(); + } + // Pull the packed run once and unpack from the contiguous slice; this keeps + // the hot decode path free of per-byte ByteSource calls. + const size_t packed = (static_cast(w) * n + 7) / 8; + Slice buf; + SNII_RETURN_IF_ERROR(src->get_bytes(packed, &buf)); + const uint8_t* base = buf.data(); + + switch (w) { + case 1: + bitunpack_w1(base, n, out); + break; + case 2: + bitunpack_w2(base, n, out); + break; + case 3: + bitunpack_w3(base, packed, n, out); + break; + case 4: + bitunpack_w4(base, n, out); + break; + case 5: + bitunpack_w5(base, packed, n, out); + break; + case 6: + bitunpack_w6(base, packed, n, out); + break; + case 7: + bitunpack_w7(base, packed, n, out); + break; + case 8: + bitunpack_w8(base, n, out); + break; + default: + bitunpack_generic(base, packed, n, w, out); + break; + } + return Status::OK(); +} + +} // namespace + +void pfor_encode(const uint32_t* values, size_t n, ByteSink* out) { + uint8_t w = choose_width(values, n); + std::vector> exc; // (index, full value) + std::vector low(values, values + n); + for (size_t i = 0; i < n; ++i) { + if (bits_for(values[i]) > w) { + exc.emplace_back(static_cast(i), values[i]); + low[i] = 0; // Write 0 as placeholder at exception position; true value + // stored in exception table + } + } + out->put_u8(w); + out->put_varint32(static_cast(exc.size())); + bitpack(low.data(), n, w, out); + uint32_t prev = 0; + for (const auto& e : exc) { + out->put_varint32(e.first - prev); + out->put_varint32(e.second); + prev = e.first; + } +} + +Status pfor_decode(ByteSource* src, size_t n, uint32_t* out) { + uint8_t w; + SNII_RETURN_IF_ERROR(src->get_u8(&w)); + uint32_t n_exc; + SNII_RETURN_IF_ERROR(src->get_varint32(&n_exc)); + SNII_RETURN_IF_ERROR(bitunpack(src, n, w, out)); + uint32_t idx = 0; + for (uint32_t i = 0; i < n_exc; ++i) { + uint32_t d, val; + SNII_RETURN_IF_ERROR(src->get_varint32(&d)); + SNII_RETURN_IF_ERROR(src->get_varint32(&val)); + idx += d; + if (idx >= n) { + return Status::Corruption("pfor exception index out of range"); + } + out[idx] = val; + } + return Status::OK(); +} + +Status pfor_skip(ByteSource* src, size_t n) { + uint8_t w = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&w)); + uint32_t n_exc = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&n_exc)); + const size_t packed = (static_cast(w) * n + 7) / 8; + Slice unused; + SNII_RETURN_IF_ERROR(src->get_bytes(packed, &unused)); + uint32_t idx = 0; + for (uint32_t i = 0; i < n_exc; ++i) { + uint32_t d = 0; + uint32_t val = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&d)); + SNII_RETURN_IF_ERROR(src->get_varint32(&val)); + idx += d; + if (idx >= n) { + return Status::Corruption("pfor exception index out of range"); + } + } + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/section_framer.cpp b/be/src/storage/index/snii/core/src/encoding/section_framer.cpp new file mode 100644 index 00000000000000..99d086c79e705c --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/section_framer.cpp @@ -0,0 +1,37 @@ +#include "snii/encoding/section_framer.h" + +#include "snii/encoding/crc32c.h" + +namespace snii { + +void SectionFramer::write(ByteSink& sink, uint8_t section_type, Slice payload) { + // Assemble type+len+payload in a temporary sink, compute crc over the whole thing, then write it all out. + ByteSink framed; + framed.put_u8(section_type); + framed.put_varint64(payload.size()); + framed.put_bytes(payload); + uint32_t crc = crc32c(framed.view()); + sink.put_bytes(framed.view()); + sink.put_fixed32(crc); +} + +Status SectionFramer::read(ByteSource& src, FramedSection* out) { + size_t start = src.position(); + uint8_t type; + SNII_RETURN_IF_ERROR(src.get_u8(&type)); + uint64_t len; + SNII_RETURN_IF_ERROR(src.get_varint64(&len)); + Slice payload; + SNII_RETURN_IF_ERROR(src.get_bytes(static_cast(len), &payload)); + size_t framed_len = src.position() - start; + uint32_t stored; + SNII_RETURN_IF_ERROR(src.get_fixed32(&stored)); + if (crc32c(src.slice_from(start, framed_len)) != stored) { + return Status::Corruption("section crc mismatch"); + } + out->type = type; + out->payload = payload; + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/varint.cpp b/be/src/storage/index/snii/core/src/encoding/varint.cpp new file mode 100644 index 00000000000000..12877f972cb089 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/varint.cpp @@ -0,0 +1,53 @@ +#include "snii/encoding/varint.h" + +namespace snii { + +size_t varint_len(uint64_t v) { + size_t n = 1; + while (v >= 0x80) { + v >>= 7; + ++n; + } + return n; +} + +size_t encode_varint64(uint64_t v, uint8_t* out) { + size_t i = 0; + while (v >= 0x80) { + out[i++] = static_cast(v) | 0x80; + v >>= 7; + } + out[i++] = static_cast(v); + return i; +} + +size_t encode_varint32(uint32_t v, uint8_t* out) { + return encode_varint64(v, out); +} + +Status decode_varint64(const uint8_t* p, const uint8_t* end, uint64_t* v, const uint8_t** next) { + uint64_t result = 0; + int shift = 0; + while (p < end) { + uint8_t b = *p++; + result |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) { + *v = result; + *next = p; + return Status::OK(); + } + shift += 7; + if (shift >= 64) return Status::Corruption("varint64 overflow"); + } + return Status::Corruption("varint truncated"); +} + +Status decode_varint32(const uint8_t* p, const uint8_t* end, uint32_t* v, const uint8_t** next) { + uint64_t tmp; + SNII_RETURN_IF_ERROR(decode_varint64(p, end, &tmp, next)); + if (tmp > 0xFFFFFFFFu) return Status::Corruption("varint32 overflow"); + *v = static_cast(tmp); + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp b/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp new file mode 100644 index 00000000000000..abb01981d63450 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp @@ -0,0 +1,32 @@ +#include "snii/encoding/zstd_codec.h" + +#include + +#include + +namespace snii { + +Status zstd_compress(Slice input, int level, std::vector* out) { + size_t bound = ZSTD_compressBound(input.size()); + out->resize(bound); + size_t n = ZSTD_compress(out->data(), bound, input.data(), input.size(), level); + if (ZSTD_isError(n)) { + return Status::Internal(std::string("zstd compress: ") + ZSTD_getErrorName(n)); + } + out->resize(n); + return Status::OK(); +} + +Status zstd_decompress(Slice input, size_t expected_uncomp_len, std::vector* out) { + out->resize(expected_uncomp_len); + size_t n = ZSTD_decompress(out->data(), expected_uncomp_len, input.data(), input.size()); + if (ZSTD_isError(n)) { + return Status::Corruption(std::string("zstd decompress: ") + ZSTD_getErrorName(n)); + } + if (n != expected_uncomp_len) { + return Status::Corruption("zstd decompressed length mismatch"); + } + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp b/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp new file mode 100644 index 00000000000000..e65c4817d1c6dc --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp @@ -0,0 +1,91 @@ +#include "snii/format/bootstrap_header.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" + +namespace snii::format { + +namespace { + +// Number of bytes covered by header_checksum: everything except the trailing +// crc32c. +constexpr size_t kChecksumCoverage = kBootstrapHeaderSize - 4; + +// Writes all fixed fields except the trailing checksum. Field order is the +// on-disk contract; reuse ByteSink fixed-width primitives, never hand-assemble +// bytes. +void encode_fields(const BootstrapHeader& header, ByteSink* sink) { + sink->put_fixed32(header.magic); + sink->put_fixed32((static_cast(header.min_reader_version) << 16) | + header.format_version); + sink->put_fixed32(header.flags); + sink->put_fixed32(kBootstrapHeaderSize); // header_length is always derived + sink->put_u8(header.tail_pointer_size); +} + +} // namespace + +Status encode_bootstrap_header(const BootstrapHeader& header, ByteSink* sink) { + if (sink == nullptr) { + return Status::InvalidArgument("bootstrap_header: null sink"); + } + ByteSink fields; + encode_fields(header, &fields); + const uint32_t checksum = crc32c(fields.view()); + sink->put_bytes(fields.view()); + sink->put_fixed32(checksum); + return Status::OK(); +} + +Status decode_bootstrap_header(Slice data, BootstrapHeader* out) { + if (out == nullptr) { + return Status::InvalidArgument("bootstrap_header: null out"); + } + // Reject any size other than the exact fixed header: short input is + // truncation, longer input means stray trailing bytes the parser would + // otherwise ignore. + if (data.size() != kBootstrapHeaderSize) { + return Status::Corruption("bootstrap_header: wrong header size"); + } + + ByteSource src(data); + uint32_t magic = 0; + uint32_t version_pair = 0; + uint32_t flags = 0; + uint32_t header_length = 0; + uint8_t tail_pointer_size = 0; + uint32_t stored_checksum = 0; + SNII_RETURN_IF_ERROR(src.get_fixed32(&magic)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&version_pair)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&flags)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&header_length)); + SNII_RETURN_IF_ERROR(src.get_u8(&tail_pointer_size)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&stored_checksum)); + + if (magic != kContainerMagic) { + return Status::Corruption("bootstrap_header: bad container magic"); + } + const uint32_t computed = crc32c(data.subslice(0, kChecksumCoverage)); + if (computed != stored_checksum) { + return Status::Corruption("bootstrap_header: checksum mismatch"); + } + + const auto min_reader_version = static_cast((version_pair >> 16) & 0xFFFFu); + const auto format_version = static_cast(version_pair & 0xFFFFu); + if (format_version != kFormatVersion) { + return Status::Unsupported("bootstrap_header: unsupported container format_version"); + } + if (min_reader_version > kFormatVersion) { + return Status::Unsupported("bootstrap_header: container requires a newer reader version"); + } + + out->magic = magic; + out->format_version = format_version; + out->min_reader_version = min_reader_version; + out->flags = flags; + out->header_length = header_length; + out->tail_pointer_size = tail_pointer_size; + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/bsbf.cpp b/be/src/storage/index/snii/core/src/format/bsbf.cpp new file mode 100644 index 00000000000000..adfe5e445c2dce --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/bsbf.cpp @@ -0,0 +1,218 @@ +#include "snii/format/bsbf.h" + +#include + +#include "snii/encoding/crc32c.h" + +#if defined(__x86_64__) || defined(_M_X64) +#include +#define SNII_BSBF_X86 1 +#endif + +#define XXH_INLINE_ALL +#include "xxhash.h" + +namespace snii::format { + +const uint32_t kBsbfSalt[kBsbfBitsSetPerBlock] = {0x47b6137bU, 0x44974d91U, 0x8824ad5bU, + 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, + 0x9efc4947U, 0x5c6bfb31U}; + +namespace { + +void store_le32(uint8_t* p, uint32_t v) { + p[0] = static_cast(v); + p[1] = static_cast(v >> 8); + p[2] = static_cast(v >> 16); + p[3] = static_cast(v >> 24); +} +uint32_t load_le32(const uint8_t* p) { + return static_cast(p[0]) | (static_cast(p[1]) << 8) | + (static_cast(p[2]) << 16) | (static_cast(p[3]) << 24); +} + +bool cpu_has_avx2() { +#if defined(SNII_BSBF_X86) + static const bool v = __builtin_cpu_supports("avx2"); + return v; +#else + return false; +#endif +} + +// --- scalar kernels --- +inline void masks_scalar(uint32_t key, uint32_t m[8]) { + for (int i = 0; i < 8; ++i) m[i] = 1u << ((key * kBsbfSalt[i]) >> 27); +} +bool block_contains_scalar(uint64_t hash, const uint8_t* block) { + const uint32_t* w = reinterpret_cast(block); // LE + uint32_t m[8]; + masks_scalar(static_cast(hash), m); + for (int i = 0; i < 8; ++i) + if ((load_le32(reinterpret_cast(w + i)) & m[i]) != m[i]) return false; + return true; +} +void insert_scalar(uint32_t* words, uint32_t block, uint32_t key) { + uint32_t m[8]; + masks_scalar(key, m); + for (int i = 0; i < 8; ++i) words[block * 8 + i] |= m[i]; +} +bool find_scalar(const uint32_t* words, uint32_t block, uint32_t key) { + uint32_t m[8]; + masks_scalar(key, m); + for (int i = 0; i < 8; ++i) + if ((words[block * 8 + i] & m[i]) != m[i]) return false; + return true; +} + +#if defined(SNII_BSBF_X86) +// --- AVX2 kernels: a 256-bit block is one YMM register --- +__attribute__((target("avx2"))) __m256i mask_avx2(uint32_t key) { + const __m256i salt = + _mm256_setr_epi32(static_cast(kBsbfSalt[0]), static_cast(kBsbfSalt[1]), + static_cast(kBsbfSalt[2]), static_cast(kBsbfSalt[3]), + static_cast(kBsbfSalt[4]), static_cast(kBsbfSalt[5]), + static_cast(kBsbfSalt[6]), static_cast(kBsbfSalt[7])); + const __m256i prod = _mm256_mullo_epi32(_mm256_set1_epi32(static_cast(key)), salt); + const __m256i shifts = _mm256_srli_epi32(prod, 27); // top 5 bits -> 0..31 + return _mm256_sllv_epi32(_mm256_set1_epi32(1), shifts); +} +__attribute__((target("avx2"))) bool block_contains_avx2(uint64_t hash, const uint8_t* block) { + const __m256i m = mask_avx2(static_cast(hash)); + const __m256i b = _mm256_loadu_si256(reinterpret_cast(block)); + return _mm256_testc_si256(b, m) != 0; // (~b & m) == 0 -> b contains m +} +__attribute__((target("avx2"))) void insert_avx2(uint32_t* words, uint32_t block, uint32_t key) { + __m256i* p = reinterpret_cast<__m256i*>(words + block * 8); + _mm256_storeu_si256(p, _mm256_or_si256(_mm256_loadu_si256(p), mask_avx2(key))); +} +__attribute__((target("avx2"))) bool find_avx2(const uint32_t* words, uint32_t block, + uint32_t key) { + const __m256i m = mask_avx2(key); + const __m256i b = _mm256_loadu_si256(reinterpret_cast(words + block * 8)); + return _mm256_testc_si256(b, m) != 0; +} +#endif + +} // namespace + +uint64_t bsbf_hash(std::string_view term) { + return XXH64(term.data(), term.size(), /*seed=*/0); +} + +uint32_t bsbf_optimal_num_bytes(uint32_t ndv, double fpp) { + // Parquet OptimalNumOfBits, then >>3 for bytes. + const double m = -8.0 * ndv / std::log(1 - std::pow(fpp, 1.0 / 8)); + uint32_t num_bits; + if (m < 0 || m > static_cast(kBsbfMaxBytes) * 8) { + num_bits = kBsbfMaxBytes << 3; + } else { + num_bits = static_cast(m); + } + if (num_bits < (kBsbfMinBytes << 3)) num_bits = kBsbfMinBytes << 3; + if (num_bits & (num_bits - 1)) { // next power of 2 + uint32_t p = 1; + while (p < num_bits) p <<= 1; + num_bits = p; + } + if (num_bits > (kBsbfMaxBytes << 3)) num_bits = kBsbfMaxBytes << 3; + return num_bits >> 3; +} + +bool bsbf_block_contains(uint64_t hash, const uint8_t block[kBsbfBytesPerBlock]) { +#if defined(SNII_BSBF_X86) + if (cpu_has_avx2()) return block_contains_avx2(hash, block); +#endif + return block_contains_scalar(hash, block); +} + +Status BsbfBuilder::create(uint32_t ndv, double fpp, BsbfBuilder* out) { + if (out == nullptr) return Status::InvalidArgument("bsbf: null out"); + if (!(fpp > 0.0 && fpp < 1.0)) return Status::InvalidArgument("bsbf: fpp out of (0,1)"); + if (ndv == 0) ndv = 1; + out->num_bytes_ = bsbf_optimal_num_bytes(ndv, fpp); + out->num_blocks_ = out->num_bytes_ / kBsbfBytesPerBlock; + out->ndv_ = ndv; + out->words_.assign(out->num_bytes_ / 4, 0u); + return Status::OK(); +} + +void BsbfBuilder::insert(uint64_t hash) { + const uint32_t block = bsbf_block_index(hash, num_blocks_); + const uint32_t key = static_cast(hash); +#if defined(SNII_BSBF_X86) + if (cpu_has_avx2()) { + insert_avx2(words_.data(), block, key); + return; + } +#endif + insert_scalar(words_.data(), block, key); +} + +bool BsbfBuilder::maybe_contains(uint64_t hash) const { + const uint32_t block = bsbf_block_index(hash, num_blocks_); + const uint32_t key = static_cast(hash); +#if defined(SNII_BSBF_X86) + if (cpu_has_avx2()) return find_avx2(words_.data(), block, key); +#endif + return find_scalar(words_.data(), block, key); +} + +Status BsbfBuilder::serialize(ByteSink* sink) const { + if (sink == nullptr) return Status::InvalidArgument("bsbf: null sink"); + if (num_bytes_ == 0) return Status::InvalidArgument("bsbf: not built"); + uint8_t hdr[kBsbfHeaderSize] = {0}; + hdr[0] = 'B'; + hdr[1] = 'S'; + hdr[2] = 'B'; + hdr[3] = 'F'; + hdr[4] = 1; // version + hdr[5] = 0; // hash strategy: XXH64 seed 0 + hdr[6] = 0; // index strategy: fastrange + hdr[7] = 0; // pad + store_le32(hdr + 8, num_bytes_); + store_le32(hdr + 12, num_blocks_); + store_le32(hdr + 16, ndv_); + store_le32(hdr + 20, crc32c(Slice(hdr, 20))); // header crc over [0,20) + const uint8_t* bits = reinterpret_cast(words_.data()); + store_le32(hdr + 24, crc32c(Slice(bits, num_bytes_))); // bitset crc + sink->put_bytes(Slice(hdr, kBsbfHeaderSize)); + sink->put_bytes(Slice(bits, num_bytes_)); // contiguous, uncompressed, LE + return Status::OK(); +} + +Status BsbfHeader::parse(Slice h, uint64_t section_base, BsbfHeader* out) { + if (out == nullptr) return Status::InvalidArgument("bsbf: null out"); + if (h.size() < kBsbfHeaderSize) return Status::Corruption("bsbf: short header"); + const uint8_t* p = h.data(); + if (p[0] != 'B' || p[1] != 'S' || p[2] != 'B' || p[3] != 'F') + return Status::Corruption("bsbf: bad magic"); + if (p[4] != 1) return Status::Corruption("bsbf: bad version"); + if (p[5] != 0) return Status::Corruption("bsbf: unsupported hash strategy"); + if (p[6] != 0) return Status::Corruption("bsbf: unsupported index strategy"); + if (crc32c(Slice(p, 20)) != load_le32(p + 20)) + return Status::Corruption("bsbf: header crc mismatch"); + const uint32_t nb = load_le32(p + 8); + const uint32_t nblk = load_le32(p + 12); + if (nb < kBsbfMinBytes || nb > kBsbfMaxBytes || (nb & (nb - 1)) != 0) + return Status::Corruption("bsbf: num_bytes out of range or not power of 2"); + if (nblk != nb / kBsbfBytesPerBlock) return Status::Corruption("bsbf: num_blocks mismatch"); + out->num_bytes = nb; + out->num_blocks = nblk; + out->bitset_crc = load_le32(p + 24); + out->bitset_base = section_base + kBsbfHeaderSize; + return Status::OK(); +} + +Status bsbf_probe(snii::io::FileReader* reader, const BsbfHeader& header, uint64_t hash, + bool* maybe_present) { + if (reader == nullptr || maybe_present == nullptr) + return Status::InvalidArgument("bsbf: null arg"); + std::vector blk; + SNII_RETURN_IF_ERROR(reader->read_at(header.block_offset(hash), kBsbfBytesPerBlock, &blk)); + if (blk.size() < kBsbfBytesPerBlock) return Status::Corruption("bsbf: short block read"); + *maybe_present = bsbf_block_contains(hash, blk.data()); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/dict_block.cpp b/be/src/storage/index/snii/core/src/format/dict_block.cpp new file mode 100644 index 00000000000000..375414df96f264 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/dict_block.cpp @@ -0,0 +1,293 @@ +#include "snii/format/dict_block.h" + +#include + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/varint.h" + +namespace snii::format { + +namespace { + +constexpr size_t kFooterBytes = sizeof(uint32_t); // trailing crc32c +constexpr size_t kNAnchorsBytes = sizeof(uint32_t); // n_anchors u32 +constexpr size_t kAnchorOffBytes = sizeof(uint32_t); // per-anchor offset u32 + +// Estimate the encoded upper-bound byte size of one entry (no actual encoding; used by estimated_bytes). +// Take the maximum varint width of each variable-length field plus payload bytes to guarantee an upper bound. +size_t estimate_entry_bytes(const DictEntry& e) { + size_t body = 0; + body += varint_len(static_cast(e.term.size())); // prefix_len upper bound + body += varint_len(static_cast(e.term.size())); // suffix_len upper bound + body += e.term.size(); // suffix bytes upper bound + body += 1; // flags + body += 10; // df + ttf + max_freq upper bound + body += 10; // ttf_delta + body += 10; // max_freq + if (e.kind == DictEntryKind::kInline) { + body += 10 + e.frq_bytes.size(); + body += 10 + e.prx_bytes.size(); + } else { + body += 10 * 5; // frq_off/frq_len/prelude/prx_off/prx_len upper bound + } + return varint_len(static_cast(body)) + body; // entry_len + body +} + +} // namespace + +// ---- DictBlockBuilder ---- + +DictBlockBuilder::DictBlockBuilder(IndexTier tier, bool has_positions, uint64_t frq_base, + uint64_t prx_base, uint32_t anchor_interval) + : tier_(tier), + has_positions_(has_positions), + frq_base_(frq_base), + prx_base_(prx_base), + anchor_interval_(anchor_interval == 0 ? 1 : anchor_interval) {} + +void DictBlockBuilder::add_entry(const DictEntry& entry) { + if (is_anchor(n_entries_)) ++n_anchors_; + entries_est_ += estimate_entry_bytes(entry); + entries_.push_back(entry); + prev_term_ = entry.term; + ++n_entries_; +} + +size_t DictBlockBuilder::estimated_bytes() const { + size_t header = varint_len(static_cast(n_entries_)) + 2; // +ver +flags + header += varint_len(frq_base_); + if (has_positions_) header += varint_len(prx_base_); + const size_t anchors = n_anchors_ * kAnchorOffBytes + kNAnchorsBytes; + return header + entries_est_ + anchors + kFooterBytes; +} + +void DictBlockBuilder::finish(ByteSink* sink) const { + ByteSink body; // header + entries + anchor_offsets + n_anchors (crc covered region) + + // header. + body.put_varint64(static_cast(n_entries_)); + body.put_u8(kDictBlockFormatVer); + body.put_u8(has_positions_ ? dict_block_flags::kHasPositions : 0u); + body.put_varint64(frq_base_); + if (has_positions_) body.put_varint64(prx_base_); + + // entries: anchor entries use prev_term="" and record their byte offset within the block. + std::vector anchor_offsets; + anchor_offsets.reserve(n_anchors_); + std::string prev; + for (uint32_t i = 0; i < n_entries_; ++i) { + const bool anchor = is_anchor(i); + if (anchor) { + anchor_offsets.push_back(static_cast(body.size())); + } + const std::string_view prev_term = anchor ? std::string_view {} : std::string_view(prev); + encode_dict_entry(entries_[i], prev_term, tier_, &body); + prev = entries_[i].term; + } + + // anchor_offsets[] + n_anchors. + for (uint32_t off : anchor_offsets) body.put_fixed32(off); + body.put_fixed32(static_cast(anchor_offsets.size())); + + // Write the entire block (including crc footer) to sink. + sink->put_bytes(body.view()); + sink->put_fixed32(crc32c(body.view())); +} + +// ---- DictBlockReader ---- + +namespace { + +// Verify the block length is sufficient and validate the trailing crc; return a Slice of the covered region (excluding crc footer). +Status verify_crc(Slice block, Slice* covered) { + if (block.size() < kFooterBytes + kNAnchorsBytes) { + return Status::Corruption("dict_block: block too short to contain footer"); + } + const size_t covered_len = block.size() - kFooterBytes; + *covered = block.subslice(0, covered_len); + + ByteSource crc_src(block.subslice(covered_len, kFooterBytes)); + uint32_t stored = 0; + SNII_RETURN_IF_ERROR(crc_src.get_fixed32(&stored)); + if (crc32c(*covered) != stored) { + return Status::Corruption("dict_block: crc32c checksum mismatch"); + } + return Status::OK(); +} + +// Read and verify that block_flags is consistent with has_positions. +Status check_flags(uint8_t flags, bool has_positions) { + const bool flag_pos = (flags & dict_block_flags::kHasPositions) != 0; + if (flag_pos != has_positions) { + return Status::InvalidArgument("dict_block: has_positions inconsistent with block_flags"); + } + return Status::OK(); +} + +} // namespace + +Status DictBlockReader::open(Slice block, IndexTier tier, bool has_positions, + DictBlockReader* out) { + if (out == nullptr) return Status::InvalidArgument("dict_block: out is null"); + *out = DictBlockReader {}; + + Slice covered; + SNII_RETURN_IF_ERROR(verify_crc(block, &covered)); + out->block_ = covered; + out->tier_ = tier; + out->has_positions_ = has_positions; + + // header. + ByteSource src(covered); + uint64_t n_entries = 0; + SNII_RETURN_IF_ERROR(src.get_varint64(&n_entries)); + uint8_t ver = 0; + uint8_t flags = 0; + SNII_RETURN_IF_ERROR(src.get_u8(&ver)); + SNII_RETURN_IF_ERROR(src.get_u8(&flags)); + if (ver != kDictBlockFormatVer) { + return Status::Unsupported("dict_block: unsupported entry_format_ver"); + } + SNII_RETURN_IF_ERROR(check_flags(flags, has_positions)); + SNII_RETURN_IF_ERROR(src.get_varint64(&out->frq_base_)); + if (has_positions) SNII_RETURN_IF_ERROR(src.get_varint64(&out->prx_base_)); + + out->n_entries_ = static_cast(n_entries); + out->entries_begin_ = src.position(); + + // The anchor table is at the tail of covered: [... anchor_offsets[n] n_anchors(u32)]. + if (covered.size() < kNAnchorsBytes) { + return Status::Corruption("dict_block: missing n_anchors"); + } + ByteSource na_src(covered.subslice(covered.size() - kNAnchorsBytes, kNAnchorsBytes)); + uint32_t n_anchors = 0; + SNII_RETURN_IF_ERROR(na_src.get_fixed32(&n_anchors)); + + const size_t anchor_table_bytes = static_cast(n_anchors) * kAnchorOffBytes; + if (covered.size() < kNAnchorsBytes + anchor_table_bytes || + out->entries_begin_ + anchor_table_bytes + kNAnchorsBytes > covered.size()) { + return Status::Corruption("dict_block: anchor table out of range"); + } + const size_t anchor_table_begin = covered.size() - kNAnchorsBytes - anchor_table_bytes; + + ByteSource at_src(covered.subslice(anchor_table_begin, anchor_table_bytes)); + out->anchor_offsets_.resize(n_anchors); + out->anchor_terms_.resize(n_anchors); + for (uint32_t i = 0; i < n_anchors; ++i) { + uint32_t off = 0; + SNII_RETURN_IF_ERROR(at_src.get_fixed32(&off)); + if (off >= anchor_table_begin) { + return Status::Corruption("dict_block: anchor offset out of range"); + } + // Anchor offsets must be strictly monotonically increasing, and the first anchor must be exactly the start of the entries region (entry 0 is always an anchor). + // Otherwise scan_from_anchor's segment-length computation seg_end-seg_begin would underflow as size_t and cause an out-of-range read, + // guarding against non-monotonic offset tables with a re-stamped crc (remote on-demand read / cache misalignment scenarios). + if (i == 0) { + if (off != out->entries_begin_) { + return Status::Corruption( + "dict_block: first anchor offset is not the start of entries"); + } + } else if (off <= out->anchor_offsets_[i - 1]) { + return Status::Corruption("dict_block: anchor offsets are not strictly increasing"); + } + out->anchor_offsets_[i] = off; + // Anchor entries are encoded with prev_term="" and can be decoded independently to retrieve their term. + ByteSource e_src(covered.subslice(off, anchor_table_begin - off)); + DictEntry probe; + SNII_RETURN_IF_ERROR(decode_dict_entry(&e_src, std::string_view {}, tier, &probe)); + out->anchor_terms_[i] = std::move(probe.term); + } + return Status::OK(); +} + +bool DictBlockReader::locate_anchor(std::string_view target, size_t* anchor_idx) const { + if (anchor_terms_.empty()) return false; + if (target < std::string_view(anchor_terms_.front())) return false; + // The last anchor_term <= target. + size_t lo = 0; + size_t hi = anchor_terms_.size(); // open interval + while (lo + 1 < hi) { + const size_t mid = lo + (hi - lo) / 2; + if (std::string_view(anchor_terms_[mid]) <= target) { + lo = mid; + } else { + hi = mid; + } + } + *anchor_idx = lo; + return true; +} + +Status DictBlockReader::decode_all(std::vector* out) const { + if (out == nullptr) return Status::InvalidArgument("dict_block: out is null"); + out->clear(); + out->reserve(n_entries_); + for (size_t a = 0; a < anchor_offsets_.size(); ++a) { + const size_t seg_begin = anchor_offsets_[a]; + const bool is_last = a + 1 == anchor_offsets_.size(); + const size_t seg_end = is_last ? (block_.size() - kNAnchorsBytes - + anchor_offsets_.size() * kAnchorOffBytes) + : anchor_offsets_[a + 1]; + if (seg_end < seg_begin || seg_end > block_.size()) { + return Status::Corruption("dict_block: anchor segment range invalid"); + } + ByteSource src(block_.subslice(seg_begin, seg_end - seg_begin)); + std::string prev; // first entry of a segment is an anchor (prev_term="") + while (!src.eof()) { + DictEntry e; + SNII_RETURN_IF_ERROR(decode_dict_entry(&src, std::string_view(prev), tier_, &e)); + prev = e.term; + out->push_back(std::move(e)); + } + } + if (out->size() != n_entries_) { + return Status::Corruption("dict_block: decoded entry count mismatch"); + } + return Status::OK(); +} + +Status DictBlockReader::scan_from_anchor(size_t anchor_idx, std::string_view target, bool* found, + DictEntry* out) const { + // Byte range of this anchor segment: [anchor_offset, next anchor offset or anchor table start). + const size_t seg_begin = anchor_offsets_[anchor_idx]; + const bool is_last = anchor_idx + 1 == anchor_offsets_.size(); + const size_t seg_end = + is_last ? (block_.size() - kNAnchorsBytes - anchor_offsets_.size() * kAnchorOffBytes) + : anchor_offsets_[anchor_idx + 1]; + + // Fallback: open() has already verified anchor monotonicity; this additionally guards against seg_end block_.size()) { + return Status::Corruption("dict_block: anchor segment range invalid"); + } + ByteSource src(block_.subslice(seg_begin, seg_end - seg_begin)); + std::string prev; // the first entry in the segment is an anchor, prev_term="" + while (!src.eof()) { + DictEntry e; + SNII_RETURN_IF_ERROR(decode_dict_entry(&src, std::string_view(prev), tier_, &e)); + if (e.term == target) { + *found = true; + *out = std::move(e); + return Status::OK(); + } + if (std::string_view(e.term) > target) { + *found = false; // already past target; entries are sorted so it does not exist + return Status::OK(); + } + prev = std::move(e.term); + } + *found = false; + return Status::OK(); +} + +Status DictBlockReader::find_term(std::string_view target, bool* found, DictEntry* out) const { + if (found == nullptr || out == nullptr) { + return Status::InvalidArgument("dict_block: found / out is null"); + } + *found = false; + size_t anchor_idx = 0; + if (!locate_anchor(target, &anchor_idx)) return Status::OK(); + return scan_from_anchor(anchor_idx, target, found, out); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp b/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp new file mode 100644 index 00000000000000..05f73814c32d2d --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp @@ -0,0 +1,89 @@ +#include "snii/format/dict_block_directory.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +namespace { + +// Each block_ref has a fixed field order; reuse ByteSink varint/fixed primitives — do not hand-craft bytes manually. +// uncomp_len trails only when the kZstd flag is set, so uncompressed-block +// directories keep their compact (v1-identical) per-ref byte layout. +void encode_ref(const BlockRef& ref, ByteSink* payload) { + payload->put_varint64(ref.offset); + payload->put_varint64(ref.length); + payload->put_varint32(ref.n_entries); + payload->put_u8(ref.flags); + payload->put_fixed32(ref.checksum); + if (ref.flags & block_ref_flags::kZstd) payload->put_varint64(ref.uncomp_len); +} + +Status decode_ref(ByteSource* ps, BlockRef* ref) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->offset)); + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->length)); + SNII_RETURN_IF_ERROR(ps->get_varint32(&ref->n_entries)); + SNII_RETURN_IF_ERROR(ps->get_u8(&ref->flags)); + SNII_RETURN_IF_ERROR(ps->get_fixed32(&ref->checksum)); + if (ref->flags & block_ref_flags::kZstd) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->uncomp_len)); + } + return Status::OK(); +} + +Status decode_payload(Slice payload, std::vector* refs) { + ByteSource ps(payload); + uint32_t n_blocks = 0; + SNII_RETURN_IF_ERROR(ps.get_varint32(&n_blocks)); + // Guard against a corrupted, inflated count from untrusted bytes: each BlockRef + // needs >= 8 bytes (flags u8 + checksum u32 + >= 1 byte for each of 3 varints), + // so cap before reserve to avoid a huge allocation. + constexpr size_t kMinRefBytes = 8; + if (n_blocks > ps.remaining() / kMinRefBytes) { + return Status::Corruption("dict_block_directory: n_blocks exceeds payload capacity"); + } + refs->clear(); + refs->reserve(n_blocks); + for (uint32_t i = 0; i < n_blocks; ++i) { + BlockRef ref {}; + SNII_RETURN_IF_ERROR(decode_ref(&ps, &ref)); + refs->push_back(ref); + } + if (!ps.eof()) { + return Status::Corruption("dict_block_directory: trailing bytes in payload"); + } + return Status::OK(); +} + +} // namespace + +void DictBlockDirectoryBuilder::finish(ByteSink* sink) const { + ByteSink payload; + payload.put_varint32(static_cast(refs_.size())); + for (const auto& ref : refs_) { + encode_ref(ref, &payload); + } + SectionFramer::write(*sink, static_cast(SectionType::kDictBlockDirectory), + payload.view()); +} + +Status DictBlockDirectoryReader::open(Slice section, DictBlockDirectoryReader* out) { + ByteSource src(section); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + if (sec.type != static_cast(SectionType::kDictBlockDirectory)) { + return Status::InvalidArgument("dict_block_directory: unexpected section type"); + } + return decode_payload(sec.payload, &out->refs_); +} + +Status DictBlockDirectoryReader::get(uint32_t ordinal, BlockRef* out) const { + if (ordinal >= refs_.size()) { + return Status::NotFound("dict_block_directory: ordinal out of range"); + } + *out = refs_[ordinal]; + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/dict_entry.cpp b/be/src/storage/index/snii/core/src/format/dict_entry.cpp new file mode 100644 index 00000000000000..3b7a189e2c276b --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/dict_entry.cpp @@ -0,0 +1,293 @@ +#include "snii/format/dict_entry.h" + +#include + +#include "snii/common/slice.h" + +namespace snii::format { + +namespace { + +// Pure-function assembly / parsing of flags bits; avoids a long inline if-else +// chain. +uint8_t pack_flags(const DictEntry& e) { + uint8_t f = 0; + if (e.kind == DictEntryKind::kInline) f |= dict_flags::kKind; + if (e.enc == DictEntryEnc::kWindowed) f |= dict_flags::kEnc; + if (e.has_sb) f |= dict_flags::kHasSb; + // bit3 has_champion / bit4 offsets_ref are always 0 in v1. + return f; +} + +void apply_flags(uint8_t f, DictEntry* e) { + e->kind = (f & dict_flags::kKind) ? DictEntryKind::kInline : DictEntryKind::kPodRef; + e->enc = (f & dict_flags::kEnc) ? DictEntryEnc::kWindowed : DictEntryEnc::kSlim; + e->has_sb = (f & dict_flags::kHasSb) != 0; +} + +// Length of the longest common prefix between term and prev_term. +uint32_t common_prefix_len(std::string_view term, std::string_view prev) { + uint32_t n = 0; + const uint32_t lim = static_cast(std::min(term.size(), prev.size())); + while (n < lim && term[n] == prev[n]) ++n; + return n; +} + +bool tier_has_stats(IndexTier tier) { + return tier >= IndexTier::kT2; +} + +// ---- Encode entry body (excluding entry_len and trailing crc) ---- + +void write_term_key(const DictEntry& e, std::string_view prev, ByteSink* sink) { + const uint32_t prefix = common_prefix_len(e.term, prev); + const std::string_view suffix = std::string_view(e.term).substr(prefix); + sink->put_varint32(prefix); + sink->put_varint32(static_cast(suffix.size())); + sink->put_bytes(Slice(suffix)); +} + +void write_stats(const DictEntry& e, IndexTier tier, ByteSink* sink) { + sink->put_varint32(e.df); + if (!tier_has_stats(tier)) return; + sink->put_varint64(e.ttf_delta); + sink->put_varint64(e.max_freq); +} + +// Per-window codec mode byte shared by slim/inline single-window regions. +uint8_t pack_win_mode(const DictEntry& e) { + uint8_t mode = 0; + if (e.dd_meta.zstd) mode |= 1u << 0; // dd_zstd + if (e.freq_meta.zstd) mode |= 1u << 1; // freq_zstd + return mode; +} + +// Writes the slim/inline region codec metadata (dd always; freq when tier>=T2). +// store_crc=false (INLINE entries, format v2) omits the redundant per-region +// crc32c: the inline bytes already sit inside the dict block, whose own +// block-level crc32c covers them. POD-ref entries pass store_crc=true (their +// regions live in the separately-fetched .frq POD, uncovered by the block crc). +void write_region_meta(const DictEntry& e, IndexTier tier, bool store_crc, ByteSink* sink) { + sink->put_u8(pack_win_mode(e)); + sink->put_varint64(e.dd_meta.uncomp_len); + if (store_crc) sink->put_fixed32(e.dd_meta.crc); + if (!tier_has_stats(tier)) return; + sink->put_varint64(e.freq_meta.uncomp_len); + if (store_crc) sink->put_fixed32(e.freq_meta.crc); +} + +void write_pod_ref(const DictEntry& e, IndexTier tier, ByteSink* sink) { + sink->put_varint64(e.frq_off_delta); + sink->put_varint64(e.frq_len); + if (e.enc == DictEntryEnc::kWindowed) { + sink->put_varint64(e.prelude_len); + sink->put_varint64(e.frq_docs_len); + } else { + sink->put_varint64(e.frq_docs_len); // slim pod_ref: dd region on-disk length + // POD-ref regions live in the .frq POD (not covered by the block crc): keep + // crc. + write_region_meta(e, tier, /*store_crc=*/true, sink); + } + if (!tier_has_stats(tier)) return; + sink->put_varint64(e.prx_off_delta); + sink->put_varint64(e.prx_len); +} + +void write_inline(const DictEntry& e, IndexTier tier, ByteSink* sink) { + sink->put_varint64(static_cast(e.frq_bytes.size())); + sink->put_bytes(Slice(e.frq_bytes)); + sink->put_varint64(e.inline_dd_disk_len); + // INLINE bytes are covered by the dict block crc32c: omit the redundant + // per-region crc. + write_region_meta(e, tier, /*store_crc=*/false, sink); + if (!tier_has_stats(tier)) return; + sink->put_varint64(static_cast(e.prx_bytes.size())); + sink->put_bytes(Slice(e.prx_bytes)); +} + +void write_body(const DictEntry& e, std::string_view prev, IndexTier tier, ByteSink* sink) { + write_term_key(e, prev, sink); + sink->put_u8(pack_flags(e)); + write_stats(e, tier, sink); + if (e.kind == DictEntryKind::kInline) { + write_inline(e, tier, sink); + } else { + write_pod_ref(e, tier, sink); + } +} + +// ---- Decode entry body ---- + +Status read_term_key(ByteSource* src, std::string_view prev, DictEntry* out) { + uint32_t prefix = 0; + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&prefix)); + SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len)); + if (prefix > prev.size()) { + return Status::Corruption("dict_entry: prefix_len exceeds prev_term length"); + } + Slice suffix; + SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix)); + out->term.assign(prev.substr(0, prefix)); + out->term.append(reinterpret_cast(suffix.data()), suffix.size()); + return Status::OK(); +} + +Status read_stats(ByteSource* src, IndexTier tier, DictEntry* out) { + SNII_RETURN_IF_ERROR(src->get_varint32(&out->df)); + if (!tier_has_stats(tier)) return Status::OK(); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->ttf_delta)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->max_freq)); + return Status::OK(); +} + +// Reads the slim/inline region codec metadata (mode/uncomp/[crc]) and fills the +// dd/freq region disk_len from the supplied total/split lengths. has_crc=false +// (INLINE entries, format v2) means no per-region crc was stored: the on-disk +// crc field is absent and region decode must skip crc verification (verify_crc= +// false) since the dict block's own crc32c already covers the inline bytes. +Status read_region_meta(ByteSource* src, IndexTier tier, bool has_crc, uint64_t dd_disk_len, + uint64_t freq_disk_len, DictEntry* out) { + uint8_t mode = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&mode)); + if ((mode & ~0x3u) != 0) { + return Status::Corruption("dict_entry: unknown win_mode bits"); + } + out->dd_meta.zstd = (mode & (1u << 0)) != 0; + out->dd_meta.disk_len = dd_disk_len; + out->dd_meta.verify_crc = has_crc; + SNII_RETURN_IF_ERROR(src->get_varint64(&out->dd_meta.uncomp_len)); + if (has_crc) SNII_RETURN_IF_ERROR(src->get_fixed32(&out->dd_meta.crc)); + if (!tier_has_stats(tier)) { + if (mode & (1u << 1)) { + return Status::Corruption("dict_entry: freq mode set without freq tier"); + } + return Status::OK(); + } + out->freq_meta.zstd = (mode & (1u << 1)) != 0; + out->freq_meta.disk_len = freq_disk_len; + out->freq_meta.verify_crc = has_crc; + SNII_RETURN_IF_ERROR(src->get_varint64(&out->freq_meta.uncomp_len)); + if (has_crc) SNII_RETURN_IF_ERROR(src->get_fixed32(&out->freq_meta.crc)); + return Status::OK(); +} + +Status read_pod_ref(ByteSource* src, IndexTier tier, DictEntry* out) { + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_off_delta)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_len)); + if (out->enc == DictEntryEnc::kWindowed) { + SNII_RETURN_IF_ERROR(src->get_varint64(&out->prelude_len)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_docs_len)); + if (out->prelude_len == 0 || out->prelude_len > out->frq_docs_len || + out->frq_docs_len > out->frq_len) { + return Status::Corruption("dict_entry: invalid windowed docs prefix"); + } + } else { + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_docs_len)); + if (out->frq_docs_len > out->frq_len) { + return Status::Corruption("dict_entry: frq_docs_len exceeds frq_len"); + } + SNII_RETURN_IF_ERROR(read_region_meta(src, tier, /*has_crc=*/true, out->frq_docs_len, + out->frq_len - out->frq_docs_len, out)); + } + if (!tier_has_stats(tier)) return Status::OK(); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->prx_off_delta)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->prx_len)); + return Status::OK(); +} + +Status read_byte_blob(ByteSource* src, std::vector* out) { + uint64_t len = 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&len)); + Slice bytes; + SNII_RETURN_IF_ERROR(src->get_bytes(static_cast(len), &bytes)); + out->assign(bytes.data(), bytes.data() + bytes.size()); + return Status::OK(); +} + +Status read_inline(ByteSource* src, IndexTier tier, DictEntry* out) { + SNII_RETURN_IF_ERROR(read_byte_blob(src, &out->frq_bytes)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->inline_dd_disk_len)); + if (out->inline_dd_disk_len > out->frq_bytes.size()) { + return Status::Corruption("dict_entry: inline_dd_disk_len exceeds frq_bytes"); + } + const uint64_t freq_disk_len = + static_cast(out->frq_bytes.size()) - out->inline_dd_disk_len; + // INLINE entries store no per-region crc (covered by the block crc): + // has_crc=false. + SNII_RETURN_IF_ERROR(read_region_meta(src, tier, /*has_crc=*/false, out->inline_dd_disk_len, + freq_disk_len, out)); + if (!tier_has_stats(tier)) return Status::OK(); + SNII_RETURN_IF_ERROR(read_byte_blob(src, &out->prx_bytes)); + return Status::OK(); +} + +Status read_locator(ByteSource* src, IndexTier tier, DictEntry* out) { + if (out->kind == DictEntryKind::kInline) return read_inline(src, tier, out); + return read_pod_ref(src, tier, out); +} + +// Read entry_len (= body length) and verify that src has enough remaining +// bytes. +Status read_entry_len(ByteSource* src, uint64_t* total) { + SNII_RETURN_IF_ERROR(src->get_varint64(total)); + if (*total > src->remaining()) { + return Status::Corruption("dict_entry: entry_len out of range"); + } + return Status::OK(); +} + +} // namespace + +Status encode_dict_entry(const DictEntry& entry, std::string_view prev_term, IndexTier tier, + ByteSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("dict_entry: sink is null"); + + // Serialize the body into a temporary buffer first to obtain the exact + // length, then write entry_len + body. CRC verification is done uniformly at + // the DICT block level (covering block header + all entries + anchor table); + // CRC is not repeated at the entry level, to keep slim/inline low-frequency + // terms maximally compact (spec §DICT block/§dict entry). + ByteSink body; + write_body(entry, prev_term, tier, &body); + sink->put_varint64(static_cast(body.size())); + sink->put_bytes(body.view()); + return Status::OK(); +} + +Status decode_dict_entry(ByteSource* src, std::string_view prev_term, IndexTier tier, + DictEntry* out) { + if (src == nullptr || out == nullptr) { + return Status::InvalidArgument("dict_entry: src / out is null"); + } + *out = DictEntry {}; + + uint64_t total = 0; + SNII_RETURN_IF_ERROR(read_entry_len(src, &total)); + const size_t body_start = src->position(); + + SNII_RETURN_IF_ERROR(read_term_key(src, prev_term, out)); + uint8_t flags = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&flags)); + apply_flags(flags, out); + SNII_RETURN_IF_ERROR(read_stats(src, tier, out)); + SNII_RETURN_IF_ERROR(read_locator(src, tier, out)); + + // The body must consume exactly entry_len bytes; otherwise the structure is + // inconsistent with the tier. + const size_t consumed = src->position() - body_start; + if (consumed != static_cast(total)) { + return Status::Corruption("dict_entry: body length does not match entry_len"); + } + return Status::OK(); +} + +Status skip_dict_entry(ByteSource* src) { + if (src == nullptr) return Status::InvalidArgument("dict_entry: src is null"); + uint64_t total = 0; + SNII_RETURN_IF_ERROR(read_entry_len(src, &total)); + Slice unused; + return src->get_bytes(static_cast(total), &unused); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/frq_pod.cpp b/be/src/storage/index/snii/core/src/format/frq_pod.cpp new file mode 100644 index 00000000000000..1dc28fb9eea696 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/frq_pod.cpp @@ -0,0 +1,196 @@ +#include "snii/format/frq_pod.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/pfor.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/format_constants.h" + +namespace snii::format { +namespace { + +// Auto-compression threshold: use raw when a region is smaller than this byte +// count (zstd gain is negligible and metadata overhead is relatively large). +inline constexpr size_t kAutoZstdMinBytes = 512; +// Default zstd level for auto mode. +inline constexpr int kDefaultZstdLevel = 3; +// Maximum decompressed byte size for a single region. Guards against a +// corrupted uncomp_len read from S3 that inflated to a huge value: sanity-check +// before allocating/decompressing to avoid GB-scale allocations. Windows are +// 256-doc aligned and normally far smaller than this. +inline constexpr uint32_t kMaxRegionUncompBytes = 256u * 1024 * 1024; +// Maximum doc count per .frq window (guards against a corrupted n). Window +// baseline is 256, practical combined cap is 2048, so this is a loose but +// astronomically-large-number-blocking upper bound. +inline constexpr uint32_t kMaxWindowDocs = 1u << 24; + +// Encode a uint32 array into multiple PFOR runs, each of 256 (kFrqBaseUnit) +// elements. n / run count is not written: the number of runs is derived from +// total length n and kFrqBaseUnit, and the decoder computes it the same way. +void encode_pfor_runs(std::span values, ByteSink* out) { + size_t n = values.size(); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + pfor_encode(values.data() + off, run, out); + } +} + +// Decode n uint32 values from source (multiple PFOR runs of 256 each). +Status decode_pfor_runs(ByteSource* src, size_t n, std::vector* out) { + out->assign(n, 0); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + SNII_RETURN_IF_ERROR(pfor_decode(src, run, out->data() + off)); + } + return Status::OK(); +} + +// Verifies docids are ascending and the first entry is not below win_base. +Status validate_docs(std::span docs, uint64_t win_base) { + if (docs.empty()) return Status::OK(); + if (static_cast(docs.front()) < win_base) { + return Status::InvalidArgument("frq: first docid below win_base"); + } + for (size_t i = 1; i < docs.size(); ++i) { + if (docs[i] < docs[i - 1]) { + return Status::InvalidArgument("frq: docids must be ascending"); + } + } + return Status::OK(); +} + +// Decision: given level and plaintext length, determine whether to compress. +bool should_compress(int level, size_t plain_len) { + if (level == 0) return false; // force raw + if (level > 0) return true; // force zstd + return plain_len >= kAutoZstdMinBytes; // auto +} + +// Encodes one region's plaintext into raw or zstd, appends the on-disk bytes to +// out, and fills meta (mode/uncomp_len/disk_len/crc). The region carries no +// header. +Status emit_region(Slice plain, int level, ByteSink* out, FrqRegionMeta* meta) { + if (out == nullptr || meta == nullptr) { + return Status::InvalidArgument("frq: null region out"); + } + meta->uncomp_len = plain.size(); + std::vector disk; + if (should_compress(level, plain.size())) { + meta->zstd = true; + SNII_RETURN_IF_ERROR(zstd_compress(plain, level > 0 ? level : kDefaultZstdLevel, &disk)); + } else { + meta->zstd = false; + disk.assign(plain.data(), plain.data() + plain.size()); + } + meta->disk_len = static_cast(disk.size()); + meta->crc = crc32c(Slice(disk)); + out->put_bytes(Slice(disk)); + return Status::OK(); +} + +// Materializes a region's plaintext (raw borrows the view; zstd decompresses) +// and verifies its crc + slice length against meta. +Status open_region(Slice disk, const FrqRegionMeta& meta, std::vector* holder, + Slice* plain) { + if (disk.size() != static_cast(meta.disk_len)) { + return Status::Corruption("frq: region slice length mismatch"); + } + if (meta.uncomp_len > kMaxRegionUncompBytes) { + return Status::Corruption("frq: region uncomp_len exceeds sane cap"); + } + // Inline entries (verify_crc=false) carry no per-region crc: their on-disk + // bytes are covered by the enclosing dict block's block-level crc32c, so the + // region crc would be redundant. POD-ref regions keep their own crc check. + if (meta.verify_crc && crc32c(disk) != meta.crc) { + return Status::Corruption("frq: region crc mismatch"); + } + if (!meta.zstd) { + if (meta.uncomp_len != meta.disk_len) { + return Status::Corruption("frq: raw region length inconsistent"); + } + *plain = disk; + return Status::OK(); + } + SNII_RETURN_IF_ERROR(zstd_decompress(disk, static_cast(meta.uncomp_len), holder)); + *plain = Slice(*holder); + return Status::OK(); +} + +} // namespace + +Status build_dd_region(std::span docids_ascending, uint64_t win_base, + int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta) { + if (out == nullptr || meta == nullptr) { + return Status::InvalidArgument("frq: null dd region out"); + } + SNII_RETURN_IF_ERROR(validate_docs(docids_ascending, win_base)); + ByteSink plain; // VInt n ++ PFOR_runs(doc_delta) + std::vector dd(docids_ascending.size()); + uint64_t prev = win_base; + for (size_t i = 0; i < docids_ascending.size(); ++i) { + dd[i] = static_cast(static_cast(docids_ascending[i]) - prev); + prev = docids_ascending[i]; + } + plain.put_varint32(static_cast(docids_ascending.size())); + encode_pfor_runs(dd, &plain); + return emit_region(plain.view(), zstd_level_or_neg_for_auto, out, meta); +} + +Status build_freq_region(std::span freqs, int zstd_level_or_neg_for_auto, + ByteSink* out, FrqRegionMeta* meta) { + if (out == nullptr || meta == nullptr) { + return Status::InvalidArgument("frq: null freq region out"); + } + ByteSink plain; + encode_pfor_runs(freqs, &plain); + return emit_region(plain.view(), zstd_level_or_neg_for_auto, out, meta); +} + +Status decode_dd_region(Slice dd_disk, const FrqRegionMeta& meta, uint64_t win_base, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("frq: null docids out"); + std::vector holder; + Slice plain; + SNII_RETURN_IF_ERROR(open_region(dd_disk, meta, &holder, &plain)); + ByteSource src(plain); + uint32_t n = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&n)); + if (n > kMaxWindowDocs) return Status::Corruption("frq: doc count exceeds sane cap"); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, n, docids)); + if (!src.eof()) { + return Status::Corruption("frq: trailing bytes after dd region payload"); + } + uint64_t cur = win_base; + for (uint32_t i = 0; i < n; ++i) { + cur += (*docids)[i]; + (*docids)[i] = static_cast(cur); + } + return Status::OK(); +} + +Status decode_freq_region(Slice freq_disk, const FrqRegionMeta& meta, size_t doc_count, + std::vector* freqs) { + if (freqs == nullptr) return Status::InvalidArgument("frq: null freqs out"); + std::vector holder; + Slice plain; + SNII_RETURN_IF_ERROR(open_region(freq_disk, meta, &holder, &plain)); + if (doc_count == 0) { + if (meta.uncomp_len != 0) { + return Status::Corruption("frq: empty freq region expected"); + } + freqs->clear(); + return Status::OK(); + } + ByteSource src(plain); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, freqs)); + if (!src.eof()) { + return Status::Corruption("frq: trailing bytes after freq region payload"); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/frq_prelude.cpp b/be/src/storage/index/snii/core/src/format/frq_prelude.cpp new file mode 100644 index 00000000000000..568fda00f2f854 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/frq_prelude.cpp @@ -0,0 +1,470 @@ +#include "snii/format/frq_prelude.h" + +#include +#include +#include + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" + +namespace snii::format { + +namespace { + +// Anti-DoS: a segment holds at most ~15M docs (>=1 doc/window), so 1<<24 +// windows is a generous ceiling that still prevents multi-GB allocations from a +// crafted N. (crc32c is not a MAC and cannot defend a re-stamped inflated count.) +constexpr uint64_t kMaxWindows = 1ull << 24; + +uint64_t ceil_div(uint64_t a, uint64_t b) { + return (a + b - 1) / b; +} + +uint8_t make_flags(const FrqPreludeColumns& cols) { + uint8_t flags = 0; + if (cols.has_freq) flags |= frq_prelude_flags::kHasFreq; + if (cols.has_prx) flags |= frq_prelude_flags::kHasPrx; + return flags; +} + +uint8_t make_win_mode(const WindowMeta& m, bool has_freq) { + uint8_t mode = 0; + if (m.dd_zstd) mode |= frq_win_mode::kDdZstd; + if (has_freq && m.freq_zstd) mode |= frq_win_mode::kFreqZstd; + return mode; +} + +Status checked_add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) { + if (rhs > std::numeric_limits::max() - lhs) { + return Status::Corruption(message); + } + *out = lhs + rhs; + return Status::OK(); +} + +Status checked_u32(uint64_t value, const char* message, uint32_t* out) { + if (value > std::numeric_limits::max()) { + return Status::Corruption(message); + } + *out = static_cast(value); + return Status::OK(); +} + +Status validate_window_doc_count(bool first_window, uint64_t win_base, uint64_t last_docid, + uint64_t doc_count) { + uint64_t first_docid = 0; + if (!first_window) { + SNII_RETURN_IF_ERROR(checked_add_u64( + win_base, 1, "frq_prelude: window base exceeds docid range", &first_docid)); + } + if (last_docid < first_docid) { + return Status::Corruption("frq_prelude: invalid window docid range"); + } + const uint64_t width = last_docid - first_docid + 1; + if (doc_count > width) { + return Status::Corruption("frq_prelude: doc_count exceeds window width"); + } + return Status::OK(); +} + +// Validates builder input: non-null sink, group_size>=1, sane count, and +// non-decreasing absolute last_docid across windows. +Status validate_input(const FrqPreludeColumns& cols, ByteSink* out) { + if (out == nullptr) return Status::InvalidArgument("frq_prelude: null sink"); + if (cols.group_size == 0) { + return Status::InvalidArgument("frq_prelude: group_size must be >= 1"); + } + if (cols.windows.size() > kMaxWindows) { + return Status::InvalidArgument("frq_prelude: window count exceeds cap"); + } + for (size_t w = 1; w < cols.windows.size(); ++w) { + if (cols.windows[w].last_docid < cols.windows[w - 1].last_docid) { + return Status::InvalidArgument("frq_prelude: last_docid not monotonic"); + } + } + return Status::OK(); +} + +// Encodes one window row into a per-block sink. last_docid_delta is the row's +// absolute last_docid minus prev_last (the previous window's absolute last). +void encode_window_row(const WindowMeta& m, bool has_freq, bool has_prx, uint64_t prev_last, + ByteSink* block) { + block->put_varint64(static_cast(m.last_docid) - prev_last); + block->put_varint64(m.doc_count); + block->put_u8(make_win_mode(m, has_freq)); + block->put_varint64(m.dd_off); + block->put_varint64(m.dd_disk_len); + block->put_varint64(m.dd_uncomp_len); + block->put_fixed32(m.crc_dd); + if (has_freq) { + block->put_varint64(m.freq_off); + block->put_varint64(m.freq_disk_len); + block->put_varint64(m.freq_uncomp_len); + block->put_fixed32(m.crc_freq); + } + if (has_prx) { + block->put_varint64(m.prx_off); + block->put_varint64(m.prx_len); + } + block->put_varint64(m.max_freq); + block->put_u8(m.max_norm); +} + +// One super-block's serialized window block plus its directory fields. +struct SuperBlock { + ByteSink block; + uint64_t last_docid = 0; // absolute last docid of this super-block's last window +}; + +// Builds every super-block's window block (row-encoded) and records the running +// absolute last docid at each super-block boundary. +std::vector encode_super_blocks(const FrqPreludeColumns& cols) { + const uint32_t g = cols.group_size; + const size_t n = cols.windows.size(); + std::vector blocks; + blocks.reserve(static_cast(ceil_div(n, g))); + uint64_t prev_last = 0; // previous window's absolute last docid (chains across blocks) + for (size_t start = 0; start < n; start += g) { + const size_t end = std::min(n, start + g); + SuperBlock sb; + for (size_t w = start; w < end; ++w) { + encode_window_row(cols.windows[w], cols.has_freq, cols.has_prx, prev_last, &sb.block); + prev_last = cols.windows[w].last_docid; + } + sb.last_docid = prev_last; + blocks.push_back(std::move(sb)); + } + return blocks; +} + +// Serializes the super_block_dir (one row per super-block) into dir_sink, using +// each block's byte length to compute its offset within the window_dir region. +void encode_super_block_dir(const std::vector& blocks, ByteSink* dir_sink) { + uint64_t prev_last = 0; + uint64_t block_off = 0; + for (const SuperBlock& sb : blocks) { + dir_sink->put_varint64(sb.last_docid - prev_last); + dir_sink->put_varint64(block_off); + dir_sink->put_varint64(sb.block.size()); + prev_last = sb.last_docid; + block_off += sb.block.size(); + } +} + +} // namespace + +Status build_frq_prelude(const FrqPreludeColumns& cols, ByteSink* out) { + SNII_RETURN_IF_ERROR(validate_input(cols, out)); + + const std::vector blocks = encode_super_blocks(cols); + ByteSink dir_sink; + encode_super_block_dir(blocks, &dir_sink); + + // covered = header + super_block_dir (the crc covers exactly this region). + ByteSink covered; + covered.put_u8(make_flags(cols)); + covered.put_varint64(cols.windows.size()); + covered.put_varint64(cols.group_size); + covered.put_varint64(blocks.size()); + covered.put_varint64(dir_sink.size()); + covered.put_bytes(dir_sink.view()); + + out->put_bytes(covered.view()); + out->put_fixed32(crc32c(covered.view())); + for (const SuperBlock& sb : blocks) out->put_bytes(sb.block.view()); + return Status::OK(); +} + +namespace { + +// Decoded header fields shared between parse phases. +struct Header { + bool has_freq = false; + bool has_prx = false; + uint64_t n = 0; + uint64_t group_size = 0; + uint64_t n_super = 0; + uint64_t sbdir_len = 0; +}; + +// Verifies the trailing crc covers [start of buffer .. end of super_block_dir]. +// covered_len = header bytes (up to and including sbdir_len) + sbdir_len. +Status verify_covered_crc(Slice prelude, size_t header_end, uint64_t sbdir_len) { + const size_t covered = header_end + static_cast(sbdir_len); + if (covered + sizeof(uint32_t) > prelude.size()) { + return Status::Corruption("frq_prelude: buffer too short for crc region"); + } + uint32_t stored = 0; + ByteSource crc_src(prelude.subslice(covered, sizeof(uint32_t))); + SNII_RETURN_IF_ERROR(crc_src.get_fixed32(&stored)); + if (crc32c(prelude.subslice(0, covered)) != stored) { + return Status::Corruption("frq_prelude: crc32c mismatch"); + } + return Status::OK(); +} + +// Parses + validates the header (counts capped before any later reserve). +Status parse_header(ByteSource* src, Header* h) { + uint8_t flags = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&flags)); + h->has_freq = (flags & frq_prelude_flags::kHasFreq) != 0; + h->has_prx = (flags & frq_prelude_flags::kHasPrx) != 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&h->n)); + SNII_RETURN_IF_ERROR(src->get_varint64(&h->group_size)); + SNII_RETURN_IF_ERROR(src->get_varint64(&h->n_super)); + SNII_RETURN_IF_ERROR(src->get_varint64(&h->sbdir_len)); + if (h->n > kMaxWindows || h->n_super > kMaxWindows) { + return Status::Corruption("frq_prelude: window count exceeds sane cap"); + } + if (h->group_size == 0) { + return Status::Corruption("frq_prelude: group_size is zero"); + } + if (h->n_super != ceil_div(h->n, h->group_size)) { + return Status::Corruption("frq_prelude: n_super inconsistent with N/G"); + } + return Status::OK(); +} + +// One super-block directory row. +struct SbDirRow { + uint64_t last_docid = 0; + uint64_t block_off = 0; + uint64_t block_len = 0; +}; + +// Decodes the super_block_dir region into absolute-last-docid rows, validating +// monotonic last docids and contiguous, in-bounds block offsets. +Status decode_super_block_dir(Slice dir, const Header& h, std::vector* rows, + uint64_t* window_region_len) { + ByteSource src(dir); + rows->clear(); + rows->reserve(static_cast(h.n_super)); + uint64_t prev_last = 0; + uint64_t expect_off = 0; + for (uint64_t s = 0; s < h.n_super; ++s) { + SbDirRow r; + uint64_t ldd = 0; + SNII_RETURN_IF_ERROR(src.get_varint64(&ldd)); + SNII_RETURN_IF_ERROR(src.get_varint64(&r.block_off)); + SNII_RETURN_IF_ERROR(src.get_varint64(&r.block_len)); + SNII_RETURN_IF_ERROR(checked_add_u64( + prev_last, ldd, "frq_prelude: super-block last_docid overflow", &r.last_docid)); + uint32_t checked_last = 0; + SNII_RETURN_IF_ERROR(checked_u32( + r.last_docid, "frq_prelude: super-block last_docid exceeds u32", &checked_last)); + if (r.last_docid < prev_last || r.block_off != expect_off) { + return Status::Corruption("frq_prelude: super-block dir inconsistent"); + } + expect_off += r.block_len; + prev_last = r.last_docid; + rows->push_back(r); + } + if (!src.eof()) { + return Status::Corruption("frq_prelude: super-block dir has trailing bytes"); + } + *window_region_len = expect_off; + return Status::OK(); +} + +// Validates a per-window codec mode byte against the known bits. +Status check_win_mode(uint8_t mode, bool has_freq) { + if ((mode & ~frq_win_mode::kKnownBits) != 0) { + return Status::Corruption("frq_prelude: unknown win_mode bits"); + } + if (!has_freq && (mode & frq_win_mode::kFreqZstd) != 0) { + return Status::Corruption("frq_prelude: freq mode set without has_freq"); + } + return Status::OK(); +} + +// Decodes one window row, advancing prev_last to this window's absolute last. +Status decode_window_row(ByteSource* src, bool has_freq, bool has_prx, bool first_window, + uint64_t* prev_last, WindowMeta* m) { + uint64_t ldd = 0, doc_count = 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&ldd)); + SNII_RETURN_IF_ERROR(src->get_varint64(&doc_count)); + uint8_t mode = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&mode)); + SNII_RETURN_IF_ERROR(check_win_mode(mode, has_freq)); + m->dd_zstd = (mode & frq_win_mode::kDdZstd) != 0; + m->freq_zstd = has_freq && (mode & frq_win_mode::kFreqZstd) != 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_off)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_disk_len)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_uncomp_len)); + SNII_RETURN_IF_ERROR(src->get_fixed32(&m->crc_dd)); + if (has_freq) { + SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_off)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_disk_len)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_uncomp_len)); + SNII_RETURN_IF_ERROR(src->get_fixed32(&m->crc_freq)); + } + if (has_prx) { + SNII_RETURN_IF_ERROR(src->get_varint64(&m->prx_off)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->prx_len)); + } + uint64_t max_freq = 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&max_freq)); + SNII_RETURN_IF_ERROR(src->get_u8(&m->max_norm)); + uint64_t last_docid = 0; + SNII_RETURN_IF_ERROR(checked_add_u64(*prev_last, ldd, "frq_prelude: window last_docid overflow", + &last_docid)); + SNII_RETURN_IF_ERROR( + validate_window_doc_count(first_window, *prev_last, last_docid, doc_count)); + m->win_base = *prev_last; + SNII_RETURN_IF_ERROR( + checked_u32(last_docid, "frq_prelude: window last_docid exceeds u32", &m->last_docid)); + SNII_RETURN_IF_ERROR( + checked_u32(doc_count, "frq_prelude: window doc_count exceeds u32", &m->doc_count)); + SNII_RETURN_IF_ERROR( + checked_u32(max_freq, "frq_prelude: window max_freq exceeds u32", &m->max_freq)); + *prev_last = last_docid; + return Status::OK(); +} + +// Decodes one super-block's window block (<=G rows) into the global window list, +// seeding win_base from prev_last and re-checking the recorded sb last docid. +Status decode_one_block(Slice block, const Header& h, uint64_t sb_last_docid, size_t row_count, + uint64_t* prev_last, std::vector* windows) { + ByteSource src(block); + for (size_t i = 0; i < row_count; ++i) { + WindowMeta m; + SNII_RETURN_IF_ERROR( + decode_window_row(&src, h.has_freq, h.has_prx, windows->empty(), prev_last, &m)); + windows->push_back(m); + } + if (!src.eof()) { + return Status::Corruption("frq_prelude: window block has trailing bytes"); + } + if (*prev_last != sb_last_docid) { + return Status::Corruption("frq_prelude: window block last docid mismatch"); + } + return Status::OK(); +} + +// Decodes all window blocks pointed to by the super_block_dir. +Status decode_all_blocks(Slice window_region, const Header& h, const std::vector& dir, + std::vector* windows) { + windows->clear(); + windows->reserve(static_cast(h.n)); + uint64_t prev_last = 0; + for (size_t s = 0; s < dir.size(); ++s) { + const SbDirRow& r = dir[s]; + if (r.block_off + r.block_len > window_region.size() || + r.block_off + r.block_len < r.block_off) { + return Status::Corruption("frq_prelude: window block out of region"); + } + const uint64_t already = static_cast(windows->size()); + const uint64_t rows = std::min(h.group_size, h.n - already); + Slice block = window_region.subslice(static_cast(r.block_off), + static_cast(r.block_len)); + SNII_RETURN_IF_ERROR(decode_one_block(block, h, r.last_docid, static_cast(rows), + &prev_last, windows)); + } + if (windows->size() != h.n) { + return Status::Corruption("frq_prelude: decoded window count mismatch"); + } + return Status::OK(); +} + +// Validates the dd/freq region locators tile the dd-block / freq-block contiguously +// (each region starts where the previous one ended) and returns the block lengths. +// Contiguity makes the docs-only prefix one solid run and bounds the read range. +Status validate_region_layout(const Header& h, const std::vector& windows, + uint64_t* dd_block_len, uint64_t* freq_block_len) { + uint64_t dd_expect = 0; + uint64_t freq_expect = 0; + for (const WindowMeta& m : windows) { + if (m.dd_off != dd_expect) { + return Status::Corruption("frq_prelude: dd region not contiguous"); + } + if (m.dd_disk_len > m.dd_uncomp_len && !m.dd_zstd) { + return Status::Corruption("frq_prelude: raw dd region length inconsistent"); + } + if (dd_expect + m.dd_disk_len < dd_expect) { + return Status::Corruption("frq_prelude: dd block length overflow"); + } + dd_expect += m.dd_disk_len; + if (h.has_freq) { + if (m.freq_off != freq_expect) { + return Status::Corruption("frq_prelude: freq region not contiguous"); + } + if (freq_expect + m.freq_disk_len < freq_expect) { + return Status::Corruption("frq_prelude: freq block length overflow"); + } + freq_expect += m.freq_disk_len; + } + } + *dd_block_len = dd_expect; + *freq_block_len = freq_expect; + return Status::OK(); +} + +} // namespace + +Status FrqPreludeReader::open(Slice prelude, FrqPreludeReader* out) { + ByteSource src(prelude); + Header h; + SNII_RETURN_IF_ERROR(parse_header(&src, &h)); + const size_t header_end = src.position(); + SNII_RETURN_IF_ERROR(verify_covered_crc(prelude, header_end, h.sbdir_len)); + + if (header_end + static_cast(h.sbdir_len) > prelude.size()) { + return Status::Corruption("frq_prelude: sbdir_len past buffer"); + } + Slice dir = prelude.subslice(header_end, static_cast(h.sbdir_len)); + std::vector rows; + uint64_t window_region_len = 0; + SNII_RETURN_IF_ERROR(decode_super_block_dir(dir, h, &rows, &window_region_len)); + + const size_t region_start = header_end + static_cast(h.sbdir_len) + sizeof(uint32_t); + if (region_start + static_cast(window_region_len) > prelude.size()) { + return Status::Corruption("frq_prelude: window region past buffer"); + } + Slice window_region = prelude.subslice(region_start, static_cast(window_region_len)); + + out->has_freq_ = h.has_freq; + out->has_prx_ = h.has_prx; + out->group_size_ = static_cast(h.group_size); + out->n_super_ = static_cast(h.n_super); + out->sb_last_docid_.clear(); + out->sb_last_docid_.reserve(rows.size()); + for (const SbDirRow& r : rows) out->sb_last_docid_.push_back(r.last_docid); + SNII_RETURN_IF_ERROR(decode_all_blocks(window_region, h, rows, &out->windows_)); + return validate_region_layout(h, out->windows_, &out->dd_block_len_, &out->freq_block_len_); +} + +Status FrqPreludeReader::window(uint32_t w, WindowMeta* out) const { + if (out == nullptr) return Status::InvalidArgument("frq_prelude: null window out"); + if (w >= windows_.size()) { + return Status::InvalidArgument("frq_prelude: window index out of range"); + } + *out = windows_[w]; + return Status::OK(); +} + +Status FrqPreludeReader::locate_window(uint32_t docid, bool* found, uint32_t* w) const { + if (found == nullptr || w == nullptr) { + return Status::InvalidArgument("frq_prelude: null locate out"); + } + *found = false; + if (windows_.empty()) return Status::OK(); + if (docid > windows_.back().last_docid) return Status::OK(); + + // Level 1: first super-block whose absolute last docid >= docid. + const auto sb_it = std::lower_bound(sb_last_docid_.begin(), sb_last_docid_.end(), + static_cast(docid)); + const size_t sb = static_cast(sb_it - sb_last_docid_.begin()); + // Level 2: window binary search within [sb*G, min((sb+1)*G, N)). + const size_t lo = sb * group_size_; + const size_t hi = std::min(lo + group_size_, windows_.size()); + for (size_t i = lo; i < hi; ++i) { + if (docid <= windows_[i].last_docid) { + *found = true; + *w = static_cast(i); + return Status::OK(); + } + } + return Status::OK(); // unreachable when invariants hold; defensive miss. +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp b/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp new file mode 100644 index 00000000000000..27ca75b8f6b9ec --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp @@ -0,0 +1,116 @@ +#include "snii/format/logical_index_directory.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +namespace { + +// Minimum payload bytes any entry can occupy: index_id (>=1) + suffix_len (>=1, value 0) + +// meta_off (>=1) + meta_len (>=1). Used as an anti-DoS lower bound before reserving. +constexpr size_t kMinEntryBytes = 4; + +// Encode one directory entry. Fixed field order; reuse ByteSink varint/bytes primitives. +void encode_entry(const LogicalIndexRef& ref, ByteSink* payload) { + payload->put_varint64(ref.index_id); + payload->put_varint32(static_cast(ref.index_suffix.size())); + payload->put_bytes(Slice(std::string_view(ref.index_suffix))); + payload->put_varint64(ref.meta_off); + payload->put_varint64(ref.meta_len); +} + +// Decode one directory entry, validating suffix_len against the remaining payload before copying. +Status decode_entry(ByteSource* ps, LogicalIndexRef* ref) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->index_id)); + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(ps->get_varint32(&suffix_len)); + // Anti-DoS: reject a suffix_len that cannot fit in the remaining bytes before allocating. + if (suffix_len > ps->remaining()) { + return Status::Corruption("logical_index_directory: suffix_len exceeds payload"); + } + Slice suffix; + SNII_RETURN_IF_ERROR(ps->get_bytes(suffix_len, &suffix)); + ref->index_suffix.assign(reinterpret_cast(suffix.data()), suffix.size()); + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->meta_off)); + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->meta_len)); + return Status::OK(); +} + +Status decode_payload(Slice payload, std::vector* refs) { + ByteSource ps(payload); + uint32_t n_entries = 0; + SNII_RETURN_IF_ERROR(ps.get_varint32(&n_entries)); + // Anti-DoS: cap n_entries against the remaining payload before reserving, so a corrupted + // inflated count cannot trigger a huge allocation. + if (n_entries > ps.remaining() / kMinEntryBytes) { + return Status::Corruption("logical_index_directory: n_entries exceeds payload capacity"); + } + refs->clear(); + refs->reserve(n_entries); + for (uint32_t i = 0; i < n_entries; ++i) { + LogicalIndexRef ref {}; + SNII_RETURN_IF_ERROR(decode_entry(&ps, &ref)); + refs->push_back(std::move(ref)); + } + if (!ps.eof()) { + return Status::Corruption("logical_index_directory: trailing bytes in payload"); + } + return Status::OK(); +} + +} // namespace + +void LogicalIndexDirectoryBuilder::finish(ByteSink* sink) const { + ByteSink payload; + payload.put_varint32(static_cast(refs_.size())); + for (const auto& ref : refs_) { + encode_entry(ref, &payload); + } + SectionFramer::write(*sink, static_cast(SectionType::kLogicalIndexDirectory), + payload.view()); +} + +Status LogicalIndexDirectoryReader::open(Slice framed, LogicalIndexDirectoryReader* out) { + if (out == nullptr) { + return Status::InvalidArgument("logical_index_directory: out is null"); + } + ByteSource src(framed); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + if (sec.type != static_cast(SectionType::kLogicalIndexDirectory)) { + return Status::InvalidArgument("logical_index_directory: unexpected section type"); + } + return decode_payload(sec.payload, &out->refs_); +} + +Status LogicalIndexDirectoryReader::get(uint32_t i, LogicalIndexRef* out) const { + if (out == nullptr) { + return Status::InvalidArgument("logical_index_directory: out is null"); + } + if (i >= refs_.size()) { + return Status::NotFound("logical_index_directory: index out of range"); + } + *out = refs_[i]; + return Status::OK(); +} + +Status LogicalIndexDirectoryReader::find(uint64_t index_id, std::string_view suffix, bool* found, + LogicalIndexRef* out) const { + if (found == nullptr || out == nullptr) { + return Status::InvalidArgument("logical_index_directory: output pointer is null"); + } + *found = false; + for (const auto& ref : refs_) { + if (ref.index_id != index_id || std::string_view(ref.index_suffix) != suffix) { + continue; + } + *out = ref; + *found = true; + return Status::OK(); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/norms_pod.cpp b/be/src/storage/index/snii/core/src/format/norms_pod.cpp new file mode 100644 index 00000000000000..a6f80c03b1ebcd --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/norms_pod.cpp @@ -0,0 +1,46 @@ +#include "snii/format/norms_pod.h" + +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +void NormsPodWriter::finish(ByteSink* sink) const { + // Build inner payload: [varint64 doc_count][raw norm bytes]. + ByteSink payload; + payload.put_varint64(norms_.size()); + payload.put_bytes(Slice(norms_)); + // Delegate outer framing to SectionFramer to append type+len+crc32c, avoiding manual checksum assembly. + SectionFramer::write(*sink, static_cast(SectionType::kStatsBlock), payload.view()); +} + +Status NormsPodReader::open(Slice framed, NormsPodReader* out) { + // framer handles CRC verify, truncation detection, and payload slicing. + ByteSource src(framed); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + + // Parse inner payload: [varint64 doc_count][bytes]. + ByteSource payload(sec.payload); + uint64_t doc_count = 0; + SNII_RETURN_IF_ERROR(payload.get_varint64(&doc_count)); + if (doc_count > std::numeric_limits::max()) { + return Status::Corruption("norms POD doc_count overflows uint32"); + } + // doc_count must exactly equal the remaining byte count (1 byte per doc). + if (payload.remaining() != doc_count) { + return Status::Corruption("norms POD length mismatch"); + } + + Slice bytes; + SNII_RETURN_IF_ERROR(payload.get_bytes(static_cast(doc_count), &bytes)); + out->doc_count_ = static_cast(doc_count); + out->norms_ = bytes.data(); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/null_bitmap.cpp b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp new file mode 100644 index 00000000000000..d805cd2e945563 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp @@ -0,0 +1,107 @@ +#include "snii/format/null_bitmap.h" + +#include +#include + +#include "roaring/roaring.h" +#include "roaring/roaring.hh" +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" + +namespace snii::format { + +NullBitmapWriter:: + NullBitmapWriter() // NOLINT(modernize-use-equals-default): roaring type is incomplete in the header. + : bitmap_(std::make_unique()) {} + +NullBitmapWriter::~NullBitmapWriter() = default; + +void NullBitmapWriter::add_null(uint32_t docid) { + bitmap_->add(docid); +} + +uint32_t NullBitmapWriter::null_count() const { + return static_cast(bitmap_->cardinality()); +} + +void NullBitmapWriter::finish(uint32_t doc_count, ByteSink* sink) const { + // Serialize the Roaring bitmap to its portable on-disk form. + const size_t roaring_size = bitmap_->getSizeInBytes(); + std::vector roaring_buf(roaring_size); + bitmap_->write(roaring_buf.data()); + + // Build inner payload: [varint64 doc_count][varint64 roaring_size][bytes]. + ByteSink payload; + payload.put_varint64(doc_count); + payload.put_varint64(roaring_size); + payload.put_bytes(Slice(reinterpret_cast(roaring_buf.data()), roaring_size)); + + // Delegate the type + len + crc32c envelope to SectionFramer. + SectionFramer::write(*sink, kNullBitmapSectionType, payload.view()); +} + +NullBitmapReader:: + NullBitmapReader() // NOLINT(modernize-use-equals-default): roaring type is incomplete in the header. + : bitmap_(std::make_unique()) {} + +NullBitmapReader::~NullBitmapReader() = default; + +NullBitmapReader::NullBitmapReader(NullBitmapReader&&) noexcept = default; +NullBitmapReader& NullBitmapReader::operator=(NullBitmapReader&&) noexcept = default; + +Status NullBitmapReader::open(Slice framed, NullBitmapReader* out) { + // SectionFramer handles CRC verification, truncation detection, and payload + // slicing. + ByteSource src(framed); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + + // Parse inner payload: [varint64 doc_count][varint64 roaring_size][bytes]. + ByteSource payload(sec.payload); + uint64_t doc_count = 0; + SNII_RETURN_IF_ERROR(payload.get_varint64(&doc_count)); + if (doc_count > std::numeric_limits::max()) { + return Status::Corruption("null bitmap doc_count overflows uint32"); + } + + uint64_t roaring_size = 0; + SNII_RETURN_IF_ERROR(payload.get_varint64(&roaring_size)); + // Anti-DoS: the declared roaring_size must not exceed the bytes actually + // present, otherwise readSafe could be told to walk past the payload. + if (roaring_size > payload.remaining()) { + return Status::Corruption("null bitmap roaring_size exceeds payload"); + } + + Slice roaring_bytes; + SNII_RETURN_IF_ERROR(payload.get_bytes(static_cast(roaring_size), &roaring_bytes)); + + // Validate the Roaring container BEFORE deserializing. A CRC-valid frame can + // still carry malformed roaring bytes; Roaring::readSafe / read would then hit + // CRoaring's terminate-or-throw path (NULL -> ROARING_TERMINATE). The safe, + // non-throwing C probe returns the exact byte count a valid container would + // consume, or 0 on malformed/insufficient input. + const char* rb = reinterpret_cast(roaring_bytes.data()); + const size_t probed = + roaring_bitmap_portable_deserialize_size(rb, static_cast(roaring_size)); + if (probed == 0 || probed != static_cast(roaring_size)) { + return Status::Corruption("null bitmap: malformed roaring container"); + } + *out->bitmap_ = roaring::Roaring::readSafe(rb, static_cast(roaring_size)); + out->doc_count_ = static_cast(doc_count); + return Status::OK(); +} + +bool NullBitmapReader::is_null(uint32_t docid) const { + return bitmap_->contains(docid); +} + +uint32_t NullBitmapReader::null_count() const { + return static_cast(bitmap_->cardinality()); +} + +void NullBitmapReader::copy_to(roaring::Roaring* out) const { + *out = *bitmap_; +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/per_index_meta.cpp b/be/src/storage/index/snii/core/src/format/per_index_meta.cpp new file mode 100644 index 00000000000000..31bb6e42445404 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/per_index_meta.cpp @@ -0,0 +1,191 @@ +#include "snii/format/per_index_meta.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/section_framer.h" + +namespace snii::format { + +namespace { + +// Upper bound on index_suffix length read from untrusted bytes, capped before +// allocation to avoid a DoS-inflated reserve. A logical index suffix is a short +// column/field name; 64 KiB is far beyond any legitimate value. +constexpr uint32_t kMaxSuffixLen = 64u * 1024u; + +void encode_region(const RegionRef& r, ByteSink* payload) { + payload->put_varint64(r.offset); + payload->put_varint64(r.length); +} + +Status decode_region(ByteSource* ps, RegionRef* r) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&r->offset)); + SNII_RETURN_IF_ERROR(ps->get_varint64(&r->length)); + return Status::OK(); +} + +// SectionRefs payload: five RegionRefs in fixed order, each as varint64 pair. +// Order: dict_region, posting_region, norms, null_bitmap, bsbf. +void encode_section_refs(const SectionRefs& refs, ByteSink* sink) { + ByteSink payload; + encode_region(refs.dict_region, &payload); + encode_region(refs.posting_region, &payload); + encode_region(refs.norms, &payload); + encode_region(refs.null_bitmap, &payload); + encode_region(refs.bsbf, &payload); + SectionFramer::write(*sink, static_cast(SectionType::kSectionRefs), payload.view()); +} + +Status decode_section_refs(Slice payload, SectionRefs* out) { + ByteSource ps(payload); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->dict_region)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->posting_region)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->norms)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->null_bitmap)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->bsbf)); + if (!ps.eof()) { + return Status::Corruption("per_index_meta: trailing bytes in section_refs"); + } + return Status::OK(); +} + +// Writes the self-checksummed header prefix. Layout matches the class comment. +void encode_header(uint64_t index_id, const std::string& suffix, uint32_t flags, ByteSink* sink) { + ByteSink head; + head.put_fixed16(kMetaFormatVersion); + head.put_varint64(index_id); + head.put_varint32(static_cast(suffix.size())); + head.put_bytes(Slice(suffix)); + head.put_fixed32(flags); + uint32_t crc = crc32c(head.view()); + sink->put_bytes(head.view()); + sink->put_fixed32(crc); +} + +// Parses and crc-verifies the header prefix, advancing src past the crc field. +Status decode_header(Slice block, ByteSource* src, uint64_t* index_id, std::string* suffix, + uint32_t* flags) { + size_t start = src->position(); + uint16_t version = 0; + SNII_RETURN_IF_ERROR(src->get_fixed16(&version)); + if (version != kMetaFormatVersion) { + return Status::Corruption("per_index_meta: unsupported meta_format_version"); + } + SNII_RETURN_IF_ERROR(src->get_varint64(index_id)); + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len)); + if (suffix_len > kMaxSuffixLen || suffix_len > src->remaining()) { + return Status::Corruption("per_index_meta: suffix_len exceeds bounds"); + } + Slice suffix_view; + SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix_view)); + SNII_RETURN_IF_ERROR(src->get_fixed32(flags)); + size_t covered = src->position() - start; + uint32_t stored = 0; + SNII_RETURN_IF_ERROR(src->get_fixed32(&stored)); + if (crc32c(block.subslice(start, covered)) != stored) { + return Status::Corruption("per_index_meta: header crc mismatch"); + } + suffix->assign(reinterpret_cast(suffix_view.data()), suffix_view.size()); + return Status::OK(); +} + +// Reads one framed section, returning both its type and the FULL frame Slice +// (type+len+payload+crc) so it can be re-opened by a sub-module reader. The +// framer itself crc-verifies the frame. +Status read_frame(Slice block, ByteSource* src, uint8_t* type, Slice* frame) { + size_t start = src->position(); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(*src, &sec)); + *type = sec.type; + *frame = block.subslice(start, src->position() - start); + return Status::OK(); +} + +// Captures one frame into the matching reader field by section type. Returns +// false (via *handled) for unrecognized types so the caller skips them. +// Routes an optional sub-section frame to its slot. Unknown section types are +// intentionally ignored (forward compatibility: skip unknown optional sections). +void dispatch_frame(uint8_t type, Slice frame, Slice* sampled, Slice* dict) { + if (type == static_cast(SectionType::kSampledTermIndex)) { + *sampled = frame; + } else if (type == static_cast(SectionType::kDictBlockDirectory)) { + *dict = frame; + } +} + +} // namespace + +PerIndexMetaBuilder::PerIndexMetaBuilder(uint64_t index_id, std::string index_suffix, + uint32_t flags) + : index_id_(index_id), index_suffix_(std::move(index_suffix)), flags_(flags) {} + +void PerIndexMetaBuilder::set_stats(const StatsBlock& stats) { + stats_ = stats; +} + +void PerIndexMetaBuilder::set_sampled_term_index(Slice framed_bytes) { + sampled_term_index_.assign(framed_bytes.data(), framed_bytes.data() + framed_bytes.size()); +} + +void PerIndexMetaBuilder::set_dict_block_directory(Slice framed_bytes) { + dict_block_directory_.assign(framed_bytes.data(), framed_bytes.data() + framed_bytes.size()); +} + +void PerIndexMetaBuilder::set_section_refs(const SectionRefs& refs) { + section_refs_ = refs; +} + +void PerIndexMetaBuilder::add_raw_section(Slice framed_bytes) { + extra_sections_.emplace_back(framed_bytes.data(), framed_bytes.data() + framed_bytes.size()); +} + +Status PerIndexMetaBuilder::finish(ByteSink* sink) const { + if (sink == nullptr) { + return Status::InvalidArgument("per_index_meta: null sink"); + } + encode_header(index_id_, index_suffix_, flags_, sink); + encode_stats_block(stats_, sink); + sink->put_bytes(Slice(sampled_term_index_)); + sink->put_bytes(Slice(dict_block_directory_)); + encode_section_refs(section_refs_, sink); + for (const auto& extra : extra_sections_) { + sink->put_bytes(Slice(extra)); + } + return Status::OK(); +} + +Status PerIndexMetaReader::open(Slice block, PerIndexMetaReader* out) { + if (out == nullptr) { + return Status::InvalidArgument("per_index_meta: null reader"); + } + ByteSource src(block); + SNII_RETURN_IF_ERROR( + decode_header(block, &src, &out->index_id_, &out->index_suffix_, &out->flags_)); + bool have_stats = false; + bool have_refs = false; + while (!src.eof()) { + uint8_t type = 0; + Slice frame; + SNII_RETURN_IF_ERROR(read_frame(block, &src, &type, &frame)); + if (type == static_cast(SectionType::kStatsBlock)) { + ByteSource fs(frame); + SNII_RETURN_IF_ERROR(decode_stats_block(&fs, &out->stats_)); + have_stats = true; + } else if (type == static_cast(SectionType::kSectionRefs)) { + FramedSection sec; + ByteSource fs(frame); + SNII_RETURN_IF_ERROR(SectionFramer::read(fs, &sec)); + SNII_RETURN_IF_ERROR(decode_section_refs(sec.payload, &out->section_refs_)); + have_refs = true; + } else { + dispatch_frame(type, frame, &out->sampled_term_index_, &out->dict_block_directory_); + } + } + if (!have_stats || !have_refs) { + return Status::Corruption("per_index_meta: missing required sub-section"); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/prx_pod.cpp b/be/src/storage/index/snii/core/src/format/prx_pod.cpp new file mode 100644 index 00000000000000..7d90cb3ead5df6 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/prx_pod.cpp @@ -0,0 +1,738 @@ +#include "snii/format/prx_pod.h" + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/pfor.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/format_constants.h" + +namespace snii::format { +namespace { + +// Auto-compression threshold: use raw when payload is smaller than this (zstd +// gain is negligible and metadata overhead is relatively large). +inline constexpr size_t kAutoZstdMinBytes = 512; +// Default zstd level in auto mode. +inline constexpr int kDefaultZstdLevel = 3; +// Maximum decompressed byte size for a single .prx window. Guards against a +// corrupted uncomp_len read from S3 inflated to a huge value: sanity-check +// before allocating/decompressing to avoid GB-scale allocations. Windows are +// 256-doc aligned and normally far below this limit. +inline constexpr uint32_t kMaxWindowUncompBytes = 256u * 1024 * 1024; +// Anti-DoS cap on position count decoded from a single window before +// allocation. +inline constexpr uint32_t kMaxWindowPositions = 1u << 26; // 64M positions/window +// Anti-DoS cap on doc count decoded from a single window before allocation. A +// corrupt doc_count is otherwise fed straight to assign()/reserve() -> +// bad_alloc. +inline constexpr uint32_t kMaxWindowDocs = 1u << 24; // 16M docs/window + +// Writer-side precondition for the FLAT builders: the per-doc partition `freqs` +// must address exactly the positions present in `flat`. If sum(freqs) overruns +// flat.size() a (positions_flat, freqs) mismatch would index flat[off+i] past +// the span end -- an out-of-bounds read on caller-supplied data. Reject it as +// InvalidArgument BEFORE any indexing so the bug surfaces as a clean Status, +// never UB. (sum < size leaves trailing positions unused, which is also a +// writer bug, so we require exact equality.) Uint64 accumulation cannot +// overflow for uint32 freqs. +Status check_flat_partition(std::span flat, std::span freqs) { + uint64_t sum = 0; + for (uint32_t fc : freqs) sum += fc; + if (sum != flat.size()) { + return Status::InvalidArgument("prx: sum(freqs) does not match positions_flat size"); + } + return Status::OK(); +} + +// Encode per-doc position lists into a self-describing plain payload (doc_count +// + per-doc delta stream). +Status encode_payload(std::span> per_doc, ByteSink* out) { + out->put_varint32(static_cast(per_doc.size())); + for (const auto& doc : per_doc) { + out->put_varint32(static_cast(doc.size())); + uint32_t prev = 0; + for (size_t i = 0; i < doc.size(); ++i) { + uint32_t pos = doc[i]; + if (i > 0 && pos < prev) { + return Status::InvalidArgument("prx: positions within a doc must be ascending"); + } + out->put_varint32(i == 0 ? pos : pos - prev); + prev = pos; + } + } + return Status::OK(); +} + +// FLAT-positions encoder: identical wire output to encode_payload above, but +// reads positions from a single flat span partitioned per-doc by `freqs` (doc d +// owns the next freqs[d] entries). This avoids materializing a +// vector-of-vectors for the window; freqs.size() is the doc count and +// sum(freqs) == flat.size(). +Status encode_payload_flat(std::span flat, std::span freqs, + ByteSink* out) { + SNII_RETURN_IF_ERROR(check_flat_partition(flat, freqs)); + out->put_varint32(static_cast(freqs.size())); + size_t off = 0; + for (uint32_t fc : freqs) { + out->put_varint32(fc); + uint32_t prev = 0; + for (uint32_t i = 0; i < fc; ++i) { + const uint32_t pos = flat[off + i]; + if (i > 0 && pos < prev) { + return Status::InvalidArgument("prx: positions within a doc must be ascending"); + } + out->put_varint32(i == 0 ? pos : pos - prev); + prev = pos; + } + off += fc; + } + return Status::OK(); +} + +// Encode a uint32 array into PFOR runs of kFrqBaseUnit (256) elements each. The +// run count is derived by the decoder from the total length, so it is not +// stored. +void encode_pfor_runs(std::span values, ByteSink* out) { + const size_t n = values.size(); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + const size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + pfor_encode(values.data() + off, run, out); + } +} + +// Decode n uint32 values (multiple PFOR runs of kFrqBaseUnit each) into out. +Status decode_pfor_runs(ByteSource* src, size_t n, std::vector* out) { + out->assign(n, 0); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + const size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + SNII_RETURN_IF_ERROR(pfor_decode(src, run, out->data() + off)); + } + return Status::OK(); +} + +// PFOR window payload (self-describing; no entropy coding): +// VInt doc_count +// VInt total_pos # sum of all pos_counts +// PFOR_runs(pos_counts) # doc_count values (bit-packed; mostly 1 -> ~1 +// bit) PFOR_runs(position_deltas) # total_pos deltas, flat across docs (first +// per +// # doc absolute, rest delta-within-doc) +// Bit-packing the per-doc pos_counts (vs one varint each) is the size win: in a +// uniform corpus most docs have freq 1, so the count column packs to ~1 +// bit/doc. Builds the payload from a flat positions span partitioned per-doc by +// `freqs`. +Status encode_pfor_payload_flat(std::span flat, std::span freqs, + ByteSink* out) { + SNII_RETURN_IF_ERROR(check_flat_partition(flat, freqs)); + out->put_varint32(static_cast(freqs.size())); + out->put_varint32(static_cast(flat.size())); + encode_pfor_runs(freqs, out); + std::vector deltas; + deltas.reserve(flat.size()); + size_t off = 0; + for (uint32_t fc : freqs) { + uint32_t prev = 0; + for (uint32_t i = 0; i < fc; ++i) { + const uint32_t pos = flat[off + i]; + if (i > 0 && pos < prev) { + return Status::InvalidArgument("prx: positions within a doc must be ascending"); + } + deltas.push_back(i == 0 ? pos : pos - prev); + prev = pos; + } + off += fc; + } + encode_pfor_runs(deltas, out); + return Status::OK(); +} + +// Builds the PFOR payload from per-doc lists (delegates through a flat view). +Status encode_pfor_payload(std::span> per_doc, ByteSink* out) { + std::vector flat, freqs; + freqs.reserve(per_doc.size()); + for (const auto& doc : per_doc) { + freqs.push_back(static_cast(doc.size())); + flat.insert(flat.end(), doc.begin(), doc.end()); + } + return encode_pfor_payload_flat(flat, freqs, out); +} + +// Decode per-doc position lists from a PFOR payload. +Status decode_pfor_payload(Slice plain, std::vector>* out) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + std::vector pos_counts; + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); + uint64_t sum = 0; + for (uint32_t d = 0; d < doc_count; ++d) sum += pos_counts[d]; + if (sum != total_pos) { + return Status::Corruption("prx: pos_count sum mismatch"); + } + std::vector deltas; + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, &deltas)); + out->clear(); + out->reserve(doc_count); + size_t off = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + std::vector doc; + doc.reserve(pos_counts[d]); + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_counts[d]; ++i) { + prev = (i == 0) ? deltas[off + i] : prev + deltas[off + i]; + doc.push_back(prev); + } + off += pos_counts[d]; + out->push_back(std::move(doc)); + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload"); + return Status::OK(); +} + +// Writes a PFOR window: codec=pfor, payload, crc(header+payload). +void write_pfor(Slice payload, ByteSink* sink) { + ByteSink framed; + framed.put_u8(static_cast(PrxCodec::kPfor)); + framed.put_varint32(static_cast(payload.size())); + framed.put_bytes(payload); + sink->put_bytes(framed.view()); + sink->put_fixed32(crc32c(framed.view())); +} + +// Decode per-doc position lists from a plain payload. +Status decode_payload(Slice plain, std::vector>* out) { + ByteSource src(plain); + uint32_t doc_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + out->clear(); + out->reserve(doc_count); + for (uint32_t d = 0; d < doc_count; ++d) { + uint32_t pos_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count)); + std::vector doc; + doc.reserve(pos_count); + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t delta = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&delta)); + prev = (i == 0) ? delta : prev + delta; + doc.push_back(prev); + } + out->push_back(std::move(doc)); + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload"); + return Status::OK(); +} + +// CSR decode of a PFOR payload: all docs' positions into one flat buffer + +// per-doc offsets, with NO per-doc std::vector allocation. `pos_off` has +// doc_count+1 entries (pos_off[0]==0); doc d's positions are +// pos_flat[pos_off[d] .. pos_off[d+1]). +Status decode_pfor_payload_csr(Slice plain, std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + pos_off->clear(); + pos_off->reserve(static_cast(doc_count) + 1); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, pos_off)); + uint64_t sum = 0; + for (uint32_t d = 0; d < doc_count; ++d) sum += (*pos_off)[d]; + if (sum != total_pos) return Status::Corruption("prx: pos_count sum mismatch"); + pos_flat->reserve(total_pos); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat)); + size_t off = 0; + uint32_t next_off = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + const uint32_t pos_count = (*pos_off)[d]; + (*pos_off)[d] = next_off; + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t& value = (*pos_flat)[off + i]; + prev = (i == 0) ? value : prev + value; + value = prev; + } + off += pos_count; + next_off += pos_count; + } + pos_off->push_back(next_off); + if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload"); + return Status::OK(); +} + +Status validate_doc_ordinals(std::span doc_ordinals, uint32_t doc_count) { + uint32_t prev = 0; + for (size_t i = 0; i < doc_ordinals.size(); ++i) { + const uint32_t doc = doc_ordinals[i]; + if (doc >= doc_count) { + return Status::Corruption("prx: selected doc ordinal out of range"); + } + if (i != 0 && doc <= prev) { + return Status::InvalidArgument("prx: selected doc ordinals must be strictly ascending"); + } + prev = doc; + } + return Status::OK(); +} + +struct SelectedRange { + SelectedRange(uint32_t begin_, uint32_t end_, uint32_t out_begin_) + : begin(begin_), end(end_), out_begin(out_begin_) {} + + uint32_t begin; + uint32_t end; + uint32_t out_begin; +}; + +uint32_t count_covered_pfor_runs(std::span selected, uint32_t total_pos) { + if (selected.empty() || total_pos == 0) { + return 0; + } + uint32_t runs = 0; + uint32_t next_run = 0; + for (const SelectedRange& range : selected) { + if (range.begin == range.end) { + continue; + } + const uint32_t first_run = range.begin / kFrqBaseUnit; + const uint32_t last_run = (range.end - 1) / kFrqBaseUnit; + const uint32_t counted_first = std::max(first_run, next_run); + if (counted_first <= last_run) { + runs += last_run - counted_first + 1; + next_run = last_run + 1; + } + } + return runs; +} + +bool should_decode_full_prx_positions(std::span selected, + uint32_t selected_pos_count, uint32_t total_pos) { + if (selected.empty() || total_pos == 0) { + return false; + } + if (selected_pos_count * 2 >= total_pos) { + return true; + } + const uint32_t total_runs = (total_pos + kFrqBaseUnit - 1) / kFrqBaseUnit; + const uint32_t covered_runs = count_covered_pfor_runs(selected, total_pos); + return covered_runs * 4 >= total_runs * 3; +} + +void compact_selected_pfor_positions(std::span selected, + std::vector& pos_flat, + std::vector& pos_off) { + size_t write_off = 0; + pos_off.clear(); + pos_off.reserve(selected.size() + 1); + pos_off.push_back(0); + for (const SelectedRange& range : selected) { + const uint32_t count = range.end - range.begin; + if (count == 1) { + pos_flat[write_off++] = pos_flat[range.begin]; + pos_off.push_back(static_cast(write_off)); + continue; + } + uint32_t prev = 0; + for (uint32_t i = 0; i < count; ++i) { + const uint32_t delta = pos_flat[range.begin + i]; + prev = (i == 0) ? delta : prev + delta; + pos_flat[write_off++] = prev; + } + pos_off.push_back(static_cast(write_off)); + } + pos_flat.resize(write_off); +} + +Status decode_selected_pfor_count_ranges(ByteSource* src, uint32_t doc_count, + std::span doc_ordinals, + std::vector& selected, + std::vector& pos_off, uint64_t* total_pos_count, + uint32_t* selected_pos_count) { + selected.clear(); + selected.reserve(doc_ordinals.size()); + pos_off.clear(); + pos_off.reserve(doc_ordinals.size() + 1); + pos_off.push_back(0); + + *selected_pos_count = 0; + uint32_t delta_begin = 0; + size_t next_doc = 0; + *total_pos_count = 0; + std::array run_buf {}; + for (uint32_t run_begin = 0; run_begin < doc_count; run_begin += kFrqBaseUnit) { + const uint32_t run_len = std::min(kFrqBaseUnit, doc_count - run_begin); + SNII_RETURN_IF_ERROR(pfor_decode(src, run_len, run_buf.data())); + for (uint32_t i = 0; i < run_len; ++i) { + const uint32_t d = run_begin + i; + const uint32_t count = run_buf[i]; + *total_pos_count += count; + if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) { + selected.emplace_back(delta_begin, delta_begin + count, *selected_pos_count); + *selected_pos_count += count; + pos_off.push_back(*selected_pos_count); + ++next_doc; + } + delta_begin += count; + } + } + if (next_doc != doc_ordinals.size()) { + return Status::Corruption("prx: selected doc ordinal was not decoded"); + } + return Status::OK(); +} + +Status decode_sparse_selected_pfor_positions(ByteSource* src, uint32_t total_pos, + std::span selected, + std::span pos_flat) { + std::array run_buf {}; + size_t range_idx = 0; + for (uint32_t run_begin = 0; run_begin < total_pos; run_begin += kFrqBaseUnit) { + const uint32_t run_len = std::min(kFrqBaseUnit, total_pos - run_begin); + const uint32_t run_end = run_begin + run_len; + while (range_idx < selected.size() && selected[range_idx].end <= run_begin) { + ++range_idx; + } + if (range_idx == selected.size() || selected[range_idx].begin >= run_end) { + SNII_RETURN_IF_ERROR(pfor_skip(src, run_len)); + continue; + } + + SNII_RETURN_IF_ERROR(pfor_decode(src, run_len, run_buf.data())); + for (size_t ri = range_idx; ri < selected.size() && selected[ri].begin < run_end; ++ri) { + const SelectedRange& range = selected[ri]; + const uint32_t copy_begin = std::max(range.begin, run_begin); + const uint32_t copy_end = std::min(range.end, run_end); + const uint32_t dst_begin = range.out_begin + copy_begin - range.begin; + std::copy_n(run_buf.data() + copy_begin - run_begin, copy_end - copy_begin, + pos_flat.data() + dst_begin); + } + } + return Status::OK(); +} + +void restore_selected_position_deltas(const std::vector& pos_off, + std::span pos_flat) { + for (size_t i = 0; i + 1 < pos_off.size(); ++i) { + uint32_t prev = 0; + for (uint32_t off = pos_off[i]; off < pos_off[i + 1]; ++off) { + uint32_t& value = pos_flat[off]; + prev = (off == pos_off[i]) ? value : prev + value; + value = prev; + } + } +} + +Status decode_pfor_payload_csr_selective(Slice plain, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); + + pos_flat->clear(); + + std::vector selected; + uint64_t sum = 0; + uint32_t selected_pos_count = 0; + SNII_RETURN_IF_ERROR(decode_selected_pfor_count_ranges(&src, doc_count, doc_ordinals, selected, + *pos_off, &sum, &selected_pos_count)); + if (sum != total_pos) { + return Status::Corruption("prx: pos_count sum mismatch"); + } + + if (should_decode_full_prx_positions(selected, selected_pos_count, total_pos)) { + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat)); + compact_selected_pfor_positions(selected, *pos_flat, *pos_off); + if (!src.eof()) { + return Status::Corruption("prx: trailing bytes after pfor payload"); + } + return Status::OK(); + } + + pos_flat->resize(selected_pos_count); + SNII_RETURN_IF_ERROR(decode_sparse_selected_pfor_positions( + &src, total_pos, selected, std::span(pos_flat->data(), pos_flat->size()))); + + restore_selected_position_deltas(*pos_off, + std::span(pos_flat->data(), pos_flat->size())); + if (!src.eof()) { + return Status::Corruption("prx: trailing bytes after pfor payload"); + } + return Status::OK(); +} + +// CSR decode of a plain (raw) payload. See decode_pfor_payload_csr. +Status decode_payload_csr(Slice plain, std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + pos_flat->clear(); + pos_off->clear(); + pos_off->reserve(static_cast(doc_count) + 1); + pos_off->push_back(0); + uint64_t total_pos = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + uint32_t pos_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count)); + total_pos += pos_count; + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t delta = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&delta)); + prev = (i == 0) ? delta : prev + delta; + pos_flat->push_back(prev); + } + pos_off->push_back(static_cast(pos_flat->size())); + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload"); + return Status::OK(); +} + +Status decode_payload_csr_selective(Slice plain, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); + pos_flat->clear(); + pos_off->clear(); + pos_off->reserve(doc_ordinals.size() + 1); + pos_off->push_back(0); + size_t next_doc = 0; + uint64_t total_pos = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + uint32_t pos_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count)); + total_pos += pos_count; + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + const bool selected = next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d; + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t delta = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&delta)); + if (!selected) continue; + prev = (i == 0) ? delta : prev + delta; + pos_flat->push_back(prev); + } + if (selected) { + pos_off->push_back(static_cast(pos_flat->size())); + ++next_doc; + } + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload"); + return Status::OK(); +} + +// Decision: given level and plain length, determine whether to compress. +bool should_compress(int level, size_t plain_len) { + if (level == 0) return false; // force raw + if (level > 0) return true; // force zstd + return plain_len >= kAutoZstdMinBytes; // auto +} + +// Write a raw window: codec=raw, uncomp_len, crc(header+payload), payload. +void write_raw(Slice plain, ByteSink* sink) { + ByteSink framed; + framed.put_u8(static_cast(PrxCodec::kRaw)); + framed.put_varint32(static_cast(plain.size())); + framed.put_bytes(plain); + sink->put_bytes(framed.view()); + sink->put_fixed32(crc32c(framed.view())); +} + +// Write a zstd window: codec=zstd, uncomp_len, comp_len, crc(header+payload), +// payload. +Status write_zstd(Slice plain, int level, ByteSink* sink) { + std::vector comp; + SNII_RETURN_IF_ERROR(zstd_compress(plain, level > 0 ? level : kDefaultZstdLevel, &comp)); + ByteSink framed; + framed.put_u8(static_cast(PrxCodec::kZstd)); + framed.put_varint32(static_cast(plain.size())); + framed.put_varint32(static_cast(comp.size())); + framed.put_bytes(Slice(comp)); + sink->put_bytes(framed.view()); + sink->put_fixed32(crc32c(framed.view())); + return Status::OK(); +} + +// Read header + payload, verify crc in retrospect, and return the payload view +// and uncomp_len to the caller. +Status read_framed(ByteSource* src, uint8_t* codec, uint32_t* uncomp_len, Slice* payload) { + size_t start = src->position(); + SNII_RETURN_IF_ERROR(src->get_u8(codec)); + if (*codec != static_cast(PrxCodec::kRaw) && + *codec != static_cast(PrxCodec::kZstd) && + *codec != static_cast(PrxCodec::kPfor)) { + return Status::Corruption("prx: unknown codec"); + } + SNII_RETURN_IF_ERROR(src->get_varint32(uncomp_len)); + if (*uncomp_len > kMaxWindowUncompBytes) { + return Status::Corruption("prx: uncomp_len exceeds sane window cap"); + } + size_t payload_len = *uncomp_len; + if (*codec == static_cast(PrxCodec::kZstd)) { + uint32_t comp_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&comp_len)); + payload_len = comp_len; + } + SNII_RETURN_IF_ERROR(src->get_bytes(payload_len, payload)); + size_t framed_len = src->position() - start; + uint32_t stored = 0; + SNII_RETURN_IF_ERROR(src->get_fixed32(&stored)); + if (crc32c(src->slice_from(start, framed_len)) != stored) { + return Status::Corruption("prx: window crc mismatch"); + } + return Status::OK(); +} + +} // namespace + +Status build_prx_window(std::span> per_doc_positions, + int zstd_level_or_negative_for_auto, ByteSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("prx: null sink"); + // Forced legacy codecs (level 0 = raw varint, level > 0 = zstd) are kept so + // the test/legacy paths still exercise them; the auto path (< 0) now emits + // PFOR bit-packed deltas -- no entropy coding, far cheaper build CPU than + // zstd-3. + if (zstd_level_or_negative_for_auto >= 0) { + ByteSink plain; + SNII_RETURN_IF_ERROR(encode_payload(per_doc_positions, &plain)); + Slice plain_view = plain.view(); + if (!should_compress(zstd_level_or_negative_for_auto, plain_view.size())) { + write_raw(plain_view, sink); + return Status::OK(); + } + return write_zstd(plain_view, zstd_level_or_negative_for_auto, sink); + } + ByteSink payload; + SNII_RETURN_IF_ERROR(encode_pfor_payload(per_doc_positions, &payload)); + write_pfor(payload.view(), sink); + return Status::OK(); +} + +Status build_prx_window_flat(std::span positions_flat, + std::span freqs, int zstd_level_or_negative_for_auto, + ByteSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("prx: null sink"); + if (zstd_level_or_negative_for_auto >= 0) { + ByteSink plain; + SNII_RETURN_IF_ERROR(encode_payload_flat(positions_flat, freqs, &plain)); + Slice plain_view = plain.view(); + if (!should_compress(zstd_level_or_negative_for_auto, plain_view.size())) { + write_raw(plain_view, sink); + return Status::OK(); + } + return write_zstd(plain_view, zstd_level_or_negative_for_auto, sink); + } + ByteSink payload; + SNII_RETURN_IF_ERROR(encode_pfor_payload_flat(positions_flat, freqs, &payload)); + write_pfor(payload.view(), sink); + return Status::OK(); +} + +Status read_prx_window(ByteSource* source, std::vector>* per_doc_positions) { + if (source == nullptr || per_doc_positions == nullptr) { + return Status::InvalidArgument("prx: null arg"); + } + uint8_t codec = 0; + uint32_t uncomp_len = 0; + Slice payload; + SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload)); + if (codec == static_cast(PrxCodec::kPfor)) { + return decode_pfor_payload(payload, per_doc_positions); + } + if (codec == static_cast(PrxCodec::kRaw)) { + return decode_payload(payload, per_doc_positions); + } + std::vector plain; + SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain)); + return decode_payload(Slice(plain), per_doc_positions); +} + +Status read_prx_window_csr(ByteSource* source, std::vector* pos_flat, + std::vector* pos_off) { + if (source == nullptr || pos_flat == nullptr || pos_off == nullptr) { + return Status::InvalidArgument("prx: null arg"); + } + uint8_t codec = 0; + uint32_t uncomp_len = 0; + Slice payload; + SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload)); + if (codec == static_cast(PrxCodec::kPfor)) { + return decode_pfor_payload_csr(payload, pos_flat, pos_off); + } + if (codec == static_cast(PrxCodec::kRaw)) { + return decode_payload_csr(payload, pos_flat, pos_off); + } + std::vector plain; + SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain)); + return decode_payload_csr(Slice(plain), pos_flat, pos_off); +} + +Status read_prx_window_csr_selective(ByteSource* source, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + if (source == nullptr || pos_flat == nullptr || pos_off == nullptr) { + return Status::InvalidArgument("prx: null arg"); + } + uint8_t codec = 0; + uint32_t uncomp_len = 0; + Slice payload; + SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload)); + if (codec == static_cast(PrxCodec::kPfor)) { + return decode_pfor_payload_csr_selective(payload, doc_ordinals, pos_flat, pos_off); + } + if (codec == static_cast(PrxCodec::kRaw)) { + return decode_payload_csr_selective(payload, doc_ordinals, pos_flat, pos_off); + } + std::vector plain; + SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain)); + return decode_payload_csr_selective(Slice(plain), doc_ordinals, pos_flat, pos_off); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp b/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp new file mode 100644 index 00000000000000..1f7790e3aac84e --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp @@ -0,0 +1,154 @@ +#include "snii/format/sampled_term_index.h" + +#include + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" + +namespace snii::format { + +namespace { + +// Longest common prefix length of term and prev (front coding primitive, consistent with dict_entry). +uint32_t common_prefix_len(std::string_view term, std::string_view prev) { + uint32_t n = 0; + const uint32_t lim = static_cast(std::min(term.size(), prev.size())); + while (n < lim && term[n] == prev[n]) ++n; + return n; +} + +// Write a front-coded term key (prefix_len + suffix_len + suffix). +void write_term_key(std::string_view term, std::string_view prev, ByteSink* sink) { + const uint32_t prefix = common_prefix_len(term, prev); + const std::string_view suffix = term.substr(prefix); + sink->put_varint32(prefix); + sink->put_varint32(static_cast(suffix.size())); + sink->put_bytes(Slice(suffix)); +} + +// Read a front-coded term key and reconstruct it into out from prev + suffix. +Status read_term_key(ByteSource* src, std::string_view prev, std::string* out) { + uint32_t prefix = 0; + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&prefix)); + SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len)); + if (prefix > prev.size()) { + return Status::Corruption("sampled_term_index: prefix_len exceeds prev_term length"); + } + Slice suffix; + SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix)); + out->assign(prev.substr(0, prefix)); + out->append(reinterpret_cast(suffix.data()), suffix.size()); + return Status::OK(); +} + +} // namespace + +void SampledTermIndexBuilder::add_block_first_term(std::string_view first_term) { + first_terms_.emplace_back(first_term); +} + +void SampledTermIndexBuilder::finish(ByteSink* sink) { + ByteSink payload; + payload.put_varint32(static_cast(first_terms_.size())); + // min_term / max_term are written only when non-empty (== first/last sample_term). + if (!first_terms_.empty()) { + write_term_key(first_terms_.front(), std::string_view {}, &payload); + write_term_key(first_terms_.back(), std::string_view {}, &payload); + std::string_view prev {}; + for (const auto& t : first_terms_) { + write_term_key(t, prev, &payload); + prev = t; + } + } + SectionFramer::write(*sink, static_cast(SectionType::kSampledTermIndex), + payload.view()); +} + +namespace { + +// Parse n_blocks, min/max (not used directly; consumed for checksum alignment), and all sample_terms from payload. +Status parse_payload(Slice payload, std::vector* terms) { + ByteSource src(payload); + uint32_t n_blocks = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&n_blocks)); + if (n_blocks == 0) { + if (!src.eof()) { + return Status::Corruption("sampled_term_index: empty index contains trailing bytes"); + } + terms->clear(); + return Status::OK(); + } + + // min_term / max_term (do not drive binary search directly; must be consumed to verify structural alignment). + std::string min_term; + std::string max_term; + SNII_RETURN_IF_ERROR(read_term_key(&src, std::string_view {}, &min_term)); + SNII_RETURN_IF_ERROR(read_term_key(&src, std::string_view {}, &max_term)); + + std::vector out; + out.reserve(n_blocks); + std::string prev; + for (uint32_t i = 0; i < n_blocks; ++i) { + std::string term; + SNII_RETURN_IF_ERROR(read_term_key(&src, prev, &term)); + prev = term; + out.push_back(std::move(term)); + } + if (!src.eof()) { + return Status::Corruption("sampled_term_index: payload contains trailing bytes"); + } + if (out.front() != min_term || out.back() != max_term) { + return Status::Corruption("sampled_term_index: min/max inconsistent with sample_terms"); + } + *terms = std::move(out); + return Status::OK(); +} + +} // namespace + +Status SampledTermIndexReader::open(Slice section, SampledTermIndexReader* out) { + if (out == nullptr) { + return Status::InvalidArgument("sampled_term_index: out is null"); + } + ByteSource src(section); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + if (sec.type != static_cast(SectionType::kSampledTermIndex)) { + return Status::InvalidArgument("sampled_term_index: not a kSampledTermIndex section"); + } + *out = SampledTermIndexReader {}; + return parse_payload(sec.payload, &out->sample_terms_); +} + +Status SampledTermIndexReader::locate(std::string_view target, bool* maybe_present, + uint32_t* block_ordinal) const { + if (maybe_present == nullptr || block_ordinal == nullptr) { + return Status::InvalidArgument("sampled_term_index: output pointer is null"); + } + *maybe_present = false; + *block_ordinal = 0; + if (sample_terms_.empty()) { + return Status::OK(); // empty index: always out of range. + } + // target < min_term (first block's first term) -> before the first block, so it + // cannot exist in any block. NOTE: a target GREATER than the last sample term is + // NOT out of range -- sample_terms_ holds each block's FIRST term, so the LAST + // block can contain terms greater than its first term. Such a target routes to + // the last block (upper_bound -> end()), where find_term confirms presence. + if (target < std::string_view(sample_terms_.front())) { + return Status::OK(); + } + // Last sample_term <= target: step back one position after upper_bound. For a + // target past every sample term, upper_bound returns end() and idx = n-1 (the + // last block), which is correct. + auto it = std::upper_bound( + sample_terms_.begin(), sample_terms_.end(), target, + [](std::string_view t, const std::string& s) { return t < std::string_view(s); }); + const auto idx = (it - sample_terms_.begin()) - 1; // it > begin (< min excluded). + *maybe_present = true; + *block_ordinal = static_cast(idx); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/stats_block.cpp b/be/src/storage/index/snii/core/src/format/stats_block.cpp new file mode 100644 index 00000000000000..527f4f98d43d79 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/stats_block.cpp @@ -0,0 +1,46 @@ +#include "snii/format/stats_block.h" + +namespace snii::format { + +namespace { + +// Field order within payload is fixed; reuse ByteSink varint primitives — do not hand-assemble bytes. +void encode_payload(const StatsBlock& sb, ByteSink* payload) { + payload->put_varint64(sb.doc_count); + payload->put_varint64(sb.indexed_doc_count); + payload->put_varint64(sb.term_count); + payload->put_varint64(sb.sum_total_term_freq); + payload->put_varint64(sb.null_count); +} + +Status decode_payload(Slice payload, StatsBlock* out) { + ByteSource ps(payload); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->doc_count)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->indexed_doc_count)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->term_count)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->sum_total_term_freq)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->null_count)); + if (!ps.eof()) { + return Status::Corruption("stats_block: trailing bytes in payload"); + } + return Status::OK(); +} + +} // namespace + +void encode_stats_block(const StatsBlock& sb, ByteSink* sink) { + ByteSink payload; + encode_payload(sb, &payload); + SectionFramer::write(*sink, static_cast(SectionType::kStatsBlock), payload.view()); +} + +Status decode_stats_block(ByteSource* src, StatsBlock* out) { + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(*src, &sec)); + if (sec.type != static_cast(SectionType::kStatsBlock)) { + return Status::InvalidArgument("stats_block: unexpected section type"); + } + return decode_payload(sec.payload, out); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp b/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp new file mode 100644 index 00000000000000..ed781c4d82e667 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp @@ -0,0 +1,129 @@ +#include "snii/format/tail_meta_region.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/format/format_constants.h" + +namespace snii::format { +namespace { + +// Header field bytes (before header_crc): u32 ver + u32 flags + u64 meta_region_len +// + u32 n + u64 directory_offset + u64 directory_length. +constexpr size_t kHeaderFields = 4 + 4 + 8 + 4 + 8 + 8; // 36 +constexpr size_t kHeaderSize = kHeaderFields + 4; // + header_crc32c +constexpr size_t kRegionChecksumSize = 4; + +} // namespace + +void TailMetaRegionBuilder::add_index(uint64_t index_id, std::string index_suffix, + Slice per_index_meta_bytes) { + Entry e; + e.index_id = index_id; + e.suffix = std::move(index_suffix); + e.bytes.assign(per_index_meta_bytes.data(), + per_index_meta_bytes.data() + per_index_meta_bytes.size()); + entries_.push_back(std::move(e)); +} + +void TailMetaRegionBuilder::finish(ByteSink* sink) const { + // Lay out per-index meta blocks right after the header; build the directory + // with each block's in-region offset/length. + LogicalIndexDirectoryBuilder dir; + uint64_t offset = kHeaderSize; + for (const Entry& e : entries_) { + LogicalIndexRef ref; + ref.index_id = e.index_id; + ref.index_suffix = e.suffix; + ref.meta_off = offset; + ref.meta_len = e.bytes.size(); + dir.add(ref); + offset += e.bytes.size(); + } + const uint64_t directory_offset = offset; + ByteSink dir_bytes; + dir.finish(&dir_bytes); + const uint64_t directory_length = dir_bytes.size(); + const uint64_t meta_region_len = directory_offset + directory_length + kRegionChecksumSize; + + ByteSink fields; + fields.put_fixed32(kMetaFormatVersion); + fields.put_fixed32(0); // flags + fields.put_fixed64(meta_region_len); + fields.put_fixed32(static_cast(entries_.size())); + fields.put_fixed64(directory_offset); + fields.put_fixed64(directory_length); + + ByteSink region; + region.put_bytes(fields.view()); + region.put_fixed32(crc32c(fields.view())); // header_crc32c + for (const Entry& e : entries_) region.put_bytes(Slice(e.bytes)); + region.put_bytes(dir_bytes.view()); + region.put_fixed32(crc32c(region.view())); // meta_region_checksum + + sink->put_bytes(region.view()); +} + +Status TailMetaRegionReader::open(Slice region, TailMetaRegionReader* out) { + if (out == nullptr) return Status::InvalidArgument("tail_meta_region: null out"); + if (region.size() < kHeaderSize + kRegionChecksumSize) { + return Status::Corruption("tail_meta_region: region too short"); + } + + // Verify the trailing region checksum. + const size_t covered = region.size() - kRegionChecksumSize; + ByteSource cs(region.subslice(covered, kRegionChecksumSize)); + uint32_t region_crc = 0; + SNII_RETURN_IF_ERROR(cs.get_fixed32(®ion_crc)); + if (crc32c(region.subslice(0, covered)) != region_crc) { + return Status::Corruption("tail_meta_region: meta_region_checksum mismatch"); + } + + // Parse + verify the header. + ByteSource hs(region.subslice(0, kHeaderFields)); + uint32_t ver = 0, flags = 0, n = 0; + uint64_t meta_region_len = 0, directory_offset = 0, directory_length = 0; + SNII_RETURN_IF_ERROR(hs.get_fixed32(&ver)); + SNII_RETURN_IF_ERROR(hs.get_fixed32(&flags)); + SNII_RETURN_IF_ERROR(hs.get_fixed64(&meta_region_len)); + SNII_RETURN_IF_ERROR(hs.get_fixed32(&n)); + SNII_RETURN_IF_ERROR(hs.get_fixed64(&directory_offset)); + SNII_RETURN_IF_ERROR(hs.get_fixed64(&directory_length)); + ByteSource hc(region.subslice(kHeaderFields, 4)); + uint32_t header_crc = 0; + SNII_RETURN_IF_ERROR(hc.get_fixed32(&header_crc)); + if (crc32c(region.subslice(0, kHeaderFields)) != header_crc) { + return Status::Corruption("tail_meta_region: header crc mismatch"); + } + if (ver != kMetaFormatVersion) { + return Status::Unsupported("tail_meta_region: unsupported meta_format_version"); + } + if (meta_region_len != region.size()) { + return Status::Corruption("tail_meta_region: declared length mismatch"); + } + if (directory_offset + directory_length > region.size() || directory_offset < kHeaderSize) { + return Status::Corruption("tail_meta_region: directory out of range"); + } + + SNII_RETURN_IF_ERROR(LogicalIndexDirectoryReader::open( + region.subslice(directory_offset, directory_length), &out->dir_)); + if (out->dir_.size() != n) { + return Status::Corruption("tail_meta_region: directory size mismatch"); + } + out->region_ = region; + out->n_ = n; + return Status::OK(); +} + +Status TailMetaRegionReader::find(uint64_t index_id, std::string_view suffix, bool* found, + Slice* per_index_meta_bytes) const { + LogicalIndexRef ref; + SNII_RETURN_IF_ERROR(dir_.find(index_id, suffix, found, &ref)); + if (!*found) return Status::OK(); + if (ref.meta_off + ref.meta_len > region_.size()) { + return Status::Corruption("tail_meta_region: meta block out of range"); + } + *per_index_meta_bytes = region_.subslice(ref.meta_off, ref.meta_len); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/tail_pointer.cpp b/be/src/storage/index/snii/core/src/format/tail_pointer.cpp new file mode 100644 index 00000000000000..bc17f5652d4f82 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/tail_pointer.cpp @@ -0,0 +1,95 @@ +#include "snii/format/tail_pointer.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +namespace { + +// Byte widths of every fixed field, used to derive the constant on-disk size: +// u32 magic + u16 version + 3*u64 + 2*u32 + u8 size + u32 tail_checksum. +constexpr size_t kMagicBytes = 4; +constexpr size_t kVersionBytes = 2; +constexpr size_t kU64Bytes = 8; +constexpr size_t kU32Bytes = 4; +constexpr size_t kSizeByteBytes = 1; + +constexpr size_t kFixedSize = + kMagicBytes + kVersionBytes + 3 * kU64Bytes + 2 * kU32Bytes + kSizeByteBytes + kU32Bytes; +// tail_checksum is the trailing u32 and covers every byte before it. +constexpr size_t kChecksumCoverage = kFixedSize - kU32Bytes; + +// Serializes the checksum-covered region in fixed field order into covered. +void serialize_covered(const TailPointer& tp, ByteSink* covered) { + covered->put_fixed32(kTailMagic); + covered->put_fixed16(kFormatVersion); + covered->put_fixed64(tp.meta_region_offset); + covered->put_fixed64(tp.meta_region_length); + covered->put_fixed64(tp.hot_off); + covered->put_fixed32(tp.meta_region_checksum); + covered->put_fixed32(tp.bootstrap_header_checksum); + covered->put_u8(static_cast(kFixedSize)); +} + +} // namespace + +size_t tail_pointer_size() { + return kFixedSize; +} + +Status encode_tail_pointer(const TailPointer& tp, ByteSink* sink) { + ByteSink covered; + serialize_covered(tp, &covered); + if (covered.size() != kChecksumCoverage) { + return Status::Internal("tail_pointer: covered size mismatch"); + } + const uint32_t tail_checksum = crc32c(covered.view()); + sink->put_bytes(covered.view()); + sink->put_fixed32(tail_checksum); + return Status::OK(); +} + +Status decode_tail_pointer(Slice last_bytes, TailPointer* out) { + // Anti-DoS / framing: the tail pointer is a fixed-size footer, so reject any + // input that is not exactly the fixed size before touching its contents. + if (last_bytes.size() != kFixedSize) { + return Status::Corruption("tail_pointer: input is not the fixed size"); + } + // Verify the trailing tail_checksum over the covered region first; a mismatch + // means any parsed field would be untrustworthy. + const Slice covered = last_bytes.subslice(0, kChecksumCoverage); + ByteSource src(last_bytes); + + uint32_t magic = 0; + SNII_RETURN_IF_ERROR(src.get_fixed32(&magic)); + if (magic != kTailMagic) { + return Status::Corruption("tail_pointer: bad magic"); + } + + uint16_t format_version = 0; + SNII_RETURN_IF_ERROR(src.get_fixed16(&format_version)); + (void)format_version; // Read to advance the cursor; version policy lives in + // the bootstrap header, not here. + SNII_RETURN_IF_ERROR(src.get_fixed64(&out->meta_region_offset)); + SNII_RETURN_IF_ERROR(src.get_fixed64(&out->meta_region_length)); + SNII_RETURN_IF_ERROR(src.get_fixed64(&out->hot_off)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&out->meta_region_checksum)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&out->bootstrap_header_checksum)); + + uint8_t on_disk_size = 0; + SNII_RETURN_IF_ERROR(src.get_u8(&on_disk_size)); + if (on_disk_size != kFixedSize) { + return Status::Corruption("tail_pointer: embedded size mismatch"); + } + + uint32_t tail_checksum = 0; + SNII_RETURN_IF_ERROR(src.get_fixed32(&tail_checksum)); + if (tail_checksum != crc32c(covered)) { + return Status::Corruption("tail_pointer: tail_checksum mismatch"); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp b/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp new file mode 100644 index 00000000000000..1292f8d4f09c2e --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp @@ -0,0 +1,81 @@ +#include "snii/io/batch_range_fetcher.h" + +#include +#include + +namespace snii::io { +namespace { + +Status checked_end(uint64_t offset, uint64_t len, uint64_t* out) { + if (len > std::numeric_limits::max() - offset) { + return Status::Corruption("batch_range_fetcher: range end overflow"); + } + *out = offset + len; + return Status::OK(); +} + +Status checked_size(uint64_t len, size_t* out) { + if (len > static_cast(std::numeric_limits::max())) { + return Status::Corruption("batch_range_fetcher: physical range too large"); + } + *out = static_cast(len); + return Status::OK(); +} + +} // namespace + +BatchRangeFetcher::BatchRangeFetcher(FileReader* reader, uint64_t coalesce_gap) + : reader_(reader), coalesce_gap_(coalesce_gap) {} + +size_t BatchRangeFetcher::add(uint64_t offset, uint64_t len) { + reqs_.push_back(Req {offset, len}); + return reqs_.size() - 1; +} + +void BatchRangeFetcher::clear() { + reqs_.clear(); + phys_.clear(); +} + +Status BatchRangeFetcher::fetch() { + if (reader_ == nullptr) return Status::InvalidArgument("batch_range_fetcher: null reader"); + phys_.clear(); + if (reqs_.empty()) return Status::OK(); + + std::vector order(reqs_.size()); + for (size_t i = 0; i < order.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return reqs_[a].offset < reqs_[b].offset; }); + + // Sweep in offset order, merging requests into physical segments. + std::vector segs; + uint64_t cur_start = 0; + uint64_t cur_end = 0; + for (size_t k = 0; k < order.size(); ++k) { + Req& r = reqs_[order[k]]; + uint64_t r_end = 0; + SNII_RETURN_IF_ERROR(checked_end(r.offset, r.len, &r_end)); + SNII_RETURN_IF_ERROR(checked_size(r.len, &r.len_size)); + const bool disjoint = r.offset > cur_end && r.offset - cur_end > coalesce_gap_; + if (segs.empty() || disjoint) { + segs.push_back(Range {r.offset, 0}); // length finalized below + cur_start = r.offset; + cur_end = r_end; + } else { + cur_end = std::max(cur_end, r_end); + } + r.phys_idx = segs.size() - 1; + SNII_RETURN_IF_ERROR(checked_size(r.offset - cur_start, &r.sub_offset)); + SNII_RETURN_IF_ERROR(checked_size(cur_end - cur_start, &segs.back().len)); + } + + return reader_->read_batch(segs, &phys_); +} + +Slice BatchRangeFetcher::get(size_t h) const { + const Req& r = reqs_[h]; + const std::vector& buf = phys_[r.phys_idx]; + return Slice(buf.data() + r.sub_offset, r.len_size); +} + +} // namespace snii::io diff --git a/be/src/storage/index/snii/core/src/io/local_file.cpp b/be/src/storage/index/snii/core/src/io/local_file.cpp new file mode 100644 index 00000000000000..af64664fe6ad30 --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/local_file.cpp @@ -0,0 +1,113 @@ +#include "snii/io/local_file.h" + +#include +#include +#include + +#include +#include + +namespace snii::io { +namespace { + +std::string errno_msg(const char* what) { + return std::string(what) + ": " + std::strerror(errno); +} + +} // namespace + +LocalFileReader::~LocalFileReader() { + if (fd_ >= 0) ::close(fd_); +} + +Status LocalFileReader::open(const std::string& path) { + fd_ = ::open(path.c_str(), O_RDONLY); + if (fd_ < 0) return Status::IoError(errno_msg("open")); + struct stat st; + if (::fstat(fd_, &st) != 0) return Status::IoError(errno_msg("fstat")); + size_ = static_cast(st.st_size); + return Status::OK(); +} + +Status LocalFileReader::read_at(uint64_t offset, size_t len, std::vector* out) { + if (fd_ < 0) return Status::IoError("read_at on unopened file"); + // Non-wrapping bounds check (offset+len could overflow uint64 on a corrupt arg). + if (offset > size_ || len > size_ - offset) { + return Status::Corruption("read_at past end of file"); + } + out->resize(len); + size_t done = 0; + while (done < len) { + ssize_t n = ::pread(fd_, out->data() + done, len - done, static_cast(offset + done)); + if (n < 0) { + if (errno == EINTR) continue; + return Status::IoError(errno_msg("pread")); + } + if (n == 0) return Status::Corruption("pread returned 0 before len"); + done += static_cast(n); + } + return Status::OK(); +} + +LocalFileWriter::~LocalFileWriter() { + if (fd_ >= 0) ::close(fd_); // best-effort: dtor cannot surface a flush error +} + +Status LocalFileWriter::open(const std::string& path) { + fd_ = ::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd_ < 0) return Status::IoError(errno_msg("open")); + buf_.reserve(kBufCapacity); + return Status::OK(); +} + +Status LocalFileWriter::write_all(const uint8_t* data, size_t len) { + size_t done = 0; + while (done < len) { + ssize_t n = ::write(fd_, data + done, len - done); + if (n < 0) { + if (errno == EINTR) continue; + return Status::IoError(errno_msg("write")); + } + done += static_cast(n); + } + return Status::OK(); +} + +Status LocalFileWriter::flush_buffer() { + if (buf_.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(write_all(buf_.data(), buf_.size())); + buf_.clear(); + return Status::OK(); +} + +Status LocalFileWriter::append(Slice data) { + if (fd_ < 0) return Status::IoError("append on unopened file"); + const size_t len = data.size(); + if (len == 0) return Status::OK(); + // Spans larger than the buffer go straight to the fd (after flushing pending + // bytes) to avoid a pointless copy and an oversized buffer. + if (len >= kBufCapacity) { + SNII_RETURN_IF_ERROR(flush_buffer()); + SNII_RETURN_IF_ERROR(write_all(data.data(), len)); + bytes_written_ += len; + return Status::OK(); + } + if (buf_.size() + len > kBufCapacity) SNII_RETURN_IF_ERROR(flush_buffer()); + buf_.insert(buf_.end(), data.data(), data.data() + len); + bytes_written_ += len; + return Status::OK(); +} + +Status LocalFileWriter::finalize() { + if (fd_ < 0) return Status::IoError("finalize on unopened file"); + SNII_RETURN_IF_ERROR(flush_buffer()); + if (::fsync(fd_) != 0) return Status::IoError(errno_msg("fsync")); + if (::close(fd_) != 0) { + fd_ = -1; + return Status::IoError(errno_msg("close")); + } + fd_ = -1; + return Status::OK(); +} + +} // namespace snii::io diff --git a/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp b/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp new file mode 100644 index 00000000000000..a643d8eca5aa3f --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp @@ -0,0 +1,117 @@ +#include "snii/io/metered_file_reader.h" + +#include + +namespace snii::io { +namespace { + +// Inclusive [first, last] block ids touched by a validated [offset, offset+len). +// Empty len touches no block (callers guard len==0 before calling this). +void block_range(uint64_t offset, size_t len, size_t block_size, uint64_t* first, uint64_t* last) { + *first = offset / block_size; + *last = (offset + len - 1) / block_size; +} + +} // namespace + +MeteredFileReader::MeteredFileReader(FileReader* inner, size_t block_size) + : inner_(inner), block_size_(block_size) {} + +void MeteredFileReader::reset_metrics() { + resident_.clear(); + metrics_ = IoMetrics {}; +} + +Status MeteredFileReader::validate_range(uint64_t offset, size_t len) const { + if (inner_ == nullptr) return Status::InvalidArgument("metered: null inner reader"); + if (block_size_ == 0) return Status::InvalidArgument("metered: zero block size"); + const uint64_t total = inner_->size(); + if (offset > total || len > total - offset) { + return Status::Corruption("metered: read range past end"); + } + return Status::OK(); +} + +// Accounts the FileCache effect of touching [offset, offset+len): newly missed +// blocks become coalesced remote GETs and remote bytes. Returns true iff any +// block missed. (Single contiguous span -> at most one coalesced run.) +bool MeteredFileReader::account_blocks(uint64_t offset, size_t len) { + if (len == 0) return false; + uint64_t first = 0, last = 0; + block_range(offset, len, block_size_, &first, &last); + + bool any_miss = false; + bool in_run = false; // currently inside a contiguous run of missing blocks + const uint64_t total = inner_->size(); + for (uint64_t b = first; b <= last; ++b) { + if (resident_.count(b)) { + in_run = false; + continue; + } + resident_.insert(b); + any_miss = true; + const uint64_t block_start = b * block_size_; + metrics_.remote_bytes += std::min(block_size_, total - block_start); + if (!in_run) { + ++metrics_.range_gets; // start of a new coalesced GET + in_run = true; + } + } + return any_miss; +} + +Status MeteredFileReader::read_at(uint64_t offset, size_t len, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("metered: null out"); + SNII_RETURN_IF_ERROR(validate_range(offset, len)); + ++metrics_.read_at_calls; + metrics_.total_request_bytes += len; + // A single blocking read: any miss forces one serial round (the next offset is + // not known until these bytes return). + if (account_blocks(offset, len)) ++metrics_.serial_rounds; + return inner_->read_at(offset, len, out); +} + +Status MeteredFileReader::read_batch(const std::vector& ranges, + std::vector>* outs) { + if (outs == nullptr) return Status::InvalidArgument("metered: null batch out"); + for (const Range& r : ranges) { + SNII_RETURN_IF_ERROR(validate_range(r.offset, r.len)); + } + + // Gather the union of touched blocks so coalescing spans the whole batch, and + // the entire batch counts as at most one serial round. + std::vector blocks; + for (const Range& r : ranges) { + metrics_.total_request_bytes += r.len; + if (r.len == 0) continue; + uint64_t first = 0, last = 0; + block_range(r.offset, r.len, block_size_, &first, &last); + for (uint64_t b = first; b <= last; ++b) blocks.push_back(b); + } + metrics_.read_at_calls += ranges.size(); + + std::sort(blocks.begin(), blocks.end()); + blocks.erase(std::unique(blocks.begin(), blocks.end()), blocks.end()); + + bool any_miss = false; + const uint64_t total = inner_->size(); + uint64_t prev_miss = 0; + bool have_prev = false; + for (uint64_t b : blocks) { + if (resident_.count(b)) continue; + resident_.insert(b); + any_miss = true; + metrics_.remote_bytes += std::min(block_size_, total - b * block_size_); + if (!have_prev || b != prev_miss + 1) ++metrics_.range_gets; // new run + prev_miss = b; + have_prev = true; + } + if (any_miss) ++metrics_.serial_rounds; + + // Delegate the actual byte fetch to the inner reader's batch path, so a backend + // that fetches a batch concurrently (e.g. S3FileReader) realizes the planned + // round as parallel GETs (matching the single serial round accounted above). + return inner_->read_batch(ranges, outs); +} + +} // namespace snii::io diff --git a/be/src/storage/index/snii/core/src/io/s3_object_store.cpp b/be/src/storage/index/snii/core/src/io/s3_object_store.cpp new file mode 100644 index 00000000000000..6be72027ebe263 --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/s3_object_store.cpp @@ -0,0 +1,217 @@ +#include "snii/io/s3_object_store.h" + +// The whole implementation is compiled only when the S3 backend is enabled. +// Without SNII_WITH_S3 this file is an empty translation unit and pulls in no +// aws-sdk headers, keeping core aws-free by default. +#ifdef SNII_WITH_S3 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace snii::io { +namespace { + +// Refcounted process-wide InitAPI/ShutdownAPI control, shared by AwsApiGuard. +std::mutex g_api_mu; +int g_api_refcount = 0; +Aws::SDKOptions g_api_options; + +void api_acquire() { + std::lock_guard lock(g_api_mu); + if (g_api_refcount == 0) { + Aws::InitAPI(g_api_options); + } + ++g_api_refcount; +} + +void api_release() { + std::lock_guard lock(g_api_mu); + if (g_api_refcount > 0) { + --g_api_refcount; + if (g_api_refcount == 0) { + Aws::ShutdownAPI(g_api_options); + } + } +} + +// Builds a virtual-hosted-addressing S3 client for an OSS-compatible endpoint. +// OSS rejects path-style addressing (SecondLevelDomainForbidden), so virtual +// addressing is mandatory; payload signing is disabled (Never). +std::shared_ptr make_client(const S3Config& cfg) { + Aws::Auth::AWSCredentials creds(Aws::String(cfg.ak.c_str()), Aws::String(cfg.sk.c_str())); + Aws::Client::ClientConfigurationInitValues init; + init.shouldDisableIMDS = true; + Aws::Client::ClientConfiguration client_cfg(init); + client_cfg.endpointOverride = Aws::String(cfg.endpoint.c_str()); + client_cfg.region = Aws::String(cfg.region.c_str()); + client_cfg.connectTimeoutMs = cfg.connect_timeout_ms; + client_cfg.requestTimeoutMs = cfg.request_timeout_ms; + client_cfg.httpRequestTimeoutMs = cfg.http_request_timeout_ms; + return std::make_shared( + creds, client_cfg, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + /*useVirtualAddressing=*/true); +} + +std::string join_key(const std::string& prefix, const std::string& key) { + if (prefix.empty()) return key; + return prefix + "/" + key; +} + +} // namespace + +AwsApiGuard::AwsApiGuard() { + api_acquire(); +} +AwsApiGuard::~AwsApiGuard() { + api_release(); +} + +// --------------------------------------------------------------------------- +// S3FileReader +// --------------------------------------------------------------------------- + +S3FileReader::~S3FileReader() = default; + +S3FileReader::S3FileReader(S3FileReader&&) noexcept = default; +S3FileReader& S3FileReader::operator=(S3FileReader&&) noexcept = default; + +Status S3FileReader::open(const S3Config& cfg, const std::string& key, S3FileReader* out) { + if (out == nullptr) return Status::InvalidArgument("S3FileReader::open: null out"); + out->client_ = make_client(cfg); + out->bucket_ = cfg.bucket; + out->object_key_ = join_key(cfg.prefix, key); + + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(Aws::String(out->bucket_.c_str())); + req.SetKey(Aws::String(out->object_key_.c_str())); + auto outcome = out->client_->HeadObject(req); + if (!outcome.IsSuccess()) { + return Status::IoError("HeadObject(" + out->object_key_ + + "): " + outcome.GetError().GetMessage().c_str()); + } + out->size_ = static_cast(outcome.GetResult().GetContentLength()); + return Status::OK(); +} + +Status S3FileReader::read_at(uint64_t offset, size_t len, std::vector* out) { + if (client_ == nullptr) return Status::IoError("read_at on unopened S3 object"); + if (out == nullptr) return Status::InvalidArgument("read_at: null out"); + // Non-wrapping bounds check (offset+len could overflow uint64 on a corrupt arg). + if (offset > size_ || len > size_ - offset) { + return Status::Corruption("read_at past end of object"); + } + out->resize(len); + if (len == 0) return Status::OK(); + + Aws::S3::Model::GetObjectRequest req; + req.SetBucket(Aws::String(bucket_.c_str())); + req.SetKey(Aws::String(object_key_.c_str())); + std::ostringstream range; + range << "bytes=" << offset << "-" << (offset + len - 1); + req.SetRange(Aws::String(range.str().c_str())); + + auto outcome = client_->GetObject(req); + if (!outcome.IsSuccess()) { + return Status::IoError("GetObject(" + object_key_ + + "): " + outcome.GetError().GetMessage().c_str()); + } + auto& body = outcome.GetResult().GetBody(); + body.read(reinterpret_cast(out->data()), static_cast(len)); + const std::streamsize got = body.gcount(); + if (static_cast(got) != len) { + return Status::Corruption("GetObject returned fewer bytes than requested"); + } + return Status::OK(); +} + +Status S3FileReader::read_batch(const std::vector& ranges, + std::vector>* outs) { + if (outs == nullptr) return Status::InvalidArgument("read_batch: null outs"); + outs->resize(ranges.size()); + if (ranges.empty()) return Status::OK(); + // Issue GETs concurrently in bounded waves; aws S3Client is safe for parallel + // requests and each range writes a distinct output buffer. + constexpr size_t kMaxConcurrent = 16; + Status first_err; + for (size_t base = 0; base < ranges.size(); base += kMaxConcurrent) { + const size_t end = std::min(base + kMaxConcurrent, ranges.size()); + std::vector> futs; + for (size_t i = base; i < end; ++i) { + futs.push_back(std::async(std::launch::async, [this, &ranges, outs, i]() { + return read_at(ranges[i].offset, ranges[i].len, &(*outs)[i]); + })); + } + for (auto& f : futs) { + const Status s = f.get(); + if (!s.ok() && first_err.ok()) first_err = s; + } + } + return first_err; +} + +// --------------------------------------------------------------------------- +// S3FileWriter +// --------------------------------------------------------------------------- + +S3FileWriter::~S3FileWriter() = default; + +S3FileWriter::S3FileWriter(S3FileWriter&&) noexcept = default; +S3FileWriter& S3FileWriter::operator=(S3FileWriter&&) noexcept = default; + +Status S3FileWriter::open(const S3Config& cfg, const std::string& key) { + client_ = make_client(cfg); + bucket_ = cfg.bucket; + object_key_ = join_key(cfg.prefix, key); + buffer_.clear(); + bytes_written_ = 0; + finalized_ = false; + return Status::OK(); +} + +Status S3FileWriter::append(Slice data) { + if (client_ == nullptr) return Status::IoError("append on unopened S3 writer"); + if (finalized_) return Status::IoError("append after finalize"); + buffer_.insert(buffer_.end(), data.data(), data.data() + data.size()); + bytes_written_ += data.size(); + return Status::OK(); +} + +Status S3FileWriter::finalize() { + if (client_ == nullptr) return Status::IoError("finalize on unopened S3 writer"); + if (finalized_) return Status::IoError("finalize called twice"); + + Aws::S3::Model::PutObjectRequest req; + req.SetBucket(Aws::String(bucket_.c_str())); + req.SetKey(Aws::String(object_key_.c_str())); + auto stream = Aws::MakeShared("S3FileWriter"); + stream->write(reinterpret_cast(buffer_.data()), + static_cast(buffer_.size())); + req.SetBody(stream); + req.SetContentLength(static_cast(buffer_.size())); + + auto outcome = client_->PutObject(req); + if (!outcome.IsSuccess()) { + return Status::IoError("PutObject(" + object_key_ + + "): " + outcome.GetError().GetMessage().c_str()); + } + finalized_ = true; + return Status::OK(); +} + +} // namespace snii::io + +#endif // SNII_WITH_S3 diff --git a/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp b/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp new file mode 100644 index 00000000000000..4987d788e6ed7d --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp @@ -0,0 +1,42 @@ +#include "snii/query/bm25_scorer.h" + +#include +#include + +namespace snii::query { + +double decode_norm(uint8_t encoded) { + return encoded == 0 ? 1.0 : static_cast(encoded); +} + +uint8_t encode_norm(uint64_t doc_length) { + const uint64_t clamped = std::clamp(doc_length, 1, 255); + return static_cast(clamped); +} + +ScorerContext ScorerContext::make(uint64_t n, uint64_t df) { + ScorerContext ctx; + ctx.df_ = df; + const double nn = static_cast(n); + const double dff = static_cast(df); + // idf = log(1 + (N - df + 0.5) / (df + 0.5)); always positive for df <= N. + ctx.idf_ = std::log(1.0 + (nn - dff + 0.5) / (dff + 0.5)); + return ctx; +} + +double ScorerContext::score(uint32_t tf, uint8_t encoded_norm, double avgdl, + const Bm25Params& params) const { + const double dl = decode_norm(encoded_norm); + const double tff = static_cast(tf); + const double denom = tff + params.k1 * (1.0 - params.b + params.b * dl / avgdl); + return idf_ * (tff * (params.k1 + 1.0)) / denom; +} + +double ScorerContext::max_score(uint32_t max_freq, uint8_t min_norm, double avgdl, + const Bm25Params& params) const { + // The score grows monotonically with tf and shrinks with dl, so the per-window + // upper bound uses the window's largest tf and smallest dl (min encoded norm). + return score(max_freq, min_norm, avgdl, params); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/boolean_query.cpp b/be/src/storage/index/snii/core/src/query/boolean_query.cpp new file mode 100644 index 00000000000000..e4befe6e316b4a --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/boolean_query.cpp @@ -0,0 +1,99 @@ +#include "snii/query/boolean_query.h" + +#include +#include +#include +#include + +#include "snii/format/dict_entry.h" +#include "snii/query/docid_sink.h" +#include "snii/query/internal/docid_conjunction.h" +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/query/internal/docid_union.h" + +namespace snii::query { + +namespace { + +std::vector unique_terms(const std::vector& terms) { + std::vector out; + out.reserve(terms.size()); + for (const std::string& term : terms) out.emplace_back(term); + std::sort(out.begin(), out.end()); + out.erase(std::unique(out.begin(), out.end()), out.end()); + return out; +} + +Status resolve_or_postings(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, + std::vector* postings) { + postings->clear(); + for (std::string_view term : unique_terms(terms)) { + bool found = false; + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base)); + if (!found) continue; + + postings->push_back({std::move(entry), frq_base, prx_base}); + } + return Status::OK(); +} + +} // namespace + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("boolean_or: null out"); + docids->clear(); + if (terms.empty()) return Status::OK(); + + std::vector postings; + SNII_RETURN_IF_ERROR(resolve_or_postings(idx, terms, &postings)); + return internal::build_docid_union(idx, postings, docids); +} + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return boolean_or(idx, terms, docids); +} + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("boolean_or: null sink"); + if (terms.empty()) return Status::OK(); + + std::vector postings; + SNII_RETURN_IF_ERROR(resolve_or_postings(idx, terms, &postings)); + return internal::emit_docid_union(idx, postings, sink); +} + +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("boolean_and: null out"); + docids->clear(); + if (terms.empty()) return Status::OK(); + + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + bool all_present = false; + SNII_RETURN_IF_ERROR(internal::plan_terms(idx, terms, &round1, &plans, &all_present, + /*need_positions=*/false)); + if (!all_present) return Status::OK(); + if (round1.pending() > 0) SNII_RETURN_IF_ERROR(round1.fetch()); + SNII_RETURN_IF_ERROR(internal::open_preludes(round1, &plans, + /*need_positions=*/false)); + return internal::build_docid_only_conjunction(idx, round1, plans, docids); +} + +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return boolean_and(idx, terms, docids); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp new file mode 100644 index 00000000000000..cfbafd3ca7c1bb --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -0,0 +1,823 @@ +#include "snii/query/internal/docid_conjunction.h" + +#include +#include +#include +#include + +#include "snii/format/frq_pod.h" +#include "snii/query/internal/docid_set_ops.h" +#include "snii/reader/windowed_posting.h" + +namespace snii::query::internal { + +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeReader; +using snii::format::WindowMeta; +using snii::reader::LogicalIndexReader; + +namespace { + +using CandidateIt = std::vector::const_iterator; + +constexpr uint32_t kBoundedSpanBitsetDocs = 16 * 1024; +constexpr size_t kBoundedSpanBitsetWords = kBoundedSpanBitsetDocs / 64; +constexpr size_t kBoundedSpanBitsetMinInput = 32; + +struct CandidateRange { + size_t begin = 0; + size_t end = 0; +}; + +Status slim_frq_docs_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) { + if (entry.frq_docs_len > win_len) { + return Status::Corruption("docid_conjunction: slim frq_docs_len exceeds frq window"); + } + *out = entry.frq_docs_len > 0 ? entry.frq_docs_len : win_len; + return Status::OK(); +} + +Status add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) { + if (rhs > std::numeric_limits::max() - lhs) { + return Status::Corruption(message); + } + *out = lhs + rhs; + return Status::OK(); +} + +Status posting_abs_offset(const LogicalIndexReader& idx, uint64_t base, uint64_t delta, + const char* message, uint64_t* out) { + uint64_t with_base = 0; + SNII_RETURN_IF_ERROR( + add_u64(idx.section_refs().posting_region.offset, base, message, &with_base)); + return add_u64(with_base, delta, message, out); +} + +Status configure_term_plan(const LogicalIndexReader& idx, bool need_positions, + snii::io::BatchRangeFetcher* fetcher, TermPlan* p) { + p->df = p->entry.df; + p->pod_ref = (p->entry.kind == DictEntryKind::kPodRef); + p->windowed = p->pod_ref && p->entry.enc == DictEntryEnc::kWindowed; + if (p->windowed) { + uint64_t prelude_abs = 0; + SNII_RETURN_IF_ERROR(posting_abs_offset(idx, p->frq_base, p->entry.frq_off_delta, + "docid_conjunction: prelude offset overflow", + &prelude_abs)); + p->prelude_handle = fetcher->add(prelude_abs, p->entry.prelude_len); + } else if (p->pod_ref) { + uint64_t foff = 0; + uint64_t flen = 0; + uint64_t poff = 0; + uint64_t plen = 0; + SNII_RETURN_IF_ERROR(idx.resolve_frq_window(p->entry, p->frq_base, &foff, &flen)); + uint64_t frq_fetch = flen; + SNII_RETURN_IF_ERROR(slim_frq_docs_len(p->entry, flen, &frq_fetch)); + p->frq_handle = fetcher->add(foff, frq_fetch); + if (need_positions) { + SNII_RETURN_IF_ERROR(idx.resolve_prx_window(p->entry, p->prx_base, &poff, &plen)); + p->prx_handle = fetcher->add(poff, plen); + } + } + return Status::OK(); +} + +std::vector all_windows(const FrqPreludeReader& prelude) { + std::vector ws(prelude.n_windows()); + for (uint32_t i = 0; i < prelude.n_windows(); ++i) ws[i] = i; + return ws; +} + +std::vector ascending_df_order(const std::vector& plans) { + std::vector order(plans.size()); + for (size_t i = 0; i < plans.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return plans[a].df < plans[b].df; }); + return order; +} + +Status first_docid_in_window(const WindowMeta& meta, uint32_t window_ordinal, uint32_t* first) { + if (window_ordinal == 0) { + *first = 0; + return Status::OK(); + } + if (meta.win_base >= std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: window base exceeds docid range"); + } + *first = static_cast(meta.win_base + 1); + if (*first > meta.last_docid) { + return Status::Corruption("docid_conjunction: invalid window docid range"); + } + return Status::OK(); +} + +Status is_dense_full_window(const WindowMeta& meta, uint32_t window_ordinal, bool* full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, window_ordinal, &first)); + const uint64_t width = static_cast(meta.last_docid) - first + 1; + *full = meta.doc_count == width; + return Status::OK(); +} + +Status append_docid_range(uint32_t first, uint32_t last, std::vector* out) { + if (last < first) { + return Status::Corruption("docid_conjunction: invalid dense docid range"); + } + const uint64_t count64 = static_cast(last) - first + 1; + if (count64 > static_cast(std::numeric_limits::max() - out->size())) { + return Status::Corruption("docid_conjunction: dense docid range too large"); + } + out->reserve(out->size() + static_cast(count64)); + uint32_t docid = first; + while (true) { + out->push_back(docid); + if (docid == last) break; + ++docid; + } + return Status::OK(); +} + +CandidateRange find_candidate_range(const std::vector& candidates, size_t* search_begin, + uint32_t first, uint32_t last) { + const auto from = candidates.begin() + *search_begin; + const auto begin = std::lower_bound(from, candidates.end(), first); + const auto end = std::upper_bound(begin, candidates.end(), last); + *search_begin = static_cast(end - candidates.begin()); + return {.begin = static_cast(begin - candidates.begin()), + .end = static_cast(end - candidates.begin())}; +} + +void append_candidate_range(CandidateIt begin, CandidateIt end, std::vector* out) { + out->insert(out->end(), begin, end); +} + +void clear_ordinals_if_all_term_docs_selected(const std::vector& term_docids, + DocidChunk* chunk) { + if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && + chunk->docids.front() == term_docids.front() && + chunk->docids.back() == term_docids.back()) { + chunk->prx_doc_ordinals.clear(); + } +} + +bool append_term_docs_if_candidates_cover_span(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + std::vector* out, DocidChunk* chunk) { + const uint32_t first = term_docids.front(); + const uint32_t last = term_docids.back(); + const uint64_t width = static_cast(last) - first + 1; + const size_t candidate_count = static_cast(end - begin); + if (width > candidate_count) { + return false; + } + + const auto span_begin = *begin == first ? begin : std::lower_bound(begin, end, first); + if (span_begin == end || *span_begin != first) { + return false; + } + if (static_cast(end - span_begin) < width) { + return false; + } + + const auto span_last = span_begin + static_cast(width) - 1; + if (*span_last != last) { + return false; + } + + out->insert(out->end(), term_docids.begin(), term_docids.end()); + chunk->docids.insert(chunk->docids.end(), term_docids.begin(), term_docids.end()); + return true; +} + +Status append_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, uint32_t first, + uint32_t last, std::vector* out, + DocidChunk* chunk) { + const size_t candidate_count = static_cast(end - begin); + chunk->docids.reserve(candidate_count); + const uint64_t width = static_cast(last) - first + 1; + if (width > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: dense window exceeds doc count range"); + } + chunk->prx_doc_count = static_cast(width); + const bool full_dense_range = + candidate_count == width && begin != end && *begin == first && *(end - 1) == last; + if (full_dense_range) { + out->insert(out->end(), begin, end); + chunk->docids.insert(chunk->docids.end(), begin, end); + return Status::OK(); + } + chunk->prx_doc_ordinals.reserve(candidate_count); + for (auto it = begin; it != end; ++it) { + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(*it - first); + } + return Status::OK(); +} + +bool intersect_dense_term_span_with_ordinals(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + size_t candidate_count, std::vector* out, + DocidChunk* chunk) { + const uint32_t first = term_docids.front(); + const uint32_t last = term_docids.back(); + const uint64_t width = static_cast(last) - first + 1; + if (term_docids.size() > width) { + return false; + } + const uint64_t missing_count = width - term_docids.size(); + if (missing_count != 0 && + (missing_count * 8 > width || missing_count >= candidate_count || + missing_count > static_cast(std::numeric_limits::max()))) { + return false; + } + + if (missing_count == 0) { + for (auto it = begin; it != end; ++it) { + if (*it < first) { + continue; + } + if (*it > last) { + break; + } + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(*it - first); + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return true; + } + + std::vector missing; + missing.reserve(static_cast(missing_count)); + uint32_t expect = first; + for (uint32_t docid : term_docids) { + while (expect < docid) { + missing.push_back(expect); + ++expect; + } + if (docid < std::numeric_limits::max()) { + expect = docid + 1; + } + } + while (expect <= last) { + missing.push_back(expect); + if (expect == std::numeric_limits::max()) { + break; + } + ++expect; + } + + size_t miss = 0; + for (auto it = begin; it != end; ++it) { + if (*it < first) { + continue; + } + if (*it > last) { + break; + } + while (miss < missing.size() && missing[miss] < *it) { + ++miss; + } + if (miss < missing.size() && missing[miss] == *it) { + continue; + } + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(static_cast(*it - first - miss)); + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return true; +} + +bool intersect_bounded_span_with_ordinals(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + size_t candidate_count, std::vector* out, + DocidChunk* chunk) { + if (candidate_count < kBoundedSpanBitsetMinInput || + term_docids.size() < kBoundedSpanBitsetMinInput) { + return false; + } + + const uint32_t first = std::min(*begin, term_docids.front()); + const uint32_t last = std::max(*(end - 1), term_docids.back()); + const uint64_t width = static_cast(last) - first + 1; + if (width > kBoundedSpanBitsetDocs || term_docids.size() > width) { + return false; + } + + std::array bits {}; + for (uint32_t docid : term_docids) { + const uint32_t off = docid - first; + bits[off >> 6] |= 1ULL << (off & 63); + } + + const auto word_count = static_cast((width + 63) >> 6); + std::array ordinal_base {}; + uint32_t ordinal = 0; + for (size_t word = 0; word < word_count; ++word) { + ordinal_base[word] = ordinal; + ordinal += static_cast(__builtin_popcountll(bits[word])); + } + + for (auto it = begin; it != end; ++it) { + const uint32_t off = *it - first; + const size_t word = off >> 6; + const uint64_t mask = 1ULL << (off & 63); + if ((bits[word] & mask) == 0) { + continue; + } + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back( + ordinal_base[word] + + static_cast(__builtin_popcountll(bits[word] & (mask - 1)))); + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return true; +} + +size_t log2_ceil(size_t n) { + if (n <= 1) return 1; + --n; + size_t bits = 0; + while (n != 0) { + ++bits; + n >>= 1; + } + return bits; +} + +void intersect_window_candidate_range(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, uint32_t first, + uint32_t last, std::vector* out) { + const size_t candidate_count = static_cast(end - begin); + if (candidate_count == 0 || term_docids.empty()) return; + + const uint64_t width = static_cast(last) - first + 1; + const uint64_t missing_count = term_docids.size() <= width ? width - term_docids.size() : width; + if (term_docids.size() <= width && missing_count != 0 && missing_count * 8 <= width && + missing_count < candidate_count) { + std::vector missing; + missing.reserve(static_cast(missing_count)); + uint32_t expect = first; + for (uint32_t docid : term_docids) { + while (expect < docid) { + missing.push_back(expect); + ++expect; + } + if (docid < std::numeric_limits::max()) expect = docid + 1; + } + while (expect <= last) { + missing.push_back(expect); + if (expect == std::numeric_limits::max()) break; + ++expect; + } + size_t miss = 0; + for (auto it = begin; it != end; ++it) { + while (miss < missing.size() && missing[miss] < *it) ++miss; + if (miss == missing.size() || missing[miss] != *it) out->push_back(*it); + } + return; + } + + const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1; + if (candidate_count < term_docids.size() / probes_per_candidate) { + for (auto it = begin; it != end; ++it) { + if (std::binary_search(term_docids.begin(), term_docids.end(), *it)) { + out->push_back(*it); + } + } + return; + } + std::set_intersection(begin, end, term_docids.begin(), term_docids.end(), + std::back_inserter(*out)); +} + +Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + std::vector* out, + DocidChunk* chunk) { + if (term_docids.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk->prx_doc_count = static_cast(term_docids.size()); + if (begin == end || term_docids.empty()) return Status::OK(); + + const size_t candidate_count = static_cast(end - begin); + const size_t max_matches = std::min(candidate_count, term_docids.size()); + out->reserve(out->size() + max_matches); + chunk->docids.reserve(max_matches); + if (candidate_count == term_docids.size() && *begin == term_docids.front() && + *(end - 1) == term_docids.back() && std::equal(begin, end, term_docids.begin())) { + out->insert(out->end(), begin, end); + chunk->docids.insert(chunk->docids.end(), begin, end); + return Status::OK(); + } + if (append_term_docs_if_candidates_cover_span(begin, end, term_docids, out, chunk)) { + return Status::OK(); + } + + chunk->prx_doc_ordinals.reserve(max_matches); + if (intersect_dense_term_span_with_ordinals(begin, end, term_docids, candidate_count, out, + chunk)) { + return Status::OK(); + } + if (intersect_bounded_span_with_ordinals(begin, end, term_docids, candidate_count, out, + chunk)) { + return Status::OK(); + } + + const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1; + if (candidate_count < term_docids.size() / probes_per_candidate) { + size_t doc_index = 0; + for (auto it = begin; it != end; ++it) { + const auto found = + std::lower_bound(term_docids.begin() + doc_index, term_docids.end(), *it); + if (found == term_docids.end()) break; + doc_index = static_cast(found - term_docids.begin()); + if (*found != *it) continue; + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); + ++doc_index; + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return Status::OK(); + } + + const size_t probes_per_term_doc = log2_ceil(candidate_count) + 1; + if (term_docids.size() < candidate_count / probes_per_term_doc) { + auto candidate_it = begin; + for (size_t doc_index = 0; doc_index < term_docids.size(); ++doc_index) { + const uint32_t docid = term_docids[doc_index]; + candidate_it = std::lower_bound(candidate_it, end, docid); + if (candidate_it == end) break; + if (*candidate_it != docid) continue; + out->push_back(docid); + chunk->docids.push_back(docid); + chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); + ++candidate_it; + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return Status::OK(); + } + + size_t doc_index = 0; + for (auto it = begin; it != end; ++it) { + while (doc_index < term_docids.size() && term_docids[doc_index] < *it) { + ++doc_index; + } + if (doc_index == term_docids.size()) break; + if (term_docids[doc_index] != *it) continue; + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); + ++doc_index; + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return Status::OK(); +} + +Status select_covering_windows(const FrqPreludeReader& prelude, + const std::vector& candidates, + std::vector* windows) { + std::vector sel; + uint32_t last = UINT32_MAX; + for (uint32_t d : candidates) { + bool found = false; + uint32_t w = 0; + SNII_RETURN_IF_ERROR(prelude.locate_window(d, &found, &w)); + if (!found) continue; + if (w != last) { + sel.push_back(w); + last = w; + } + } + *windows = std::move(sel); + return Status::OK(); +} + +bool should_scan_all_windows(const LogicalIndexReader& idx, const TermPlan& p, + size_t candidate_count) { + const size_t window_count = p.prelude.n_windows(); + if (candidate_count > window_count * 64) return true; + + const uint64_t doc_count = idx.stats().doc_count; + const bool near_full = doc_count != 0 && static_cast(p.df) * 10 >= doc_count * 9; + return near_full && candidate_count > window_count * 4; +} + +Status decode_flat_docids_only(const snii::io::BatchRangeFetcher& round1, const TermPlan& p, + std::vector* docids) { + Slice dd; + if (p.pod_ref) { + dd = round1.get(p.frq_handle); + } else { + SNII_RETURN_IF_ERROR(inline_dd_region(p.entry, &dd)); + } + return snii::format::decode_dd_region(dd, p.entry.dd_meta, /*win_base=*/0, docids); +} + +struct WindowWork { + uint32_t ordinal = 0; + WindowMeta meta; + CandidateRange candidates; + size_t handle = 0; + bool dense_full = false; +}; + +Status emit_dense_full_window_docids(const WindowWork& f, const std::vector* candidates, + std::vector& out, DocidSource* source) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); + if (source != nullptr) { + DocidChunk chunk; + chunk.windowed = true; + chunk.window = f.ordinal; + chunk.prx_doc_count = f.meta.doc_count; + if (candidates == nullptr) { + SNII_RETURN_IF_ERROR(append_docid_range(first, f.meta.last_docid, &chunk.docids)); + } else { + const auto begin = candidates->begin() + f.candidates.begin; + const auto end = candidates->begin() + f.candidates.end; + SNII_RETURN_IF_ERROR(append_candidate_range_with_ordinals( + begin, end, first, f.meta.last_docid, &out, &chunk)); + } + source->chunks.push_back(std::move(chunk)); + } + if (candidates == nullptr) { + SNII_RETURN_IF_ERROR(append_docid_range(first, f.meta.last_docid, &out)); + } else if (source == nullptr) { + append_candidate_range(candidates->begin() + f.candidates.begin, + candidates->begin() + f.candidates.end, &out); + } + return Status::OK(); +} + +Status emit_decoded_window_docids(const WindowWork& f, const snii::io::BatchRangeFetcher& fetcher, + const std::vector* candidates, + std::vector& out, DocidSource* source, + std::vector& docs, std::vector& freqs, + std::vector>& positions) { + docs.clear(); + freqs.clear(); + positions.clear(); + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + f.meta, fetcher.get(f.handle), Slice(), Slice(), + /*want_positions=*/false, /*want_freq=*/false, &docs, &freqs, &positions)); + if (source != nullptr) { + DocidChunk chunk; + chunk.windowed = true; + chunk.window = f.ordinal; + if (candidates == nullptr) { + chunk.docids = docs; + if (docs.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(docs.size()); + source->chunks.push_back(std::move(chunk)); + } else { + const auto begin = candidates->begin() + f.candidates.begin; + const auto end = candidates->begin() + f.candidates.end; + SNII_RETURN_IF_ERROR( + intersect_window_candidate_range_with_ordinals(begin, end, docs, &out, &chunk)); + if (!chunk.docids.empty()) { + source->chunks.push_back(std::move(chunk)); + } + } + } + if (candidates == nullptr) { + out.insert(out.end(), docs.begin(), docs.end()); + return Status::OK(); + } + if (source != nullptr) { + return Status::OK(); + } + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); + intersect_window_candidate_range(candidates->begin() + f.candidates.begin, + candidates->begin() + f.candidates.end, docs, first, + f.meta.last_docid, &out); + return Status::OK(); +} + +Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPlan& p, + const std::vector& windows, + const std::vector* candidates, + std::vector* out, DocidSource* source) { + snii::io::BatchRangeFetcher fetcher(idx.reader(), snii::reader::kSameTermCoalesceGap); + std::vector work; + work.reserve(windows.size()); + out->reserve(candidates == nullptr ? p.entry.df : candidates->size()); + size_t candidate_search_begin = 0; + for (uint32_t w : windows) { + WindowMeta meta; + SNII_RETURN_IF_ERROR(p.prelude.window(w, &meta)); + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first)); + CandidateRange candidate_range; + if (candidates != nullptr) { + candidate_range = find_candidate_range(*candidates, &candidate_search_begin, first, + meta.last_docid); + if (candidate_range.begin == candidate_range.end) { + continue; + } + } + bool dense_full = false; + SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full)); + if (dense_full) { + work.push_back(WindowWork { + .ordinal = w, .meta = meta, .candidates = candidate_range, .dense_full = true}); + continue; + } + + snii::reader::WindowAbsRange range; + SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( + idx, p.entry, p.frq_base, p.prx_base, p.prelude, w, + /*want_positions=*/false, /*want_freq=*/false, &range)); + WindowWork f; + f.ordinal = w; + f.meta = meta; + f.candidates = candidate_range; + f.handle = fetcher.add(range.dd_off, range.dd_len); + work.push_back(f); + } + if (fetcher.pending() > 0) { + SNII_RETURN_IF_ERROR(fetcher.fetch()); + } + + std::vector docs; + std::vector freqs; + std::vector> positions; + for (const WindowWork& f : work) { + if (f.dense_full) { + SNII_RETURN_IF_ERROR(emit_dense_full_window_docids(f, candidates, *out, source)); + continue; + } + SNII_RETURN_IF_ERROR(emit_decoded_window_docids(f, fetcher, candidates, *out, source, docs, + freqs, positions)); + } + return Status::OK(); +} + +Status collect_docids_only(const LogicalIndexReader& idx, const snii::io::BatchRangeFetcher& round1, + const TermPlan& p, const std::vector* candidates, + std::vector* out, DocidSource* source) { + if (p.windowed) { + std::vector windows; + if (candidates == nullptr) { + windows = all_windows(p.prelude); + } else if (should_scan_all_windows(idx, p, candidates->size())) { + // Dense candidate sets cover most windows; for near-full terms this also + // avoids thousands-to-millions of locate_window probes with no byte win. + windows = all_windows(p.prelude); + } else { + SNII_RETURN_IF_ERROR(select_covering_windows(p.prelude, *candidates, &windows)); + } + return collect_windowed_docids_only(idx, p, windows, candidates, out, source); + } + + std::vector term_docids; + SNII_RETURN_IF_ERROR(decode_flat_docids_only(round1, p, &term_docids)); + if (source != nullptr) { + DocidChunk chunk; + if (term_docids.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(term_docids.size()); + if (candidates == nullptr) { + chunk.docids = term_docids; + } else if (!term_docids.empty()) { + const auto begin = std::ranges::lower_bound(*candidates, term_docids.front()); + const auto end = std::upper_bound(begin, candidates->end(), term_docids.back()); + SNII_RETURN_IF_ERROR(intersect_window_candidate_range_with_ordinals( + begin, end, term_docids, out, &chunk)); + } + if (candidates == nullptr || !chunk.docids.empty()) { + source->chunks.push_back(std::move(chunk)); + } + } + if (candidates == nullptr) { + *out = std::move(term_docids); + return Status::OK(); + } + if (source != nullptr) { + return Status::OK(); + } + *out = intersect_sorted(*candidates, term_docids); + return Status::OK(); +} + +Status build_docid_only_conjunction_impl(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates, + std::vector* sources) { + if (sources != nullptr) { + sources->assign(plans.size(), DocidSource {}); + } + const std::vector order = ascending_df_order(plans); + for (size_t k = 0; k < order.size(); ++k) { + const size_t ti = order[k]; + std::vector next; + DocidSource* source = sources == nullptr ? nullptr : &(*sources)[ti]; + SNII_RETURN_IF_ERROR(collect_docids_only(idx, round1, plans[ti], + k == 0 ? nullptr : candidates, &next, source)); + if (source != nullptr && k + 1 == order.size()) { + source->docids_are_final_candidates = true; + } + *candidates = std::move(next); + if (candidates->empty()) { + return Status::OK(); + } + } + return Status::OK(); +} + +} // namespace + +Status resolve_query_term(const LogicalIndexReader& idx, const std::string& term, + ResolvedQueryTerm* resolved, bool* found) { + *found = false; + SNII_RETURN_IF_ERROR( + idx.lookup(term, found, &resolved->entry, &resolved->frq_base, &resolved->prx_base)); + return Status::OK(); +} + +Status plan_terms(const LogicalIndexReader& idx, const std::vector& terms, + snii::io::BatchRangeFetcher* fetcher, std::vector* plans, + bool* all_present, bool need_positions) { + *all_present = true; + plans->resize(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + ResolvedQueryTerm resolved; + bool found = false; + SNII_RETURN_IF_ERROR(resolve_query_term(idx, terms[i], &resolved, &found)); + if (!found) { + *all_present = false; + return Status::OK(); + } + TermPlan& p = (*plans)[i]; + p.order = i; + p.entry = std::move(resolved.entry); + p.frq_base = resolved.frq_base; + p.prx_base = resolved.prx_base; + SNII_RETURN_IF_ERROR(configure_term_plan(idx, need_positions, fetcher, &p)); + } + return Status::OK(); +} + +Status plan_resolved_terms(const LogicalIndexReader& idx, + const std::vector& terms, + snii::io::BatchRangeFetcher* fetcher, std::vector* plans, + bool need_positions) { + plans->resize(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermPlan& p = (*plans)[i]; + p.order = i; + p.entry = terms[i].entry; + p.frq_base = terms[i].frq_base; + p.prx_base = terms[i].prx_base; + SNII_RETURN_IF_ERROR(configure_term_plan(idx, need_positions, fetcher, &p)); + } + return Status::OK(); +} + +Status open_preludes(const snii::io::BatchRangeFetcher& fetcher, std::vector* plans, + bool need_positions) { + for (TermPlan& p : *plans) { + if (!p.windowed) continue; + SNII_RETURN_IF_ERROR(FrqPreludeReader::open(fetcher.get(p.prelude_handle), &p.prelude)); + if (need_positions && !p.prelude.has_prx()) { + return Status::Corruption("docid_conjunction: windowed prelude has no positions"); + } + } + return Status::OK(); +} + +Status inline_dd_region(const DictEntry& entry, Slice* out) { + if (entry.dd_meta.disk_len > entry.frq_bytes.size()) { + return Status::Corruption("docid_conjunction: inline dd region exceeds frq bytes"); + } + *out = Slice(entry.frq_bytes.data(), static_cast(entry.dd_meta.disk_len)); + return Status::OK(); +} + +Status build_docid_only_conjunction(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates) { + return build_docid_only_conjunction_impl(idx, round1, plans, candidates, nullptr); +} + +Status build_docid_only_conjunction(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates, + std::vector* sources) { + return build_docid_only_conjunction_impl(idx, round1, plans, candidates, sources); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp new file mode 100644 index 00000000000000..206221ffc5dbbc --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp @@ -0,0 +1,296 @@ +#include "snii/query/internal/docid_posting_reader.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/reader/windowed_posting.h" + +namespace snii::query::internal { + +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeReader; +using snii::format::WindowMeta; +using snii::reader::LogicalIndexReader; + +namespace { + +Status decode_flat_docs(const DictEntry& entry, Slice dd_region, std::vector* docids) { + return snii::format::decode_dd_region(dd_region, entry.dd_meta, + /*win_base=*/0, docids); +} + +Status decode_inline_docs(const DictEntry& entry, std::vector* docids) { + if (entry.dd_meta.disk_len > entry.frq_bytes.size()) { + return Status::Corruption("docid_posting_reader: inline dd region exceeds frq bytes"); + } + return decode_flat_docs( + entry, Slice(entry.frq_bytes.data(), static_cast(entry.dd_meta.disk_len)), + docids); +} + +Status slim_docs_fetch_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) { + if (entry.frq_docs_len > win_len) { + return Status::Corruption("docid_posting_reader: slim frq_docs_len exceeds frq window"); + } + *out = entry.frq_docs_len > 0 ? entry.frq_docs_len : win_len; + return Status::OK(); +} + +Status add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) { + if (rhs > std::numeric_limits::max() - lhs) { + return Status::Corruption(message); + } + *out = lhs + rhs; + return Status::OK(); +} + +Status prelude_abs(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + uint64_t* out) { + uint64_t with_base = 0; + SNII_RETURN_IF_ERROR(add_u64(idx.section_refs().posting_region.offset, frq_base, + "docid_posting_reader: prelude offset overflow", &with_base)); + return add_u64(with_base, entry.frq_off_delta, "docid_posting_reader: prelude offset overflow", + out); +} + +Status validate_windowed_docs_prefix(const DictEntry& entry) { + if (entry.prelude_len == 0) { + return Status::Corruption("docid_posting_reader: windowed entry has no prelude"); + } + if (entry.prelude_len > entry.frq_docs_len) { + return Status::Corruption("docid_posting_reader: prelude_len exceeds docs prefix"); + } + if (entry.frq_docs_len > entry.frq_len) { + return Status::Corruption("docid_posting_reader: docs prefix exceeds frq_len"); + } + return Status::OK(); +} + +struct FlatPlan { + size_t out_index = 0; + const DictEntry* entry = nullptr; + size_t handle = 0; +}; + +struct WindowPlan { + size_t out_index = 0; + const ResolvedDocidPosting* posting = nullptr; + size_t prefix_handle = 0; +}; + +Status plan_flat_docs(const LogicalIndexReader& idx, const ResolvedDocidPosting& posting, + snii::io::BatchRangeFetcher* fetcher, FlatPlan* plan) { + uint64_t win_abs = 0; + uint64_t win_len = 0; + SNII_RETURN_IF_ERROR( + idx.resolve_frq_window(posting.entry, posting.frq_base, &win_abs, &win_len)); + uint64_t docs_len = 0; + SNII_RETURN_IF_ERROR(slim_docs_fetch_len(posting.entry, win_len, &docs_len)); + plan->handle = fetcher->add(win_abs, docs_len); + return Status::OK(); +} + +Status plan_window_prefix(const LogicalIndexReader& idx, WindowPlan* plan, + snii::io::BatchRangeFetcher* fetcher) { + const ResolvedDocidPosting& posting = *plan->posting; + SNII_RETURN_IF_ERROR(validate_windowed_docs_prefix(posting.entry)); + uint64_t abs = 0; + SNII_RETURN_IF_ERROR(prelude_abs(idx, posting.entry, posting.frq_base, &abs)); + plan->prefix_handle = fetcher->add(abs, posting.entry.frq_docs_len); + return Status::OK(); +} + +Status window_dd_slice(Slice dd_block, const WindowMeta& meta, Slice* out) { + if (meta.dd_off > dd_block.size() || meta.dd_disk_len > dd_block.size() - meta.dd_off) { + return Status::Corruption("docid_posting_reader: window dd range out of prefix"); + } + *out = dd_block.subslice(static_cast(meta.dd_off), + static_cast(meta.dd_disk_len)); + return Status::OK(); +} + +Status first_docid_in_window(const WindowMeta& meta, uint32_t window_ordinal, uint32_t* first) { + if (window_ordinal == 0) { + *first = 0; + return Status::OK(); + } + if (meta.win_base >= std::numeric_limits::max()) { + return Status::Corruption("docid_posting_reader: window base exceeds docid range"); + } + *first = static_cast(meta.win_base + 1); + if (*first > meta.last_docid) { + return Status::Corruption("docid_posting_reader: invalid window docid range"); + } + return Status::OK(); +} + +Status is_dense_full_window(const WindowMeta& meta, uint32_t window_ordinal, bool* full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, window_ordinal, &first)); + const uint64_t width = static_cast(meta.last_docid) - first + 1; + *full = meta.doc_count == width; + return Status::OK(); +} + +Status decode_flat_plan(const snii::io::BatchRangeFetcher& fetcher, const FlatPlan& plan, + std::vector* out) { + return decode_flat_docs(*plan.entry, fetcher.get(plan.handle), out); +} + +Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, + DocIdSink* sink); + +Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, + std::vector* out) { + VectorDocIdSink sink(*out); + return decode_window_prefix_plan(fetcher, plan, &sink); +} + +Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, + DocIdSink* sink) { + const DictEntry& entry = plan.posting->entry; + const Slice prefix = fetcher.get(plan.prefix_handle); + if (entry.prelude_len > prefix.size()) { + return Status::Corruption("docid_posting_reader: short docs prefix"); + } + const size_t prelude_len = static_cast(entry.prelude_len); + FrqPreludeReader prelude; + SNII_RETURN_IF_ERROR(FrqPreludeReader::open(prefix.subslice(0, prelude_len), &prelude)); + const uint64_t dd_block_len = prelude.dd_block_len(); + if (dd_block_len > static_cast(std::numeric_limits::max()) - prelude_len) { + return Status::Corruption("docid_posting_reader: docs prefix length overflow"); + } + const size_t expected_prefix_len = prelude_len + static_cast(dd_block_len); + if (prefix.size() != expected_prefix_len) { + return Status::Corruption("docid_posting_reader: docs prefix length mismatch"); + } + const Slice dd_block = prefix.subslice(prelude_len, prefix.size() - prelude_len); + std::vector docs; + std::vector freqs; + std::vector> positions; + for (uint32_t w = 0; w < prelude.n_windows(); ++w) { + WindowMeta meta; + Slice dd_region; + SNII_RETURN_IF_ERROR(prelude.window(w, &meta)); + SNII_RETURN_IF_ERROR(window_dd_slice(dd_block, meta, &dd_region)); + bool dense_full = false; + SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full)); + if (dense_full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first)); + SNII_RETURN_IF_ERROR( + sink->append_range(first, static_cast(meta.last_docid) + 1)); + continue; + } + docs.clear(); + freqs.clear(); + positions.clear(); + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + meta, dd_region, Slice(), Slice(), /*want_positions=*/false, + /*want_freq=*/false, &docs, &freqs, &positions)); + SNII_RETURN_IF_ERROR(sink->append_sorted(docs)); + } + return Status::OK(); +} + +} // namespace + +Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, std::vector* docids) { + if (docids == nullptr) { + return Status::InvalidArgument("docid_posting_reader: null out"); + } + docids->clear(); + VectorDocIdSink sink(*docids); + return read_docid_posting(idx, entry, frq_base, prx_base, &sink); +} + +Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, DocIdSink* sink) { + if (sink == nullptr) { + return Status::InvalidArgument("docid_posting_reader: null sink"); + } + ResolvedDocidPosting posting {entry, frq_base, prx_base}; + if (posting.entry.kind == DictEntryKind::kInline) { + std::vector docs; + SNII_RETURN_IF_ERROR(decode_inline_docs(posting.entry, &docs)); + return sink->append_sorted(docs); + } + + snii::io::BatchRangeFetcher docs_fetcher(idx.reader()); + if (posting.entry.enc == DictEntryEnc::kWindowed) { + WindowPlan plan; + plan.out_index = 0; + plan.posting = &posting; + SNII_RETURN_IF_ERROR(plan_window_prefix(idx, &plan, &docs_fetcher)); + if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch()); + return decode_window_prefix_plan(docs_fetcher, plan, sink); + } + + FlatPlan plan; + plan.out_index = 0; + plan.entry = &posting.entry; + SNII_RETURN_IF_ERROR(plan_flat_docs(idx, posting, &docs_fetcher, &plan)); + if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch()); + std::vector docs; + SNII_RETURN_IF_ERROR(decode_flat_plan(docs_fetcher, plan, &docs)); + return sink->append_sorted(docs); +} + +Status read_docid_postings_batched(const LogicalIndexReader& idx, + const std::vector& postings, + std::vector>* docids) { + if (docids == nullptr) { + return Status::InvalidArgument("docid_posting_reader: null batched out"); + } + docids->clear(); + docids->resize(postings.size()); + + std::vector flat_plans; + std::vector window_plans; + snii::io::BatchRangeFetcher docs_fetcher(idx.reader()); + + for (size_t i = 0; i < postings.size(); ++i) { + const ResolvedDocidPosting& posting = postings[i]; + if (posting.entry.kind == DictEntryKind::kInline) { + SNII_RETURN_IF_ERROR(decode_inline_docs(posting.entry, &(*docids)[i])); + continue; + } + if (posting.entry.enc == DictEntryEnc::kWindowed) { + WindowPlan plan; + plan.out_index = i; + plan.posting = &posting; + SNII_RETURN_IF_ERROR(plan_window_prefix(idx, &plan, &docs_fetcher)); + window_plans.push_back(std::move(plan)); + continue; + } + FlatPlan plan; + plan.out_index = i; + plan.entry = &posting.entry; + flat_plans.push_back(plan); + } + + for (FlatPlan& plan : flat_plans) { + const ResolvedDocidPosting& posting = postings[plan.out_index]; + SNII_RETURN_IF_ERROR(plan_flat_docs(idx, posting, &docs_fetcher, &plan)); + } + if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch()); + + for (const FlatPlan& plan : flat_plans) { + SNII_RETURN_IF_ERROR(decode_flat_plan(docs_fetcher, plan, &(*docids)[plan.out_index])); + } + for (const WindowPlan& plan : window_plans) { + SNII_RETURN_IF_ERROR( + decode_window_prefix_plan(docs_fetcher, plan, &(*docids)[plan.out_index])); + } + return Status::OK(); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp b/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp new file mode 100644 index 00000000000000..88b748e49e80b1 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp @@ -0,0 +1,105 @@ +#include "snii/query/internal/docid_set_ops.h" + +#include +#include +#include +#include + +namespace snii::query::internal { + +std::vector intersect_sorted(const std::vector& a, + const std::vector& b) { + std::vector out; + out.reserve(std::min(a.size(), b.size())); + std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), std::back_inserter(out)); + return out; +} + +void union_sorted_into(std::vector* acc, const std::vector& next) { + std::vector merged; + merged.reserve(acc->size() + next.size()); + std::set_union(acc->begin(), acc->end(), next.begin(), next.end(), std::back_inserter(merged)); + *acc = std::move(merged); +} + +std::vector union_sorted_many(const std::vector>& lists) { + constexpr size_t kLinearFanInMax = 8; + struct Cursor { + uint32_t docid = 0; + size_t list = 0; + size_t offset = 0; + }; + struct GreaterDocId { + bool operator()(const Cursor& a, const Cursor& b) const { return a.docid > b.docid; } + }; + + size_t non_empty = 0; + size_t largest = 0; + std::priority_queue, GreaterDocId> heap; + for (size_t i = 0; i < lists.size(); ++i) { + if (lists[i].empty()) continue; + ++non_empty; + largest = std::max(largest, lists[i].size()); + heap.push(Cursor {lists[i][0], i, 0}); + } + if (non_empty == 0) return {}; + if (non_empty == 1) { + for (const std::vector& docs : lists) { + if (!docs.empty()) return docs; + } + } + + if (non_empty <= kLinearFanInMax) { + std::vector offsets(lists.size(), 0); + std::vector out; + out.reserve(largest); + bool has_last = false; + uint32_t last = 0; + for (;;) { + bool found = false; + uint32_t next = 0; + for (size_t i = 0; i < lists.size(); ++i) { + if (offsets[i] >= lists[i].size()) continue; + const uint32_t docid = lists[i][offsets[i]]; + if (!found || docid < next) { + found = true; + next = docid; + } + } + if (!found) break; + if (!has_last || next != last) { + out.push_back(next); + last = next; + has_last = true; + } + for (size_t i = 0; i < lists.size(); ++i) { + while (offsets[i] < lists[i].size() && lists[i][offsets[i]] == next) { + ++offsets[i]; + } + } + } + return out; + } + + std::vector out; + out.reserve(largest); + bool has_last = false; + uint32_t last = 0; + while (!heap.empty()) { + const Cursor cur = heap.top(); + heap.pop(); + if (!has_last || cur.docid != last) { + out.push_back(cur.docid); + last = cur.docid; + has_last = true; + } + const size_t next_offset = cur.offset + 1; + const std::vector& docs = lists[cur.list]; + if (next_offset < docs.size()) { + heap.push(Cursor {docs[next_offset], cur.list, next_offset}); + } + } + return out; +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/docid_union.cpp b/be/src/storage/index/snii/core/src/query/docid_union.cpp new file mode 100644 index 00000000000000..da4665a63d1280 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_union.cpp @@ -0,0 +1,31 @@ +#include "snii/query/internal/docid_union.h" + +#include + +#include "snii/query/internal/docid_set_ops.h" + +namespace snii::query::internal { + +Status build_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, + std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("docid_union: null out"); + out->clear(); + if (postings.empty()) return Status::OK(); + + std::vector> docs_by_posting; + SNII_RETURN_IF_ERROR(read_docid_postings_batched(idx, postings, &docs_by_posting)); + *out = union_sorted_many(docs_by_posting); + return Status::OK(); +} + +Status emit_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("docid_union: null sink"); + std::vector acc; + SNII_RETURN_IF_ERROR(build_docid_union(idx, postings, &acc)); + if (acc.empty()) return Status::OK(); + return sink->append_sorted(acc); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp new file mode 100644 index 00000000000000..72db2d628513e0 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -0,0 +1,1194 @@ +#include "snii/query/phrase_query.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/format/prx_pod.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/query/internal/docid_conjunction.h" +#include "snii/query/internal/docid_set_ops.h" +#include "snii/query/internal/position_math.h" +#include "snii/query/prefix_query.h" +#include "snii/query/term_query.h" +#include "snii/reader/windowed_posting.h" + +// phrase_query implements MATCH_PHRASE with WINDOW (sub-block) SKIPPING for +// high-df windowed terms (design spec section 6.2): +// 1. Resolve every term; reject if any is absent. +// 2. Batch-read each windowed term's prelude + each slim/inline term's full +// docid posting in one round; open the two-level prelude readers. +// 3. Pick the DRIVER = smallest-df term; materialize it fully -> the initial +// candidate docid set. +// 4. For every other term in ascending-df order, narrow the candidate set: +// - slim/inline: intersect with its (already decoded) full posting. +// - windowed: locate_window() the CURRENT candidates -> the SET of +// windows covering them; batch-fetch ONLY those windows' +// .frq docid regions; keep candidates present in some +// covering window. A high-df term thus reads +// O(candidates) windows instead of its whole O(df) +// posting. +// 5. Fetch PRX only for retained chunks and run the positional phrase check +// (term[0]@p, term[1]@p+1, ...) on the survivors. +// The result is identical to a full-read intersection; only the bytes read for +// high-df windowed terms shrink. +namespace snii::query { + +using snii::query::internal::DocidChunk; +using snii::query::internal::DocidSource; +using snii::query::internal::ResolvedQueryTerm; +using snii::query::internal::TermPlan; +using snii::reader::LogicalIndexReader; + +namespace { + +struct ExpectedTailPositions { + uint32_t docid = 0; + size_t positions_begin = 0; + size_t positions_end = 0; +}; + +struct ExpectedTailPositionSet { + std::vector docs; + std::vector positions; + + void clear() { + docs.clear(); + positions.clear(); + } + + void reserve_docs(size_t count) { + docs.reserve(count); + positions.reserve(count); + } +}; + +// One decoded chunk of a term's posting: a windowed term's covering window, or +// a slim/inline term's single posting. `docids` is decoded in the conjunction +// phase (and reused by the streaming cursor -- the dd region is decoded exactly +// once); `prx` is the on-disk positions bytes, decoded lazily by the cursor +// (once per chunk) during phrase verification. +struct PosChunk { + std::vector docids; // ascending, absolute + // Empty means the chunk keeps every PRX doc in on-disk order. Non-empty means + // `docids[i]` corresponds to on-disk local document ordinal + // `prx_doc_ordinals[i]`, allowing PRX decode to skip positions for docs that + // were removed by the docid-only conjunction. + std::vector prx_doc_ordinals; + uint32_t prx_doc_count = 0; + Slice prx; // .prx window bytes (reference fetcher/round1/entry) + bool windowed = false; + uint32_t window = 0; +}; + +// A term's retained posting as an ordered list of chunks (windowed: covering +// windows in docid order; slim/inline: one). The referenced prx bytes live in +// `round1` / the per-term fetchers kept alive in phrase_query::owners for the +// whole query, so the cursor can decode positions during verification. +struct PosSource { + std::vector chunks; +}; + +struct PhraseExecutionState { + std::vector srcs; + std::vector> owners; + std::vector candidates; +}; + +struct PhraseTermMapping { + std::vector unique_terms; + std::vector phrase_plan_index; +}; + +PhraseTermMapping BuildPhraseTermMapping(const std::vector& terms) { + PhraseTermMapping mapping; + mapping.phrase_plan_index.reserve(terms.size()); + for (const std::string& term : terms) { + auto it = std::find(mapping.unique_terms.begin(), mapping.unique_terms.end(), term); + if (it == mapping.unique_terms.end()) { + mapping.phrase_plan_index.push_back(mapping.unique_terms.size()); + mapping.unique_terms.push_back(term); + continue; + } + mapping.phrase_plan_index.push_back(static_cast(it - mapping.unique_terms.begin())); + } + return mapping; +} + +Status append_prx_doc_ordinal(size_t ordinal, std::vector* out) { + if (ordinal > std::numeric_limits::max()) { + return Status::Corruption("phrase_query: prx doc ordinal exceeds u32"); + } + out->push_back(static_cast(ordinal)); + return Status::OK(); +} + +Status append_selected_ordinal(size_t doc_index, const std::vector& prx_doc_ordinals, + std::vector* selected_ordinals) { + if (!prx_doc_ordinals.empty()) { + selected_ordinals->push_back(prx_doc_ordinals[doc_index]); + return Status::OK(); + } + return append_prx_doc_ordinal(doc_index, selected_ordinals); +} + +Status append_selected_doc(size_t doc_index, uint32_t docid, + const std::vector& prx_doc_ordinals, + std::vector* selected_docids, + std::vector* selected_ordinals) { + selected_docids->push_back(docid); + return append_selected_ordinal(doc_index, prx_doc_ordinals, selected_ordinals); +} + +Status materialize_selected_prefix(size_t count, size_t capacity, + const std::vector& docids, + const std::vector& prx_doc_ordinals, + std::vector* selected_docids, + std::vector* selected_ordinals) { + selected_docids->reserve(capacity); + selected_ordinals->reserve(capacity); + selected_docids->insert(selected_docids->end(), docids.begin(), docids.begin() + count); + for (size_t i = 0; i < count; ++i) { + SNII_RETURN_IF_ERROR(append_selected_ordinal(i, prx_doc_ordinals, selected_ordinals)); + } + return Status::OK(); +} + +Status materialize_selected_prefix_if_needed(bool* selected_all, size_t count, size_t capacity, + const std::vector& docids, + const std::vector& prx_doc_ordinals, + std::vector* selected_docids, + std::vector* selected_ordinals) { + if (!*selected_all) { + return Status::OK(); + } + *selected_all = false; + return materialize_selected_prefix(count, capacity, docids, prx_doc_ordinals, selected_docids, + selected_ordinals); +} + +Status SelectCandidateDocsForPrx(std::vector* docids, + std::vector* prx_doc_ordinals, uint32_t prx_doc_count, + const std::vector& candidates, PosChunk* chunk) { + chunk->docids.clear(); + chunk->prx_doc_ordinals.clear(); + if (prx_doc_count == 0 && docids->size() > std::numeric_limits::max()) { + return Status::Corruption("phrase_query: prx doc count exceeds u32"); + } + chunk->prx_doc_count = + prx_doc_count == 0 ? static_cast(docids->size()) : prx_doc_count; + if (docids->empty() || candidates.empty()) { + return Status::OK(); + } + if (!prx_doc_ordinals->empty() && prx_doc_ordinals->size() != docids->size()) { + return Status::Corruption("phrase_query: prx ordinal/docid count mismatch"); + } + + std::vector selected_docids; + std::vector selected_ordinals; + bool selected_all = true; + const size_t selected_capacity = std::min(docids->size(), candidates.size()); + + auto candidate_it = std::ranges::lower_bound(candidates, docids->front()); + size_t candidate_index = static_cast(candidate_it - candidates.begin()); + for (size_t doc_index = 0; doc_index < docids->size(); ++doc_index) { + const uint32_t docid = (*docids)[doc_index]; + while (candidate_index < candidates.size() && candidates[candidate_index] < docid) { + ++candidate_index; + } + if (candidate_index == candidates.size()) { + SNII_RETURN_IF_ERROR(materialize_selected_prefix_if_needed( + &selected_all, doc_index, selected_capacity, *docids, *prx_doc_ordinals, + &selected_docids, &selected_ordinals)); + break; + } + if (candidates[candidate_index] != docid) { + SNII_RETURN_IF_ERROR(materialize_selected_prefix_if_needed( + &selected_all, doc_index, selected_capacity, *docids, *prx_doc_ordinals, + &selected_docids, &selected_ordinals)); + continue; + } + + if (!selected_all) { + SNII_RETURN_IF_ERROR(append_selected_doc(doc_index, docid, *prx_doc_ordinals, + &selected_docids, &selected_ordinals)); + } + ++candidate_index; + } + + if (selected_all) { + chunk->docids = std::move(*docids); + chunk->prx_doc_ordinals = std::move(*prx_doc_ordinals); + docids->clear(); + prx_doc_ordinals->clear(); + return Status::OK(); + } + if (selected_docids.empty()) { + return Status::OK(); + } + chunk->docids = std::move(selected_docids); + chunk->prx_doc_ordinals = std::move(selected_ordinals); + return Status::OK(); +} + +Status BuildFlatPositionSource(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, DocidSource* doc_source, + const TermPlan& p, const std::vector& candidates, + std::vector>* owners, + PosSource* src) { + PosChunk chunk; + std::vector docids; + std::vector prx_doc_ordinals; + const bool docids_are_final_candidates = + doc_source->docids_are_final_candidates && !doc_source->chunks.empty(); + if (!doc_source->chunks.empty()) { + DocidChunk& doc_chunk = doc_source->chunks.front(); + docids = std::move(doc_chunk.docids); + prx_doc_ordinals = std::move(doc_chunk.prx_doc_ordinals); + chunk.prx_doc_count = doc_chunk.prx_doc_count; + } + if (p.pod_ref) { + uint64_t poff = 0; + uint64_t plen = 0; + SNII_RETURN_IF_ERROR(idx.resolve_prx_window(p.entry, p.prx_base, &poff, &plen)); + auto fetcher = std::make_unique(idx.reader()); + const size_t prx_handle = fetcher->add(poff, plen); + SNII_RETURN_IF_ERROR(fetcher->fetch()); + chunk.prx = fetcher->get(prx_handle); + owners->push_back(std::move(fetcher)); + } else { + chunk.prx = Slice(p.entry.prx_bytes); + } + if (docids.empty()) { + Slice dd; + if (p.pod_ref) { + dd = round1.get(p.frq_handle); + } else { + SNII_RETURN_IF_ERROR(internal::inline_dd_region(p.entry, &dd)); + } + SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd, p.entry.dd_meta, + /*win_base=*/0, &docids)); + if (docids.size() > std::numeric_limits::max()) { + return Status::Corruption("phrase_query: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(docids.size()); + } + if (docids_are_final_candidates) { + chunk.docids = std::move(docids); + chunk.prx_doc_ordinals = std::move(prx_doc_ordinals); + if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk)); + return Status::OK(); + } + SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx(&docids, &prx_doc_ordinals, chunk.prx_doc_count, + candidates, &chunk)); + if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk)); + return Status::OK(); +} + +bool ChunkMayContainCandidate(const DocidChunk& chunk, const std::vector& candidates) { + if (chunk.docids.empty() || candidates.empty()) return false; + const auto it = std::lower_bound(candidates.begin(), candidates.end(), chunk.docids.front()); + return it != candidates.end() && *it <= chunk.docids.back(); +} + +Status DecodeWindowedPositionSource( + const LogicalIndexReader& idx, const TermPlan& p, DocidSource* doc_source, + const std::vector& candidates, + std::vector>* owners, PosSource* src) { + struct WindowFetch { + size_t chunk_index = 0; + size_t prx_handle = 0; + }; + + auto prx_fetcher = std::make_unique( + idx.reader(), snii::reader::kSameTermCoalesceGap); + std::vector fetched; + fetched.reserve(doc_source->chunks.size()); + for (size_t i = 0; i < doc_source->chunks.size(); ++i) { + DocidChunk& doc_chunk = doc_source->chunks[i]; + if (!doc_source->docids_are_final_candidates && + !ChunkMayContainCandidate(doc_chunk, candidates)) { + continue; + } + if (!doc_chunk.windowed) { + return Status::Corruption("phrase_query: expected windowed doc chunk"); + } + PosChunk chunk; + if (doc_source->docids_are_final_candidates) { + chunk.docids = std::move(doc_chunk.docids); + chunk.prx_doc_ordinals = std::move(doc_chunk.prx_doc_ordinals); + chunk.prx_doc_count = doc_chunk.prx_doc_count; + } else { + SNII_RETURN_IF_ERROR( + SelectCandidateDocsForPrx(&doc_chunk.docids, &doc_chunk.prx_doc_ordinals, + doc_chunk.prx_doc_count, candidates, &chunk)); + } + if (chunk.docids.empty()) continue; + + snii::reader::WindowAbsRange range; + SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( + idx, p.entry, p.frq_base, p.prx_base, p.prelude, doc_chunk.window, + /*want_positions=*/true, /*want_freq=*/false, &range)); + chunk.windowed = true; + chunk.window = doc_chunk.window; + WindowFetch f; + f.chunk_index = src->chunks.size(); + f.prx_handle = prx_fetcher->add(range.prx_off, range.prx_len); + fetched.push_back(f); + src->chunks.push_back(std::move(chunk)); + } + if (prx_fetcher->pending() > 0) SNII_RETURN_IF_ERROR(prx_fetcher->fetch()); + + for (const WindowFetch& f : fetched) { + src->chunks[f.chunk_index].prx = prx_fetcher->get(f.prx_handle); + } + if (!fetched.empty()) owners->push_back(std::move(prx_fetcher)); + return Status::OK(); +} + +Status BuildPositionSourcesForCandidates( + const LogicalIndexReader& idx, const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, std::vector* doc_sources, + const std::vector& candidates, + std::vector>* owners, + std::vector* srcs) { + srcs->assign(plans.size(), PosSource {}); + for (size_t i = 0; i < plans.size(); ++i) { + const TermPlan& p = plans[i]; + if (p.windowed) { + SNII_RETURN_IF_ERROR(DecodeWindowedPositionSource(idx, p, &(*doc_sources)[i], + candidates, owners, &(*srcs)[i])); + continue; + } + SNII_RETURN_IF_ERROR(BuildFlatPositionSource(idx, round1, &(*doc_sources)[i], p, candidates, + owners, &(*srcs)[i])); + } + return Status::OK(); +} + +class PosChunkDecoder { +public: + void reset() { + chunk_ = nullptr; + offsets_by_prx_ordinal_ = false; + } + + Status decode(const PosChunk& chunk) { + chunk_ = &chunk; + ByteSource ps(chunk.prx); + offsets_by_prx_ordinal_ = false; + if (chunk.prx_doc_ordinals.empty()) { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + } else if (should_decode_full_prx_window(chunk)) { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + offsets_by_prx_ordinal_ = true; + } else { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr_selective( + &ps, chunk.prx_doc_ordinals, &pflat_, &poff_)); + } + if (offsets_by_prx_ordinal_) { + if (poff_.size() != static_cast(chunk.prx_doc_count) + 1) { + return Status::Corruption("phrase_query: full prx doc-count mismatch"); + } + } else if (poff_.size() != chunk.docids.size() + 1) { + return Status::Corruption("phrase_query: selected prx/doc-count mismatch"); + } + if (poff_.back() > pflat_.size()) { + return Status::Corruption("phrase_query: prx final offset out of range"); + } + return Status::OK(); + } + + Status positions(size_t doc_index, std::pair* out) const { + if (chunk_ == nullptr || doc_index >= chunk_->docids.size()) { + return Status::Corruption("phrase_query: decoded chunk doc index out of range"); + } + const size_t pos_index = + offsets_by_prx_ordinal_ ? chunk_->prx_doc_ordinals[doc_index] : doc_index; + if (pos_index + 1 >= poff_.size()) { + return Status::Corruption("phrase_query: prx ordinal offset out of range"); + } + const uint32_t begin = poff_[pos_index]; + const uint32_t end = poff_[pos_index + 1]; + if (begin == end) { + *out = {nullptr, nullptr}; + return Status::OK(); + } + if (end > pflat_.size()) { + return Status::Corruption("phrase_query: prx offset out of range"); + } + *out = {pflat_.data() + begin, pflat_.data() + end}; + return Status::OK(); + } + + inline __attribute__((always_inline)) std::pair + positions_unchecked(size_t doc_index) const { + const size_t pos_index = + offsets_by_prx_ordinal_ ? chunk_->prx_doc_ordinals[doc_index] : doc_index; + const uint32_t begin = poff_[pos_index]; + const uint32_t end = poff_[pos_index + 1]; + if (begin == end) { + return {nullptr, nullptr}; + } + return {pflat_.data() + begin, pflat_.data() + end}; + } + +private: + static bool should_decode_full_prx_window(const PosChunk& chunk) { + return chunk.prx_doc_count != 0 && + static_cast(chunk.prx_doc_ordinals.size()) * 2 >= chunk.prx_doc_count; + } + + const PosChunk* chunk_ = nullptr; + bool offsets_by_prx_ordinal_ = false; + std::vector pflat_; + std::vector poff_; +}; + +// Streaming position cursor over one term's retained chunks. It advances ONLY +// forward (callers seek ascending candidate docids), decodes each chunk's +// docids once (reused from the conjunction phase) and each chunk's positions at +// most once (lazily, into a flat CSR whose capacity is retained across chunks). +// No per-doc allocation, no per-candidate docid binary search: positions are +// addressed by the doc's local index within its chunk. This is the read-side +// dual of the windowed posting layout -- the S3-native batch fetch already +// pulled every needed chunk into memory; the cursor is pure in-memory column +// iteration. +class PostingCursor { +public: + void init(const PosSource* src) { + src_ = src; + ci_ = 0; + li_ = 0; + decoded_pos_chunk_ = kNoChunk; + decoder_.reset(); + } + + // Positions the cursor at `target` (guaranteed present: candidates are the + // intersection of exactly these chunks' docids). Monotonic forward advance. + Status seek(uint32_t target) { + while (ci_ < src_->chunks.size() && + (src_->chunks[ci_].docids.empty() || src_->chunks[ci_].docids.back() < target)) { + ++ci_; + li_ = 0; + } + if (ci_ >= src_->chunks.size()) { + return Status::Corruption("phrase_query: cursor exhausted before target docid"); + } + const std::vector& d = src_->chunks[ci_].docids; + while (li_ < d.size() && d[li_] < target) ++li_; + if (li_ >= d.size() || d[li_] != target) { + return Status::Corruption("phrase_query: candidate missing from posting chunk"); + } + return Status::OK(); + } + + // [begin,end) of the current doc's positions, decoding the current chunk's + // .prx exactly once (cached). Must follow a seek that landed on a real doc. + Status positions(std::pair* out) { + if (ci_ >= src_->chunks.size() || li_ >= src_->chunks[ci_].docids.size()) { + return Status::Corruption("phrase_query: cursor positions out of range"); + } + if (decoded_pos_chunk_ != ci_) { + SNII_RETURN_IF_ERROR(decoder_.decode(src_->chunks[ci_])); + decoded_pos_chunk_ = ci_; + } + return decoder_.positions(li_, out); + } + + Status next(uint32_t* docid, std::pair* out) { + while (ci_ < src_->chunks.size() && + (src_->chunks[ci_].docids.empty() || li_ >= src_->chunks[ci_].docids.size())) { + ++ci_; + li_ = 0; + } + if (ci_ >= src_->chunks.size()) { + return Status::Corruption("phrase_query: cursor exhausted before next docid"); + } + *docid = src_->chunks[ci_].docids[li_]; + SNII_RETURN_IF_ERROR(positions(out)); + ++li_; + return Status::OK(); + } + +private: + static constexpr size_t kNoChunk = static_cast(-1); + + const PosSource* src_ = nullptr; + size_t ci_ = 0; // current chunk + size_t li_ = 0; // current local doc index within the chunk + size_t decoded_pos_chunk_ = kNoChunk; // which chunk decoder_ currently holds + PosChunkDecoder decoder_; +}; + +class PhrasePositionLoader { +public: + PhrasePositionLoader(size_t plan_count, std::vector& srcs) + : cursors_(plan_count), plan_spans_(plan_count), loaded_epoch_(plan_count, 0) { + for (size_t i = 0; i < plan_count; ++i) { + cursors_[i].init(&srcs[i]); + } + } + + void begin_doc(uint32_t docid) { + docid_ = docid; + ++epoch_; + if (epoch_ == 0) { + std::ranges::fill(loaded_epoch_, 0); + epoch_ = 1; + } + } + + Status positions_for_phrase_pos(const std::vector& phrase_plan_index, size_t phrase_pos, + std::pair* out) { + const size_t plan_index = phrase_plan_index[phrase_pos]; + if (loaded_epoch_[plan_index] != epoch_) { + SNII_RETURN_IF_ERROR(cursors_[plan_index].seek(docid_)); + SNII_RETURN_IF_ERROR(cursors_[plan_index].positions(&plan_spans_[plan_index])); + loaded_epoch_[plan_index] = epoch_; + } + *out = plan_spans_[plan_index]; + return Status::OK(); + } + +private: + std::vector cursors_; + std::vector> plan_spans_; + std::vector loaded_epoch_; + uint32_t docid_ = 0; + uint32_t epoch_ = 0; +}; + +bool ContainsTwoTermPhrase(std::pair left_span, + std::pair right_span, + uint32_t right_delta) { + const uint32_t* left = left_span.first; + const uint32_t* right = right_span.first; + const uint32_t max_start = std::numeric_limits::max() - right_delta; + while (left != left_span.second && right != right_span.second) { + if (*left > max_start) { + return false; + } + const uint32_t want = *left + right_delta; + while (right != right_span.second && *right < want) { + ++right; + } + if (right == right_span.second) { + return false; + } + if (*right == want) { + return true; + } + ++left; + } + return false; +} + +size_t SelectPhraseVerificationPair(const std::vector& plans, + const std::vector& phrase_plan_index) { + size_t best_left = 0; + uint64_t best_score = std::numeric_limits::max(); + for (size_t left = 0; left + 1 < phrase_plan_index.size(); ++left) { + const uint64_t score = static_cast(plans[phrase_plan_index[left]].df) + + plans[phrase_plan_index[left + 1]].df; + if (score < best_score) { + best_score = score; + best_left = left; + } + } + return best_left; +} + +void CollectTwoTermPhraseStarts(std::pair left_span, + std::pair right_span, + uint32_t right_delta, uint32_t left_offset, + std::vector& starts) { + starts.clear(); + const uint32_t* left = left_span.first; + const uint32_t* right = right_span.first; + const uint32_t max_left = std::numeric_limits::max() - right_delta; + while (left != left_span.second && right != right_span.second) { + if (*left > max_left) { + return; + } + const uint32_t want = *left + right_delta; + while (right != right_span.second && *right < want) { + ++right; + } + if (right == right_span.second) { + return; + } + if (*right == want && *left >= left_offset) { + starts.push_back(*left - left_offset); + } + ++left; + } +} + +Status EmitTwoTermPhraseStreaming(const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, + const std::vector& candidates, + std::vector* docids) { + const size_t left_plan = phrase_plan_index[0]; + const size_t right_plan = phrase_plan_index[1]; + const uint32_t right_delta = position_offsets[1] - position_offsets[0]; + + if (left_plan == right_plan) { + PostingCursor cursor; + cursor.init(&srcs[left_plan]); + for (uint32_t expected_docid : candidates) { + uint32_t docid = 0; + std::pair span; + SNII_RETURN_IF_ERROR(cursor.next(&docid, &span)); + if (docid != expected_docid) { + return Status::Corruption("phrase_query: repeated-term cursor/docid mismatch"); + } + if (ContainsTwoTermPhrase(span, span, right_delta)) { + docids->push_back(docid); + } + } + return Status::OK(); + } + + PostingCursor left_cursor; + PostingCursor right_cursor; + left_cursor.init(&srcs[left_plan]); + right_cursor.init(&srcs[right_plan]); + for (uint32_t expected_docid : candidates) { + uint32_t left_docid = 0; + uint32_t right_docid = 0; + std::pair left_span; + std::pair right_span; + SNII_RETURN_IF_ERROR(left_cursor.next(&left_docid, &left_span)); + SNII_RETURN_IF_ERROR(right_cursor.next(&right_docid, &right_span)); + if (left_docid != expected_docid || right_docid != expected_docid) { + return Status::Corruption("phrase_query: two-term cursor/docid mismatch"); + } + if (ContainsTwoTermPhrase(left_span, right_span, right_delta)) { + docids->push_back(expected_docid); + } + } + return Status::OK(); +} + +void EmitTwoTermPhraseChunkPair(const PosChunk& left, const PosChunk& right, + const PosChunkDecoder& left_decoder, + const PosChunkDecoder& right_decoder, uint32_t right_delta, + std::vector& docids) { + size_t li = static_cast( + std::lower_bound(left.docids.begin(), left.docids.end(), right.docids.front()) - + left.docids.begin()); + size_t ri = static_cast( + std::lower_bound(right.docids.begin(), right.docids.end(), left.docids.front()) - + right.docids.begin()); + while (li < left.docids.size() && ri < right.docids.size()) { + const uint32_t left_docid = left.docids[li]; + const uint32_t right_docid = right.docids[ri]; + if (left_docid < right_docid) { + ++li; + continue; + } + if (right_docid < left_docid) { + ++ri; + continue; + } + + const std::pair left_span = + left_decoder.positions_unchecked(li); + const std::pair right_span = + right_decoder.positions_unchecked(ri); + if (ContainsTwoTermPhrase(left_span, right_span, right_delta)) { + docids.push_back(left_docid); + } + ++li; + ++ri; + } +} + +Status EmitTwoTermPhraseChunkMerge(const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, + std::vector* const docids) { + const size_t left_plan = phrase_plan_index[0]; + const size_t right_plan = phrase_plan_index[1]; + const uint32_t right_delta = position_offsets[1] - position_offsets[0]; + const PosSource& left_src = srcs[left_plan]; + const PosSource& right_src = srcs[right_plan]; + + PosChunkDecoder left_decoder; + PosChunkDecoder right_decoder; + size_t decoded_left_chunk = static_cast(-1); + size_t decoded_right_chunk = static_cast(-1); + size_t left_chunk = 0; + size_t right_chunk = 0; + while (left_chunk < left_src.chunks.size() && right_chunk < right_src.chunks.size()) { + const PosChunk& left = left_src.chunks[left_chunk]; + const PosChunk& right = right_src.chunks[right_chunk]; + if (left.docids.empty()) { + ++left_chunk; + continue; + } + if (right.docids.empty()) { + ++right_chunk; + continue; + } + if (left.docids.back() < right.docids.front()) { + ++left_chunk; + continue; + } + if (right.docids.back() < left.docids.front()) { + ++right_chunk; + continue; + } + + if (decoded_left_chunk != left_chunk) { + SNII_RETURN_IF_ERROR(left_decoder.decode(left)); + decoded_left_chunk = left_chunk; + } + if (decoded_right_chunk != right_chunk) { + SNII_RETURN_IF_ERROR(right_decoder.decode(right)); + decoded_right_chunk = right_chunk; + } + + EmitTwoTermPhraseChunkPair(left, right, left_decoder, right_decoder, right_delta, *docids); + + const uint32_t left_last = left.docids.back(); + const uint32_t right_last = right.docids.back(); + if (left_last <= right_last) { + ++left_chunk; + } + if (right_last <= left_last) { + ++right_chunk; + } + } + return Status::OK(); +} + +bool PhraseStartMatchesAllTerms( + uint32_t start, size_t phrase_len, size_t pair_left, size_t pair_right, + const std::vector& position_offsets, + const std::vector>& span) { + for (size_t t = 0; t < phrase_len; ++t) { + if (t == pair_left || t == pair_right) { + continue; + } + uint32_t want = 0; + if (!internal::add_position_offset(start, position_offsets[t], &want)) { + return false; + } + if (!std::binary_search(span[t].first, span[t].second, want)) { + return false; + } + } + return true; +} + +Status EmitSingleTermPhraseStreaming(const std::vector& phrase_plan_index, + std::vector& srcs, + const std::vector& candidates, + std::vector* docids) { + PhrasePositionLoader loader(srcs.size(), srcs); + for (uint32_t d : candidates) { + loader.begin_doc(d); + std::pair single_span; + SNII_RETURN_IF_ERROR(loader.positions_for_phrase_pos(phrase_plan_index, 0, &single_span)); + if (single_span.first != single_span.second) { + docids->push_back(d); + } + } + return Status::OK(); +} + +Status EmitMultiTermPhraseStreaming(const std::vector& plans, + const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, + const std::vector& candidates, + std::vector* docids) { + const size_t phrase_len = phrase_plan_index.size(); + PhrasePositionLoader loader(plans.size(), srcs); + std::vector> span(phrase_len); + std::vector starts; + const size_t pair_left = SelectPhraseVerificationPair(plans, phrase_plan_index); + const size_t pair_right = pair_left + 1; + for (uint32_t d : candidates) { + loader.begin_doc(d); + std::pair left_span; + std::pair right_span; + SNII_RETURN_IF_ERROR( + loader.positions_for_phrase_pos(phrase_plan_index, pair_left, &left_span)); + SNII_RETURN_IF_ERROR( + loader.positions_for_phrase_pos(phrase_plan_index, pair_right, &right_span)); + + CollectTwoTermPhraseStarts(left_span, right_span, + position_offsets[pair_right] - position_offsets[pair_left], + position_offsets[pair_left], starts); + if (starts.empty()) { + continue; + } + + span[pair_left] = left_span; + span[pair_right] = right_span; + for (size_t pp = 0; pp < phrase_len; ++pp) { + if (pp == pair_left || pp == pair_right) { + continue; + } + SNII_RETURN_IF_ERROR(loader.positions_for_phrase_pos(phrase_plan_index, pp, &span[pp])); + } + + for (uint32_t start : starts) { + if (PhraseStartMatchesAllTerms(start, phrase_len, pair_left, pair_right, + position_offsets, span)) { + docids->push_back(d); + break; + } + } + } + return Status::OK(); +} + +// Single streaming pass over the candidates: for each (ascending) candidate, +// gather positions lazily, and test the consecutive-phrase predicate +// (term[0]@p, term[1]@p+1, ...). Multi-term phrases first test the cheapest +// adjacent pair by df before decoding the remaining terms for that document. +// Cursors decode each retained chunk at most once and address positions by +// local index -- no per-candidate docid binary search, no full-candidate +// position materialization. Candidates are ascending so the emitted docids are +// already sorted. +Status EmitPhraseStreaming(const std::vector& plans, + const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, const std::vector& candidates, + std::vector* docids) { + const size_t phrase_len = phrase_plan_index.size(); + if (phrase_len == 1) { + return EmitSingleTermPhraseStreaming(phrase_plan_index, srcs, candidates, docids); + } + if (phrase_len == 2) { + if (phrase_plan_index[0] != phrase_plan_index[1]) { + return EmitTwoTermPhraseChunkMerge(phrase_plan_index, position_offsets, srcs, docids); + } + return EmitTwoTermPhraseStreaming(phrase_plan_index, position_offsets, srcs, candidates, + docids); + } + return EmitMultiTermPhraseStreaming(plans, phrase_plan_index, position_offsets, srcs, + candidates, docids); +} + +Status BuildPhraseExecutionState(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1, + std::vector* plans, PhraseExecutionState* state) { + if (round1->pending() > 0) SNII_RETURN_IF_ERROR(round1->fetch()); + SNII_RETURN_IF_ERROR(internal::open_preludes(*round1, plans, + /*need_positions=*/true)); + + state->owners.clear(); + state->candidates.clear(); + std::vector doc_sources; + SNII_RETURN_IF_ERROR(internal::build_docid_only_conjunction(idx, *round1, *plans, + &state->candidates, &doc_sources)); + if (state->candidates.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(BuildPositionSourcesForCandidates( + idx, *round1, *plans, &doc_sources, state->candidates, &state->owners, &state->srcs)); + return Status::OK(); +} + +Status ExecutePhrasePlans(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1, + std::vector* plans, + const std::vector& phrase_plan_index, + std::vector* docids) { + PhraseExecutionState state; + SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, round1, plans, &state)); + if (state.candidates.empty()) return Status::OK(); + + std::vector position_offsets; + if (!internal::build_position_offsets(phrase_plan_index.size(), &position_offsets)) { + return Status::InvalidArgument("phrase_query: phrase length exceeds doc position range"); + } + return EmitPhraseStreaming(*plans, phrase_plan_index, position_offsets, state.srcs, + state.candidates, docids); +} + +Status ExecuteResolvedPhraseTerms(const LogicalIndexReader& idx, + const std::vector& terms, + std::vector* docids) { + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, terms, &round1, &plans, + /*need_positions=*/false)); + std::vector phrase_plan_index(terms.size()); + std::iota(phrase_plan_index.begin(), phrase_plan_index.end(), 0); + return ExecutePhrasePlans(idx, &round1, &plans, phrase_plan_index, docids); +} + +Status CollectExpectedTailPositions(const std::vector& plans, + const std::vector& position_offsets, + std::vector& srcs, + const std::vector& candidates, + ExpectedTailPositionSet* out) { + const size_t n = plans.size(); + std::vector cur(n); + for (size_t i = 0; i < n; ++i) cur[i].init(&srcs[i]); + + std::vector ordered(n); + for (size_t i = 0; i < n; ++i) ordered[plans[i].order] = &cur[i]; + + std::vector> span(n); + for (uint32_t d : candidates) { + for (size_t i = 0; i < n; ++i) SNII_RETURN_IF_ERROR(cur[i].seek(d)); + for (size_t pp = 0; pp < n; ++pp) { + SNII_RETURN_IF_ERROR(ordered[pp]->positions(&span[pp])); + } + + const size_t expected_begin = out->positions.size(); + for (const uint32_t* p = span[0].first; p != span[0].second; ++p) { + const uint32_t start = *p; + bool ok = true; + for (size_t t = 1; t < n; ++t) { + uint32_t want = 0; + if (!internal::add_position_offset(start, position_offsets[t], &want)) { + ok = false; + break; + } + if (!std::binary_search(span[t].first, span[t].second, want)) { + ok = false; + break; + } + } + uint32_t tail_pos = 0; + if (ok && internal::add_position_offset(start, position_offsets[n], &tail_pos)) { + out->positions.push_back(tail_pos); + } + } + const size_t expected_end = out->positions.size(); + if (expected_end != expected_begin) { + out->docs.push_back({d, expected_begin, expected_end}); + } + } + return Status::OK(); +} + +Status CollectSingleTermExpectedTailPositions(std::vector& srcs, + const std::vector& candidates, + uint32_t tail_offset, ExpectedTailPositionSet* out) { + PostingCursor cursor; + cursor.init(srcs.data()); + out->reserve_docs(out->docs.size() + candidates.size()); + + for (uint32_t d : candidates) { + SNII_RETURN_IF_ERROR(cursor.seek(d)); + std::pair span; + SNII_RETURN_IF_ERROR(cursor.positions(&span)); + + const size_t expected_begin = out->positions.size(); + for (const uint32_t* p = span.first; p != span.second; ++p) { + uint32_t tail_pos = 0; + if (internal::add_position_offset(*p, tail_offset, &tail_pos)) { + out->positions.push_back(tail_pos); + } + } + const size_t expected_end = out->positions.size(); + if (expected_end != expected_begin) { + out->docs.push_back({d, expected_begin, expected_end}); + } + } + return Status::OK(); +} + +Status CollectExpectedTailPositions(const LogicalIndexReader& idx, + const std::vector& exact_terms, + ExpectedTailPositionSet* out) { + out->clear(); + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, exact_terms, &round1, &plans, + /*need_positions=*/false)); + + PhraseExecutionState state; + SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state)); + if (state.candidates.empty()) return Status::OK(); + out->reserve_docs(state.candidates.size()); + std::vector position_offsets; + if (!internal::build_position_offsets(plans.size() + 1, &position_offsets)) { + return Status::InvalidArgument( + "phrase_prefix_query: phrase length exceeds doc position range"); + } + if (plans.size() == 1) { + return CollectSingleTermExpectedTailPositions(state.srcs, state.candidates, + position_offsets[1], out); + } + return CollectExpectedTailPositions(plans, position_offsets, state.srcs, state.candidates, out); +} + +bool contains_any_position(const ExpectedTailPositionSet& expected, + const ExpectedTailPositions& wanted, + std::pair actual) { + for (size_t i = wanted.positions_begin; i < wanted.positions_end; ++i) { + if (std::binary_search(actual.first, actual.second, expected.positions[i])) { + return true; + } + } + return false; +} + +Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx, + const ResolvedQueryTerm& tail, + const ExpectedTailPositionSet& expected, + std::vector* out) { + if (expected.docs.empty()) { + return Status::OK(); + } + + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, {tail}, &round1, &plans, + /*need_positions=*/false)); + + PhraseExecutionState state; + SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state)); + if (state.candidates.empty()) return Status::OK(); + + PostingCursor cursor; + cursor.init(&state.srcs[0]); + size_t ei = 0; + size_t ti = 0; + while (ei < expected.docs.size() && ti < state.candidates.size()) { + const uint32_t want_doc = expected.docs[ei].docid; + const uint32_t tail_doc = state.candidates[ti]; + if (want_doc < tail_doc) { + ++ei; + continue; + } + if (tail_doc < want_doc) { + ++ti; + continue; + } + + SNII_RETURN_IF_ERROR(cursor.seek(want_doc)); + std::pair actual; + SNII_RETURN_IF_ERROR(cursor.positions(&actual)); + if (contains_any_position(expected, expected.docs[ei], actual)) { + out->push_back(want_doc); + } + ++ei; + ++ti; + } + return Status::OK(); +} + +} // namespace + +Status phrase_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* const docids) { + if (docids == nullptr) { + return Status::InvalidArgument("phrase_query: null out"); + } + docids->clear(); + if (terms.empty()) { + return Status::OK(); + } + if (terms.size() == 1) { + return term_query(idx, terms.front(), docids); + } + if (!idx.has_positions()) { + return Status::Unsupported("phrase_query: index has no positions"); + } + + // Round 1: preludes (windowed) + docid postings (slim/inline) batched + // together. Positions are fetched after the docid-only conjunction has + // produced final candidates, so phrase verification does not read PRX for + // windows later removed by the docid intersection. + snii::io::BatchRangeFetcher round1(idx.reader()); + const PhraseTermMapping mapping = BuildPhraseTermMapping(terms); + std::vector plans; + bool all_present = false; + SNII_RETURN_IF_ERROR(internal::plan_terms(idx, mapping.unique_terms, &round1, &plans, + &all_present, + /*need_positions=*/false)); + if (!all_present) return Status::OK(); + return ExecutePhrasePlans(idx, &round1, &plans, mapping.phrase_plan_index, docids); +} + +Status phrase_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* const docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return phrase_query(idx, terms, docids); +} + +Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("phrase_prefix_query: null out"); + } + docids->clear(); + if (terms.empty()) { + return Status::OK(); + } + if (terms.size() == 1) { + return prefix_query(idx, terms.front(), docids, max_expansions); + } + if (!idx.has_positions()) { + return Status::Unsupported("phrase_prefix_query: index has no positions"); + } + + std::vector exact_terms; + exact_terms.reserve(terms.size() - 1); + for (size_t i = 0; i + 1 < terms.size(); ++i) { + ResolvedQueryTerm resolved; + bool found = false; + SNII_RETURN_IF_ERROR(internal::resolve_query_term(idx, terms[i], &resolved, &found)); + if (!found) { + return Status::OK(); + } + exact_terms.push_back(std::move(resolved)); + } + + std::vector tail_hits; + SNII_RETURN_IF_ERROR(idx.prefix_terms(terms.back(), &tail_hits, max_expansions)); + if (tail_hits.empty()) { + return Status::OK(); + } + if (tail_hits.size() == 1) { + std::vector resolved_terms = exact_terms; + resolved_terms.push_back(ResolvedQueryTerm {std::move(tail_hits.front().entry), + tail_hits.front().frq_base, + tail_hits.front().prx_base}); + return ExecuteResolvedPhraseTerms(idx, resolved_terms, docids); + } + + ExpectedTailPositionSet expected; + SNII_RETURN_IF_ERROR(CollectExpectedTailPositions(idx, exact_terms, &expected)); + if (expected.docs.empty()) { + return Status::OK(); + } + + std::vector acc; + for (LogicalIndexReader::PrefixHit& hit : tail_hits) { + ResolvedQueryTerm tail {std::move(hit.entry), hit.frq_base, hit.prx_base}; + std::vector tail_docs; + SNII_RETURN_IF_ERROR( + CollectTailMatchesAtExpectedPositions(idx, tail, expected, &tail_docs)); + internal::union_sorted_into(&acc, tail_docs); + } + *docids = std::move(acc); + return Status::OK(); +} + +Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { + QueryProfileScope profile_scope(idx.reader(), profile); + return phrase_prefix_query(idx, terms, docids, max_expansions); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/prefix_query.cpp b/be/src/storage/index/snii/core/src/query/prefix_query.cpp new file mode 100644 index 00000000000000..4ad9b6629bdf77 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/prefix_query.cpp @@ -0,0 +1,47 @@ +#include "snii/query/prefix_query.h" + +#include +#include + +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/query/internal/docid_union.h" + +namespace snii::query { + +using snii::reader::LogicalIndexReader; + +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("prefix_query: null out"); + } + docids->clear(); + VectorDocIdSink sink(*docids); + return prefix_query(idx, prefix, &sink, max_expansions); +} + +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { + QueryProfileScope profile_scope(idx.reader(), profile); + return prefix_query(idx, prefix, docids, max_expansions); +} + +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, DocIdSink* const sink, + int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("prefix_query: null sink"); + } + + std::vector hits; + SNII_RETURN_IF_ERROR(idx.prefix_terms(prefix, &hits, max_expansions)); + + std::vector postings; + postings.reserve(hits.size()); + for (LogicalIndexReader::PrefixHit& hit : hits) { + postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base}); + } + return internal::emit_docid_union(idx, postings, sink); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/query_profile.cpp b/be/src/storage/index/snii/core/src/query/query_profile.cpp new file mode 100644 index 00000000000000..9ecd333cb231ed --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/query_profile.cpp @@ -0,0 +1,46 @@ +#include "snii/query/query_profile.h" + +#include +#include + +#include "snii/io/file_reader.h" + +namespace snii::query { + +QueryProfileScope::QueryProfileScope(snii::io::FileReader* reader, QueryProfile* profile) + : reader_(reader), profile_(profile), start_(std::chrono::steady_clock::now()) { + if (profile_ == nullptr) return; + + *profile_ = QueryProfile {}; + if (reader_ == nullptr) return; + + const snii::io::IoMetrics* metrics = reader_->io_metrics(); + if (metrics == nullptr) return; + + profile_->has_io_metrics = true; + profile_->io_before = *metrics; +} + +QueryProfileScope::~QueryProfileScope() { + finish(); +} + +void QueryProfileScope::finish() { + if (profile_ == nullptr || finished_) return; + finished_ = true; + + const auto end = std::chrono::steady_clock::now(); + const auto elapsed = std::chrono::duration_cast(end - start_).count(); + profile_->elapsed_ns = std::max(1, static_cast(elapsed)); + + if (!profile_->has_io_metrics || reader_ == nullptr) return; + const snii::io::IoMetrics* metrics = reader_->io_metrics(); + if (metrics == nullptr) { + profile_->has_io_metrics = false; + return; + } + profile_->io_after = *metrics; + profile_->io_delta = snii::io::delta(profile_->io_after, profile_->io_before); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/regexp_query.cpp b/be/src/storage/index/snii/core/src/query/regexp_query.cpp new file mode 100644 index 00000000000000..13377732b17201 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/regexp_query.cpp @@ -0,0 +1,91 @@ +#include "snii/query/regexp_query.h" + +#include +#include +#include +#include + +#include "snii/query/internal/term_expansion.h" + +namespace snii::query { + +namespace { + +bool is_regex_metachar(char c) { + switch (c) { + case '.': + case '^': + case '$': + case '|': + case '(': + case ')': + case '[': + case ']': + case '*': + case '+': + case '?': + case '{': + case '}': + case '\\': + return true; + default: + return false; + } +} + +std::string literal_prefix_for_regex(std::string_view pattern) { + std::string out; + size_t i = 0; + if (!pattern.empty() && pattern.front() == '^') { + i = 1; + } + for (; i < pattern.size(); ++i) { + const char c = pattern[i]; + if (is_regex_metachar(c)) { + break; + } + out.push_back(c); + } + return out; +} + +} // namespace + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("regexp_query: null out"); + } + docids->clear(); + VectorDocIdSink sink(*docids); + return regexp_query(idx, pattern, &sink, max_expansions); +} + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { + QueryProfileScope profile_scope(idx.reader(), profile); + return regexp_query(idx, pattern, docids, max_expansions); +} + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* const sink, int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("regexp_query: null sink"); + } + + std::regex re; + try { + re = std::regex(std::string(pattern)); + } catch (const std::regex_error& e) { + return Status::InvalidArgument(std::string("regexp_query: invalid regex: ") + e.what()); + } + + const std::string enum_prefix = literal_prefix_for_regex(pattern); + return internal::emit_expanded_docid_union( + idx, enum_prefix, + [&re](std::string_view term) { return std::regex_match(term.begin(), term.end(), re); }, + sink, max_expansions); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/scoring_query.cpp b/be/src/storage/index/snii/core/src/query/scoring_query.cpp new file mode 100644 index 00000000000000..4813b3560ca7d7 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/scoring_query.cpp @@ -0,0 +1,684 @@ +#include "snii/query/scoring_query.h" + +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/reader/windowed_posting.h" + +namespace snii::query { + +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeReader; +using snii::format::WindowMeta; +using snii::reader::LogicalIndexReader; + +namespace { + +// One scored posting for one term in one doc. +struct TermPosting { + uint32_t docid = 0; + double score = 0.0; +}; + +// One window's block-max upper bound and the docid range it covers. block_max is +// true when max_score came from the frq_prelude columns (vs the exact-score +// fallback); both are valid upper bounds, so it is informational only. +struct WindowBound { + uint32_t first_docid = 0; // inclusive + uint32_t last_docid = 0; // inclusive + double max_score = 0.0; // block-max upper bound for any doc in this window + bool block_max = false; +}; + +// All scored postings of one query term plus its block-max metadata. +struct TermCursor { + std::vector postings; // ascending docid, exact per-doc scores + std::vector windows; // ascending, covering all postings + size_t pos = 0; // DAAT cursor into postings +}; + +uint32_t CurrentDoc(const TermCursor& c) { + return c.pos < c.postings.size() ? c.postings[c.pos].docid + : std::numeric_limits::max(); +} + +// Reads one slim .frq window's bytes for a slim pod_ref/inline entry (prelude +// stripped). Windowed entries are handled separately via the prelude decode. +Status FetchSlimWindowBytes(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, std::vector* window_owned, Slice* window) { + if (entry.kind == DictEntryKind::kInline) { + *window = Slice(entry.frq_bytes); + return Status::OK(); + } + uint64_t win_abs = 0; + uint64_t win_len = 0; + SNII_RETURN_IF_ERROR(idx.resolve_frq_window(entry, frq_base, &win_abs, &win_len)); + snii::io::BatchRangeFetcher fetcher(idx.reader()); + const size_t h = fetcher.add(win_abs, win_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + Slice got = fetcher.get(h); + window_owned->assign(got.data(), got.data() + got.size()); + *window = Slice(*window_owned); + return Status::OK(); +} + +// Reads a windowed entry's frq_prelude (block-max columns live here). +Status FetchPrelude(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + FrqPreludeReader* out) { + const auto& region = idx.section_refs().posting_region; + const uint64_t prelude_abs = region.offset + frq_base + entry.frq_off_delta; + snii::io::BatchRangeFetcher fetcher(idx.reader()); + const size_t h = fetcher.add(prelude_abs, entry.prelude_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + return FrqPreludeReader::open(fetcher.get(h), out); +} + +// Builds per-window block-max bounds from a windowed entry's prelude. Each +// WindowMeta carries the window's max_freq / max_norm and its covered docid +// range (win_base+1 .. last_docid), so bounds come straight from the directory. +Status BuildWindowBounds(const FrqPreludeReader& prelude, const ScorerContext& ctx, double avgdl, + const Bm25Params& params, std::vector* windows) { + const uint32_t n = prelude.n_windows(); + for (uint32_t w = 0; w < n; ++w) { + WindowMeta m; + SNII_RETURN_IF_ERROR(prelude.window(w, &m)); + if (m.doc_count == 0) continue; + WindowBound wb; + wb.first_docid = static_cast(m.win_base) + (w == 0 ? 0u : 1u); + wb.last_docid = m.last_docid; + wb.max_score = ctx.max_score(m.max_freq, m.max_norm, avgdl, params); + wb.block_max = true; + windows->push_back(wb); + } + return Status::OK(); +} + +// Fallback single window covering all postings, bounded by the exact max score +// (always a valid upper bound, so pruning stays correct). +void SingleWindowFallback(const std::vector& postings, + std::vector* windows) { + if (postings.empty()) return; + WindowBound wb; + wb.first_docid = postings.front().docid; + wb.last_docid = postings.back().docid; + wb.block_max = false; + for (const auto& p : postings) wb.max_score = std::max(wb.max_score, p.score); + windows->push_back(wb); +} + +// Computes exact per-doc BM25 scores from decoded (docid, freq) vectors. +Status ScoreDecoded(const snii::stats::SniiStatsProvider& stats, const ScorerContext& ctx, + const Bm25Params& params, const std::vector& docids, + const std::vector& freqs, std::vector* out) { + const double avgdl = stats.avgdl(); + out->reserve(docids.size()); + for (size_t i = 0; i < docids.size(); ++i) { + uint8_t norm = 0; + SNII_RETURN_IF_ERROR(stats.encoded_norm(docids[i], &norm)); + const uint32_t tf = i < freqs.size() ? freqs[i] : 1; + out->push_back({docids[i], ctx.score(tf, norm, avgdl, params)}); + } + return Status::OK(); +} + +// Decodes a slim/inline term's single .frq window ([dd_region][freq_region]) into +// docids/freqs using the entry's region metadata. +Status DecodeSlim(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + std::vector* docids, std::vector* freqs) { + std::vector owned; + Slice window; + SNII_RETURN_IF_ERROR(FetchSlimWindowBytes(idx, entry, frq_base, &owned, &window)); + const uint64_t dd_len = entry.dd_meta.disk_len; + if (dd_len > window.size()) { + return Status::Corruption("scoring_query: slim dd region exceeds window"); + } + Slice dd_region = window.subslice(0, static_cast(dd_len)); + SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd_region, entry.dd_meta, + /*win_base=*/0, docids)); + Slice freq_region = window.subslice(static_cast(dd_len), + window.size() - static_cast(dd_len)); + return snii::format::decode_freq_region(freq_region, entry.freq_meta, docids->size(), freqs); +} + +// Builds the cursor for a windowed term: tiles all windows for exact scores and +// reads the prelude once for true per-window block-max bounds. +Status BuildWindowedCursor(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, const ScorerContext& ctx, + const DictEntry& entry, uint64_t frq_base, uint64_t prx_base, + const Bm25Params& params, TermCursor* cursor) { + snii::reader::DecodedPosting posting; + // Scoring needs freqs for BM25: fetch the FULL windows (want_freq=true). + SNII_RETURN_IF_ERROR(snii::reader::read_windowed_posting(idx, entry, frq_base, prx_base, + /*want_positions=*/false, + /*want_freq=*/true, &posting)); + SNII_RETURN_IF_ERROR( + ScoreDecoded(stats, ctx, params, posting.docids, posting.freqs, &cursor->postings)); + FrqPreludeReader prelude; + if (FetchPrelude(idx, entry, frq_base, &prelude).ok()) { + SNII_RETURN_IF_ERROR( + BuildWindowBounds(prelude, ctx, stats.avgdl(), params, &cursor->windows)); + } + return Status::OK(); +} + +// Builds the cursor for one term: postings with exact scores + window bounds. +Status BuildCursor(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats, + const std::string& term, const Bm25Params& params, bool* found, + TermCursor* cursor) { + DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, found, &entry, &frq_base, &prx_base)); + if (!*found) return Status::OK(); + + const ScorerContext ctx = ScorerContext::make(stats.indexed_doc_count(), entry.df); + + const bool windowed = + entry.kind == DictEntryKind::kPodRef && entry.enc == DictEntryEnc::kWindowed; + if (windowed) { + SNII_RETURN_IF_ERROR( + BuildWindowedCursor(idx, stats, ctx, entry, frq_base, prx_base, params, cursor)); + } else { + std::vector docids; + std::vector freqs; + SNII_RETURN_IF_ERROR(DecodeSlim(idx, entry, frq_base, &docids, &freqs)); + SNII_RETURN_IF_ERROR(ScoreDecoded(stats, ctx, params, docids, freqs, &cursor->postings)); + } + if (cursor->windows.empty()) { + SingleWindowFallback(cursor->postings, &cursor->windows); + } + return Status::OK(); +} + +// Block-max upper bound for a term at a given docid: the max_score of the window +// covering docid (windows are ascending and contiguous). Beyond the last window +// the bound is 0 (the term cannot contribute). +double TermBoundAt(const TermCursor& c, uint32_t docid) { + // Windows are ascending and contiguous; the first window whose last_docid is + // >= docid covers it. Its block-max is a valid upper bound for any contained + // doc, so it also bounds gaps between windows. + for (const auto& w : c.windows) { + if (docid <= w.last_docid) return w.max_score; + } + return 0.0; +} + +// Min-heap keyed on score (smallest at top) maintaining the top-K. +struct TopK { + explicit TopK(uint32_t k) : k_(k) {} + void offer(uint32_t docid, double score) { + if (heap_.size() < k_) { + heap_.push({score, docid}); + return; + } + if (heap_.empty()) return; + const Entry& worst = heap_.top(); // lowest score; ties: largest docid + const bool better = score > worst.first || (score == worst.first && docid < worst.second); + if (better) { + heap_.pop(); + heap_.push({score, docid}); + } + } + double threshold() const { return heap_.size() < k_ ? -1.0 : heap_.top().first; } + + using Entry = std::pair; + struct Cmp { + bool operator()(const Entry& a, const Entry& b) const { + if (a.first != b.first) return a.first > b.first; // min-score at top + return a.second < b.second; // for ties, largest docid at top (evictable) + } + }; + uint32_t k_; + std::priority_queue, Cmp> heap_; +}; + +void DrainSorted(TopK* topk, std::vector* out) { + std::vector all; + while (!topk->heap_.empty()) { + all.push_back({topk->heap_.top().second, topk->heap_.top().first}); + topk->heap_.pop(); + } + std::sort(all.begin(), all.end(), [](const ScoredDoc& a, const ScoredDoc& b) { + if (a.score != b.score) return a.score > b.score; + return a.docid < b.docid; + }); + *out = std::move(all); +} + +Status BuildCursors(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, const Bm25Params& params, + std::vector* cursors) { + for (const auto& term : terms) { + bool found = false; + TermCursor c; + SNII_RETURN_IF_ERROR(BuildCursor(idx, stats, term, params, &found, &c)); + if (found && !c.postings.empty()) cursors->push_back(std::move(c)); + } + return Status::OK(); +} + +} // namespace + +Status scoring_query_exhaustive(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("scoring_query: null out"); + out->clear(); + if (k == 0) return Status::OK(); + + std::vector cursors; + SNII_RETURN_IF_ERROR(BuildCursors(idx, stats, terms, params, &cursors)); + + std::unordered_map scores; + for (const auto& c : cursors) + for (const auto& p : c.postings) scores[p.docid] += p.score; + + std::vector all; + all.reserve(scores.size()); + for (const auto& [docid, score] : scores) all.push_back({docid, score}); + std::sort(all.begin(), all.end(), [](const ScoredDoc& a, const ScoredDoc& b) { + if (a.score != b.score) return a.score > b.score; + return a.docid < b.docid; + }); + if (all.size() > k) all.resize(k); + *out = std::move(all); + return Status::OK(); +} + +namespace { + +// --- Phase C: selective-fetch (lazy window) WAND ----------------------------- +// +// A LazyTermCursor knows its per-window block-max bounds + docid ranges from the +// frq_prelude WITHOUT fetching any .frq window. Each window's exact (docid,score) +// postings are decoded on first access and cached, so a window is fetched at most +// once and ONLY when the WAND control flow touches a posting in it. Combined with +// window-level SkipTo (advance past whole windows whose last_docid < target via +// the prelude, never fetching them), the offer sequence is byte-identical to the +// eager scoring_query_wand path -- only the bytes read differ. +// +// Soundness: a window is fetched only when LazyCurrentDoc/LazySkipTo land the +// cursor inside it, i.e. it covers a candidate the WAND pivot already proved can +// reach the running theta (bound >= theta). LazySkipTo jumps the cursor to the +// SAME posting (first docid >= target) the eager per-doc walk would, so pivots, +// alignments and offers are identical to the eager path; only windows the eager +// path read-through-but-never-offered-from are skipped. Windows whose block-max +// bound never reaches theta are never the pivot, so never fetched. + +// One query term's lazily-fetched scoring state. +struct LazyTermCursor { + const LogicalIndexReader* idx = nullptr; + const snii::stats::SniiStatsProvider* stats = nullptr; + ScorerContext ctx = ScorerContext::make(1, 1); + Bm25Params params; + DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + FrqPreludeReader prelude; + bool windowed = false; // false => slim/inline single block already materialized + + std::vector windows; // ascending; from prelude (or slim fallback) + std::vector postings; // sparse: only fetched windows are filled + std::vector win_start; // prefix offsets, size = windows.size()+1 + std::vector fetched; // size = windows.size() + size_t pos = 0; // virtual cursor over all windows' postings +}; + +// Total posting count across all windows (the virtual stream length). +uint32_t TotalPostings(const LazyTermCursor& c) { + return c.win_start.empty() ? 0 : c.win_start.back(); +} + +// Index of the window whose virtual range contains posting index p (p < total). +uint32_t WindowOf(const LazyTermCursor& c, uint32_t p) { + const auto it = std::upper_bound(c.win_start.begin(), c.win_start.end(), p); + return static_cast((it - c.win_start.begin()) - 1); +} + +// Fetches + decodes window w into the cursor's posting cache (idempotent). Only +// reached when the WAND proves window w can still contribute to the top-K. +Status MaterializeWindow(LazyTermCursor* c, uint32_t w) { + if (c->fetched[w]) return Status::OK(); + WindowMeta meta; + SNII_RETURN_IF_ERROR(c->prelude.window(w, &meta)); + snii::reader::WindowAbsRange r; + SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( + *c->idx, c->entry, c->frq_base, c->prx_base, c->prelude, w, + /*want_positions=*/false, /*want_freq=*/true, &r)); + // Scoring needs docids + freqs: fetch the window's dd sub-range AND freq sub-range. + snii::io::BatchRangeFetcher fetcher(c->idx->reader(), snii::reader::kSameTermCoalesceGap); + const size_t dh = fetcher.add(r.dd_off, r.dd_len); + const size_t fh = fetcher.add(r.freq_off, r.freq_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + std::vector docids; + std::vector freqs; + std::vector> pos; + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + meta, fetcher.get(dh), fetcher.get(fh), Slice(), /*want_positions=*/false, + /*want_freq=*/true, &docids, &freqs, &pos)); + if (docids.size() != c->win_start[w + 1] - c->win_start[w]) { + return Status::Corruption("scoring_query: selective window doc-count drift"); + } + std::vector scored; + SNII_RETURN_IF_ERROR(ScoreDecoded(*c->stats, c->ctx, c->params, docids, freqs, &scored)); + std::copy(scored.begin(), scored.end(), c->postings.begin() + c->win_start[w]); + c->fetched[w] = 1; + return Status::OK(); +} + +// Current docid at the cursor, fetching the covering window if needed. Exhausted +// cursor -> UINT32_MAX. +Status LazyCurrentDoc(LazyTermCursor* c, uint32_t* docid) { + if (c->pos >= TotalPostings(*c)) { + *docid = std::numeric_limits::max(); + return Status::OK(); + } + const uint32_t w = WindowOf(*c, static_cast(c->pos)); + SNII_RETURN_IF_ERROR(MaterializeWindow(c, w)); + *docid = c->postings[c->pos].docid; + return Status::OK(); +} + +// Advances pos to the first posting with docid >= target, skipping ENTIRE windows +// whose last_docid < target WITHOUT fetching them (prelude-only), then fetching +// just the landing window. Lands on the same posting the eager per-doc walk would. +Status LazySkipTo(LazyTermCursor* c, uint32_t target) { + const uint32_t total = TotalPostings(*c); + while (c->pos < total) { + const uint32_t w = WindowOf(*c, static_cast(c->pos)); + if (c->windows[w].last_docid >= target) break; + c->pos = c->win_start[w + 1]; // skip this window entirely (no fetch) + } + if (c->pos >= total) return Status::OK(); + const uint32_t w = WindowOf(*c, static_cast(c->pos)); + SNII_RETURN_IF_ERROR(MaterializeWindow(c, w)); + while (c->pos < total && c->postings[c->pos].docid < target) ++c->pos; + return Status::OK(); +} + +// Initializes a lazy windowed cursor from the prelude alone: per-window block-max +// bounds + ranges + cache slots, with NO .frq window fetched. +Status BuildLazyWindowed(LazyTermCursor* c) { + SNII_RETURN_IF_ERROR( + snii::reader::fetch_windowed_prelude(*c->idx, c->entry, c->frq_base, &c->prelude)); + SNII_RETURN_IF_ERROR( + BuildWindowBounds(c->prelude, c->ctx, c->stats->avgdl(), c->params, &c->windows)); + // BuildWindowBounds keeps only non-empty windows, in window order. Build the + // matching prefix-sum of doc_counts over those same non-empty windows so the + // bound list, win_start and fetched stay 1:1. + const uint32_t nb = static_cast(c->windows.size()); + c->win_start.assign(nb + 1, 0); + c->fetched.assign(nb, 0); + uint32_t bi = 0; + uint32_t acc = 0; + for (uint32_t w = 0; w < c->prelude.n_windows() && bi < nb; ++w) { + WindowMeta meta; + SNII_RETURN_IF_ERROR(c->prelude.window(w, &meta)); + if (meta.doc_count == 0) continue; + acc += meta.doc_count; + c->win_start[++bi] = acc; + } + c->postings.assign(acc, TermPosting {}); + return Status::OK(); +} + +// Initializes a slim/inline cursor: its single window is small, so fetch + score +// it eagerly (exactly as the existing path). One bound covers all its postings. +Status BuildLazySlim(LazyTermCursor* c) { + std::vector docids; + std::vector freqs; + SNII_RETURN_IF_ERROR(DecodeSlim(*c->idx, c->entry, c->frq_base, &docids, &freqs)); + SNII_RETURN_IF_ERROR(ScoreDecoded(*c->stats, c->ctx, c->params, docids, freqs, &c->postings)); + SingleWindowFallback(c->postings, &c->windows); + c->win_start = {0, static_cast(c->postings.size())}; + c->fetched.assign(1, 1); // already materialized + return Status::OK(); +} + +// Builds a LazyTermCursor for one term: prelude-only for windowed terms (no .frq +// fetched), fully-materialized single window for slim/inline (small). +Status BuildLazyCursor(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats, + const std::string& term, const Bm25Params& params, bool* found, + LazyTermCursor* c) { + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, found, &c->entry, &c->frq_base, &prx_base)); + if (!*found) return Status::OK(); + c->idx = &idx; + c->stats = &stats; + c->params = params; + c->prx_base = prx_base; + c->ctx = ScorerContext::make(stats.indexed_doc_count(), c->entry.df); + c->windowed = + c->entry.kind == DictEntryKind::kPodRef && c->entry.enc == DictEntryEnc::kWindowed; + return c->windowed ? BuildLazyWindowed(c) : BuildLazySlim(c); +} + +Status SelectiveBuildCursors(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, const Bm25Params& params, + std::vector* cursors) { + for (const auto& term : terms) { + bool found = false; + LazyTermCursor c; + SNII_RETURN_IF_ERROR(BuildLazyCursor(idx, stats, term, params, &found, &c)); + if (found && TotalPostings(c) > 0) cursors->push_back(std::move(c)); + } + return Status::OK(); +} + +// Block-max upper bound for a lazy cursor at docid: block_max of the window +// covering docid (ascending, contiguous). Beyond the last window -> 0. Same +// semantics as TermBoundAt over the eager cursor's window list. +double LazyTermBoundAt(const LazyTermCursor& c, uint32_t docid) { + for (const auto& w : c.windows) { + if (docid <= w.last_docid) return w.max_score; + } + return 0.0; +} + +// Sorts cursors ascending by current docid (materializing each cursor's current +// covering window), returning the smallest current docid via *front. +Status SelectiveSortByDoc(std::vector* cursors, uint32_t* front) { + std::vector cur(cursors->size()); + for (size_t i = 0; i < cursors->size(); ++i) { + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&(*cursors)[i], &cur[i])); + } + std::vector order(cursors->size()); + for (size_t i = 0; i < order.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { return cur[a] < cur[b]; }); + std::vector sorted; + sorted.reserve(cursors->size()); + for (size_t i : order) sorted.push_back(std::move((*cursors)[i])); + *cursors = std::move(sorted); + *front = order.empty() ? std::numeric_limits::max() : cur[order.front()]; + return Status::OK(); +} + +// Finds the pivot term: the first cursor (current-docid order) at which the +// accumulated block-max bound reaches theta. >= keeps boundary ties (matching the +// exhaustive total order). *found=false when no remaining doc can beat theta. +Status SelectivePivot(std::vector* cursors, double theta, size_t* pivot, + uint32_t* pivot_doc, bool* found) { + double bound = 0.0; + *found = false; + for (size_t i = 0; i < cursors->size(); ++i) { + uint32_t d = 0; + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&(*cursors)[i], &d)); + if (d == std::numeric_limits::max()) break; + bound += LazyTermBoundAt((*cursors)[i], d); + if (bound >= theta) { + *pivot = i; + *pivot_doc = d; + *found = true; + return Status::OK(); + } + } + return Status::OK(); +} + +// Scores the aligned pivot doc exactly (summing all cursors AT pivot_doc) and +// advances those cursors by one posting. +Status SelectiveScorePivot(std::vector* cursors, uint32_t pivot_doc, TopK* topk) { + double doc_score = 0.0; + for (auto& c : *cursors) { + uint32_t d = 0; + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&c, &d)); + if (d == pivot_doc) { + doc_score += c.postings[c.pos].score; // window already materialized + ++c.pos; + } + } + topk->offer(pivot_doc, doc_score); + return Status::OK(); +} + +// Advances the first lagging cursor (current doc < pivot_doc) up to pivot_doc. +Status SelectiveAdvanceLagging(std::vector* cursors, uint32_t pivot_doc) { + for (auto& c : *cursors) { + uint32_t d = 0; + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&c, &d)); + if (d < pivot_doc) { + SNII_RETURN_IF_ERROR(LazySkipTo(&c, pivot_doc)); + return Status::OK(); + } + } + return Status::OK(); +} + +// One WAND iteration body: sort, pick pivot, then either score (aligned) or skip +// a lagging cursor forward. *done=true ends the loop. +Status SelectiveStep(std::vector* cursors, TopK* topk, bool* done) { + uint32_t front = 0; + SNII_RETURN_IF_ERROR(SelectiveSortByDoc(cursors, &front)); + if (cursors->empty() || front == std::numeric_limits::max()) { + *done = true; + return Status::OK(); + } + size_t pivot = 0; + uint32_t pivot_doc = 0; + bool found_pivot = false; + SNII_RETURN_IF_ERROR( + SelectivePivot(cursors, topk->threshold(), &pivot, &pivot_doc, &found_pivot)); + if (!found_pivot) { + *done = true; + return Status::OK(); + } + if (front == pivot_doc) { + return SelectiveScorePivot(cursors, pivot_doc, topk); + } + return SelectiveAdvanceLagging(cursors, pivot_doc); +} + +Status SelectiveWandLoop(std::vector* cursors, TopK* topk) { + bool done = false; + while (!done) { + SNII_RETURN_IF_ERROR(SelectiveStep(cursors, topk, &done)); + } + return Status::OK(); +} + +} // namespace + +Status scoring_query_wand_selective(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("scoring_query: null out"); + out->clear(); + if (k == 0) return Status::OK(); + + std::vector cursors; + SNII_RETURN_IF_ERROR(SelectiveBuildCursors(idx, stats, terms, params, &cursors)); + + TopK topk(k); + SNII_RETURN_IF_ERROR(SelectiveWandLoop(&cursors, &topk)); + DrainSorted(&topk, out); + return Status::OK(); +} + +Status scoring_query_wand(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("scoring_query: null out"); + out->clear(); + if (k == 0) return Status::OK(); + + std::vector cursors; + SNII_RETURN_IF_ERROR(BuildCursors(idx, stats, terms, params, &cursors)); + + TopK topk(k); + // Document-at-a-time WAND with block-max bounds. + while (true) { + // Sort cursors by current docid (ascending; exhausted cursors sink). + std::sort(cursors.begin(), cursors.end(), [](const TermCursor& a, const TermCursor& b) { + return CurrentDoc(a) < CurrentDoc(b); + }); + if (cursors.empty() || + CurrentDoc(cursors.front()) == std::numeric_limits::max()) { + break; + } + + const double theta = topk.threshold(); + // Accumulate block-max upper bounds in docid order to find the pivot term. + double bound = 0.0; + size_t pivot = 0; + bool found_pivot = false; + for (size_t i = 0; i < cursors.size(); ++i) { + const uint32_t d = CurrentDoc(cursors[i]); + if (d == std::numeric_limits::max()) break; + bound += TermBoundAt(cursors[i], d); + // Use >= (not >) so a doc whose upper bound only TIES the K-th threshold is + // still explored and exact-scored: under the (score desc, docid asc) total + // order a tie can still evict the current K-th entry (smaller docid wins), + // exactly as the exhaustive path would. Strict > would wrongly prune ties. + if (bound >= theta) { + pivot = i; + found_pivot = true; + break; + } + } + if (!found_pivot) break; // no doc can beat the threshold anymore. + + const uint32_t pivot_doc = CurrentDoc(cursors[pivot]); + if (CurrentDoc(cursors.front()) == pivot_doc) { + // All cursors at the pivot doc are aligned: score it exactly. + double doc_score = 0.0; + for (auto& c : cursors) { + if (CurrentDoc(c) == pivot_doc) { + doc_score += c.postings[c.pos].score; + ++c.pos; + } + } + topk.offer(pivot_doc, doc_score); + } else { + // Advance a lagging cursor toward pivot_doc (skip docs it cannot win on). + for (auto& c : cursors) { + if (CurrentDoc(c) < pivot_doc) { + while (c.pos < c.postings.size() && c.postings[c.pos].docid < pivot_doc) { + ++c.pos; + } + break; + } + } + } + } + DrainSorted(&topk, out); + return Status::OK(); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/term_expansion.cpp b/be/src/storage/index/snii/core/src/query/term_expansion.cpp new file mode 100644 index 00000000000000..ce1cffb0f141f1 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/term_expansion.cpp @@ -0,0 +1,33 @@ +#include "snii/query/internal/term_expansion.h" + +#include +#include + +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/query/internal/docid_union.h" + +namespace snii::query::internal { + +Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx, + std::string_view enum_prefix, const TermMatcher& matches, + DocIdSink* const sink, int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("term_expansion: null sink"); + } + + std::vector postings; + int32_t count = 0; + SNII_RETURN_IF_ERROR(idx.visit_prefix_terms( + enum_prefix, [&](snii::reader::LogicalIndexReader::PrefixHit&& hit, bool* stop) { + if (!matches(hit.term)) { + return Status::OK(); + } + postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base}); + ++count; + *stop = max_expansions > 0 && count >= max_expansions; + return Status::OK(); + })); + return emit_docid_union(idx, postings, sink); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/term_query.cpp b/be/src/storage/index/snii/core/src/query/term_query.cpp new file mode 100644 index 00000000000000..4cf6e97bc2471b --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/term_query.cpp @@ -0,0 +1,39 @@ +#include "snii/query/term_query.h" + +#include + +#include "snii/format/dict_entry.h" +#include "snii/query/internal/docid_posting_reader.h" + +namespace snii::query { + +using snii::format::DictEntry; +using snii::reader::LogicalIndexReader; + +Status term_query(const LogicalIndexReader& idx, std::string_view term, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("term_query: null out"); + docids->clear(); + VectorDocIdSink sink(*docids); + return term_query(idx, term, &sink); +} + +Status term_query(const LogicalIndexReader& idx, std::string_view term, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("term_query: null sink"); + + bool found = false; + DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base)); + if (!found) return Status::OK(); + return internal::read_docid_posting(idx, entry, frq_base, prx_base, sink); +} + +Status term_query(const LogicalIndexReader& idx, std::string_view term, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return term_query(idx, term, docids); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/wildcard_query.cpp b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp new file mode 100644 index 00000000000000..a3d5fd72bfbb71 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp @@ -0,0 +1,79 @@ +#include "snii/query/wildcard_query.h" + +#include +#include +#include +#include +#include + +#include "snii/query/internal/term_expansion.h" + +namespace snii::query { + +namespace { + +std::string literal_prefix_for_wildcard(std::string_view pattern) { + std::string out; + for (char c : pattern) { + if (c == '*' || c == '?') { + break; + } + out.push_back(c); + } + return out; +} + +bool wildcard_match(std::string_view pattern, std::string_view text) { + std::vector prev(text.size() + 1, 0); + std::vector curr(text.size() + 1, 0); + prev[0] = 1; + + for (char p : pattern) { + std::fill(curr.begin(), curr.end(), 0); + if (p == '*') { + curr[0] = prev[0]; + for (size_t i = 1; i <= text.size(); ++i) { + curr[i] = prev[i] || curr[i - 1]; + } + } else { + for (size_t i = 1; i <= text.size(); ++i) { + curr[i] = prev[i - 1] && (p == '?' || p == text[i - 1]); + } + } + prev.swap(curr); + } + return prev[text.size()] != 0; +} + +} // namespace + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("wildcard_query: null out"); + } + docids->clear(); + VectorDocIdSink sink(*docids); + return wildcard_query(idx, pattern, &sink, max_expansions); +} + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { + QueryProfileScope profile_scope(idx.reader(), profile); + return wildcard_query(idx, pattern, docids, max_expansions); +} + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* const sink, int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("wildcard_query: null sink"); + } + const std::string enum_prefix = literal_prefix_for_wildcard(pattern); + return internal::emit_expanded_docid_union( + idx, enum_prefix, + [pattern](std::string_view term) { return wildcard_match(pattern, term); }, sink, + max_expansions); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp new file mode 100644 index 00000000000000..be6c01b2cb97d6 --- /dev/null +++ b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp @@ -0,0 +1,390 @@ +#include "snii/reader/logical_index_reader.h" + +#include +#include +#include +#include + +#include "snii/encoding/crc32c.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" + +namespace snii::reader { + +using snii::format::BlockRef; +using snii::format::bsbf_hash; +using snii::format::bsbf_probe; +using snii::format::DictBlockDirectoryReader; +using snii::format::DictBlockReader; +using snii::format::DictEntry; +using snii::format::IndexTier; +using snii::format::kBsbfBytesPerBlock; +using snii::format::kBsbfHeaderSize; +using snii::format::PerIndexMetaReader; +using snii::format::RegionRef; +using snii::format::SampledTermIndexReader; + +namespace { +constexpr uint64_t kMaxDictBlockUncompBytes = 256ULL * 1024 * 1024; +constexpr uint64_t kDefaultDictResidentMaxBytes = 256ULL * 1024; + +// L0/L1 tiering threshold (bytes). Defaults to kBsbfResidentMaxBytes; the env +// SNII_BSBF_RESIDENT_MAX overrides it for tuning and for exercising the +// on-demand L1 path in tests without a 250K-term corpus. Read fresh each open. +uint64_t bsbf_resident_max_bytes() { + const char* s = std::getenv("SNII_BSBF_RESIDENT_MAX"); + if (s != nullptr) { + char* end = nullptr; + const unsigned long long v = std::strtoull(s, &end, 10); + if (end != s) { + return v; + } + } + return snii::format::kBsbfResidentMaxBytes; +} + +uint64_t dict_resident_max_bytes() { + const char* s = std::getenv("SNII_DICT_RESIDENT_MAX"); + if (s != nullptr) { + char* end = nullptr; + const unsigned long long v = std::strtoull(s, &end, 10); + if (end != s) { + return v; + } + } + return kDefaultDictResidentMaxBytes; +} + +Status checked_size(uint64_t value, const char* error, size_t* out) { + if (value > std::numeric_limits::max()) { + return Status::Corruption(error); + } + *out = static_cast(value); + return Status::OK(); +} + +Status dict_block_memory_bytes(const BlockRef& ref, uint64_t* out) { + if ((ref.flags & snii::format::block_ref_flags::kZstd) == 0) { + *out = ref.length; + return Status::OK(); + } + if (ref.uncomp_len == 0 || ref.uncomp_len > kMaxDictBlockUncompBytes) { + return Status::Corruption("dict block: zstd uncomp_len out of range"); + } + *out = ref.uncomp_len; + return Status::OK(); +} + +Status read_dict_block_bytes(snii::io::FileReader* reader, const BlockRef& ref, + std::vector* out) { + size_t read_len = 0; + SNII_RETURN_IF_ERROR( + checked_size(ref.length, "dict block: on-disk length out of range", &read_len)); + + std::vector block_bytes; + SNII_RETURN_IF_ERROR(reader->read_at(ref.offset, read_len, &block_bytes)); + if (block_bytes.size() != read_len) { + return Status::Corruption("dict block: short read"); + } + + if ((ref.flags & snii::format::block_ref_flags::kZstd) == 0) { + *out = std::move(block_bytes); + return Status::OK(); + } + + uint64_t memory_bytes = 0; + SNII_RETURN_IF_ERROR(dict_block_memory_bytes(ref, &memory_bytes)); + size_t uncomp_len = 0; + SNII_RETURN_IF_ERROR( + checked_size(memory_bytes, "dict block: zstd length out of range", &uncomp_len)); + return snii::zstd_decompress(Slice(block_bytes), uncomp_len, out); +} + +Status open_dict_block(snii::io::FileReader* reader, const BlockRef& ref, IndexTier tier, + bool has_positions, std::vector* bytes, DictBlockReader* out) { + SNII_RETURN_IF_ERROR(read_dict_block_bytes(reader, ref, bytes)); + return DictBlockReader::open(Slice(*bytes), tier, has_positions, out); +} +} // namespace + +Status LogicalIndexReader::load_resident_dict_blocks() { + resident_dict_blocks_.clear(); + + const uint64_t max_bytes = dict_resident_max_bytes(); + if (max_bytes == 0 || dbd_.n_blocks() == 0) { + return Status::OK(); + } + + uint64_t total_bytes = 0; + for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) { + BlockRef ref {}; + SNII_RETURN_IF_ERROR(dbd_.get(ord, &ref)); + uint64_t block_bytes = 0; + SNII_RETURN_IF_ERROR(dict_block_memory_bytes(ref, &block_bytes)); + if (block_bytes > max_bytes - total_bytes) { + return Status::OK(); + } + total_bytes += block_bytes; + } + + resident_dict_blocks_.reserve(dbd_.n_blocks()); + for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) { + BlockRef ref {}; + SNII_RETURN_IF_ERROR(dbd_.get(ord, &ref)); + ResidentDictBlock block; + SNII_RETURN_IF_ERROR( + open_dict_block(reader_, ref, tier_, has_positions_, &block.bytes, &block.reader)); + resident_dict_blocks_.push_back(std::move(block)); + } + return Status::OK(); +} + +Status LogicalIndexReader::dict_block_reader_for_ordinal(uint32_t ordinal, + OnDemandDictBlock* on_demand, + const DictBlockReader** out) const { + if (!resident_dict_blocks_.empty()) { + if (resident_dict_blocks_.size() != dbd_.n_blocks() || + ordinal >= resident_dict_blocks_.size()) { + return Status::Corruption("logical_index: incomplete resident dict"); + } + *out = &resident_dict_blocks_[ordinal].reader; + return Status::OK(); + } + + BlockRef ref {}; + SNII_RETURN_IF_ERROR(dbd_.get(ordinal, &ref)); + SNII_RETURN_IF_ERROR(open_dict_block(reader_, ref, tier_, has_positions_, &on_demand->bytes, + &on_demand->reader)); + *out = &on_demand->reader; + return Status::OK(); +} + +Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tier, + bool has_positions, Slice meta_block, LogicalIndexReader* out) { + if (file_reader == nullptr) { + return Status::InvalidArgument("logical_index: null file reader"); + } + if (out == nullptr) { + return Status::InvalidArgument("logical_index: null out"); + } + *out = LogicalIndexReader {}; + + out->reader_ = file_reader; + out->tier_ = tier; + out->has_positions_ = has_positions; + + SNII_RETURN_IF_ERROR(PerIndexMetaReader::open(meta_block, &out->meta_)); + SNII_RETURN_IF_ERROR( + SampledTermIndexReader::open(out->meta_.sampled_term_index_bytes(), &out->sti_)); + SNII_RETURN_IF_ERROR( + DictBlockDirectoryReader::open(out->meta_.dict_block_directory_bytes(), &out->dbd_)); + SNII_RETURN_IF_ERROR(out->load_resident_dict_blocks()); + + // Block-split bloom XFilter: derive the resident header from the section ref + // (offset+length) -- ZERO open-time I/O, the whole point of the on-demand + // design. The bitset starts at the constant offset section.offset + 28; one + // 32-byte block is read on demand per probe in lookup(). + const RegionRef& bsbf = out->meta_.section_refs().bsbf; + if (bsbf.length > 0) { + if (bsbf.length <= kBsbfHeaderSize) { + return Status::Corruption("logical_index: bsbf section too small"); + } + const uint64_t num_bytes = bsbf.length - kBsbfHeaderSize; + const bool resident = bsbf.length <= bsbf_resident_max_bytes(); + // L0: read the WHOLE section (header + bitset) so probes are in-memory AND + // the bitset crc can be verified once. L1: read only the 28-byte header so + // open stays near-zero I/O; the on-demand single-block probe cannot verify + // a whole-bitset crc, so L1 relies on the storage layer's own integrity for + // the bitset body. Either way the header (magic/version/strategy/geometry + + // header crc) is parsed and verified -- BsbfHeader::parse rejects a corrupt + // header. + std::vector head; + SNII_RETURN_IF_ERROR( + file_reader->read_at(bsbf.offset, resident ? bsbf.length : kBsbfHeaderSize, &head)); + if (head.size() < kBsbfHeaderSize) { + return Status::Corruption("logical_index: short bsbf header read"); + } + SNII_RETURN_IF_ERROR(snii::format::BsbfHeader::parse(Slice(head.data(), kBsbfHeaderSize), + bsbf.offset, &out->bsbf_header_)); + // Cross-check the header geometry against the section ref. + if (out->bsbf_header_.num_bytes != num_bytes) { + return Status::Corruption("logical_index: bsbf header/section size mismatch"); + } + out->has_bsbf_ = true; + if (resident) { + if (head.size() < bsbf.length) { + return Status::Corruption("logical_index: short bsbf resident read"); + } + const Slice bitset(head.data() + kBsbfHeaderSize, out->bsbf_header_.num_bytes); + if (snii::crc32c(bitset) != out->bsbf_header_.bitset_crc) { + return Status::Corruption("logical_index: bsbf bitset crc mismatch"); + } + out->bsbf_resident_bitset_.assign(bitset.data(), bitset.data() + bitset.size()); + out->bsbf_resident_ = true; + } + } + return Status::OK(); +} + +Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* entry, + uint64_t* frq_base, uint64_t* prx_base) const { + *found = false; + if (reader_ == nullptr) { + return Status::InvalidArgument("logical_index: not opened"); + } + + // 1. XFilter fast rejection. DEFINITELY-ABSENT returns empty without the + // DICT read. L0 probes the resident bitset; L1 reads one 32-byte block. + if (has_bsbf_) { + const uint64_t h = bsbf_hash(term); + bool maybe = false; + if (bsbf_resident_) { + // L0: in-memory probe of the resident bitset (no round). + const uint32_t blk = snii::format::bsbf_block_index(h, bsbf_header_.num_blocks); + maybe = snii::format::bsbf_block_contains( + h, + bsbf_resident_bitset_.data() + static_cast(blk) * kBsbfBytesPerBlock); + } else { + // L1: on-demand single-block probe. + SNII_RETURN_IF_ERROR(bsbf_probe(reader_, bsbf_header_, h, &maybe)); + } + if (!maybe) { + return Status::OK(); + } + } + + // 2. SampledTermIndex -> candidate block ordinal. + bool maybe = false; + uint32_t ordinal = 0; + SNII_RETURN_IF_ERROR(sti_.locate(term, &maybe, &ordinal)); + if (!maybe) { + return Status::OK(); + } + + // 3. Use a resident small-DICT block when present; otherwise read the DICT + // block on demand and parse it with the same validation path used at open. + const DictBlockReader* br = nullptr; + OnDemandDictBlock on_demand; + SNII_RETURN_IF_ERROR(dict_block_reader_for_ordinal(ordinal, &on_demand, &br)); + + bool hit = false; + SNII_RETURN_IF_ERROR(br->find_term(term, &hit, entry)); + if (!hit) { + return Status::OK(); + } + + *found = true; + *frq_base = br->frq_base(); + *prx_base = br->prx_base(); + return Status::OK(); +} + +Status LogicalIndexReader::visit_prefix_terms(std::string_view prefix, + const PrefixHitVisitor& visitor) const { + if (!visitor) { + return Status::InvalidArgument("logical_index: null prefix visitor"); + } + if (reader_ == nullptr) { + return Status::InvalidArgument("logical_index: not opened"); + } + + // Seek the start block: the SampledTermIndex block whose first term <= prefix + // (terms with `prefix` are >= prefix, so they begin in that block or later). + // If the prefix sorts before every sample (or is empty), start at block 0. + uint32_t start = 0; + if (!prefix.empty()) { + bool maybe = false; + uint32_t ordinal = 0; + SNII_RETURN_IF_ERROR(sti_.locate(prefix, &maybe, &ordinal)); + if (maybe) { + start = ordinal; + } + } + + for (uint32_t ord = start; ord < dbd_.n_blocks(); ++ord) { + const DictBlockReader* br = nullptr; + OnDemandDictBlock on_demand; + SNII_RETURN_IF_ERROR(dict_block_reader_for_ordinal(ord, &on_demand, &br)); + std::vector entries; + SNII_RETURN_IF_ERROR(br->decode_all(&entries)); + + for (DictEntry& e : entries) { + const std::string_view t(e.term); + if (t < prefix) { + continue; // not yet at the prefix range + } + const bool has_prefix = + t.size() >= prefix.size() && t.compare(0, prefix.size(), prefix) == 0; + if (!has_prefix) { + return Status::OK(); // past the prefix range; sorted -> done + } + PrefixHit hit; + hit.term = e.term; + hit.entry = std::move(e); + hit.frq_base = br->frq_base(); + hit.prx_base = br->prx_base(); + bool stop = false; + SNII_RETURN_IF_ERROR(visitor(std::move(hit), &stop)); + if (stop) { + return Status::OK(); + } + } + } + return Status::OK(); +} + +Status LogicalIndexReader::prefix_terms(std::string_view prefix, std::vector* const out, + int32_t max_terms) const { + if (out == nullptr) { + return Status::InvalidArgument("logical_index: null out"); + } + out->clear(); + return visit_prefix_terms(prefix, [&](PrefixHit&& hit, bool* stop) { + out->push_back(std::move(hit)); + *stop = max_terms > 0 && out->size() >= static_cast(max_terms); + return Status::OK(); + }); +} + +namespace { + +// Validates a pod_ref window locator against the posting region and returns the +// absolute window range (after the prelude). Rejects corrupt locators rather +// than letting size_t underflow / uint64 overflow reach read_at. +Status resolve_window(const snii::format::RegionRef& section, uint64_t base, uint64_t off_delta, + uint64_t total_len, uint64_t prelude_len, uint64_t* abs_off, uint64_t* len) { + if (prelude_len > total_len) { + return Status::Corruption("logical_index: prelude_len exceeds window len"); + } + const uint64_t in_region = base + off_delta; + if (in_region < base) { + return Status::Corruption("logical_index: locator overflow"); + } + if (in_region > section.length || total_len > section.length - in_region) { + return Status::Corruption("logical_index: window past posting region"); + } + *abs_off = section.offset + in_region + prelude_len; + *len = total_len - prelude_len; + return Status::OK(); +} + +} // namespace + +Status LogicalIndexReader::resolve_frq_window(const snii::format::DictEntry& entry, + uint64_t frq_base, uint64_t* abs_off, + uint64_t* len) const { + return resolve_window(section_refs().posting_region, frq_base, entry.frq_off_delta, + entry.frq_len, entry.prelude_len, abs_off, len); +} + +Status LogicalIndexReader::resolve_prx_window(const snii::format::DictEntry& entry, + uint64_t prx_base, uint64_t* abs_off, + uint64_t* len) const { + // .prx windows carry no prelude (prelude_len = 0); both spans live in the + // same posting region (prx span precedes frq span for the same term). + return resolve_window(section_refs().posting_region, prx_base, entry.prx_off_delta, + entry.prx_len, 0, abs_off, len); +} + +} // namespace snii::reader diff --git a/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp b/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp new file mode 100644 index 00000000000000..41e6ba06800152 --- /dev/null +++ b/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp @@ -0,0 +1,97 @@ +#include "snii/reader/snii_segment_reader.h" + +#include + +#include "snii/encoding/crc32c.h" +#include "snii/format/bootstrap_header.h" +#include "snii/format/format_constants.h" +#include "snii/format/per_index_meta.h" +#include "snii/format/stats_block.h" +#include "snii/format/tail_pointer.h" + +namespace snii::reader { + +using snii::format::BootstrapHeader; +using snii::format::IndexTier; +using snii::format::PerIndexMetaReader; +using snii::format::StatsBlock; +using snii::format::TailMetaRegionReader; +using snii::format::TailPointer; + +namespace { + +// Reads the bootstrap header from the front of the file and validates it. +Status ReadBootstrap(snii::io::FileReader* reader, BootstrapHeader* bh) { + std::vector buf; + SNII_RETURN_IF_ERROR(reader->read_at(0, snii::format::kBootstrapHeaderSize, &buf)); + return snii::format::decode_bootstrap_header(Slice(buf), bh); +} + +// Reads the fixed tail pointer (last tail_pointer_size() bytes) of the file. +Status ReadTailPointer(snii::io::FileReader* reader, TailPointer* tp) { + const size_t tp_size = snii::format::tail_pointer_size(); + const uint64_t total = reader->size(); + if (total < tp_size) { + return Status::Corruption("segment: file smaller than tail pointer"); + } + std::vector buf; + SNII_RETURN_IF_ERROR(reader->read_at(total - tp_size, tp_size, &buf)); + return snii::format::decode_tail_pointer(Slice(buf), tp); +} + +} // namespace + +Status SniiSegmentReader::open(snii::io::FileReader* reader, SniiSegmentReader* out) { + if (reader == nullptr) return Status::InvalidArgument("segment: null reader"); + if (out == nullptr) return Status::InvalidArgument("segment: null out"); + + BootstrapHeader bh; + SNII_RETURN_IF_ERROR(ReadBootstrap(reader, &bh)); + + TailPointer tp; + SNII_RETURN_IF_ERROR(ReadTailPointer(reader, &tp)); + if (tp.meta_region_length == 0) { + return Status::Corruption("segment: empty tail meta region"); + } + + out->reader_ = reader; + SNII_RETURN_IF_ERROR( + reader->read_at(tp.meta_region_offset, tp.meta_region_length, &out->meta_region_)); + // Verify the whole meta region against the tail pointer's checksum BEFORE parsing + // it. (TailMetaRegionReader::open also checks the region's own internal checksum; + // this is the read-boundary check that makes tp.meta_region_checksum meaningful and + // catches corruption before any framed sub-section is touched.) + if (snii::crc32c(Slice(out->meta_region_)) != tp.meta_region_checksum) { + return Status::Corruption("segment: meta region checksum mismatch"); + } + return TailMetaRegionReader::open(Slice(out->meta_region_), &out->region_reader_); +} + +Status SniiSegmentReader::open_index(uint64_t index_id, std::string_view suffix, + LogicalIndexReader* out) const { + if (out == nullptr) return Status::InvalidArgument("segment: null index out"); + if (reader_ == nullptr) return Status::InvalidArgument("segment: not opened"); + + bool found = false; + Slice meta_bytes; + SNII_RETURN_IF_ERROR(region_reader_.find(index_id, suffix, &found, &meta_bytes)); + if (!found) return Status::NotFound("segment: logical index not found"); + + // Determine tier / positions capability from the per-index meta. Positions + // capability is read from the PERSISTED header flag (kHasPositions), NOT from + // any region length: after the frq/prx merge, posting_region.length is non-zero + // for ANY index with a pod_ref term -- docs-only included -- so a region-length + // heuristic would mis-classify a docs-only index as positional and make + // DictBlockReader::check_flags hard-fail. The "|| has_norms" is kept only as a + // defensive belt-and-suspenders (a scoring index always has positions). + PerIndexMetaReader meta; + SNII_RETURN_IF_ERROR(PerIndexMetaReader::open(meta_bytes, &meta)); + const bool has_norms = meta.section_refs().norms.length > 0; + const bool has_positions = meta.has_positions() || has_norms; + const IndexTier tier = + has_norms ? IndexTier::kT3 : (has_positions ? IndexTier::kT2 : IndexTier::kT1); + + return LogicalIndexReader::open(reader_, tier, has_positions, meta_bytes, out); +} + +} // namespace snii::reader diff --git a/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp b/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp new file mode 100644 index 00000000000000..1299660f0658a8 --- /dev/null +++ b/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp @@ -0,0 +1,253 @@ +#include "snii/reader/windowed_posting.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/format/prx_pod.h" +#include "snii/io/batch_range_fetcher.h" + +namespace snii::reader { + +using snii::format::DictEntry; +using snii::format::FrqPreludeReader; +using snii::format::FrqRegionMeta; +using snii::format::WindowMeta; + +namespace { + +// Resolves the absolute file offset of the prelude bytes for a windowed entry. +// The frq span lives in the interleaved posting region (after the term's prx span). +uint64_t PreludeAbs(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base) { + const auto& region = idx.section_refs().posting_region; + return region.offset + frq_base + entry.frq_off_delta; +} + +// Validates that [off, off+len) fits within [0, total). +Status InBounds(uint64_t off, uint64_t len, uint64_t total) { + if (off > total || len > total - off) { + return Status::Corruption("windowed_posting: range out of section"); + } + return Status::OK(); +} + +// Block geometry of a windowed entry's grouped .frq payload (all offsets absolute). +struct BlockGeometry { + uint64_t dd_block_off = 0; // absolute start of the dd-block + uint64_t dd_block_len = 0; + uint64_t freq_block_off = 0; // absolute start of the freq-block + uint64_t freq_block_len = 0; + uint64_t frq_region_len = 0; // entry.frq_len - prelude_len (dd-block + freq-block) +}; + +// Derives the dd-block / freq-block absolute ranges from the entry + prelude, +// validating they tile the post-prelude .frq region exactly. +Status ResolveBlocks(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + const FrqPreludeReader& prelude, BlockGeometry* g) { + if (entry.prelude_len > entry.frq_len) { + return Status::Corruption("windowed_posting: prelude_len exceeds frq_len"); + } + const uint64_t frq_window_start = PreludeAbs(idx, entry, frq_base) + entry.prelude_len; + g->frq_region_len = entry.frq_len - entry.prelude_len; + g->dd_block_len = prelude.dd_block_len(); + g->freq_block_len = prelude.freq_block_len(); + // dd-block + freq-block must fit exactly within the post-prelude region. + if (g->dd_block_len > g->frq_region_len || + g->freq_block_len > g->frq_region_len - g->dd_block_len) { + return Status::Corruption("windowed_posting: blocks exceed frq region"); + } + g->dd_block_off = frq_window_start; + g->freq_block_off = frq_window_start + g->dd_block_len; + return Status::OK(); +} + +// Per-window decode state for the full-posting path. +struct WindowSlices { + WindowMeta meta; + Slice dd_region; + Slice freq_region; + Slice prx_window; +}; + +// Carves window w's dd (and freq when want_freq) sub-slices out of the fetched +// blocks, validating each locator against its block length. +Status CarveRegionSlices(const WindowMeta& m, Slice dd_block, Slice freq_block, bool want_freq, + WindowSlices* out) { + SNII_RETURN_IF_ERROR(InBounds(m.dd_off, m.dd_disk_len, dd_block.size())); + out->dd_region = + dd_block.subslice(static_cast(m.dd_off), static_cast(m.dd_disk_len)); + if (!want_freq) return Status::OK(); + SNII_RETURN_IF_ERROR(InBounds(m.freq_off, m.freq_disk_len, freq_block.size())); + out->freq_region = freq_block.subslice(static_cast(m.freq_off), + static_cast(m.freq_disk_len)); + return Status::OK(); +} + +// Decodes window w from the fetched blocks (+ optional prx slice) and appends to out. +Status AppendWindow(const WindowSlices& ws, bool want_positions, bool want_freq, + DecodedPosting* out) { + std::vector docids, freqs; + std::vector> pos; + SNII_RETURN_IF_ERROR(decode_window_slices(ws.meta, ws.dd_region, ws.freq_region, ws.prx_window, + want_positions, want_freq, &docids, &freqs, &pos)); + out->docids.insert(out->docids.end(), docids.begin(), docids.end()); + out->freqs.insert(out->freqs.end(), freqs.begin(), freqs.end()); + if (want_positions) { + for (auto& v : pos) out->positions.push_back(std::move(v)); + } + return Status::OK(); +} + +} // namespace + +Status fetch_windowed_prelude(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, FrqPreludeReader* prelude) { + if (entry.prelude_len == 0) { + return Status::Corruption("windowed_posting: windowed entry has no prelude"); + } + if (entry.prelude_len > entry.frq_len) { + return Status::Corruption("windowed_posting: prelude_len exceeds frq_len"); + } + const uint64_t prelude_abs = PreludeAbs(idx, entry, frq_base); + snii::io::BatchRangeFetcher fetcher(idx.reader()); + const size_t h = fetcher.add(prelude_abs, entry.prelude_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + return FrqPreludeReader::open(fetcher.get(h), prelude); +} + +Status windowed_window_range(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, const FrqPreludeReader& prelude, + uint32_t w, bool want_positions, bool want_freq, WindowAbsRange* out) { + if (out == nullptr) return Status::InvalidArgument("windowed_posting: null range"); + *out = WindowAbsRange {}; + BlockGeometry g; + SNII_RETURN_IF_ERROR(ResolveBlocks(idx, entry, frq_base, prelude, &g)); + WindowMeta meta; + SNII_RETURN_IF_ERROR(prelude.window(w, &meta)); + + // dd sub-range within the dd-block. + SNII_RETURN_IF_ERROR(InBounds(meta.dd_off, meta.dd_disk_len, g.dd_block_len)); + out->dd_off = g.dd_block_off + meta.dd_off; + out->dd_len = meta.dd_disk_len; + + if (want_freq) { + SNII_RETURN_IF_ERROR(InBounds(meta.freq_off, meta.freq_disk_len, g.freq_block_len)); + out->freq_off = g.freq_block_off + meta.freq_off; + out->freq_len = meta.freq_disk_len; + } + + if (!want_positions) return Status::OK(); + if (!prelude.has_prx()) { + return Status::Corruption("windowed_posting: positions requested but prelude has none"); + } + const uint64_t prx_region_start = + idx.section_refs().posting_region.offset + prx_base + entry.prx_off_delta; + SNII_RETURN_IF_ERROR(InBounds(meta.prx_off, meta.prx_len, entry.prx_len)); + out->prx_off = prx_region_start + meta.prx_off; + out->prx_len = meta.prx_len; + return Status::OK(); +} + +Status decode_window_slices(const WindowMeta& meta, Slice dd_region, Slice freq_region, + Slice prx_window, bool want_positions, bool want_freq, + std::vector* docids, std::vector* freqs, + std::vector>* positions) { + FrqRegionMeta dd_meta; + dd_meta.zstd = meta.dd_zstd; + dd_meta.uncomp_len = meta.dd_uncomp_len; + dd_meta.disk_len = meta.dd_disk_len; + dd_meta.crc = meta.crc_dd; + dd_meta.verify_crc = meta.verify_crc; + SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd_region, dd_meta, meta.win_base, docids)); + if (docids->size() != meta.doc_count) { + return Status::Corruption("windowed_posting: frq doc_count mismatch"); + } + if (want_freq) { + FrqRegionMeta freq_meta; + freq_meta.zstd = meta.freq_zstd; + freq_meta.uncomp_len = meta.freq_uncomp_len; + freq_meta.disk_len = meta.freq_disk_len; + freq_meta.crc = meta.crc_freq; + freq_meta.verify_crc = meta.verify_crc; + SNII_RETURN_IF_ERROR( + snii::format::decode_freq_region(freq_region, freq_meta, meta.doc_count, freqs)); + } else { + freqs->clear(); + } + if (!want_positions) return Status::OK(); + + ByteSource psrc(prx_window); + SNII_RETURN_IF_ERROR(snii::format::read_prx_window(&psrc, positions)); + if (positions->size() != docids->size()) { + return Status::Corruption("windowed_posting: prx/frq doc-count mismatch"); + } + return Status::OK(); +} + +namespace { + +// Fetches the dd-block (always), the freq-block (when want_freq) and the whole .prx +// region (when want_positions) of a windowed entry in ONE batch and returns the +// in-memory block slices. The dd-block is a single contiguous range -> the +// docid-only / phrase path reads it as one Range GET (the byte-saving core). +Status FetchBlocks(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t prx_base, + const BlockGeometry& g, bool want_positions, bool want_freq, + snii::io::BatchRangeFetcher* fetcher, size_t* dd_h, size_t* freq_h, + size_t* prx_h) { + *dd_h = fetcher->add(g.dd_block_off, g.dd_block_len); + if (want_freq) { + *freq_h = fetcher->add(g.freq_block_off, g.freq_block_len); + } + if (want_positions) { + const uint64_t prx_region_start = + idx.section_refs().posting_region.offset + prx_base + entry.prx_off_delta; + *prx_h = fetcher->add(prx_region_start, entry.prx_len); + } + return fetcher->fetch(); +} + +} // namespace + +Status read_windowed_posting(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, bool want_positions, + bool want_freq, DecodedPosting* out) { + if (out == nullptr) { + return Status::InvalidArgument("windowed_posting: null out"); + } + *out = DecodedPosting {}; + + FrqPreludeReader prelude; + SNII_RETURN_IF_ERROR(fetch_windowed_prelude(idx, entry, frq_base, &prelude)); + if (want_positions && !prelude.has_prx()) { + return Status::Corruption("windowed_posting: positions requested but prelude has none"); + } + BlockGeometry g; + SNII_RETURN_IF_ERROR(ResolveBlocks(idx, entry, frq_base, prelude, &g)); + + snii::io::BatchRangeFetcher fetcher(idx.reader()); + size_t dd_h = 0, freq_h = 0, prx_h = 0; + SNII_RETURN_IF_ERROR(FetchBlocks(idx, entry, prx_base, g, want_positions, want_freq, &fetcher, + &dd_h, &freq_h, &prx_h)); + const Slice dd_block = fetcher.get(dd_h); + const Slice freq_block = want_freq ? fetcher.get(freq_h) : Slice(); + const Slice prx_region = want_positions ? fetcher.get(prx_h) : Slice(); + + const uint32_t n = prelude.n_windows(); + for (uint32_t w = 0; w < n; ++w) { + WindowSlices ws; + SNII_RETURN_IF_ERROR(prelude.window(w, &ws.meta)); + SNII_RETURN_IF_ERROR(CarveRegionSlices(ws.meta, dd_block, freq_block, want_freq, &ws)); + if (want_positions) { + SNII_RETURN_IF_ERROR(InBounds(ws.meta.prx_off, ws.meta.prx_len, prx_region.size())); + ws.prx_window = prx_region.subslice(static_cast(ws.meta.prx_off), + static_cast(ws.meta.prx_len)); + } + SNII_RETURN_IF_ERROR(AppendWindow(ws, want_positions, want_freq, out)); + } + return Status::OK(); +} + +} // namespace snii::reader diff --git a/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp b/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp new file mode 100644 index 00000000000000..f4457c96273f40 --- /dev/null +++ b/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp @@ -0,0 +1,93 @@ +#include "snii/stats/snii_stats_provider.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/stats_block.h" +#include "snii/io/batch_range_fetcher.h" + +namespace snii::stats { + +using snii::format::DictEntry; +using snii::format::NormsPodReader; +using snii::format::RegionRef; + +namespace { + +// Resolves a term's DictEntry. *found=false for an absent term (OK status). +Status LookupEntry(const snii::reader::LogicalIndexReader& idx, std::string_view term, bool* found, + DictEntry* entry) { + uint64_t frq_base = 0; + uint64_t prx_base = 0; + return idx.lookup(term, found, entry, &frq_base, &prx_base); +} + +} // namespace + +Status SniiStatsProvider::open(const snii::reader::LogicalIndexReader* idx, + SniiStatsProvider* out) { + if (idx == nullptr || out == nullptr) { + return Status::InvalidArgument("stats_provider: null argument"); + } + out->idx_ = idx; + const auto& sb = idx->stats(); + out->doc_count_ = sb.doc_count; + out->indexed_doc_count_ = sb.indexed_doc_count; + out->sum_total_term_freq_ = sb.sum_total_term_freq; + + const RegionRef& norms = idx->section_refs().norms; + if (norms.length == 0) { + out->has_norms_ = false; + return Status::OK(); + } + + snii::io::BatchRangeFetcher fetcher(idx->reader()); + const size_t h = fetcher.add(norms.offset, norms.length); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + Slice framed = fetcher.get(h); + out->norms_bytes_.assign(framed.data(), framed.data() + framed.size()); + SNII_RETURN_IF_ERROR(NormsPodReader::open(Slice(out->norms_bytes_), &out->norms_reader_)); + out->has_norms_ = true; + return Status::OK(); +} + +double SniiStatsProvider::avgdl() const { + const uint64_t denom = std::max(1, indexed_doc_count_); + return static_cast(sum_total_term_freq_) / static_cast(denom); +} + +Status SniiStatsProvider::doc_freq(std::string_view term, uint64_t* df) const { + if (df == nullptr) return Status::InvalidArgument("stats_provider: null df"); + *df = 0; + bool found = false; + DictEntry entry; + SNII_RETURN_IF_ERROR(LookupEntry(*idx_, term, &found, &entry)); + if (found) *df = entry.df; + return Status::OK(); +} + +Status SniiStatsProvider::total_term_freq(std::string_view term, uint64_t* ttf) const { + if (ttf == nullptr) return Status::InvalidArgument("stats_provider: null ttf"); + *ttf = 0; + bool found = false; + DictEntry entry; + SNII_RETURN_IF_ERROR(LookupEntry(*idx_, term, &found, &entry)); + if (!found) return Status::OK(); + // tier>=T2 entries carry the total term frequency directly in ttf_delta (the + // LogicalIndexWriter stores ttf there, not a delta from df). + *ttf = entry.ttf_delta; + return Status::OK(); +} + +Status SniiStatsProvider::encoded_norm(uint32_t docid, uint8_t* out) const { + if (out == nullptr) return Status::InvalidArgument("stats_provider: null out"); + if (!has_norms_) { + return Status::InvalidArgument("stats_provider: index has no norms"); + } + return norms_reader_.try_encoded_norm(docid, out); +} + +} // namespace snii::stats diff --git a/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp b/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp new file mode 100644 index 00000000000000..a6ce29aee03557 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp @@ -0,0 +1,155 @@ +#include "snii/writer/compact_posting_pool.h" + +#include +#include +#include + +namespace snii::writer { + +// Gentle (~1.5x) many-level payload-capacity schedule. Starting at 5 bytes with a +// slow ramp keeps the over-allocated FINAL slice small for the millions of low-df +// terms (the dominant arena-overhead source) while still reaching multi-KiB slices +// for high-df chains in a bounded number of hops (so the per-slice 4-byte forward +// pointer stays a small fraction of a large chain's bytes). +const uint32_t CompactPostingPool::kSliceSizes[kLevelCount] = { + 5, 8, 12, 18, 27, 40, 60, 90, 135, 202, 303, 455, 683, 1024, 1536, 2304}; +const uint8_t CompactPostingPool::kNextLevel[kLevelCount] = {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 15}; + +CompactPostingPool::CompactPostingPool() = default; + +uint32_t CompactPostingPool::kSliceSizes_level0() { + return kSliceSizes[0]; +} + +uint32_t CompactPostingPool::kSliceSize_at(int level) { + return kSliceSizes[level]; +} + +uint8_t CompactPostingPool::kNextLevel_at(int level) { + return kNextLevel[level]; +} + +void CompactPostingPool::reset() { + std::vector>().swap(blocks_); + next_offset_ = 0; + payload_bytes_ = 0; +} + +uint32_t CompactPostingPool::alloc_run(uint32_t bytes) { + const uint32_t in_block = next_offset_ & kBlockMask; + // A fresh block is needed when (a) there is no tail block yet, (b) the run does + // not fit in the current tail block's remaining space, or (c) next_offset_ sits + // exactly on a block boundary whose block has not been allocated (a previous run + // that exactly filled the tail leaves next_offset_ == blocks_.size()*kBlockSize, + // so in_block == 0 must NOT be mistaken for an empty fresh block). + const bool tail_exists = (next_offset_ >> kBlockShift) < blocks_.size(); + const bool need_block = !tail_exists || in_block + bytes > kBlockSize; + // Hard invariant (see arena_bytes()): the uint32 offset must never wrap. The spimi + // accumulator force-spills below 4 GiB, but enforce it here too -- in release as + // well as debug -- so any direct user of the pool fails loudly instead of silently + // aliasing block 0. We are a library: throw and let the caller decide how to + // handle it, rather than aborting the process. The run starts either in the + // current tail or at a new block's base; compute that start in 64 bits before the + // uint32 arithmetic can wrap. + const uint64_t run_start = + need_block ? static_cast(blocks_.size()) * kBlockSize : next_offset_; + if (run_start + bytes > UINT32_MAX) { + throw std::overflow_error( + "snii: CompactPostingPool arena exceeded the 4 GiB uint32 offset limit; " + "the caller must spill before this point"); + } + if (need_block) { + blocks_.emplace_back(kBlockSize, 0); + next_offset_ = static_cast((blocks_.size() - 1) * kBlockSize); + } + const uint32_t off = next_offset_; + next_offset_ += bytes; + return off; +} + +uint32_t CompactPostingPool::alloc_slice(int level, uint32_t* slice_end) { + const uint32_t cap = kSliceSizes[level]; + const uint32_t first = alloc_run(cap + kPtrBytes); + *slice_end = first + cap; + // Zero the forward pointer so a not-yet-extended tail slice reads next_head == 0. + std::memset(at(*slice_end), 0, kPtrBytes); + return first; +} + +uint32_t CompactPostingPool::read_ptr(uint32_t slice_end) const { + uint32_t v; + std::memcpy(&v, at(slice_end), sizeof(v)); + return v; +} + +void CompactPostingPool::write_ptr(uint32_t slice_end, uint32_t next_head) { + std::memcpy(at(slice_end), &next_head, sizeof(next_head)); +} + +uint32_t CompactPostingPool::start_chain(SliceWriter* w, uint8_t* level) { + *level = 0; + const uint32_t head = alloc_slice(0, &w->slice_end); + w->cur = head; + return head; +} + +void CompactPostingPool::append_byte(SliceWriter* w, uint8_t* level, uint8_t value) { + if (w->cur == w->slice_end) { + // Current slice payload region is full: grow the chain with a larger slice and + // record the link in the old slice's trailing pointer bytes. + const uint8_t next_level = kNextLevel[*level]; + uint32_t new_end = 0; + const uint32_t new_head = alloc_slice(next_level, &new_end); + write_ptr(w->slice_end, new_head); + *level = next_level; + w->cur = new_head; + w->slice_end = new_end; + } + *at(w->cur) = value; + ++w->cur; + ++payload_bytes_; +} + +CompactPostingPool::Cursor::Cursor(const CompactPostingPool* pool, uint32_t head, uint64_t budget) + : pool_(pool), cur_(head), level_(0), budget_(budget) { + // The first slice is level 0; its payload region ends kSliceSizes[0] bytes in. + slice_end_ = head + CompactPostingPool::kSliceSizes[0]; +} + +bool CompactPostingPool::Cursor::has_next() const { + if (budget_ == 0) return false; + // At a slice boundary, the chain continues only if the forward pointer is non-zero; + // a zero pointer is the tail marker (offset 0 is never a valid next-slice head). Peek + // it so has_next() never reports a phantom byte that next() would have to fabricate. + if (cur_ == slice_end_) return pool_->read_ptr(slice_end_) != 0; + return true; +} + +uint8_t CompactPostingPool::Cursor::next() { + // Budget guard: the caller's stated upper bound is spent -- yield nothing more. + if (budget_ == 0) return 0; + if (cur_ == slice_end_) { + // Reached this slice's payload boundary. Follow the forward pointer to the next + // slice -- UNLESS it is zero, which marks the CHAIN TAIL (offset 0 is always the + // pool's very first slice, never a valid *next*-slice head, so a zero pointer is + // unambiguously "no more slices"). Without this tail check, an over-reading caller + // would follow the zero pointer to offset 0 and alias block 0's bytes (or read an + // unallocated block) -- UB. Stopping here makes the cursor self-terminating and + // safe regardless of how large a budget the caller passed. + const uint32_t next_head = pool_->read_ptr(slice_end_); + if (next_head == 0) { + budget_ = 0; // chain exhausted: no further bytes exist + return 0; + } + level_ = CompactPostingPool::kNextLevel[level_]; + cur_ = next_head; + slice_end_ = next_head + CompactPostingPool::kSliceSizes[level_]; + } + const uint8_t v = *pool_->at(cur_); + ++cur_; + --budget_; + return v; +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp b/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp new file mode 100644 index 00000000000000..8cbf1de2eee0d3 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp @@ -0,0 +1,686 @@ +#include "snii/writer/logical_index_writer.h" + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/bsbf.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/format/norms_pod.h" +#include "snii/format/null_bitmap.h" +#include "snii/format/prx_pod.h" + +namespace snii::writer { + +using snii::format::BlockRef; +using snii::format::DictBlockBuilder; +using snii::format::DictBlockDirectoryBuilder; +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeColumns; +using snii::format::PerIndexMetaBuilder; +using snii::format::SampledTermIndexBuilder; +using snii::format::SectionRefs; +using snii::format::WindowMeta; + +namespace { + +// Target false-positive probability for the block-split bloom XFilter. Sizes +// the filter via Parquet OptimalNumOfBytes; L0 keeps the probe in memory and L1 +// keeps the per-query cost at one 32-byte block. +constexpr double kBsbfFpp = 0.01; +// Zstd "auto" sentinel for window builders (raw for tiny payloads). +constexpr int kAutoZstd = -1; +// Force-raw level for .frq dd/freq regions. Their plaintext is PFOR-bit-packed +// doc-deltas/freqs -- already high-entropy, so zstd shrinks ~30 MB of input by +// <0.1 MiB while burning ~0.4s CPU (and an extra crc pass over the compressed +// bytes) at 5M. We force raw here and keep zstd only on .prx (which compresses +// ~77%). Output stays self-describing: the region meta records zstd=false. +constexpr int kRawFrqRegion = 0; +// Windows per super-block in the two-level prelude directory (design section +// 5). +constexpr uint32_t kPreludeGroupSize = 64; +// zstd level for whole-DICT-block compression. Level 3 (zstd default) +// compresses the 64KiB front-coded term-key + entry-meta + inline-posting +// blocks ~40% at ~120 MiB/s encode / ~600 MiB/s decode -- a large size win for +// a small build-CPU cost, and a per-lookup decode (~0.1ms/64KiB) that is +// dominated by the S3 round trip it shrinks. Higher levels gain <1% here for +// materially more CPU. +constexpr int kDictBlockZstdLevel = 3; + +using snii::format::FrqRegionMeta; + +// Encodes one window's dd region (and freq region when has_freq) into separate +// buffers, returning their codec metadata. The dd region is the docs-only data; +// the freq region is the skippable suffix. Used for both the grouped windowed +// layout (regions concatenated into posting-level blocks) and the single-window +// slim/inline layout ([dd_region][freq_region]). +Status EncodeRegions(std::span docids, std::span freqs, + uint64_t win_base, bool has_freq, std::vector* dd_out, + FrqRegionMeta* dd_meta, std::vector* freq_out, + FrqRegionMeta* freq_meta) { + ByteSink dd_sink; + SNII_RETURN_IF_ERROR( + snii::format::build_dd_region(docids, win_base, kRawFrqRegion, &dd_sink, dd_meta)); + *dd_out = dd_sink.take(); + if (!has_freq) { + *freq_out = std::vector(); + *freq_meta = FrqRegionMeta {}; + return Status::OK(); + } + ByteSink freq_sink; + SNII_RETURN_IF_ERROR( + snii::format::build_freq_region(freqs, kRawFrqRegion, &freq_sink, freq_meta)); + *freq_out = freq_sink.take(); + return Status::OK(); +} + +// Reusable per-window scratch for the windowed builder. Each ByteSink RETAINS +// its capacity across windows (clear(), not re-construct), so encoding a +// high-df term split into thousands of windows allocates the scratch ONCE +// instead of churning thousands of small buffers (which fragment the heap arena +// and raise peak RSS). +struct WindowScratch { + ByteSink dd_sink; + ByteSink freq_sink; + ByteSink prx_sink; +}; + +// Encodes one window's dd (and freq) region into the scratch sinks and appends +// the bytes directly to the grouped blocks via LayoutWindowRegions. Reuses the +// sinks. +Status EncodeRegionsInto(WindowScratch* sc, std::span docids, + std::span freqs, uint64_t win_base, bool has_freq, + FrqRegionMeta* dd_meta, FrqRegionMeta* freq_meta) { + sc->dd_sink.clear(); + SNII_RETURN_IF_ERROR( + snii::format::build_dd_region(docids, win_base, kRawFrqRegion, &sc->dd_sink, dd_meta)); + if (has_freq) { + sc->freq_sink.clear(); + SNII_RETURN_IF_ERROR( + snii::format::build_freq_region(freqs, kRawFrqRegion, &sc->freq_sink, freq_meta)); + } else { + *freq_meta = FrqRegionMeta {}; + } + return Status::OK(); +} + +// Builds a single .prx window directly from a FLAT positions slice + its +// parallel freqs slice (doc d owns the next freqs[d] entries). Byte-identical +// to building from per-doc vectors, but with NO vector-of-vectors +// materialization: the writer indexes straight into the term's flat positions +// buffer. +Status MakePrxWindow(std::span positions_flat, std::span freqs, + std::vector* out) { + ByteSink sink; + SNII_RETURN_IF_ERROR( + snii::format::build_prx_window_flat(positions_flat, freqs, kAutoZstd, &sink)); + *out = sink.take(); + return Status::OK(); +} + +uint32_t MaxOf(std::span v) { + uint32_t m = 0; + for (uint32_t x : v) { + if (x > m) m = x; + } + return m; +} + +uint64_t SumOf(const std::vector& v) { + uint64_t s = 0; + for (uint32_t x : v) s += x; + return s; +} + +// Computes a window's WAND max_norm: the encoded norm yielding the LARGEST BM25 +// length contribution (smallest length penalty), i.e. the SMALLEST encoded norm +// among the window's docs (smaller dl => higher score). When norms are +// unavailable (no scoring), returns 0 -- decode_norm(0)=1.0 is the smallest +// possible dl, giving a correct (loosest) upper bound. +uint8_t WindowMaxNorm(const std::vector& norms, std::span docs) { + if (norms.empty() || docs.empty()) return 0; + uint8_t best = 0xFF; // decode_norm uses the byte directly; min byte => max score + for (uint32_t docid : docs) { + if (docid >= norms.size()) continue; // defensive: out-of-range doc has no norm + if (norms[docid] < best) best = norms[docid]; + } + return best == 0xFF ? 0 : best; +} + +// Window doc count by df: high-df windowed terms combine kFrqBaseUnit units +// into larger (kAdaptiveWindowDocs) windows; both are whole multiples of the +// base unit so .prx alignment and win_base/last_docid semantics are preserved. +uint32_t AdaptiveWindowDocs(uint32_t df) { + return df >= snii::format::kAdaptiveWindowDfThreshold ? snii::format::kAdaptiveWindowDocs + : snii::format::kFrqBaseUnit; +} + +// Builds the two-level .frq prelude for a windowed term and returns its bytes. +Status BuildPrelude(const std::vector& windows, bool has_freq, bool has_prx, + std::vector* out) { + FrqPreludeColumns cols; + cols.has_freq = has_freq; + cols.has_prx = has_prx; + cols.group_size = kPreludeGroupSize; + cols.windows = windows; + ByteSink sink; + SNII_RETURN_IF_ERROR(snii::format::build_frq_prelude(cols, &sink)); + *out = sink.take(); + return Status::OK(); +} + +void AppendBytes(std::vector* dst, const std::vector& src) { + dst->insert(dst->end(), src.begin(), src.end()); +} + +// One windowed term's grouped .frq layout (design 1.6): all dd regions form the +// dd-block, all freq regions form the freq-block. The final frq span is +// [prelude][dd-block][freq-block]. The .prx windows are STREAMED straight to +// the posting sink (the container output) during pass 1 (not buffered here) -- +// so the widest term's ~tens-of-MiB prx bytes never co-exist with the dd/freq +// blocks at peak RSS; only prx_total_len (the entry's prx byte span) is +// tracked. Per-window metadata (region offsets/lens/modes/crcs, prx_off within +// the entry) is recorded for the prelude. +struct WindowedPosting { + std::vector dd_block; // dd_region_0 ++ dd_region_1 ++ ... + std::vector freq_block; // freq_region_0 ++ ... (empty if !has_freq) + uint64_t prx_total_len = 0; // total .prx bytes streamed for this entry + std::vector windows; +}; + +// Fills a window's region locator fields in m from its dd/freq region metas and +// the running dd-block / freq-block offsets, then appends the region bytes to +// the blocks. has_freq controls whether the freq region is laid out. +void LayoutWindowRegions(const FrqRegionMeta& dd_meta, const std::vector& dd_bytes, + const FrqRegionMeta& freq_meta, const std::vector& freq_bytes, + bool has_freq, WindowedPosting* out, WindowMeta* m) { + m->dd_zstd = dd_meta.zstd; + m->dd_off = static_cast(out->dd_block.size()); + m->dd_disk_len = dd_meta.disk_len; + m->dd_uncomp_len = dd_meta.uncomp_len; + m->crc_dd = dd_meta.crc; + AppendBytes(&out->dd_block, dd_bytes); + if (!has_freq) return; + m->freq_zstd = freq_meta.zstd; + m->freq_off = static_cast(out->freq_block.size()); + m->freq_disk_len = freq_meta.disk_len; + m->freq_uncomp_len = freq_meta.uncomp_len; + m->crc_freq = freq_meta.crc; + AppendBytes(&out->freq_block, freq_bytes); +} + +// Splits a windowed term's postings into base-unit-aligned windows (size chosen +// by df via AdaptiveWindowDocs). Each window's dd/freq regions are encoded +// separately and grouped: all dd regions into the dd-block, all freq regions +// into the freq-block. Records per-window region metadata + WAND max_norm. +// +// TWO-PASS, MEMORY-AWARE: the widest term (df in the millions) is the dominant +// merge-phase peak-RSS source -- its flat positions_flat alone is tens of MiB +// and would otherwise co-exist with the encoded output blocks at the peak +// moment. +// pass 1 (prx): builds every window's .prx bytes, then FREES positions_flat +// (the single largest source array) before any dd/freq block +// grows. +// pass 2 (dd/freq): encodes the dd/freq regions from docids/freqs only. +// `tp` is taken by mutable reference; positions_flat is freed after pass 1 and +// docids/freqs are freed by the caller after this returns. Output bytes are +// byte-identical to the single-pass build (regions/prelude/prx are +// independent). +Status BuildWindowedPosting(TermPostings& tp, bool has_freq, bool has_prx, + const std::vector& norms, snii::io::FileWriter* posting_out, + WindowedPosting* out) { + const uint32_t unit = AdaptiveWindowDocs(static_cast(tp.docids.size())); + const size_t n = tp.docids.size(); + const std::span all_docs(tp.docids); + const std::span all_freqs(tp.freqs); + + // Size the per-term transient blocks up front so a very-high-df term (split + // into thousands of windows, dd/freq blocks of MiB) does not grow them by + // geometric doubling -- which would briefly hold the old+new buffer + // co-resident at the build peak. windows count is exact; dd/freq use a + // conservative byte/doc upper estimate (PFOR-packed deltas are typically <= 2 + // B/doc). Slack is freed when the term ends. + out->windows.reserve((n + unit - 1) / unit); + out->dd_block.reserve(n * 2); + if (has_freq) out->freq_block.reserve(n); + + WindowScratch sc; // reused across all windows (no per-window allocation churn) + + // ---- pass 1: prx (STREAMED to the output) + window skeleton ---- + // Each window's .prx bytes are appended straight to the posting sink + // (container output) as they are built, so the entry's full prx payload (tens + // of MiB for the widest term) is never buffered in RAM alongside the dd/freq + // blocks that pass 2 grows. m.prx_off is the byte offset WITHIN this entry's + // prx span (running prx_total_len), matching the reader's prx_off_delta + + // meta.prx_off contract. + { + // Positions come either from the flat buffer or, for very-high-df terms, + // from a sequential pump (so the term's full positions are never + // materialized). Both yield the SAME positions in the SAME order, so the + // prx bytes are identical. + const bool streamed = static_cast(tp.pos_pump); + const std::span all_pos(tp.positions_flat); + std::vector win_pos_buf; // reused per window when streaming + uint64_t win_base = 0; + size_t pos_off = 0; + for (size_t start = 0; start < n; start += unit) { + const size_t len = std::min(unit, n - start); + const auto docs = all_docs.subspan(start, len); + const auto freqs = all_freqs.subspan(start, len); + WindowMeta m; + m.last_docid = docs.back(); + m.win_base = win_base; + m.doc_count = static_cast(docs.size()); + m.max_freq = MaxOf(freqs); + m.max_norm = WindowMaxNorm(norms, docs); + size_t win_pos = 0; + for (uint32_t f : freqs) win_pos += f; + if (has_prx) { + std::span pos_span; + if (streamed) { + win_pos_buf.resize(win_pos); + if (win_pos != 0) tp.pos_pump(win_pos_buf.data(), win_pos); + pos_span = std::span(win_pos_buf); + } else { + pos_span = all_pos.subspan(pos_off, win_pos); + } + sc.prx_sink.clear(); + SNII_RETURN_IF_ERROR(snii::format::build_prx_window_flat(pos_span, freqs, kAutoZstd, + &sc.prx_sink)); + m.prx_off = out->prx_total_len; + m.prx_len = static_cast(sc.prx_sink.size()); + SNII_RETURN_IF_ERROR(posting_out->append(sc.prx_sink.view())); + out->prx_total_len += m.prx_len; + } + pos_off += win_pos; + out->windows.push_back(m); + win_base = m.last_docid; + } + } + // Positions are fully consumed; free the largest source array before pass 2 + // grows the dd/freq blocks, so the source positions never co-exist with them. + std::vector().swap(tp.positions_flat); + + // ---- pass 2: dd (and freq) regions from docids/freqs only ---- + uint64_t win_base = 0; + size_t wi = 0; + for (size_t start = 0; start < n; start += unit, ++wi) { + const size_t len = std::min(unit, n - start); + const auto docs = all_docs.subspan(start, len); + const auto freqs = all_freqs.subspan(start, len); + FrqRegionMeta dd_meta, freq_meta; + SNII_RETURN_IF_ERROR( + EncodeRegionsInto(&sc, docs, freqs, win_base, has_freq, &dd_meta, &freq_meta)); + LayoutWindowRegions(dd_meta, sc.dd_sink.buffer(), freq_meta, sc.freq_sink.buffer(), + has_freq, out, &out->windows[wi]); + win_base = out->windows[wi].last_docid; + } + return Status::OK(); +} + +} // namespace + +LogicalIndexWriter::LogicalIndexWriter(const SniiIndexInput& in) + : index_id_(in.index_id), + index_suffix_(in.index_suffix), + tier_(snii::format::tier_of(in.config)), + has_prx_(snii::format::has_positions(in.config)), + has_freq_(snii::format::tier_of(in.config) >= snii::format::IndexTier::kT2), + has_norms_(snii::format::has_scoring(in.config)), + doc_count_(in.doc_count), + null_docids_(in.null_docids), + terms_(in.terms), + term_source_(in.term_source), + encoded_norms_(in.encoded_norms), + target_dict_block_bytes_(in.target_dict_block_bytes != 0 + ? in.target_dict_block_bytes + : snii::format::kDefaultTargetDictBlockBytes), + // No independent dict cap: the dict spills via the writer's UNIFIED + // gate-2 cap (in.mem_reporter->over_cap()); UINT64_MAX disables the local + // per-buffer cap. + dict_buf_(UINT64_MAX, "dict", in.mem_reporter) {} + +Status LogicalIndexWriter::validate_term(const TermPostings& tp) const { + if (tp.freqs.size() != tp.docids.size()) { + return Status::InvalidArgument("logical_index: freqs length must equal docids"); + } + if (has_prx_) { + uint64_t total_pos = 0; + for (uint32_t f : tp.freqs) total_pos += f; + // Streamed positions (pos_pump set): validate against the declared + // pos_total (positions_flat is intentionally empty). Otherwise validate the + // flat buffer. + const uint64_t have = tp.pos_pump ? tp.pos_total : tp.positions_flat.size(); + if (total_pos != have) { + return Status::InvalidArgument("logical_index: positions count must equal sum(freqs)"); + } + } + for (size_t i = 1; i < tp.docids.size(); ++i) { + if (tp.docids[i] <= tp.docids[i - 1]) { + return Status::InvalidArgument("logical_index: docids must be strictly ascending"); + } + } + return Status::OK(); +} + +// Emits a windowed term: splits into base-unit windows, encodes each window's +// dd/freq regions separately, groups them at posting level, builds a two-level +// prelude, and lays out [prx span][prelude][dd-block][freq-block] CONTIGUOUSLY +// in the single posting region (prx span first, then the frq span). Sets +// enc=windowed + has_sb. frq_docs_len = prelude_len + dd_block_len is the +// contiguous docs-only prefix, which stays INSIDE the frq span. +Status LogicalIndexWriter::build_windowed_entry(TermPostings& tp, uint64_t frq_base, + uint64_t prx_base, DictEntry* e) { + // The prx span starts here: pass 1 streams each .prx window straight into + // the posting sink, so prx_off_delta is measured against the live + // posting-sink size. + const uint64_t prx_off = posting_size(); + WindowedPosting wp; + SNII_RETURN_IF_ERROR( + BuildWindowedPosting(tp, has_freq_, has_prx_, encoded_norms_, posting_out_, &wp)); + // wp.prx_total_len bytes were just streamed straight to the posting sink (0 + // when !has_prx). docids/freqs are now fully encoded into wp; release the + // source arrays before the (potentially large) wp blocks are appended to + // disk. + std::vector().swap(tp.docids); + std::vector().swap(tp.freqs); + std::vector prelude; + SNII_RETURN_IF_ERROR(BuildPrelude(wp.windows, has_freq_, has_prx_, &prelude)); + + e->kind = DictEntryKind::kPodRef; + e->enc = DictEntryEnc::kWindowed; + e->has_sb = true; // prelude is always a two-level skip directory. + e->prelude_len = static_cast(prelude.size()); + e->frq_docs_len = + e->prelude_len + static_cast(wp.dd_block.size()); // [prelude][dd-block] + + // The frq span starts immediately AFTER the prx span, in the SAME sink. The + // writer-side property frq_off == prx_off + wp.prx_total_len holds because + // nothing is appended to the posting sink between the prx pass and here -- + // but the delta is measured from the live size, not assumed. + const uint64_t frq_off = posting_size(); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(prelude))); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(wp.dd_block))); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(wp.freq_block))); + e->frq_off_delta = frq_off - frq_base; + e->frq_len = posting_size() - frq_off; + if (has_prx_) { + e->prx_off_delta = prx_off - prx_base; + e->prx_len = wp.prx_total_len; // == frq_off - prx_off + } + return Status::OK(); +} + +// Emits a slim term as a single .frq window (win_base=0) laid out [dd][freq]: +// inline when the encoded bytes are tiny, otherwise a slim pod_ref (no +// prelude). The dd region is the docs-only prefix; the freq region (when +// has_freq) is the skippable suffix. Region codecs are recorded in the +// DictEntry. For a pod_ref, the term's [prx][frq] spans are appended to the +// single posting region with the prx span FIRST (consistent with the windowed +// path); the reader resolves each delta independently so the relative order is +// not load-bearing. +Status LogicalIndexWriter::build_slim_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + DictEntry* e) { + std::vector dd_bytes, freq_bytes; + FrqRegionMeta dd_meta, freq_meta; + SNII_RETURN_IF_ERROR(EncodeRegions(tp.docids, tp.freqs, /*win_base=*/0, has_freq_, &dd_bytes, + &dd_meta, &freq_bytes, &freq_meta)); + std::vector frq_win = dd_bytes; // [dd_region][freq_region] + AppendBytes(&frq_win, freq_bytes); + std::vector prx_win; + if (has_prx_) { + SNII_RETURN_IF_ERROR(MakePrxWindow(tp.positions_flat, tp.freqs, &prx_win)); + } + + e->enc = DictEntryEnc::kSlim; + e->dd_meta = dd_meta; + e->freq_meta = freq_meta; + + if (frq_win.size() <= snii::format::kDefaultInlineThreshold) { + e->kind = DictEntryKind::kInline; + e->inline_dd_disk_len = dd_meta.disk_len; + e->frq_bytes = std::move(frq_win); + if (has_prx_) e->prx_bytes = std::move(prx_win); + return Status::OK(); + } + + // POD_REF: write [prx][frq] into the single posting sink, prx span first. + e->kind = DictEntryKind::kPodRef; + e->frq_docs_len = dd_meta.disk_len; // docs-only prefix = the single dd region + if (has_prx_) { + const uint64_t prx_off = posting_size(); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(prx_win))); + e->prx_off_delta = prx_off - prx_base; + e->prx_len = posting_size() - prx_off; + } + const uint64_t frq_off = posting_size(); // immediately after the prx span + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(frq_win))); + e->frq_off_delta = frq_off - frq_base; + e->frq_len = posting_size() - frq_off; + return Status::OK(); +} + +// Builds the DictEntry for one term. Inline entries embed their .frq/.prx +// bytes; pod_ref entries append [prx][frq] bytes to the single posting region +// and record off_delta relative to frq_base/prx_base (the posting-region size +// captured when the block opened; both bases hold that same value). +Status LogicalIndexWriter::build_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + DictEntry* e) { + e->term = tp.term; + e->df = static_cast(tp.docids.size()); + e->ttf_delta = SumOf(tp.freqs); // simple: ttf stored directly as ttf_delta + e->max_freq = MaxOf(tp.freqs); + + if (e->df >= snii::format::kSlimDfThreshold) { + return build_windowed_entry(tp, frq_base, prx_base, e); + } + return build_slim_entry(tp, frq_base, prx_base, e); +} + +// Serializes the current open block, zstd-compresses it (the dict region is the +// single largest section -- term keys + entry meta + inline postings -- and the +// 64KiB blocks compress ~40%), streams the compressed bytes into the dict +// scratch file, and records a directory entry. The block-level crc32c +// (rec.checksum) covers the UNCOMPRESSED bytes, so DictBlockReader::open +// verifies integrity after the reader decompresses. A compressed block also +// shrinks the bytes a term lookup fetches from S3 -- aligning with the +// read-byte thesis. If zstd does not shrink a (tiny) block, it is stored raw so +// a lookup never pays a pointless decompress. +Status LogicalIndexWriter::flush_block(DictBlockBuilder* block, std::string first_term) { + ByteSink bsink; + block->finish(&bsink); + const Slice plain = bsink.view(); + BlockRecord rec; + rec.rel_offset = dict_buf_.size(); + rec.n_entries = block->n_entries(); + rec.checksum = snii::crc32c(plain); // crc over UNCOMPRESSED block bytes + rec.first_term = std::move(first_term); + + std::vector comp; + Status zs = snii::zstd_compress(plain, kDictBlockZstdLevel, &comp); + if (zs.ok() && comp.size() < plain.size()) { + rec.flags = snii::format::block_ref_flags::kZstd; + rec.uncomp_len = static_cast(plain.size()); + rec.length = static_cast(comp.size()); + SNII_RETURN_IF_ERROR(dict_buf_.append_move(std::move(comp))); + } else { + rec.flags = 0; + rec.uncomp_len = 0; + rec.length = static_cast(plain.size()); + SNII_RETURN_IF_ERROR(dict_buf_.append_move(bsink.take())); + } + blocks_.push_back(std::move(rec)); + return Status::OK(); +} + +// Running state for the in-flight DICT block while terms stream past. +struct LogicalIndexWriter::BlockState { + std::unique_ptr block; + std::string block_first_term; + uint64_t frq_base = 0; + uint64_t prx_base = 0; +}; + +Status LogicalIndexWriter::process_term(TermPostings& tp, BlockState* st) { + SNII_RETURN_IF_ERROR(validate_term(tp)); + // Collect only the 8-byte filter key per term (no whole-vocabulary string + // copy). BSBF key = XXH64 seed 0 (Parquet-canonical). + term_hashes_.push_back(snii::format::bsbf_hash(tp.term)); + ++term_count_; + stats_.sum_total_term_freq += SumOf(tp.freqs); + + if (!st->block) { + // Both bases come from the SAME posting sink, snapshotted at block open. + const uint64_t base = posting_size(); + st->frq_base = base; + st->prx_base = base; + st->block = std::make_unique(tier_, has_prx_, st->frq_base, st->prx_base); + st->block_first_term = tp.term; + } + + DictEntry e; + SNII_RETURN_IF_ERROR(build_entry(tp, st->frq_base, st->prx_base, &e)); + st->block->add_entry(e); + + if (st->block->estimated_bytes() >= target_dict_block_bytes_) { + SNII_RETURN_IF_ERROR(flush_block(st->block.get(), st->block_first_term)); + st->block.reset(); + } + return Status::OK(); +} + +Status LogicalIndexWriter::build_blocks() { + BlockState st; + if (term_source_ != nullptr) { + Status streamed = Status::OK(); + // Drain the SPIMI buffer term-by-term; only one TermPostings is alive at a + // time, so the input+output never fully coexist. The returned Status covers + // both spill/merge I/O errors and add_token validation errors (the latter + // flow through merge_runs -> spill_status_), so a separate status() check + // is no longer needed. + SNII_RETURN_IF_ERROR(term_source_->for_each_term_sorted([&](TermPostings&& tp) { + if (streamed.ok()) streamed = process_term(tp, &st); + })); + SNII_RETURN_IF_ERROR(streamed); + } else { + // Materialized fallback (tests / callers holding a vector): process_term + // frees the term's arrays, so feed a per-term COPY to keep terms_ intact + // for the caller. This path is not the large out-of-core build, so the copy + // is cheap. + for (const auto& tp : terms_) { + TermPostings copy = tp; + SNII_RETURN_IF_ERROR(process_term(copy, &st)); + } + } + if (st.block) SNII_RETURN_IF_ERROR(flush_block(st.block.get(), st.block_first_term)); + return Status::OK(); +} + +Status LogicalIndexWriter::build(snii::io::FileWriter* posting_out) { + if (posting_out == nullptr) { + return Status::InvalidArgument("logical_index: null posting sink"); + } + if (has_norms_ && encoded_norms_.size() != doc_count_) { + return Status::InvalidArgument("logical_index: norms length must equal doc_count"); + } + // The interleaved posting region streams STRAIGHT into the container output + // (no temp round-trip): posting_size() is the region-relative byte count, + // derived from the output offset advanced since this index's region began. + // The DICT region is staged in dict_buf_ (tiered: RAM under the cap = + // spill-only; spills above it) since it must land contiguously after the + // concurrently-streamed posting region. + posting_out_ = posting_out; + posting_off0_ = posting_out->bytes_written(); + + SNII_RETURN_IF_ERROR(build_blocks()); + // Seal the dict buffer so a spilled temp is flushed before + // stream_dict_region_into reads it back. A no-op for a RAM-resident dict. + SNII_RETURN_IF_ERROR(dict_buf_.seal()); + + stats_.doc_count = doc_count_; + stats_.indexed_doc_count = doc_count_ - static_cast(null_docids_.size()); + stats_.term_count = term_count_; + stats_.null_count = static_cast(null_docids_.size()); + + if (has_norms_) { + snii::format::NormsPodWriter nw; + for (uint8_t n : encoded_norms_) nw.add(n); + ByteSink nsink; + nw.finish(&nsink); + norms_section_ = nsink.take(); + } + + if (!null_docids_.empty()) { + snii::format::NullBitmapWriter null_writer; + for (uint32_t docid : null_docids_) null_writer.add_null(docid); + ByteSink null_sink; + null_writer.finish(doc_count_, &null_sink); + null_bitmap_section_ = null_sink.take(); + } + + // Build the absent-term filter (block-split bloom, Parquet-canonical) from + // the per-term keys (no retained strings) as a [28B header][bitset] blob; the + // compound writer places it as a PHYSICAL section probed one 32-byte block on + // demand. + bsbf_bytes_.clear(); + if (!term_hashes_.empty()) { + snii::format::BsbfBuilder bf; + SNII_RETURN_IF_ERROR(snii::format::BsbfBuilder::create( + static_cast(term_hashes_.size()), kBsbfFpp, &bf)); + for (uint64_t k : term_hashes_) bf.insert(k); + ByteSink bsink; + SNII_RETURN_IF_ERROR(bf.serialize(&bsink)); + bsbf_bytes_ = bsink.take(); + } + std::vector().swap(term_hashes_); // release + return Status::OK(); +} + +Status LogicalIndexWriter::finish_meta(const SectionRefs& abs_refs, uint64_t dict_region_offset, + ByteSink* out) const { + if (out == nullptr) return Status::InvalidArgument("logical_index: null meta sink"); + + SampledTermIndexBuilder sti; + for (const auto& b : blocks_) sti.add_block_first_term(b.first_term); + ByteSink sti_sink; + sti.finish(&sti_sink); + + DictBlockDirectoryBuilder dir; + for (const auto& b : blocks_) { + BlockRef ref; + ref.offset = dict_region_offset + b.rel_offset; + ref.length = b.length; + ref.n_entries = b.n_entries; + ref.flags = b.flags; + ref.checksum = b.checksum; + ref.uncomp_len = b.uncomp_len; + dir.add(ref); + } + ByteSink dir_sink; + dir.finish(&dir_sink); + + uint32_t flags = bsbf_bytes_.empty() ? 0u : PerIndexMetaBuilder::kHasBsbf; + // Persist positions capability explicitly (the R1 fix): the reader must NOT + // infer it from posting_region.length, which is non-zero for any docs-only + // pod_ref index. + if (has_prx_) flags |= PerIndexMetaBuilder::kHasPositions; + PerIndexMetaBuilder builder(index_id_, index_suffix_, flags); + builder.set_stats(stats_); + builder.set_sampled_term_index(sti_sink.view()); + builder.set_dict_block_directory(dir_sink.view()); + // The BSBF is a physical section (abs_refs.bsbf), not embedded in the meta. + builder.set_section_refs(abs_refs); + return builder.finish(out); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp b/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp new file mode 100644 index 00000000000000..8e6f9b9adc61b3 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp @@ -0,0 +1,146 @@ +#include "snii/writer/snii_compound_writer.h" + +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/crc32c.h" +#include "snii/format/bootstrap_header.h" +#include "snii/format/per_index_meta.h" // SectionRefs +#include "snii/format/tail_meta_region.h" +#include "snii/format/tail_pointer.h" + +namespace snii::writer { + +using snii::format::BootstrapHeader; +using snii::format::SectionRefs; +using snii::format::TailMetaRegionBuilder; +using snii::format::TailPointer; + +SniiCompoundWriter::SniiCompoundWriter(snii::io::FileWriter* out) : out_(out) {} + +Status SniiCompoundWriter::append(const std::vector& bytes) { + if (bytes.empty()) return Status::OK(); + return out_->append(Slice(bytes)); +} + +// The bootstrap header occupies offset 0 and must precede the first posting region, +// which streams straight into the output during build(). Written lazily exactly once +// (on the first add, or in finish() for an empty container). +Status SniiCompoundWriter::ensure_bootstrap() { + if (bootstrap_written_) return Status::OK(); + bootstrap_written_ = true; + return write_bootstrap(); +} + +Status SniiCompoundWriter::add_logical_index(const SniiIndexInput& in) { + if (out_ == nullptr) return Status::InvalidArgument("compound: null file writer"); + if (finished_) return Status::Internal("compound: add after finish"); + SNII_RETURN_IF_ERROR(ensure_bootstrap()); + auto liw = std::make_unique(in); + Placement p; + // The posting region streams DIRECTLY into the container during build() -- no temp + // round-trip for the bulk -- followed immediately by this index's compact DICT + // trailer (produced interleaved into a temp, but laid out right after its posting + // region, preserving the per-index [posting][dict] layout). Offsets are read off + // the output writer (the single source of truth -- no separate cursor). + p.post_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(liw->build(out_)); + p.post_len = out_->bytes_written() - p.post_off; + p.dict_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(liw->stream_dict_region_into(out_)); + p.dict_len = out_->bytes_written() - p.dict_off; + indexes_.push_back(std::move(liw)); + placements_.push_back(p); + return Status::OK(); +} + +Status SniiCompoundWriter::write_bootstrap() { + BootstrapHeader bh; + bh.tail_pointer_size = static_cast(snii::format::tail_pointer_size()); + ByteSink sink; + SNII_RETURN_IF_ERROR(snii::format::encode_bootstrap_header(bh, &sink)); + return append(sink.buffer()); +} + +// Writes each index's norms POD then bsbf section (in add order), after all the +// per-index [posting][dict] regions. +Status SniiCompoundWriter::write_norms() { + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + if (!w.has_norms() || w.norms_bytes().empty()) continue; + Placement& p = placements_[i]; + p.norms_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(w.norms_bytes())); + p.norms_len = out_->bytes_written() - p.norms_off; + } + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + if (!w.has_null_bitmap()) continue; + Placement& p = placements_[i]; + p.null_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(w.null_bitmap_bytes())); + p.null_len = out_->bytes_written() - p.null_off; + } + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + if (!w.has_bsbf()) continue; + Placement& p = placements_[i]; + p.bsbf_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(w.bsbf_bytes())); + p.bsbf_len = out_->bytes_written() - p.bsbf_off; + } + return Status::OK(); +} + +Status SniiCompoundWriter::write_tail() { + TailMetaRegionBuilder region; + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + const Placement& p = placements_[i]; + + SectionRefs refs; + refs.dict_region = {p.dict_off, p.dict_len}; + refs.posting_region = {p.post_off, p.post_len}; + refs.norms = {p.norms_off, p.norms_len}; + refs.null_bitmap = {p.null_off, p.null_len}; + refs.bsbf = {p.bsbf_off, p.bsbf_len}; + + ByteSink meta; + SNII_RETURN_IF_ERROR(w.finish_meta(refs, p.dict_off, &meta)); + region.add_index(w.index_id(), w.index_suffix(), meta.view()); + } + + ByteSink region_sink; + region.finish(®ion_sink); + const uint64_t region_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(region_sink.buffer())); + const uint64_t region_len = out_->bytes_written() - region_off; + + TailPointer tp; + tp.meta_region_offset = region_off; + tp.meta_region_length = region_len; + tp.hot_off = 0; + tp.meta_region_checksum = snii::crc32c(region_sink.view()); + // Reserved: the bootstrap header carries (and decode_bootstrap_header verifies) its + // OWN internal crc32c, so a tail-pointer copy is redundant. Left 0 until a cross- + // region check needs it; the tail pointer's own tail_checksum still covers this + // field's bytes. + tp.bootstrap_header_checksum = 0; + ByteSink tail_sink; + SNII_RETURN_IF_ERROR(snii::format::encode_tail_pointer(tp, &tail_sink)); + return append(tail_sink.buffer()); +} + +Status SniiCompoundWriter::finish() { + if (out_ == nullptr) return Status::InvalidArgument("compound: null file writer"); + if (finished_) return Status::Internal("compound: finish called twice"); + finished_ = true; + + SNII_RETURN_IF_ERROR(ensure_bootstrap()); // empty container still gets a header + SNII_RETURN_IF_ERROR(write_norms()); + SNII_RETURN_IF_ERROR(write_tail()); + return out_->finalize(); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp b/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp new file mode 100644 index 00000000000000..e68ba24b9a4164 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp @@ -0,0 +1,597 @@ +#include "snii/writer/spill_run_codec.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "snii/encoding/varint.h" +#include "snii/format/format_constants.h" + +namespace snii::writer { + +namespace { + +// Flush staging once it grows past this. A LARGE write buffer (4 MiB) collapses +// the per-flush write() syscall count by ~64x: at 64 KiB the 5M build issued +// ~8800 write()s to ext4 (~9s of syscall overhead) for ~553 MiB of runs, versus +// a raw dd of the same bytes taking ~1.2s. Runs are PRIVATE temp files, so the +// on-disk index is unaffected; the only cost is a slightly larger transient +// RunWriter staging buffer (4 MiB, bounded, freed at close()). +constexpr size_t kWriteFlushBytes = 1u << 22; // 4 MiB +// RunReader reads this much per disk fill; the window slides so a single record +// never needs the whole run in RAM (only the current term's encoded span). KEEP +// this small (64 KiB): a large read chunk x many open runs would inflate the +// merge-phase peak RSS at low spill thresholds (each reader holds a window). +constexpr size_t kReadChunkBytes = 1u << 16; // 64 KiB + +void AppendVarint(std::vector* buf, uint64_t v) { + uint8_t tmp[10]; + const size_t n = encode_varint64(v, tmp); + buf->insert(buf->end(), tmp, tmp + n); +} + +// Appends a block of `count` uint32 values as RAW little-endian fixed-width bytes +// (memcpy from contiguous source). Runs are private temp files; the on-disk index +// is unaffected. Raw blocks make encode/decode ~10x cheaper than per-value varint +// for the freqs/positions streams (which compress poorly as varints anyway), at +// the cost of a modestly larger temp run. Empty source is a no-op. +void AppendRawU32(std::vector* buf, const uint32_t* src, size_t count) { + if (count == 0) return; + const auto* bytes = reinterpret_cast(src); + buf->insert(buf->end(), bytes, bytes + count * sizeof(uint32_t)); +} + +// Writes the full byte range [data, data+len) to fd, looping over short writes. +Status WriteAll(int fd, const uint8_t* data, size_t len) { + size_t off = 0; + while (off < len) { + const ssize_t n = ::write(fd, data + off, len - off); + if (n < 0) { + if (errno == EINTR) continue; + return Status::IoError(std::string("run write failed: ") + std::strerror(errno)); + } + off += static_cast(n); + } + return Status::OK(); +} + +} // namespace + +// --------------------------------------------------------------------------- +// RunWriter +// --------------------------------------------------------------------------- + +RunWriter::~RunWriter() { + if (fd_ >= 0) ::close(fd_); +} + +Status RunWriter::open(const std::string& path) { + fd_ = ::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd_ < 0) { + return Status::IoError("run open(" + path + "): " + std::strerror(errno)); + } + buf_.clear(); + return Status::OK(); +} + +Status RunWriter::flush() { + if (buf_.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(WriteAll(fd_, buf_.data(), buf_.size())); + buf_.clear(); + return Status::OK(); +} + +Status RunWriter::write_term(uint32_t term_id, const TermPostings& tp) { + AppendVarint(&buf_, term_id); + AppendVarint(&buf_, tp.docids.size()); + // Docids are a RAW fixed-width u32 block (bulk memcpy), NOT per-value VInt. + // Per-value varint over ~60M docids cost ~1.5s of encode CPU on the spill feed + // side; raw is a single memcpy and the decode side becomes a memcpy too. Runs + // are PRIVATE temp files written then read back from page cache, so the modestly + // larger run (no delta packing) costs ~0 extra real I/O. Absolute docids are + // stored (the merge concatenates per-term across runs and re-deltas at encode). + AppendRawU32(&buf_, tp.docids.data(), tp.docids.size()); + // Freqs + positions are RAW fixed-width u32 blocks (bulk memcpy). The decoder + // reads them back the same way; n_pos == positions_flat.size() is recoverable + // from sum(freqs), but is written explicitly so a reader can size the block. + AppendRawU32(&buf_, tp.freqs.data(), tp.freqs.size()); + const uint64_t n_pos = tp.positions_flat.size(); + AppendVarint(&buf_, n_pos); + AppendRawU32(&buf_, tp.positions_flat.data(), tp.positions_flat.size()); + if (buf_.size() >= kWriteFlushBytes) SNII_RETURN_IF_ERROR(flush()); + return Status::OK(); +} + +Status RunWriter::close() { + if (fd_ < 0) return Status::OK(); + SNII_RETURN_IF_ERROR(flush()); + const int fd = fd_; + fd_ = -1; + if (::close(fd) != 0) { + return Status::IoError(std::string("run close: ") + std::strerror(errno)); + } + return Status::OK(); +} + +// --------------------------------------------------------------------------- +// RunReader +// --------------------------------------------------------------------------- + +RunReader::~RunReader() { + if (fd_ >= 0) ::close(fd_); +} + +Status RunReader::open(const std::string& path, bool has_positions) { + fd_ = ::open(path.c_str(), O_RDONLY); + if (fd_ < 0) { + return Status::IoError("run reopen(" + path + "): " + std::strerror(errno)); + } + // Record the run's byte size so every length decoded from the stream can be + // bounded against it before allocating (no record holds more u32s than the whole + // file). Honors the header's "lengths validated against the file size" contract, + // turning a corrupt/truncated length into Status::Corruption rather than an + // uncaught std::bad_alloc from a giant resize(). + struct stat st {}; + if (::fstat(fd_, &st) != 0) { + return Status::IoError(std::string("run fstat: ") + std::strerror(errno)); + } + file_size_ = static_cast(st.st_size); + has_positions_ = has_positions; + exhausted_ = false; + eof_ = false; + pos_ = 0; + pos_count_ = 0; + pos_remaining_ = 0; + window_.clear(); + return advance(); +} + +// Slides consumed bytes out of the window, then appends one disk chunk. +Status RunReader::fill() { + if (pos_ > 0) { + window_.erase(window_.begin(), window_.begin() + pos_); + pos_ = 0; + } + if (eof_) return Status::OK(); + const size_t base = window_.size(); + window_.resize(base + kReadChunkBytes); + ssize_t n; + do { + n = ::read(fd_, window_.data() + base, kReadChunkBytes); + } while (n < 0 && errno == EINTR); + if (n < 0) return Status::IoError(std::string("run read: ") + std::strerror(errno)); + window_.resize(base + static_cast(n)); + if (n == 0) eof_ = true; + return Status::OK(); +} + +// Buffered bytes available to the decoder right now (from pos_ to window end). +// fill() may slide the window (erasing consumed bytes), so callers must compare +// THIS quantity -- not window_.size() -- to decide whether more data arrived. +size_t RunReader::available() const { + return window_.size() - pos_; +} + +Status RunReader::ensure(size_t n) { + while (available() < n) { + const size_t had = available(); + SNII_RETURN_IF_ERROR(fill()); + if (available() == had && eof_) { + return Status::Corruption("run truncated: needed more bytes than available"); + } + } + return Status::OK(); +} + +// Streamed varint: decode from the current window; if it straddles the buffered +// boundary, top up from disk and retry. A varint is at most 10 bytes, so this +// loops at most a couple of times. Bounds-safe: decode_varint64 never reads past +// `end`, and a partial varint at true eof is reported as corruption. +Status RunReader::read_varint(uint64_t* v) { + while (true) { + const uint8_t* p = window_.data() + pos_; + const uint8_t* end = window_.data() + window_.size(); + const uint8_t* next = nullptr; + Status s = decode_varint64(p, end, v, &next); + if (s.ok()) { + pos_ += static_cast(next - p); + return Status::OK(); + } + if (eof_) return Status::Corruption("run truncated: incomplete varint"); + const size_t had = available(); + SNII_RETURN_IF_ERROR(fill()); + if (available() == had && eof_) { + return Status::Corruption("run truncated: incomplete varint at eof"); + } + } +} + +// Streams `count` raw little-endian u32s from the window into `dst` (caller-owned +// storage of at least count*4 bytes), topping up the window from disk as needed. +// Copies whatever is buffered each pass (the window may hold only part of a large +// block), so a high-df term's freqs/positions stream through in 64 KiB chunks +// without ever needing the whole block resident at once. +Status RunReader::pull_raw_u32(uint8_t* dst, size_t count) { + if (count == 0) return Status::OK(); + size_t need = count * sizeof(uint32_t); + size_t written = 0; + while (need > 0) { + if (available() == 0) { + const size_t had = available(); + SNII_RETURN_IF_ERROR(fill()); + if (available() == had && eof_) { + return Status::Corruption("run truncated: needed more raw bytes than available"); + } + } + const size_t take = std::min(need, available()); + std::memcpy(dst + written, window_.data() + pos_, take); + pos_ += take; + written += take; + need -= take; + } + return Status::OK(); +} + +// Bulk-decodes `count` raw u32s into `out` (resized to count). +Status RunReader::read_raw_u32(size_t count, std::vector* out) { + // Bound `count` against the run's byte size BEFORE resize(): a record can never + // hold more u32s than the whole file. Rejects a corrupt/truncated length varint + // (which is otherwise an unbounded resize -> uncaught std::bad_alloc). + if (count > file_size_ / sizeof(uint32_t)) { + return Status::Corruption("run: raw u32 count exceeds file size"); + } + out->resize(count); + if (count == 0) return Status::OK(); + return pull_raw_u32(reinterpret_cast(out->data()), count); +} + +// Materializes the current term's deferred position block into positions_flat. +// A no-op once the positions are already drained (idempotent within a term). +Status RunReader::materialize_positions() { + if (pos_remaining_ == 0) { + current_.positions_flat.clear(); + return Status::OK(); + } + const size_t n = static_cast(pos_remaining_); + if (has_positions_) { + SNII_RETURN_IF_ERROR(read_raw_u32(n, ¤t_.positions_flat)); + } else { + // No-positions runs should carry n_pos == 0; tolerate (skip) a stray block. + std::vector skip; + SNII_RETURN_IF_ERROR(read_raw_u32(n, &skip)); + current_.positions_flat.clear(); + } + pos_remaining_ = 0; + return Status::OK(); +} + +// Streams the next `n` positions of the current term straight from the window. +Status RunReader::stream_positions(uint32_t* dst, size_t n) { + if (n == 0) return Status::OK(); + if (n > pos_remaining_) { + return Status::Corruption("run: stream_positions past block end"); + } + SNII_RETURN_IF_ERROR(pull_raw_u32(reinterpret_cast(dst), n)); + pos_remaining_ -= n; + return Status::OK(); +} + +// Discards any positions of the current term left unread, so the window cursor +// lands at the next record boundary before advance() reads the next term. +Status RunReader::skip_remaining_positions() { + if (pos_remaining_ == 0) return Status::OK(); + const size_t n = static_cast(pos_remaining_); + std::vector skip; + SNII_RETURN_IF_ERROR(read_raw_u32(n, &skip)); + pos_remaining_ = 0; + return Status::OK(); +} + +Status RunReader::advance() { + // Drain any positions the owner left unread for the previous term so the window + // cursor lands at the next record boundary. + SNII_RETURN_IF_ERROR(skip_remaining_positions()); + // End-of-run detection: at a record boundary, if no bytes remain we are done. + if (available() == 0) { + SNII_RETURN_IF_ERROR(fill()); + if (available() == 0 && eof_) { + exhausted_ = true; + return Status::OK(); + } + } + uint64_t term_id = 0; + SNII_RETURN_IF_ERROR(read_varint(&term_id)); + if (term_id > UINT32_MAX) return Status::Corruption("run term_id exceeds uint32"); + current_id_ = static_cast(term_id); + current_.term.clear(); // runs store only the id; owner resolves the string + + uint64_t n_docs = 0; + SNII_RETURN_IF_ERROR(read_varint(&n_docs)); + // Docids: RAW absolute u32 block (bulk read), matching the writer's AppendRawU32. + SNII_RETURN_IF_ERROR(read_raw_u32(static_cast(n_docs), ¤t_.docids)); + // Freqs: RAW u32 block (bulk read), matching the writer's AppendRawU32. + SNII_RETURN_IF_ERROR(read_raw_u32(static_cast(n_docs), ¤t_.freqs)); + uint64_t n_pos = 0; + SNII_RETURN_IF_ERROR(read_varint(&n_pos)); + // Positions are LAZY: record the block count and leave the window cursor parked + // at the block start. The owner picks materialize_positions() (default) or + // stream_positions() (wide-term merge pump). The widest term's tens-of-MiB + // position block is thus never resident unless the owner asks for it whole. + current_.positions_flat.clear(); + pos_count_ = n_pos; + pos_remaining_ = n_pos; + return Status::OK(); +} + +// --------------------------------------------------------------------------- +// K-way merge +// --------------------------------------------------------------------------- + +namespace { + +// Min-heap entry: orders by the run's current term-id's VOCAB STRING, tie-broken +// by run index so equal terms are gathered run-order (keeping concatenated +// docids ascending). The comparator resolves id -> string via the shared vocab, +// so the merged stream is lexicographic (the dictionary order the writer needs). +struct HeapItem { + uint32_t term_id; + size_t run; +}; +struct HeapGreater { + const std::vector* vocab; + bool operator()(const HeapItem& a, const HeapItem& b) const { + const std::string& sa = (*vocab)[a.term_id]; + const std::string& sb = (*vocab)[b.term_id]; + if (sa != sb) return sa > sb; + return a.run > b.run; + } +}; + +// Appends src's postings onto dst (run order). Later runs only cover docids +// >= dst's last, so docids stay ascending. COALESCE the boundary doc: if a spill +// fell BETWEEN two tokens of the same doc, that doc ends one run and begins the +// next with the SAME docid -- merge them (sum freqs, splice positions) so the +// merged term has exactly one entry per docid (matching the in-memory build). +// +// Positions are FLAT: doc order, partitioned by freqs. Because both dst and src +// already store doc-ordered flat positions, the common (no-boundary-overlap) case +// is a single bulk append. The boundary-overlap case must INSERT src's first +// doc's positions right after dst's last doc's positions so flat order stays +// consistent with the merged (coalesced) freqs. +void Concat(TermPostings* dst, const TermPostings& src, bool has_positions) { + if (src.docids.empty()) return; + size_t start = 0; + size_t src_pos_start = 0; // flat offset of src positions to append after splice + if (!dst->docids.empty() && dst->docids.back() == src.docids.front()) { + const uint32_t head_fc = src.freqs.front(); + if (has_positions && head_fc != 0) { + // Splice src's first-doc positions in right after dst's last-doc positions. + // dst's last doc owns dst->freqs.back() entries at the tail of positions_flat + // BEFORE we bump that freq, so insert at end() (last doc is the tail run). + auto& flat = dst->positions_flat; + flat.insert(flat.end(), src.positions_flat.begin(), + src.positions_flat.begin() + head_fc); + } + dst->freqs.back() += head_fc; + src_pos_start = head_fc; + start = 1; // boundary doc folded in; append the rest + } + dst->docids.insert(dst->docids.end(), src.docids.begin() + start, src.docids.end()); + dst->freqs.insert(dst->freqs.end(), src.freqs.begin() + start, src.freqs.end()); + if (has_positions) { + dst->positions_flat.insert(dst->positions_flat.end(), + src.positions_flat.begin() + src_pos_start, + src.positions_flat.end()); + } +} + +// Coalesces ONLY docids/freqs (no positions). Used by the WIDE-term path, whose +// positions are streamed via a pos_pump instead of materialized. The boundary-doc +// freq merge (dst->freqs.back() += head_fc) is identical to Concat's, so the +// merged df / freqs / ttf are bit-for-bit the same; positions are emitted in pure +// run-order concatenation by the pump (the same byte stream Concat would build). +void ConcatDocsFreqs(TermPostings* dst, const TermPostings& src) { + if (src.docids.empty()) return; + size_t start = 0; + if (!dst->docids.empty() && dst->docids.back() == src.docids.front()) { + dst->freqs.back() += src.freqs.front(); + start = 1; // boundary doc folded in; append the rest + } + dst->docids.insert(dst->docids.end(), src.docids.begin() + start, src.docids.end()); + dst->freqs.insert(dst->freqs.end(), src.freqs.begin() + start, src.freqs.end()); +} + +// A merged term is emitted with a STREAMED position pump (instead of a +// materialized positions_flat) when it is wide enough that its full flat +// positions would dominate the merge-phase peak RSS. The writer routes any term +// with df >= kSlimDfThreshold through the windowed path (build_windowed_entry), +// which is the only path that consumes pos_pump; a slim term reads positions_flat +// directly, so it must always be materialized. Gating on the same df threshold +// the writer uses keeps the two in lockstep and is conservative: only the few +// genuinely-wide terms (led by the single widest, the merge-phase peak driver) +// take the streamed path. total_pos is also required so a degenerate wide term +// with no positions still has something to stream. +bool ShouldStreamPositions(uint64_t total_docs, uint64_t total_pos, bool has_positions) { + return has_positions && total_pos != 0 && total_docs >= snii::format::kSlimDfThreshold; +} + +} // namespace + +Status MergeRuns(const std::vector& run_paths, const std::vector& vocab, + bool has_positions, const std::function& fn, + bool allow_stream_positions) { + std::vector> readers; + readers.reserve(run_paths.size()); + std::priority_queue, HeapGreater> heap(HeapGreater {&vocab}); + for (size_t i = 0; i < run_paths.size(); ++i) { + auto r = std::make_unique(); + SNII_RETURN_IF_ERROR(r->open(run_paths[i], has_positions)); + if (!r->exhausted()) { + if (r->current_id() >= vocab.size()) { + return Status::Corruption("run term_id out of vocab range"); + } + heap.push({r->current_id(), i}); + } + readers.push_back(std::move(r)); + } + + std::vector matching; // run indices contributing the current term + while (!heap.empty()) { + const uint32_t id = heap.top().term_id; + TermPostings merged; + merged.term = vocab[id]; // resolve the id -> dictionary string once + // Gather every run whose head id maps to the same string (the heap's run + // tie-break keeps them in run order, so concatenated docids stay ascending). + // Equal strings imply equal ids for a dense vocab; compare by string so a + // duplicate string still groups correctly. The matching runs' current slices + // are already loaded in their readers (they were read to seed the heap), so + // summing their sizes here costs nothing extra in RAM. + matching.clear(); + uint64_t total_docs = 0, total_pos = 0; + while (!heap.empty() && vocab[heap.top().term_id] == merged.term) { + const size_t ri = heap.top().run; + heap.pop(); + const RunReader* r = readers[ri].get(); + total_docs += r->current().docids.size(); + total_pos += r->current_pos_count(); // positions are LAZY: use the count + matching.push_back(ri); + } + // Reserve EXACTLY the summed sizes (an upper bound -- boundary-doc coalescing + // only shrinks the final size). This eliminates std::vector's geometric + // over-allocation, which left ~32 MiB of dead capacity on the widest term (df + // in the millions split across spills) -- a dominant merge-phase peak-RSS + // overhang at 5M. The reserved-but-unwritten pages are not faulted in, so the + // empty reservation itself does not raise RSS; only the actual data does. + merged.docids.reserve(static_cast(total_docs)); + merged.freqs.reserve(static_cast(total_docs)); + + bool stream = allow_stream_positions && + ShouldStreamPositions(total_docs, total_pos, has_positions); + if (!stream && has_positions) { + merged.positions_flat.reserve(static_cast(total_pos)); + } + // Coalesce docids/freqs from every matching run (always materialized -- a few + // u32 vectors). For the non-wide case, also coalesce positions here. For the + // wide case, leave positions for the streamed pump and keep the readers PARKED + // at their position blocks until fn() drains the pump. + for (size_t ri : matching) { + RunReader* r = readers[ri].get(); + if (stream) { + ConcatDocsFreqs(&merged, r->current()); + } else { + if (has_positions) SNII_RETURN_IF_ERROR(r->materialize_positions()); + Concat(&merged, r->current(), has_positions); + } + } + + // The stream gate keyed on PRE-coalesce total_docs, but the writer's slim vs + // windowed dispatch keys on the POST-coalesce df (merged.docids.size()). + // Boundary-doc coalescing across spill seams can drop df below kSlimDfThreshold + // while total_docs stayed above it; that term routes to build_slim_entry, which + // reads positions_flat directly and ignores pos_pump. Materialize positions now + // from the still-parked readers (mirrors drain_sorted()'s slim fallback). + if (stream && merged.docids.size() < snii::format::kSlimDfThreshold) { + merged.positions_flat.reserve(static_cast(total_pos)); + for (size_t ri : matching) { + RunReader* r = readers[ri].get(); + SNII_RETURN_IF_ERROR(r->materialize_positions()); + const std::vector& pf = r->current().positions_flat; + merged.positions_flat.insert(merged.positions_flat.end(), pf.begin(), pf.end()); + } + stream = false; + } + + if (stream) { + // WIDE term: STREAM positions via a pump that walks the matching readers in + // run order (pure flat concatenation == the coalesced positions_flat, + // byte-for-byte). positions_flat stays empty -- the widest term's tens-of-MiB + // position buffer is never resident; only one ~64 KiB window per pull is. The + // readers are still parked at this term's blocks, so the pump pulls from them + // synchronously while fn() runs (fn consumes synchronously -- the windowed + // writer does). After fn(), advance the readers past the (now-drained) blocks. + merged.pos_total = total_pos; + size_t cursor = 0; // index into `matching` for the run currently being drained + Status pump_status = Status::OK(); + std::vector>* rd = &readers; + const std::vector* match = &matching; + // Self-contained liveness guard. The pump captures references into THIS stack + // frame (&cursor, &pump_status) and the parked run readers (rd/match), valid + // ONLY while fn() runs synchronously -- after fn() the readers advance past the + // drained blocks. `pump_alive` is heap-owned and captured BY VALUE, so a + // stored/deferred pos_pump fails loudly (throws) instead of dereferencing + // dangling state. See the contract on TermPostings::pos_pump. + auto pump_alive = std::make_shared(true); + merged.pos_pump = [rd, match, &cursor, &pump_status, pump_alive](uint32_t* dst, + size_t n) { + if (!*pump_alive) { + throw std::logic_error( + "TermPostings::pos_pump invoked after its producing merge scope ended; " + "the streamed TermPostings must be consumed synchronously inside fn() " + "and never stored for later use"); + } + size_t off = 0; + while (off < n) { + // Advance to the next run that still has positions to yield. + while (cursor < match->size() && + (*rd)[(*match)[cursor]]->positions_remaining() == 0) { + ++cursor; + } + if (cursor >= match->size()) break; // defensive: pump over-pulled + RunReader* r = (*rd)[(*match)[cursor]].get(); + const size_t take = + std::min(n - off, static_cast(r->positions_remaining())); + Status s = r->stream_positions(dst + off, take); + if (!s.ok()) { + // Mid-stream I/O / corruption: zero-fill the UNFILLED tail before + // returning. fn() has the pump and will consume dst BEFORE pump_status + // is surfaced after fn(); never hand it uninitialized bytes (the + // failed stream_positions wrote nothing into dst[off..]). The error is + // still latched and surfaced after fn(), so the build aborts -- the + // zero fill only guarantees deterministic, defined bytes meanwhile. + std::memset(dst + off, 0, (n - off) * sizeof(uint32_t)); + if (pump_status.ok()) pump_status = std::move(s); + return; + } + off += take; + } + // Short-fill on over-pull (cursor ran past the matching runs without an + // error status): the readers held fewer positions than n. Zero-fill the + // unfilled tail so the writer never reads uninitialized storage. With + // valid runs n == pos_total == sum(positions_remaining), so off == n and + // this memset spans zero bytes -- the produced .idx is unchanged. + if (off < n) std::memset(dst + off, 0, (n - off) * sizeof(uint32_t)); + }; + fn(std::move(merged)); + *pump_alive = false; // any later pos_pump call now throws instead of UAF + SNII_RETURN_IF_ERROR(pump_status); // surface a streamed-read I/O error + } else { + fn(std::move(merged)); + } + + // Advance every matching reader to its next term and re-seed the heap. For the + // wide path this also skips any positions the pump did not pull (none, when fn + // drained the whole stream); for the non-wide path positions were already + // materialized so nothing remains. + for (size_t ri : matching) { + RunReader* r = readers[ri].get(); + SNII_RETURN_IF_ERROR(r->advance()); // frees this run's slice, loads next term + if (!r->exhausted()) { + if (r->current_id() >= vocab.size()) { + return Status::Corruption("run term_id out of vocab range"); + } + heap.push({r->current_id(), ri}); + } + } + } + return Status::OK(); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp b/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp new file mode 100644 index 00000000000000..7fc8cd58ec0bf6 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp @@ -0,0 +1,594 @@ +#include "snii/writer/spimi_term_buffer.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "snii/encoding/varint.h" +#include "snii/format/format_constants.h" +#include "snii/writer/spill_run_codec.h" +#include "snii/writer/temp_dir.h" + +#if defined(__GLIBC__) +#include +#endif + +namespace snii::writer { + +namespace { + +// Returns freed heap arenas to the OS (glibc only). The spill encode churns many +// small allocations whose freed chunks glibc retains in its arenas; trimming +// before the peak-RSS-defining merge phase recovers that retention. No-op (and +// harmless) on non-glibc libcs. +void TrimMalloc() { +#if defined(__GLIBC__) + ::malloc_trim(0); +#endif +} + +// Process-unique temp path for a spill run under `dir` (pid + monotonic counter so +// parallel builds / multiple buffers never collide). +std::string MakeRunPath(const std::string& dir) { + static std::atomic counter {0}; + const uint64_t n = counter.fetch_add(1); + return dir + "/snii_spill_" + std::to_string(::getpid()) + "_" + std::to_string(n) + ".run"; +} + +} // namespace + +SpimiTermBuffer::SpimiTermBuffer(const std::vector* vocab, bool has_positions, + size_t spill_threshold_bytes, MemoryReporter* reporter) + : vocab_(vocab), + has_positions_(has_positions), + spill_threshold_bytes_(spill_threshold_bytes), + mem_reporter_(reporter) { + // Borrowed-vocab mode: only the 4 B/id slot-index array is sized to the + // vocabulary; the Term pool (slots_) grows with the LIVE touched count, so an + // all-but-empty vocabulary costs ~4 B/id instead of ~80 B/id. + slot_of_.assign(vocab_->size(), 0); + // The vocab-sized slot index is resident immediately and survives spills; report + // its initial positive delta now. + report_arena_delta(); +} + +SpimiTermBuffer::SpimiTermBuffer(bool has_positions, size_t spill_threshold_bytes, + MemoryReporter* reporter) + : vocab_(&owned_vocab_), + has_positions_(has_positions), + spill_threshold_bytes_(spill_threshold_bytes), + mem_reporter_(reporter) { + // Owned-vocab mode: the vocabulary grows as strings are interned; terms_ / + // present_ grow alongside it in add_token(string_view, ...). +} + +SpimiTermBuffer::~SpimiTermBuffer() { + // Balance the writer-level / Doris tracker on the error path: if the buffer is + // destroyed while resident bytes were reported but not yet freed-and-reported + // (e.g. a build aborts before draining), return them here so nothing leaks. + if (mem_reporter_ != nullptr && reported_resident_ != 0) { + mem_reporter_->report(-reported_resident_); + reported_resident_ = 0; + } + cleanup_runs(); +} + +void SpimiTermBuffer::report_arena_delta() { + if (mem_reporter_ == nullptr) return; + // Diff the REAL resident bytes (arena + slot index) against the last reported + // total; emit the signed delta exactly once. + const int64_t now = static_cast(resident_bytes()); + mem_reporter_->report(now - reported_resident_); + reported_resident_ = now; +} + +size_t SpimiTermBuffer::unique_terms() const { + return live_term_count_; +} + +uint64_t SpimiTermBuffer::resident_bytes() const { + // REAL resident accumulator bytes: the posting arena plus the vocab-sized slot + // index (capacity, since the reserved-but-unused tail is still resident RSS and + // survives spills -- spill_to_run does NOT free slot_of_). This is the gate-2 + // spill trigger metric and the spill space-precheck figure -- NOT the old gated + // live_bytes_ estimate. + return pool_.arena_bytes() + static_cast(slot_of_.capacity()) * sizeof(uint32_t); +} + +// Returns the live Term for `term_id`, claiming a pool slot on first touch (1 == +// new). Reuses a freed slot from free_slots_ when available; otherwise appends a +// fresh Term to slots_. slot_of_[term_id] holds (slot index + 1); 0 means empty. +SpimiTermBuffer::Term& SpimiTermBuffer::term_slot(uint32_t term_id, bool* new_term) { + uint32_t enc = slot_of_[term_id]; + if (enc != 0) { + *new_term = false; + return slots_[enc - 1]; + } + *new_term = true; + uint32_t slot; + if (!free_slots_.empty()) { + slot = free_slots_.back(); + free_slots_.pop_back(); + } else { + slot = static_cast(slots_.size()); + slots_.emplace_back(); + } + slot_of_[term_id] = slot + 1; + return slots_[slot]; +} + +// Appends one byte to a term's chain, starting the chain lazily on first use. +void SpimiTermBuffer::put_byte(Term* t, uint8_t b) { + if (t->head == kNoChain) t->head = pool_.start_chain(&t->w, &t->level); + pool_.append_byte(&t->w, &t->level, b); +} + +void SpimiTermBuffer::put_varint(Term* t, uint64_t v) { + uint8_t tmp[10]; + const size_t n = encode_varint64(v, tmp); + for (size_t i = 0; i < n; ++i) put_byte(t, tmp[i]); +} + +void SpimiTermBuffer::accumulate(uint32_t term_id, uint32_t docid, uint32_t pos) { + bool new_term = false; + Term& t = term_slot(term_id, &new_term); + if (new_term) { + touched_ids_.push_back(term_id); + ++live_term_count_; + } + // A token starts a new doc unless it continues the most-recent doc for this term. + const bool new_doc = !t.started || t.cur_docid != docid; + // Tagged entry: varint((pos << 1) | new_doc). Positions are tagged 0 when + // disabled. The new_doc bit lets the decoder recover per-doc freqs by counting. + // Widen to 64-bit so a full 32-bit position survives the << 1 without truncation. + const uint64_t tagged = has_positions_ + ? ((static_cast(pos) << 1) | (new_doc ? 1u : 0u)) + : (new_doc ? 1u : 0u); + put_varint(&t, tagged); + if (new_doc) { + // Out-of-order docids are tolerated (zigzag delta is signed) and reordered at + // finalize; flag them so to_postings sorts. The delta base is the previous + // distinct doc (cur_docid), which is 0 for the very first doc (started==false). + const int64_t base = t.started ? static_cast(t.cur_docid) : 0; + if (t.started && docid < t.cur_docid) t.sorted = false; + const int64_t delta = static_cast(docid) - base; + put_varint(&t, zigzag_encode(delta)); + t.cur_docid = docid; + t.started = true; + } + ++t.ntok; + ++total_tokens_; + + // Gate-2 spill: trigger on REAL resident bytes (arena + slot index), NOT the old + // gated live_bytes_ estimate. arena_bytes() is monotonic per fill and reset to 0 + // by spill_to_run()'s pool_.reset(), so the trigger self-rearms after each spill. + // The OTHER trigger is the hard arena safety stop (active even in unlimited mode): + // when the arena nears the 4 GiB uint32-offset limit -- without it, a single + // >4 GiB in-memory segment wraps alloc_run and silently corrupts data. A forced + // spill + final k-way merge stays byte-identical regardless of when it fires. + constexpr uint64_t kArenaSpillCap = 0xE0000000ull; // 3.5 GiB, < UINT32_MAX margin + // Report this token's REAL resident growth FIRST so the writer's unified total + // (reporter_->current_bytes()) reflects it before the gate-2 check. Single-source + // diff: cheap (subtraction + relaxed atomic add; arena_bytes() is two field reads). + report_arena_delta(); + // Gate-2 spill (UNIFIED): when a reporter is attached, trigger on the writer's TOTAL + // build RAM (arena + slot index + dict) crossing the one configured cap -- the same + // total and cap every buffer of this writer shares, not a per-buffer threshold. Off + // Doris (no reporter) fall back to the local spill_threshold_bytes_. The hard arena + // safety stop (4 GiB uint32-offset limit) is always active. spill_to_run() resets the + // arena and reports its negative internally, so the unified total drops after a spill. + const bool over_cap = mem_reporter_ != nullptr ? mem_reporter_->over_cap() + : (spill_threshold_bytes_ != 0 && + resident_bytes() >= spill_threshold_bytes_); + const bool arena_near_limit = pool_.arena_bytes() >= kArenaSpillCap; + if ((over_cap || arena_near_limit) && spill_status_.ok()) { + spill_status_ = spill_to_run(); + } +} + +void SpimiTermBuffer::add_token(uint32_t term_id, uint32_t docid, uint32_t pos) { + // Hot path: a pooled slot lookup + a couple of pushes. No hashing, no string + // construction per token. Reject (and latch) an out-of-range id. + if (term_id >= slot_of_.size()) { + if (spill_status_.ok()) { + spill_status_ = Status::InvalidArgument("spimi: term_id out of vocab range"); + } + return; + } + accumulate(term_id, docid, pos); +} + +void SpimiTermBuffer::add_token(std::string_view term, uint32_t docid, uint32_t pos) { + // Compatibility path: intern the term into the owned vocabulary on first + // occurrence, then accumulate by its id. ONLY valid in OWNED-vocab mode. In + // BORROWED-vocab mode vocab_ points at the caller's vector, NOT &owned_vocab_: + // interning here would grow owned_vocab_ / intern_ / slot_of_ out of step with + // the active (borrowed) vocab, so the new id indexes the WRONG string and writes + // a slot_of_ entry the borrowed-vocab build never reconciles -- silent + // corruption. Reject (and latch) instead of forwarding by a bogus id. + if (vocab_ != &owned_vocab_) { + if (spill_status_.ok()) { + spill_status_ = Status::InvalidArgument( + "spimi: add_token(string_view) requires owned-vocab mode"); + } + return; + } + auto it = intern_.find(std::string(term)); + uint32_t term_id; + if (it == intern_.end()) { + term_id = static_cast(owned_vocab_.size()); + owned_vocab_.emplace_back(term); + intern_.emplace(owned_vocab_.back(), term_id); + slot_of_.push_back(0); // vocab grows: new id starts with no live slot + } else { + term_id = it->second; + } + accumulate(term_id, docid, pos); +} + +namespace { + +// Reorders a term's flat arrays into ascending-docid order, COALESCING any +// same-docid groups so the result has exactly one entry per docid -- matching the +// k-way-merge path's boundary-doc coalescing and the writer's strictly-ascending +// precondition. Only invoked for the rare term that received out-of-order docids +// (the common ascending path leaves t.sorted true and skips it). +// +// A docid may REVISIT (e.g. feed 5,1,5): the chain holds two separate doc-groups +// for doc 5. A STABLE sort keeps equal-docid groups in arrival order, then the +// coalesce pass sums their freqs and concatenates their positions in that same +// (document/arrival) order -- so the merged positions stay consistent with the +// merged freqs, exactly as the run-order merge would have produced. +void SortByDocid(std::vector* docids, std::vector* freqs, + std::vector* positions_flat, bool has_positions) { + const size_t n = docids->size(); + std::vector order(n); + std::iota(order.begin(), order.end(), 0); + // STABLE so equal docids keep arrival order: their positions then concatenate in + // document order, the same order the merge path's run concatenation yields. + std::stable_sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return (*docids)[a] < (*docids)[b]; }); + + std::vector pos_off; + if (has_positions) { + pos_off.resize(n); + uint32_t running = 0; + for (size_t i = 0; i < n; ++i) { + pos_off[i] = running; + running += (*freqs)[i]; + } + } + std::vector nd, nf, np; + nd.reserve(n); + nf.reserve(n); + if (has_positions) np.reserve(positions_flat->size()); + for (size_t k : order) { + // Coalesce a revisited docid into the previous entry (it sorts adjacent now): + // sum freqs and append this group's positions right after the prior group's, + // so flat doc order stays partitioned by the merged freqs. + if (!nd.empty() && nd.back() == (*docids)[k]) { + nf.back() += (*freqs)[k]; + } else { + nd.push_back((*docids)[k]); + nf.push_back((*freqs)[k]); + } + if (has_positions) { + np.insert(np.end(), positions_flat->begin() + pos_off[k], + positions_flat->begin() + pos_off[k] + (*freqs)[k]); + } + } + *docids = std::move(nd); + *freqs = std::move(nf); + if (has_positions) *positions_flat = std::move(np); +} + +} // namespace + +namespace { + +// Decodes one varint from a pool chain cursor. The chain was written by +// encode_varint*, so the same LEB128 continuation-bit loop reconstructs it. +uint64_t DecodeChainVarint(CompactPostingPool::Cursor* c) { + uint64_t result = 0; + int shift = 0; + for (;;) { + const uint8_t b = c->next(); + result |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) break; + shift += 7; + } + return result; +} + +} // namespace + +// Decodes a term's compact tagged chain back into a flat TermPostings (the exact +// docids/freqs/positions_flat the writer consumes), so the produced index is +// byte-identical to the legacy raw-uint32 accumulator. The chain holds one entry +// per token: varint((pos << 1) | new_doc); each new_doc entry is followed by a +// zigzag(docid-delta). A doc's freq is the run length of consecutive same-doc +// tokens; positions stream out in document order (empty when positions disabled). +// Stream positions for a sorted term whose token count exceeds this: such a term's +// flat positions buffer (uint32 per token) would be the peak-RSS transient (tens of +// MiB for the widest term). Below it, the flat buffer is cheap and simpler. +static constexpr uint32_t kStreamPositionsTokenThreshold = 1u << 16; // 65536 + +TermPostings SpimiTermBuffer::to_postings(std::string term, Term&& t, + bool allow_stream_positions) const { + TermPostings tp; + tp.term = std::move(term); + if (t.ntok == 0 || t.head == kNoChain) return tp; + + // Reserve docids/freqs by ntok (an upper bound on the doc count: ntok >= ndocs). + // The doc count is not stored separately to keep Term compact; since the corpus + // is freq~1 per (term, doc), ntok ~= ndocs so the over-reserve is negligible. + tp.docids.reserve(t.ntok); + tp.freqs.reserve(t.ntok); + + // For a large SORTED term, stream positions on demand instead of materializing a + // multi-MiB flat buffer: the writer (prx builder) pulls them window by window via + // pos_pump, decoding straight from the still-resident arena chain. Out-of-order + // terms (rare, defensive) need a full sort, so they always use the flat path. + const bool stream_pos = allow_stream_positions && has_positions_ && t.sorted && + t.ntok >= kStreamPositionsTokenThreshold; + if (has_positions_ && !stream_pos) tp.positions_flat.reserve(t.ntok); + + CompactPostingPool::Cursor c = pool_.cursor(t.head, t.w.cur); + int64_t prev = 0; + for (uint32_t i = 0; i < t.ntok; ++i) { + const uint64_t tagged = DecodeChainVarint(&c); + const bool new_doc = (tagged & 1u) != 0; + if (new_doc) { + prev += zigzag_decode(DecodeChainVarint(&c)); + tp.docids.push_back(static_cast(prev)); + tp.freqs.push_back(0); + } + ++tp.freqs.back(); // count this token toward the current doc's freq + if (has_positions_ && !stream_pos) { + tp.positions_flat.push_back(static_cast(tagged >> 1)); + } + } + + // Decide the FINAL position handling now that df (= docids.size()) is known. + // pos_pump is honored ONLY by the windowed writer path (build_windowed_entry), + // taken when df >= kSlimDfThreshold. A SLIM term (df below it) goes through + // build_slim_entry, which reads positions_flat directly -- so streaming would + // leave it empty and crash. A high-ntok but low-df term (many repeats in few + // docs) therefore falls back to materializing its df-bounded positions here. + const bool windowed_path = tp.docids.size() >= snii::format::kSlimDfThreshold; + if (stream_pos && windowed_path) { + // Hand the writer a sequential position source backed by a SECOND pass over the + // same chain (the chain stays resident in pool_ for the whole drain). The pump + // yields positions in document order -- identical to positions_flat -- so the + // produced .prx is byte-for-byte the same. The cursor is shared/advanced across + // calls (the writer pulls in order, exactly pos_total positions total). + tp.pos_total = t.ntok; + auto cur = std::make_shared(pool_.cursor(t.head, t.w.cur)); + tp.pos_pump = [cur](uint32_t* dst, size_t count) { + // Re-walk the tagged token stream, yielding one position per token. A new-doc + // token is followed by a zigzag docid-delta varint that must be consumed and + // discarded so the cursor stays aligned with the encoding. + for (size_t k = 0; k < count; ++k) { + const uint64_t tagged = DecodeChainVarint(cur.get()); + if ((tagged & 1u) != 0) (void)DecodeChainVarint(cur.get()); // skip docid delta + dst[k] = static_cast(tagged >> 1); + } + }; + } else if (stream_pos && has_positions_) { + // Slim fallback: the decode loop skipped positions (stream candidate) but the + // term is slim, so materialize positions_flat in a second pass for build_slim. + tp.positions_flat.reserve(t.ntok); + CompactPostingPool::Cursor pc = pool_.cursor(t.head, t.w.cur); + for (uint32_t i = 0; i < t.ntok; ++i) { + const uint64_t tagged = DecodeChainVarint(&pc); + if ((tagged & 1u) != 0) (void)DecodeChainVarint(&pc); // skip docid delta + tp.positions_flat.push_back(static_cast(tagged >> 1)); + } + } else if (!t.sorted) { + // Defensive reorder for the rare out-of-order-docid feed (merge of pre-sorted + // runs). The common ascending path leaves t.sorted true and skips it. + SortByDocid(&tp.docids, &tp.freqs, &tp.positions_flat, has_positions_); + } + return tp; +} + +void SpimiTermBuffer::ensure_string_rank() const { + const std::vector& v = vocab(); + if (string_rank_.size() == v.size()) return; // already built (or empty vocab) + // One full lexicographic sort of the vocabulary, amortized over every spill. + std::vector order(v.size()); + std::iota(order.begin(), order.end(), 0u); + std::sort(order.begin(), order.end(), [&](uint32_t a, uint32_t b) { return v[a] < v[b]; }); + string_rank_.assign(v.size(), 0u); + for (uint32_t rank = 0; rank < order.size(); ++rank) { + string_rank_[order[rank]] = rank; + } +} + +std::vector SpimiTermBuffer::sorted_ids() const { + ensure_string_rank(); + std::vector ids = touched_ids_; + const std::vector& rank = string_rank_; + // Integer rank compare instead of full std::string compare: equal-string ids + // cannot occur for a dense vocab, so a strict rank order matches the original + // lexicographic order exactly. + std::sort(ids.begin(), ids.end(), [&](uint32_t a, uint32_t b) { return rank[a] < rank[b]; }); + return ids; +} + +void SpimiTermBuffer::release_term(uint32_t term_id) { + const uint32_t enc = slot_of_[term_id]; + if (enc == 0) return; // not live (defensive) + const uint32_t slot = enc - 1; + slots_[slot] = Term(); // free this term's arrays; the empty Term slot is reusable + free_slots_.push_back(slot); + slot_of_[term_id] = 0; + --live_term_count_; +} + +Status SpimiTermBuffer::drain_sorted(const std::function& fn, + bool allow_stream_positions) { + const std::vector& v = vocab(); + for (uint32_t id : sorted_ids()) { + Term term = std::move(slots_[slot_of_[id] - 1]); + release_term(id); // release this term's slot before building the next + // Allow streaming positions only when the caller consumes synchronously (the + // arena chain stays resident for the whole drain, so the pump can read from it). + TermPostings tp = to_postings(v[id], std::move(term), allow_stream_positions); + fn(std::move(tp)); + } + touched_ids_.clear(); + // Drop the arena + the slot pool (their bytes are fully decoded) and return the + // freed chunks to the OS so the process peak reflects only what survives the + // drain, not retained input-phase arena memory. + pool_.reset(); + std::vector().swap(slots_); + std::vector().swap(free_slots_); + std::vector().swap(slot_of_); + TrimMalloc(); + // Arena reset + slot_of_ freed: now real resident ~0, so this emits the final + // negative that returns every reported byte (no leak after the in-memory drain). + report_arena_delta(); + return Status::OK(); +} + +Status SpimiTermBuffer::drain_to_writer(RunWriter* w) { + Status st = Status::OK(); + const std::vector& v = vocab(); + // Spill writes by term-id (no string IO). Iterate touched ids in vocab-string + // order so each run is sorted; the k-way merge re-orders runs by the same key. + for (uint32_t id : sorted_ids()) { + Term term = std::move(slots_[slot_of_[id] - 1]); + release_term(id); + // Spill path: the run codec serializes positions_flat directly, so positions + // must be materialized (no streaming pump). + TermPostings tp = to_postings(v[id], std::move(term), /*allow_stream=*/false); + if (st.ok()) st = w->write_term(id, tp); + } + touched_ids_.clear(); + pool_.reset(); // all chains decoded into the run; free the arena for the refill + // The spill returns the arena to 0; slot_of_ keeps its capacity (survives + // the spill). Report the arena-drop negative now so the gate-2 spill is balanced + // immediately, not deferred to the next token. + report_arena_delta(); + return st; +} + +Status SpimiTermBuffer::spill_to_run() { + const std::string dir = resolve_temp_dir(); + // Best-effort space pre-check: fail with a clear, early error rather than a + // mid-write IoError that leaves a half-written run. Best-effort only (TOCTOU; on + // tmpfs this reports RAM). resident_bytes() (arena + slot index) is the REAL + // resident figure about to drain -- a conservative over-estimate of the run size. + const uint64_t resident = resident_bytes(); + const uint64_t avail = temp_dir_available_bytes(dir); + if (avail < resident) { + return Status::IoError("spimi: insufficient temp space in '" + dir + "' to spill ~" + + std::to_string(resident) + " B (~" + std::to_string(avail) + + " B free); set SNII_TEMP_DIR/TMPDIR to a larger disk"); + } + const std::string path = MakeRunPath(dir); + RunWriter w; + SNII_RETURN_IF_ERROR(w.open(path)); + run_paths_.push_back(path); // tracked for cleanup even if a later step fails + SNII_RETURN_IF_ERROR(drain_to_writer(&w)); + // drain emptied touched_ids_ and freed each term's arrays; terms_/present_ keep + // their (vocab-sized) capacity so the next fill reuses the dense slots with no + // re-allocation. present_ is already all-zero after release_term per id. + return w.close(); +} + +Status SpimiTermBuffer::merge_runs(const std::function& fn, + bool allow_stream_positions) { + // Flush whatever is still resident as one final sorted run so the k-way merge + // sees a uniform set of run files (and never holds two term sources at once). + if (!touched_ids_.empty()) { + Status s = spill_to_run(); + if (!s.ok() && spill_status_.ok()) spill_status_ = s; + } + if (!spill_status_.ok()) return spill_status_; // a spill or add_token error; emit nothing + // All terms are now spilled; the merge reads runs and never touches the + // accumulators. Free the pool + the vocab-sized slot index so the merge phase + // holds none of the input-side arrays resident -- keeps spill-mode peak RSS + // down. malloc_trim(0) returns the freed glibc arenas to the OS so the peak RSS + // measurement reflects the merge transient, not retained input-phase chunks. + std::vector().swap(slots_); + std::vector().swap(free_slots_); + std::vector().swap(slot_of_); + TrimMalloc(); + // pool_ was already reset by the final spill_to_run -> drain_to_writer (reported + // there); this swap frees slot_of_, so report the remaining negative now. After a + // full spilled drain reported_resident_ returns to 0 (no leak). + report_arena_delta(); + Status s = MergeRuns(run_paths_, vocab(), has_positions_, fn, allow_stream_positions); + // The merge churns one large coalesced TermPostings per term (the widest term's + // arrays are tens of MiB) plus per-run reader windows; on completion glibc + // retains those freed chunks in its arenas. Trim again so the post-merge resident + // set (and thus the process peak high-water if a later phase allocates) reflects + // only live state, not merge-transient retention. + TrimMalloc(); + return s; +} + +Status SpimiTermBuffer::for_each_term_sorted(const std::function& fn) { + // Single-drain contract: a second call would re-merge the (still-present) run + // files and re-emit every term, or emit nothing in the in-memory path. Return + // an error and emit NOTHING rather than produce a wrong second stream. + if (drained_) { + return Status::Internal("spimi: already drained (single-drain contract)"); + } + drained_ = true; + // The callback is invoked synchronously while the arena is resident, so large + // sorted terms may stream positions via pos_pump (peak-RSS win for the writer). + if (run_paths_.empty() && spill_status_.ok()) { + return drain_sorted(fn, /*allow_stream_positions=*/true); // pure in-memory path + } + // Spilled path (or add_token latched a validation error): the merge may STREAM + // a wide term's positions via pos_pump (fn consumes each term synchronously + // while the run readers stay parked). merge_runs returns the I/O status + // directly; add_token validation errors surface via spill_status_ inside it. + return merge_runs(fn, /*allow_stream_positions=*/true); +} + +std::vector SpimiTermBuffer::finalize_sorted() { + std::vector out; + // Single-drain contract (mirrors for_each_term_sorted): a second drain (including + // a finalize_sorted after a for_each_term_sorted, or vice versa) would re-emit or + // emit nothing. Latch an error and return EMPTY rather than a wrong result. + if (drained_) { + if (spill_status_.ok()) { + spill_status_ = Status::Internal("spimi: already drained (single-drain contract)"); + } + return out; + } + drained_ = true; + out.reserve(touched_ids_.size()); + // RETAINS each TermPostings past the drain, so positions must be MATERIALIZED + // (a streamed pos_pump would reference the arena, freed when the drain ends). + if (run_paths_.empty() && spill_status_.ok()) { + Status s = drain_sorted([&out](TermPostings&& tp) { out.push_back(std::move(tp)); }, + /*allow_stream_positions=*/false); + if (!s.ok() && spill_status_.ok()) spill_status_ = s; + } else { + // RETAINS each TermPostings past the merge, so positions MUST be materialized + // (a streamed pos_pump would reference run readers freed when the merge ends). + Status s = merge_runs([&out](TermPostings&& tp) { out.push_back(std::move(tp)); }, + /*allow_stream_positions=*/false); + if (!s.ok() && spill_status_.ok()) spill_status_ = s; + } + return out; +} + +void SpimiTermBuffer::cleanup_runs() { + for (const std::string& p : run_paths_) std::remove(p.c_str()); + run_paths_.clear(); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/snii_doris_adapter.cpp b/be/src/storage/index/snii/snii_doris_adapter.cpp new file mode 100644 index 00000000000000..5756bdc8678540 --- /dev/null +++ b/be/src/storage/index/snii/snii_doris_adapter.cpp @@ -0,0 +1,249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_doris_adapter.h" + +#include + +#include +#include +#include + +#include "common/cast_set.h" + +namespace doris::segment_v2::snii_doris { + +thread_local const io::IOContext* DorisSniiFileReader::_scoped_io_ctx = nullptr; + +Status to_doris_status(const ::snii::Status& status) { + if (status.ok()) { + return Status::OK(); + } + switch (status.code()) { + case ::snii::StatusCode::kNotFound: + return Status::Error("SNII: {}", + status.message()); + case ::snii::StatusCode::kUnsupported: + return Status::Error("SNII: {}", status.message()); + case ::snii::StatusCode::kInvalidArgument: + return Status::Error("SNII: {}", status.message()); + case ::snii::StatusCode::kCorruption: + return Status::Error("SNII: {}", + status.message()); + case ::snii::StatusCode::kIoError: + return Status::IOError("SNII: {}", status.message()); + case ::snii::StatusCode::kInternal: + return Status::InternalError("SNII: {}", status.message()); + case ::snii::StatusCode::kOk: + break; + } + return Status::InternalError("SNII: {}", status.message()); +} + +::snii::Status to_snii_status(const Status& status) { + if (status.ok()) { + return ::snii::Status::OK(); + } + return ::snii::Status::IoError(status.to_string_no_stack()); +} + +::snii::Status DorisSniiFileWriter::append(::snii::Slice data) { + if (_writer == nullptr) { + return ::snii::Status::InvalidArgument("doris writer is null"); + } + return to_snii_status( + _writer->append(Slice(reinterpret_cast(data.data()), data.size()))); +} + +::snii::Status DorisSniiFileWriter::finalize() { + if (_writer == nullptr) { + return ::snii::Status::InvalidArgument("doris writer is null"); + } + return ::snii::Status::OK(); +} + +uint64_t DorisSniiFileWriter::bytes_written() const { + return _writer == nullptr ? 0 : _writer->bytes_appended(); +} + +DorisSniiFileReader::DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx) + : _reader(std::move(reader)), _default_io_ctx(_make_index_io_context(io_ctx)) {} + +io::IOContext DorisSniiFileReader::_make_index_io_context(const io::IOContext* io_ctx) { + io::IOContext index_io_ctx; + if (io_ctx != nullptr) { + index_io_ctx = *io_ctx; + } + index_io_ctx.is_inverted_index = true; + index_io_ctx.is_index_data = true; + return index_io_ctx; +} + +DorisSniiFileReader::ScopedIOContext::ScopedIOContext(const io::IOContext* io_ctx) + : _previous(_scoped_io_ctx), _io_ctx(DorisSniiFileReader::_make_index_io_context(io_ctx)) { + _scoped_io_ctx = &_io_ctx; +} + +DorisSniiFileReader::ScopedIOContext::~ScopedIOContext() { + _scoped_io_ctx = _previous; +} + +::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, + std::vector* const out) { + SNII_RETURN_IF_ERROR(_read_at(offset, len, out)); + if (len > 0) { + _record_read_stats(cast_set(len), cast_set(len), 1, 1); + } + return ::snii::Status::OK(); +} + +::snii::Status DorisSniiFileReader::_read_at(uint64_t offset, size_t len, + std::vector* const out) const { + if (_reader == nullptr) { + return ::snii::Status::InvalidArgument("doris reader is null"); + } + if (out == nullptr) { + return ::snii::Status::InvalidArgument("output buffer is null"); + } + SNII_RETURN_IF_ERROR(_check_read_range(offset, len)); + if (len == 0) { + out->clear(); + return ::snii::Status::OK(); + } + out->resize(len); + size_t bytes_read = 0; + auto status = _reader->read_at(offset, Slice(out->data(), len), &bytes_read, _current_io_ctx()); + if (!status.ok()) { + return to_snii_status(status); + } + if (bytes_read != len) { + return ::snii::Status::IoError( + fmt::format("short read at offset {}, expect {}, got {}", offset, len, bytes_read)); + } + return ::snii::Status::OK(); +} + +::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Range>& ranges, + std::vector>* const outs) { + if (outs == nullptr) { + return ::snii::Status::InvalidArgument("output buffers is null"); + } + outs->clear(); + outs->resize(ranges.size()); + if (ranges.empty()) { + return ::snii::Status::OK(); + } + + struct IndexedRange { + uint64_t offset = 0; + size_t len = 0; + size_t index = 0; + }; + int64_t request_bytes = 0; + std::vector sorted; + sorted.reserve(ranges.size()); + for (size_t i = 0; i < ranges.size(); ++i) { + SNII_RETURN_IF_ERROR(_check_read_range(ranges[i].offset, ranges[i].len)); + request_bytes += cast_set(ranges[i].len); + if (ranges[i].len == 0) { + continue; + } + sorted.push_back({ranges[i].offset, ranges[i].len, i}); + } + if (sorted.empty()) { + return ::snii::Status::OK(); + } + std::sort(sorted.begin(), sorted.end(), [](const IndexedRange& lhs, const IndexedRange& rhs) { + return lhs.offset < rhs.offset; + }); + + constexpr uint64_t max_coalesced_gap = 4096; + constexpr uint64_t max_coalesced_read = 1ULL << 20; + int64_t read_bytes = 0; + int64_t range_read_count = 0; + for (size_t begin = 0; begin < sorted.size();) { + uint64_t read_offset = sorted[begin].offset; + uint64_t read_end = sorted[begin].offset + sorted[begin].len; + size_t end = begin + 1; + while (end < sorted.size()) { + const uint64_t next_end = sorted[end].offset + sorted[end].len; + if ((sorted[end].offset > read_end && + sorted[end].offset - read_end > max_coalesced_gap) || + next_end - read_offset > max_coalesced_read) { + break; + } + read_end = std::max(read_end, next_end); + ++end; + } + + std::vector bytes; + const size_t read_len = cast_set(read_end - read_offset); + SNII_RETURN_IF_ERROR(_read_at(read_offset, read_len, &bytes)); + read_bytes += cast_set(read_len); + ++range_read_count; + for (size_t i = begin; i < end; ++i) { + const uint64_t pos = sorted[i].offset - read_offset; + auto& out = (*outs)[sorted[i].index]; + out.assign(bytes.begin() + cast_set(pos), + bytes.begin() + cast_set(pos + sorted[i].len)); + } + begin = end; + } + _record_read_stats(request_bytes, read_bytes, range_read_count, range_read_count); + return ::snii::Status::OK(); +} + +uint64_t DorisSniiFileReader::size() const { + return _reader == nullptr ? 0 : _reader->size(); +} + +const io::IOContext* DorisSniiFileReader::_current_io_ctx() const { + return _scoped_io_ctx != nullptr ? _scoped_io_ctx : &_default_io_ctx; +} + +void DorisSniiFileReader::_record_read_stats(int64_t request_bytes, int64_t read_bytes, + int64_t range_read_count, + int64_t serial_read_rounds) const { + const auto* io_ctx = _current_io_ctx(); + if (io_ctx->file_cache_stats == nullptr) { + return; + } + auto* stats = io_ctx->file_cache_stats; + stats->inverted_index_request_bytes += request_bytes; + stats->inverted_index_read_bytes += read_bytes; + stats->inverted_index_range_read_count += range_read_count; + stats->inverted_index_serial_read_rounds += serial_read_rounds; +} + +::snii::Status DorisSniiFileReader::_check_read_range(uint64_t offset, size_t len) const { + if (_reader == nullptr) { + return ::snii::Status::InvalidArgument("doris reader is null"); + } + if (offset > std::numeric_limits::max() - len) { + return ::snii::Status::Corruption( + fmt::format("read range overflows: offset {}, len {}", offset, len)); + } + const uint64_t end = offset + len; + if (end > _reader->size()) { + return ::snii::Status::Corruption( + fmt::format("read range exceeds file size: offset {}, len {}, file size {}", offset, + len, _reader->size())); + } + return ::snii::Status::OK(); +} + +} // namespace doris::segment_v2::snii_doris diff --git a/be/src/storage/index/snii/snii_doris_adapter.h b/be/src/storage/index/snii/snii_doris_adapter.h new file mode 100644 index 00000000000000..7f099466704d5b --- /dev/null +++ b/be/src/storage/index/snii/snii_doris_adapter.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "io/fs/file_reader.h" +#include "io/fs/file_writer.h" +#include "io/io_common.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" +#include "util/slice.h" + +namespace doris::segment_v2::snii_doris { + +Status to_doris_status(const ::snii::Status& status); +::snii::Status to_snii_status(const Status& status); + +class DorisSniiFileWriter final : public ::snii::io::FileWriter { +public: + explicit DorisSniiFileWriter(io::FileWriter* writer) : _writer(writer) {} + + ::snii::Status append(::snii::Slice data) override; + ::snii::Status finalize() override; + uint64_t bytes_written() const override; + +private: + io::FileWriter* _writer = nullptr; +}; + +class DorisSniiFileReader final : public ::snii::io::FileReader { +public: + class ScopedIOContext { + public: + explicit ScopedIOContext(const io::IOContext* io_ctx); + ~ScopedIOContext(); + + ScopedIOContext(const ScopedIOContext&) = delete; + ScopedIOContext& operator=(const ScopedIOContext&) = delete; + + private: + const io::IOContext* _previous = nullptr; + io::IOContext _io_ctx; + }; + + explicit DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx = nullptr); + + ::snii::Status read_at(uint64_t offset, size_t len, std::vector* const out) override; + ::snii::Status read_batch(const std::vector<::snii::io::Range>& ranges, + std::vector>* const outs) override; + uint64_t size() const override; + +private: + static io::IOContext _make_index_io_context(const io::IOContext* io_ctx); + ::snii::Status _check_read_range(uint64_t offset, size_t len) const; + ::snii::Status _read_at(uint64_t offset, size_t len, std::vector* const out) const; + const io::IOContext* _current_io_ctx() const; + void _record_read_stats(int64_t request_bytes, int64_t read_bytes, int64_t range_read_count, + int64_t serial_read_rounds) const; + + io::FileReaderSPtr _reader; + io::IOContext _default_io_ctx; + static thread_local const io::IOContext* _scoped_io_ctx; +}; + +} // namespace doris::segment_v2::snii_doris diff --git a/be/src/storage/index/snii/snii_index_reader.cpp b/be/src/storage/index/snii/snii_index_reader.cpp new file mode 100644 index 00000000000000..2b7129074d92a7 --- /dev/null +++ b/be/src/storage/index/snii/snii_index_reader.cpp @@ -0,0 +1,398 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_index_reader.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "runtime/runtime_profile.h" +#include "runtime/runtime_state.h" +#include "snii/format/null_bitmap.h" +#include "snii/query/boolean_query.h" +#include "snii/query/docid_sink.h" +#include "snii/query/phrase_query.h" +#include "snii/query/prefix_query.h" +#include "snii/query/regexp_query.h" +#include "snii/query/term_query.h" +#include "snii/query/wildcard_query.h" +#include "snii/reader/logical_index_reader.h" +#include "storage/index/index_file_reader.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/inverted_index_cache.h" +#include "storage/index/inverted/inverted_index_iterator.h" +#include "storage/index/snii/snii_doris_adapter.h" + +namespace doris::segment_v2 { + +namespace { + +class RoaringDocIdSink final : public snii::query::DocIdSink { +public: + explicit RoaringDocIdSink(roaring::Roaring* bitmap) : _bitmap(bitmap) { + DCHECK(_bitmap != nullptr); + } + + snii::Status append_sorted(std::span docids) override { + if (!docids.empty()) { + _bitmap->addMany(docids.size(), docids.data()); + } + return snii::Status::OK(); + } + + snii::Status append_range(uint32_t first, uint64_t last_exclusive) override { + if (last_exclusive > first) { + _bitmap->addRange(first, last_exclusive); + } + return snii::Status::OK(); + } + +private: + roaring::Roaring* _bitmap; +}; + +struct SniiQueryExecutionResult { + std::shared_ptr bitmap; +}; + +std::vector to_terms(const InvertedIndexQueryInfo& query_info) { + std::vector terms; + terms.reserve(query_info.term_infos.size()); + for (const auto& term_info : query_info.term_infos) { + DCHECK(term_info.is_single_term()); + terms.push_back(term_info.get_single_term()); + } + return terms; +} + +void parse_phrase_slop(std::string* query, InvertedIndexQueryInfo* query_info) { + DCHECK(query != nullptr); + DCHECK(query_info != nullptr); + const auto is_digits = [](std::string_view str) { + return std::all_of(str.begin(), str.end(), [](unsigned char c) { return std::isdigit(c); }); + }; + + const size_t last_space_pos = query->find_last_of(' '); + if (last_space_pos == std::string::npos) { + return; + } + const size_t tilde_pos = last_space_pos + 1; + if (tilde_pos >= query->size() - 1 || (*query)[tilde_pos] != '~') { + return; + } + + const size_t slop_pos = tilde_pos + 1; + std::string_view slop_str(query->data() + slop_pos, query->size() - slop_pos); + if (slop_str.empty()) { + return; + } + + bool ordered = false; + if (slop_str.size() == 1) { + if (!std::isdigit(static_cast(slop_str[0]))) { + return; + } + } else if (slop_str.back() == '+') { + ordered = true; + slop_str.remove_suffix(1); + } + + if (!is_digits(slop_str)) { + return; + } + auto result = std::from_chars(slop_str.begin(), slop_str.end(), query_info->slop); + if (result.ec != std::errc()) { + return; + } + query_info->ordered = ordered; + *query = query->substr(0, last_space_pos); +} + +std::string build_snii_query_cache_value(const InvertedIndexQueryInfo& query_info) { + std::string cache_value; + for (const auto& term_info : query_info.term_infos) { + DCHECK(term_info.is_single_term()); + const auto& term = term_info.get_single_term(); + cache_value.append(std::to_string(term.size())); + cache_value.push_back(':'); + cache_value.append(term); + cache_value.push_back('@'); + cache_value.append(std::to_string(term_info.position)); + cache_value.push_back(';'); + } + return cache_value; +} + +std::shared_ptr docids_to_bitmap(const std::vector& docids) { + auto result = std::make_shared(); + if (!docids.empty()) { + result->addMany(docids.size(), docids.data()); + } + result->runOptimize(); + return result; +} + +Status execute_snii_query(const snii::reader::LogicalIndexReader& logical_reader, + InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, std::string_view search_str, + const std::vector& terms, int32_t max_expansions, + SniiQueryExecutionResult* result) { + result->bitmap = std::make_shared(); + RoaringDocIdSink sink(result->bitmap.get()); + std::vector docids; + bool emitted_to_sink = false; + snii::Status status; + switch (query_type) { + case InvertedIndexQueryType::EQUAL_QUERY: + case InvertedIndexQueryType::MATCH_ANY_QUERY: + status = terms.size() == 1 ? snii::query::term_query(logical_reader, terms.front(), &sink) + : snii::query::boolean_or(logical_reader, terms, &sink); + emitted_to_sink = true; + break; + case InvertedIndexQueryType::MATCH_ALL_QUERY: + if (terms.size() == 1) { + status = snii::query::term_query(logical_reader, terms.front(), &sink); + emitted_to_sink = true; + } else { + status = snii::query::boolean_and(logical_reader, terms, &docids); + } + break; + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + if (query_info.slop != 0) { + return Status::Error( + "SNII does not support sloppy phrase query yet"); + } + if (terms.size() == 1) { + status = snii::query::term_query(logical_reader, terms.front(), &sink); + emitted_to_sink = true; + } else { + status = snii::query::phrase_query(logical_reader, terms, &docids); + } + break; + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + if (terms.size() == 1) { + status = + snii::query::prefix_query(logical_reader, terms.front(), &sink, max_expansions); + emitted_to_sink = true; + } else { + status = snii::query::phrase_prefix_query(logical_reader, terms, &docids, + max_expansions); + } + break; + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: + status = snii::query::regexp_query(logical_reader, search_str, &sink, max_expansions); + emitted_to_sink = true; + break; + case InvertedIndexQueryType::WILDCARD_QUERY: + status = snii::query::wildcard_query(logical_reader, search_str, &sink, max_expansions); + emitted_to_sink = true; + break; + case InvertedIndexQueryType::LESS_THAN_QUERY: + case InvertedIndexQueryType::LESS_EQUAL_QUERY: + case InvertedIndexQueryType::GREATER_THAN_QUERY: + case InvertedIndexQueryType::GREATER_EQUAL_QUERY: + case InvertedIndexQueryType::RANGE_QUERY: + return Status::Error( + "SNII inverted index storage format does not support BKD/range query"); + default: + return Status::Error( + "SNII unsupported inverted index query type {}", query_type_to_string(query_type)); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(status)); + if (emitted_to_sink) { + result->bitmap->runOptimize(); + } else { + result->bitmap = docids_to_bitmap(docids); + } + return Status::OK(); +} + +} // namespace + +Status SniiIndexReader::new_iterator(std::unique_ptr* iterator) { + if (*iterator == nullptr) { + *iterator = InvertedIndexIterator::create_unique(); + } + dynamic_cast(iterator->get()) + ->add_reader(_reader_type, + dynamic_pointer_cast(shared_from_this())); + return Status::OK(); +} + +Status SniiIndexReader::_parse_query_terms(const IndexQueryContextPtr& context, + std::string search_str, + InvertedIndexQueryType query_type, + const InvertedIndexAnalyzerCtx* analyzer_ctx, + InvertedIndexQueryInfo* query_info) { + DCHECK(query_info != nullptr); + if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY || + query_type == InvertedIndexQueryType::WILDCARD_QUERY) { + query_info->term_infos.emplace_back(search_str, 0); + return Status::OK(); + } + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { + parse_phrase_slop(&search_str, query_info); + SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); + try { + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, _index_meta.properties()); + } catch (const CLuceneError& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } + return Status::OK(); + } + + SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); + try { + if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) { + query_info->term_infos.emplace_back(search_str); + } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) { + auto reader = inverted_index::InvertedIndexAnalyzer::create_reader( + analyzer_ctx->char_filter_map); + reader->init(search_str.data(), static_cast(search_str.size()), true); + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader, analyzer_ctx->analyzer.get()); + } else { + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, _index_meta.properties()); + } + } catch (const CLuceneError& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } + return Status::OK(); +} + +Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::string& column_name, + const Field& query_value, InvertedIndexQueryType query_type, + std::shared_ptr& bit_map, + const InvertedIndexAnalyzerCtx* analyzer_ctx) { + SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer); + std::string search_str = query_value.get(); + + if (int ignore_above = + std::stoi(get_parser_ignore_above_value_from_properties(_index_meta.properties())); + _reader_type == InvertedIndexReaderType::STRING_TYPE && search_str.size() > ignore_above) { + return Status::Error( + "query value is too long, evaluate skipped."); + } + + InvertedIndexQueryInfo query_info; + RETURN_IF_ERROR(_parse_query_terms(context, search_str, query_type, analyzer_ctx, &query_info)); + if (query_info.term_infos.empty()) { + auto msg = fmt::format("token parser result is empty for SNII query '{}'", search_str); + if (is_match_query(query_type)) { + LOG(WARNING) << msg; + bit_map = std::make_shared(); + return Status::OK(); + } + return Status::Error(msg); + } + + auto terms = to_terms(query_info); + const int32_t max_expansions = + context->runtime_state == nullptr + ? 50 + : context->runtime_state->query_options().inverted_index_max_expansions; + std::string cache_value = build_snii_query_cache_value(query_info); + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + cache_value += " " + std::to_string(query_info.slop); + cache_value += " " + std::to_string(query_info.ordered); + } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY || + query_type == InvertedIndexQueryType::WILDCARD_QUERY) { + cache_value += " " + std::to_string(max_expansions); + } + auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta); + InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, + std::move(cache_value)}; + auto* cache = InvertedIndexQueryCache::instance(); + InvertedIndexQueryCacheHandle cache_handler; + if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) { + return Status::OK(); + } + + snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(context->io_ctx); + RETURN_IF_ERROR( + _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); + auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); + + SniiQueryExecutionResult query_result; + RETURN_IF_ERROR(execute_snii_query(*logical_reader, query_type, query_info, search_str, terms, + max_expansions, &query_result)); + bit_map = std::move(query_result.bitmap); + cache->insert(cache_key, bit_map, &cache_handler); + return Status::OK(); +} + +Status SniiIndexReader::try_query(const IndexQueryContextPtr& /*context*/, + const std::string& /*column_name*/, const Field& /*query_value*/, + InvertedIndexQueryType /*query_type*/, size_t* /*count*/) { + return Status::Error("SNII does not support try_query"); +} + +Status SniiIndexReader::read_null_bitmap(const IndexQueryContextPtr& context, + InvertedIndexQueryCacheHandle* cache_handle, + lucene::store::Directory* /*dir*/) { + SCOPED_RAW_TIMER(&context->stats->inverted_index_query_null_bitmap_timer); + auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta); + InvertedIndexQueryCache::CacheKey cache_key { + index_file_key, "", InvertedIndexQueryType::UNKNOWN_QUERY, "null_bitmap"}; + auto* cache = InvertedIndexQueryCache::instance(); + if (cache->lookup(cache_key, cache_handle)) { + return Status::OK(); + } + + snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(context->io_ctx); + RETURN_IF_ERROR( + _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); + auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); + auto null_bitmap = std::make_shared(); + const auto& ref = logical_reader->section_refs().null_bitmap; + if (ref.length > 0) { + std::vector bytes; + RETURN_IF_ERROR(snii_doris::to_doris_status( + logical_reader->reader()->read_at(ref.offset, ref.length, &bytes))); + snii::format::NullBitmapReader reader; + RETURN_IF_ERROR(snii_doris::to_doris_status( + snii::format::NullBitmapReader::open(snii::Slice(bytes), &reader))); + reader.copy_to(null_bitmap.get()); + null_bitmap->runOptimize(); + } + cache->insert(cache_key, null_bitmap, cache_handle); + return Status::OK(); +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/snii/snii_index_reader.h b/be/src/storage/index/snii/snii_index_reader.h new file mode 100644 index 00000000000000..5b504802a28f9f --- /dev/null +++ b/be/src/storage/index/snii/snii_index_reader.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "storage/index/inverted/inverted_index_query_type.h" +#include "storage/index/inverted/inverted_index_reader.h" + +namespace doris::segment_v2 { + +class SniiIndexReader final : public InvertedIndexReader { + ENABLE_FACTORY_CREATOR(SniiIndexReader); + +public: + SniiIndexReader(const TabletIndex* index_meta, + const std::shared_ptr& index_file_reader, + InvertedIndexReaderType reader_type) + : InvertedIndexReader(index_meta, index_file_reader), _reader_type(reader_type) {} + + Status new_iterator(std::unique_ptr* iterator) override; + Status query(const IndexQueryContextPtr& context, const std::string& column_name, + const Field& query_value, InvertedIndexQueryType query_type, + std::shared_ptr& bit_map, + const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) override; + Status try_query(const IndexQueryContextPtr& context, const std::string& column_name, + const Field& query_value, InvertedIndexQueryType query_type, + size_t* count) override; + Status read_null_bitmap(const IndexQueryContextPtr& context, + InvertedIndexQueryCacheHandle* cache_handle, + lucene::store::Directory* dir = nullptr) override; + InvertedIndexReaderType type() override { return _reader_type; } + +private: + Status _parse_query_terms(const IndexQueryContextPtr& context, std::string search_str, + InvertedIndexQueryType query_type, + const InvertedIndexAnalyzerCtx* analyzer_ctx, + InvertedIndexQueryInfo* query_info); + + InvertedIndexReaderType _reader_type; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/snii/snii_index_writer.cpp b/be/src/storage/index/snii/snii_index_writer.cpp new file mode 100644 index 00000000000000..37f2d41963fb9a --- /dev/null +++ b/be/src/storage/index/snii/snii_index_writer.cpp @@ -0,0 +1,204 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_index_writer.h" + +#include + +#include + +#include "common/cast_set.h" +#include "common/config.h" +#include "storage/index/index_file_writer.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/query/query_info.h" +#include "storage/tablet/tablet_schema.h" + +namespace doris::segment_v2 { + +SniiIndexColumnWriter::SniiIndexColumnWriter(IndexFileWriter* index_file_writer, + const TabletIndex* index_meta, bool /*single_field*/) + : _index_file_writer(index_file_writer), _index_meta(index_meta) {} + +Status SniiIndexColumnWriter::init() { + _should_analyzer = + inverted_index::InvertedIndexAnalyzer::should_analyzer(_index_meta->properties()); + _has_positions = get_parser_phrase_support_string_from_properties(_index_meta->properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES; + _config = _has_positions ? snii::format::IndexConfig::kDocsPositions + : snii::format::IndexConfig::kDocsOnly; + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + _ignore_above = cast_set(std::stoul(ignore_above_value)); + const auto spill_threshold = + static_cast(config::inverted_index_ram_buffer_size * 1024 * 1024); + _memory_reporter = std::make_unique(nullptr, spill_threshold); + _term_buffer = std::make_unique(_has_positions, spill_threshold, + _memory_reporter.get()); + _analyzer_config.analyzer_name = get_analyzer_name_from_properties(_index_meta->properties()); + _analyzer_config.parser_type = get_inverted_index_parser_type_from_string( + get_parser_string_from_properties(_index_meta->properties())); + _analyzer_config.parser_mode = + get_parser_mode_string_from_properties(_index_meta->properties()); + _analyzer_config.char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta->properties()); + _analyzer_config.lower_case = + get_parser_lowercase_from_properties(_index_meta->properties()); + _analyzer_config.stop_words = get_parser_stopwords_from_properties(_index_meta->properties()); + try { + _char_string_reader = inverted_index::InvertedIndexAnalyzer::create_reader( + _analyzer_config.char_filter_map); + if (_should_analyzer) { + _analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer(&_analyzer_config); + } + } catch (const CLuceneError& e) { + return Status::Error( + "SNII create analyzer failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII create analyzer failed: {}", e.what()); + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::_analyze(const Slice& value, std::vector* terms) { + terms->clear(); + if (!_should_analyzer) { + TermInfo term; + term.term = std::string(value.data, value.size); + term.position = 0; + terms->emplace_back(std::move(term)); + return Status::OK(); + } + try { + _char_string_reader->init(value.data, cast_set(value.size), false); + *terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result(_char_string_reader, + _analyzer.get()); + } catch (const CLuceneError& e) { + return Status::Error( + "SNII analyze value failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII analyze value failed: {}", e.what()); + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::_add_value_tokens(const Slice& value, uint32_t docid, + uint32_t position_base, uint32_t* max_position) { + DCHECK(max_position != nullptr); + *max_position = position_base; + if ((!_should_analyzer && value.size > _ignore_above) || (_should_analyzer && value.empty())) { + return Status::OK(); + } + + std::vector terms; + RETURN_IF_ERROR(_analyze(value, &terms)); + for (const auto& term_info : terms) { + DCHECK(term_info.is_single_term()); + const auto& term = term_info.get_single_term(); + const uint32_t position = + _has_positions ? position_base + cast_set(term_info.position) : 0; + _term_buffer->add_token(term, docid, position); + *max_position = std::max(*max_position, position); + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_values(const std::string /*name*/, const void* values, + size_t count) { + const auto* v = reinterpret_cast(values); + for (size_t i = 0; i < count; ++i) { + uint32_t max_position = 0; + RETURN_IF_ERROR(_add_value_tokens(*v, _rid, 0, &max_position)); + ++v; + ++_rid; + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, + const uint8_t* offsets_ptr, size_t count) { + if (count == 0) { + return Status::OK(); + } + const auto* offsets = reinterpret_cast(offsets_ptr); + size_t start_off = 0; + for (size_t i = 0; i < count; ++i) { + auto array_elem_size = offsets[i + 1] - offsets[i]; + uint32_t position_base = 0; + for (auto j = start_off; j < start_off + array_elem_size; ++j) { + if (nested_null_map != nullptr && nested_null_map[j] == 1) { + continue; + } + const auto* value = reinterpret_cast( + reinterpret_cast(value_ptr) + j * field_size); + uint32_t max_position = position_base; + RETURN_IF_ERROR(_add_value_tokens(*value, _rid, position_base, &max_position)); + position_base = max_position + 1; + } + start_off += array_elem_size; + ++_rid; + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_nulls(uint32_t count) { + _null_docids.reserve(_null_docids.size() + count); + for (uint32_t i = 0; i < count; ++i) { + _null_docids.push_back(_rid + i); + } + _rid += count; + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_array_nulls(const uint8_t* null_map, size_t num_rows) { + DCHECK(_rid >= num_rows); + if (num_rows == 0 || null_map == nullptr) { + return Status::OK(); + } + const auto first_row = _rid - num_rows; + for (size_t i = 0; i < num_rows; ++i) { + if (null_map[i] == 1) { + _null_docids.push_back(cast_set(first_row + i)); + } + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::finish() { + DCHECK(_term_buffer != nullptr); + auto status = _term_buffer->status(); + if (!status.ok()) { + return Status::InternalError("SNII term buffer error: {}", status.to_string()); + } + RETURN_IF_ERROR(_index_file_writer->add_snii_index(_index_meta, cast_set(_rid), + std::move(_null_docids), _term_buffer.get(), + _config, _memory_reporter.get())); + _index_file_writer->retain_snii_memory_reporter(std::move(_memory_reporter)); + _term_buffer.reset(); + return Status::OK(); +} + +void SniiIndexColumnWriter::close_on_error() { + _term_buffer.reset(); + _memory_reporter.reset(); + _null_docids.clear(); +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/snii/snii_index_writer.h b/be/src/storage/index/snii/snii_index_writer.h new file mode 100644 index 00000000000000..f9c6686bbed4cf --- /dev/null +++ b/be/src/storage/index/snii/snii_index_writer.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "snii/format/format_constants.h" +#include "snii/writer/memory_reporter.h" +#include "snii/writer/spimi_term_buffer.h" +#include "storage/index/index_writer.h" +#include "storage/index/inverted/inverted_index_parser.h" +#include "storage/index/inverted/query/query_info.h" +#include "storage/index/inverted/util/reader.h" +#include "util/slice.h" + +namespace lucene::analysis { +class Analyzer; +} + +namespace doris::segment_v2 { + +class SniiIndexColumnWriter final : public IndexColumnWriter { +public: + SniiIndexColumnWriter(IndexFileWriter* index_file_writer, const TabletIndex* index_meta, + bool single_field); + ~SniiIndexColumnWriter() override = default; + + Status init() override; + Status add_values(const std::string name, const void* values, size_t count) override; + Status add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, const uint8_t* offsets_ptr, + size_t count) override; + Status add_nulls(uint32_t count) override; + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override; + Status finish() override; + int64_t size() const override { return 0; } + void close_on_error() override; + +private: + Status _add_value_tokens(const Slice& value, uint32_t docid, uint32_t position_base, + uint32_t* max_position); + Status _analyze(const Slice& value, std::vector* terms); + + IndexFileWriter* _index_file_writer = nullptr; + const TabletIndex* _index_meta = nullptr; + bool _should_analyzer = false; + bool _has_positions = false; + uint32_t _ignore_above = 0; + uint32_t _rid = 0; + snii::format::IndexConfig _config = snii::format::IndexConfig::kDocsOnly; + InvertedIndexAnalyzerConfig _analyzer_config; + inverted_index::ReaderPtr _char_string_reader; + std::shared_ptr _analyzer; + std::unique_ptr _memory_reporter; + std::unique_ptr _term_buffer; + std::vector _null_docids; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/storage/rowset/beta_rowset.cpp b/be/src/storage/rowset/beta_rowset.cpp index 70950dfe065634..4f6e038661958e 100644 --- a/be/src/storage/rowset/beta_rowset.cpp +++ b/be/src/storage/rowset/beta_rowset.cpp @@ -827,6 +827,9 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, case InvertedIndexStorageFormatPB::V3: format_str = "V3"; break; + case InvertedIndexStorageFormatPB::SNII: + format_str = "SNII"; + break; default: return Status::InternalError("inverted index storage format error"); break; @@ -836,6 +839,19 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str.c_str(), allocator), allocator); rapidjson::Value segments(rapidjson::kArrayType); + auto add_file_info_to_json = [&](const std::string& path, + rapidjson::Value& json_value) -> Status { + json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), allocator); + int64_t idx_file_size = 0; + auto st = fs->file_size(path, &idx_file_size); + if (st != Status::OK()) { + LOG(WARNING) << "show nested index file get file size error, file: " << path + << ", error: " << st.msg(); + return st; + } + json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), allocator); + return Status::OK(); + }; for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { rapidjson::Value segment(rapidjson::kObjectType); segment.AddMember("segment_id", rapidjson::Value(seg_id).Move(), allocator); @@ -846,24 +862,20 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, fs, std::string(index_file_path_prefix), storage_format, InvertedIndexFileInfo(), _rowset_meta->tablet_id()); RETURN_IF_ERROR(index_file_reader->init()); + if (storage_format == InvertedIndexStorageFormatPB::SNII) { + rapidjson::Value index_file(rapidjson::kObjectType); + auto index_file_path = + InvertedIndexDescriptor::get_index_file_path_v2(index_file_path_prefix); + RETURN_IF_ERROR(add_file_info_to_json(index_file_path, index_file)); + segment.AddMember("index_files", rapidjson::Value(rapidjson::kArrayType).Move(), + allocator); + auto& index_files = segment["index_files"]; + index_files.PushBack(index_file, allocator); + segments.PushBack(segment, allocator); + continue; + } auto dirs = index_file_reader->get_all_directories(); - auto add_file_info_to_json = [&](const std::string& path, - rapidjson::Value& json_value) -> Status { - json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), - allocator); - int64_t idx_file_size = 0; - auto st = fs->file_size(path, &idx_file_size); - if (st != Status::OK()) { - LOG(WARNING) << "show nested index file get file size error, file: " << path - << ", error: " << st.msg(); - return st; - } - json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), - allocator); - return Status::OK(); - }; - auto process_files = [&allocator, &index_file_reader](auto& index_meta, rapidjson::Value& indices, rapidjson::Value& index) -> Status { diff --git a/be/src/storage/segment/column_reader.cpp b/be/src/storage/segment/column_reader.cpp index ebb1887c8ee920..262c1dd048be16 100644 --- a/be/src/storage/segment/column_reader.cpp +++ b/be/src/storage/segment/column_reader.cpp @@ -55,6 +55,7 @@ #include "storage/index/index_reader.h" #include "storage/index/inverted/analyzer/analyzer.h" #include "storage/index/inverted/inverted_index_reader.h" +#include "storage/index/snii/snii_index_reader.h" #include "storage/index/zone_map/zone_map_index.h" #include "storage/iterators.h" #include "storage/olap_common.h" @@ -647,6 +648,17 @@ Status ColumnReader::_load_index(const std::shared_ptr& index_f } IndexReaderPtr index_reader; + if (index_file_reader->get_storage_format() == InvertedIndexStorageFormatPB::SNII) { + if (!is_string_type(type)) { + return Status::Error( + "SNII inverted index storage format does not support BKD index type {}", type); + } + auto reader_type = should_analyzer ? InvertedIndexReaderType::FULLTEXT + : InvertedIndexReaderType::STRING_TYPE; + index_reader = SniiIndexReader::create_shared(index_meta, index_file_reader, reader_type); + _index_readers[index_meta->index_id()] = index_reader; + return Status::OK(); + } if (is_string_type(type)) { if (should_analyzer) { diff --git a/be/src/storage/tablet/tablet_meta.cpp b/be/src/storage/tablet/tablet_meta.cpp index b289cda58e7d3b..1e0660339fb4ec 100644 --- a/be/src/storage/tablet/tablet_meta.cpp +++ b/be/src/storage/tablet/tablet_meta.cpp @@ -101,6 +101,9 @@ TabletMetaSharedPtr TabletMeta::create( case TInvertedIndexStorageFormat::V2: inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2; break; + case TInvertedIndexStorageFormat::SNII: + inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::SNII; + break; default: break; } @@ -495,6 +498,9 @@ void TabletMeta::init_schema_from_thrift(const TTabletSchema& tablet_schema, case TInvertedIndexFileStorageFormat::V3: tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3); break; + case TInvertedIndexFileStorageFormat::SNII: + tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::SNII); + break; default: tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3); break; diff --git a/be/src/storage/task/index_builder.cpp b/be/src/storage/task/index_builder.cpp index ef49626e143ab5..0e0ffeeb1d1036 100644 --- a/be/src/storage/task/index_builder.cpp +++ b/be/src/storage/task/index_builder.cpp @@ -338,6 +338,13 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta if (_is_drop_op) { const auto& output_rs_tablet_schema = output_rowset_meta->tablet_schema(); + if (output_rs_tablet_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::SNII) { + LOG(INFO) << "skip physical SNII inverted index rewrite for drop index. tablet_id=" + << _tablet->tablet_id() + << " rowset_id=" << output_rowset_meta->rowset_id().to_string(); + return Status::OK(); + } if (output_rs_tablet_schema->get_inverted_index_storage_format() != InvertedIndexStorageFormatPB::V1) { const auto& fs = output_rowset_meta->fs(); @@ -421,6 +428,11 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta _olap_data_convertor->reserve(_alter_inverted_indexes.size()); std::unique_ptr index_file_writer = nullptr; + if (output_rowset_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::SNII) { + return Status::Error( + "BUILD INDEX is not supported for SNII inverted index storage format yet"); + } if (output_rowset_schema->get_inverted_index_storage_format() >= InvertedIndexStorageFormatPB::V2) { auto idx_file_reader_iter = _index_file_readers.find( diff --git a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp index e74ad758ac1db3..4e7bb6bb1d05a4 100644 --- a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp +++ b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp @@ -52,6 +52,10 @@ io::FileCacheStatistics make_file_cache_stats(int64_t multiplier) { stats.inverted_index_remote_io_timer = multiplier * 26; stats.inverted_index_peer_io_timer = multiplier * 27; stats.inverted_index_io_timer = multiplier * 28; + stats.inverted_index_request_bytes = multiplier * 29; + stats.inverted_index_read_bytes = multiplier * 30; + stats.inverted_index_range_read_count = multiplier * 31; + stats.inverted_index_serial_read_rounds = multiplier * 32; return stats; } @@ -89,6 +93,10 @@ void expect_file_cache_stats_eq(const io::FileCacheStatistics& actual, EXPECT_EQ(actual.inverted_index_remote_io_timer, expected.inverted_index_remote_io_timer); EXPECT_EQ(actual.inverted_index_peer_io_timer, expected.inverted_index_peer_io_timer); EXPECT_EQ(actual.inverted_index_io_timer, expected.inverted_index_io_timer); + EXPECT_EQ(actual.inverted_index_request_bytes, expected.inverted_index_request_bytes); + EXPECT_EQ(actual.inverted_index_read_bytes, expected.inverted_index_read_bytes); + EXPECT_EQ(actual.inverted_index_range_read_count, expected.inverted_index_range_read_count); + EXPECT_EQ(actual.inverted_index_serial_read_rounds, expected.inverted_index_serial_read_rounds); } } // namespace @@ -134,6 +142,14 @@ TEST(FileCacheProfileReporterTest, ReporterAggregatesDeltaReportsToExactFinalTot EXPECT_EQ(profile->get_counter("CacheGetOrSetTimer")->value(), after_second_report.cache_get_or_set_timer); EXPECT_EQ(profile->get_counter("LockWaitTimer")->value(), after_second_report.lock_wait_timer); + EXPECT_EQ(profile->get_counter("InvertedIndexRequestBytes")->value(), + after_second_report.inverted_index_request_bytes); + EXPECT_EQ(profile->get_counter("InvertedIndexReadBytes")->value(), + after_second_report.inverted_index_read_bytes); + EXPECT_EQ(profile->get_counter("InvertedIndexRangeReadCount")->value(), + after_second_report.inverted_index_range_read_count); + EXPECT_EQ(profile->get_counter("InvertedIndexSerialReadRounds")->value(), + after_second_report.inverted_index_serial_read_rounds); } } // namespace doris diff --git a/be/test/storage/index/snii_doris_adapter_test.cpp b/be/test/storage/index/snii_doris_adapter_test.cpp new file mode 100644 index 00000000000000..f307fb731daff5 --- /dev/null +++ b/be/test/storage/index/snii_doris_adapter_test.cpp @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_doris_adapter.h" + +#include + +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "io/fs/file_reader.h" +#include "io/fs/path.h" +#include "io/io_common.h" +#include "snii/io/file_reader.h" +#include "util/slice.h" + +namespace doris::segment_v2::snii_doris { +namespace { + +struct CapturedIOContext { + bool has_ctx = false; + bool is_inverted_index = false; + bool is_index_data = false; + bool read_file_cache = true; + bool is_disposable = false; + io::FileCacheStatistics* file_cache_stats = nullptr; +}; + +struct CapturedRead { + size_t offset = 0; + size_t len = 0; + CapturedIOContext io_ctx; +}; + +class RecordingFileReader final : public io::FileReader { +public: + explicit RecordingFileReader(std::string data) : _data(std::move(data)) {} + + Status close() override { + _closed = true; + return Status::OK(); + } + + const io::Path& path() const override { return _path; } + size_t size() const override { return _data.size(); } + bool closed() const override { return _closed; } + int64_t mtime() const override { return 0; } + + const std::vector& reads() const { return _reads; } + +protected: + Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, + const io::IOContext* io_ctx) override { + CapturedRead read; + read.offset = offset; + read.len = result.size; + if (io_ctx != nullptr) { + read.io_ctx.has_ctx = true; + read.io_ctx.is_inverted_index = io_ctx->is_inverted_index; + read.io_ctx.is_index_data = io_ctx->is_index_data; + read.io_ctx.read_file_cache = io_ctx->read_file_cache; + read.io_ctx.is_disposable = io_ctx->is_disposable; + read.io_ctx.file_cache_stats = io_ctx->file_cache_stats; + } + _reads.push_back(read); + + if (result.size > 0) { + std::memcpy(result.data, _data.data() + offset, result.size); + } + *bytes_read = result.size; + return Status::OK(); + } + +private: + std::string _data; + io::Path _path = "/tmp/snii_doris_adapter_test.idx"; + bool _closed = false; + std::vector _reads; +}; + +} // namespace + +TEST(DorisSniiFileReaderTest, ReadAtPropagatesIndexIOContextAndRecordsStats) { + auto recording_reader = std::make_shared("0123456789abcdef"); + DorisSniiFileReader reader(recording_reader); + + io::FileCacheStatistics stats; + io::IOContext io_ctx; + io_ctx.is_disposable = true; + io_ctx.is_index_data = false; + io_ctx.read_file_cache = false; + io_ctx.file_cache_stats = &stats; + + std::vector out; + { + DorisSniiFileReader::ScopedIOContext scope(&io_ctx); + auto status = reader.read_at(2, 5, &out); + ASSERT_TRUE(status.ok()) << status.message(); + } + + ASSERT_EQ(out.size(), 5); + EXPECT_EQ(std::string(out.begin(), out.end()), "23456"); + ASSERT_EQ(recording_reader->reads().size(), 1); + const auto& captured = recording_reader->reads()[0].io_ctx; + EXPECT_TRUE(captured.has_ctx); + EXPECT_TRUE(captured.is_inverted_index); + EXPECT_TRUE(captured.is_index_data); + EXPECT_FALSE(captured.read_file_cache); + EXPECT_TRUE(captured.is_disposable); + EXPECT_EQ(captured.file_cache_stats, &stats); + + EXPECT_EQ(stats.inverted_index_request_bytes, 5); + EXPECT_EQ(stats.inverted_index_read_bytes, 5); + EXPECT_EQ(stats.inverted_index_range_read_count, 1); + EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1); +} + +TEST(DorisSniiFileReaderTest, ReadBatchRecordsLogicalAndCoalescedPhysicalIO) { + auto recording_reader = + std::make_shared("0123456789abcdefghijklmnopqrstuvwxyz"); + DorisSniiFileReader reader(recording_reader); + + io::FileCacheStatistics stats; + io::IOContext io_ctx; + io_ctx.file_cache_stats = &stats; + + std::vector> outs; + { + DorisSniiFileReader::ScopedIOContext scope(&io_ctx); + std::vector<::snii::io::Range> ranges {{0, 4}, {6, 3}, {20, 2}}; + auto status = reader.read_batch(ranges, &outs); + ASSERT_TRUE(status.ok()) << status.message(); + } + + ASSERT_EQ(outs.size(), 3); + EXPECT_EQ(std::string(outs[0].begin(), outs[0].end()), "0123"); + EXPECT_EQ(std::string(outs[1].begin(), outs[1].end()), "678"); + EXPECT_EQ(std::string(outs[2].begin(), outs[2].end()), "kl"); + + ASSERT_EQ(recording_reader->reads().size(), 1); + EXPECT_EQ(recording_reader->reads()[0].offset, 0); + EXPECT_EQ(recording_reader->reads()[0].len, 22); + + EXPECT_EQ(stats.inverted_index_request_bytes, 9); + EXPECT_EQ(stats.inverted_index_read_bytes, 22); + EXPECT_EQ(stats.inverted_index_range_read_count, 1); + EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1); +} + +} // namespace doris::segment_v2::snii_doris diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp new file mode 100644 index 00000000000000..d735770d8402cc --- /dev/null +++ b/be/test/storage/index/snii_query_test.cpp @@ -0,0 +1,439 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/pfor.h" +#include "snii/format/format_constants.h" +#include "snii/format/prx_pod.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" +#include "snii/query/docid_sink.h" +#include "snii/query/phrase_query.h" +#include "snii/query/term_query.h" +#include "snii/reader/logical_index_reader.h" +#include "snii/reader/snii_segment_reader.h" +#include "snii/writer/snii_compound_writer.h" +#include "snii/writer/spimi_term_buffer.h" + +namespace snii::query { +namespace { + +class MemoryFile final : public snii::io::FileReader, public snii::io::FileWriter { +public: + Status append(Slice data) override { + data_.insert(data_.end(), data.data(), data.data() + data.size()); + return Status::OK(); + } + + Status finalize() override { + finalized_ = true; + return Status::OK(); + } + + uint64_t bytes_written() const override { return data_.size(); } + + // NOLINTBEGIN(readability-non-const-parameter): FileReader interface writes into out. + Status read_at(uint64_t offset, size_t len, std::vector* out) override { + if (offset > data_.size() || len > data_.size() - offset) { + return Status::Corruption("memory file read past eof"); + } + out->resize(len); + if (len != 0) { + std::memcpy(out->data(), data_.data() + offset, len); + } + return Status::OK(); + } + // NOLINTEND(readability-non-const-parameter) + + uint64_t size() const override { return data_.size(); } + bool finalized() const { return finalized_; } + +private: + std::vector data_; + bool finalized_ = false; +}; + +class RecordingDocIdSink final : public DocIdSink { +public: + Status append_sorted(std::span docids) override { + out.insert(out.end(), docids.begin(), docids.end()); + return Status::OK(); + } + + Status append_range(uint32_t first, uint64_t last_exclusive) override { + ++range_calls; + for (uint64_t docid = first; docid < last_exclusive; ++docid) { + out.push_back(static_cast(docid)); + } + return Status::OK(); + } + + std::vector out; + size_t range_calls = 0; +}; + +struct PostingDoc { + uint32_t docid = 0; + std::vector positions; +}; + +writer::TermPostings make_term(std::string term, std::vector docs) { + std::ranges::sort(docs, [](const PostingDoc& lhs, const PostingDoc& rhs) { + return lhs.docid < rhs.docid; + }); + + writer::TermPostings posting; + posting.term = std::move(term); + posting.docids.reserve(docs.size()); + posting.freqs.reserve(docs.size()); + for (const PostingDoc& doc : docs) { + posting.docids.push_back(doc.docid); + posting.freqs.push_back(static_cast(doc.positions.size())); + posting.positions_flat.insert(posting.positions_flat.end(), doc.positions.begin(), + doc.positions.end()); + } + return posting; +} + +std::vector docs_with_one_position(uint32_t begin, uint32_t end, uint32_t position) { + std::vector docs; + docs.reserve(end - begin); + for (uint32_t docid = begin; docid < end; ++docid) { + docs.push_back({docid, {position}}); + } + return docs; +} + +void assert_ok(const Status& status) { + ASSERT_TRUE(status.ok()) << status.to_string(); +} + +Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, + reader::LogicalIndexReader* index_reader) { + constexpr uint32_t kDocCount = 9000; + auto failed_docs = docs_with_one_position(0, kDocCount, 0); + auto order_docs = docs_with_one_position(0, kDocCount, 2); + auto ordinal_docs = docs_with_one_position(0, kDocCount, 2); + auto driver_docs = docs_with_one_position(0, 8000, 0); + auto almost_docs = docs_with_one_position(0, kDocCount, 1); + std::vector sparse_left_docs; + std::vector sparse_right_docs; + std::vector repeat_docs; + sparse_left_docs.reserve(kDocCount / 3 + 1); + sparse_right_docs.reserve(kDocCount); + repeat_docs.reserve(kDocCount); + for (uint32_t docid = 0; docid < kDocCount; ++docid) { + if (docid % 3 == 0) { + sparse_left_docs.push_back({docid, {0}}); + } + if (docid % 4 != 1) { + sparse_right_docs.push_back({docid, {1}}); + } + repeat_docs.push_back({docid, {0, 1, 2}}); + } + almost_docs.erase(almost_docs.begin() + 4000); + failed_docs[8000].positions = {0, 4}; + for (PostingDoc& doc : order_docs) { + if (doc.docid == 5000 || doc.docid == 7000) { + doc.positions = {1}; + } else if (doc.docid == 8000) { + doc.positions = {5}; + } + } + for (PostingDoc& doc : ordinal_docs) { + if (doc.docid == 6000) { + doc.positions = {1}; + } + } + + writer::SniiIndexInput input; + input.index_id = 7; + input.index_suffix = "Body"; + input.config = format::IndexConfig::kDocsPositions; + input.doc_count = kDocCount; + input.terms = {make_term("almost", std::move(almost_docs)), + make_term("driver", std::move(driver_docs)), + make_term("failed", std::move(failed_docs)), + make_term("order", std::move(order_docs)), + make_term("ordinal", std::move(ordinal_docs)), + make_term("repeat", std::move(repeat_docs)), + make_term("sparse_left", std::move(sparse_left_docs)), + make_term("sparse_right", std::move(sparse_right_docs))}; + + writer::SniiCompoundWriter writer(file); + SNII_RETURN_IF_ERROR(writer.add_logical_index(input)); + SNII_RETURN_IF_ERROR(writer.finish()); + EXPECT_TRUE(file->finalized()); + + SNII_RETURN_IF_ERROR(reader::SniiSegmentReader::open(file, segment_reader)); + return segment_reader->open_index(input.index_id, input.index_suffix, index_reader); +} + +TEST(SniiPhraseQueryTest, WindowedPhraseQueryKeepsCorrectCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"failed", "order"}, &docids)); + + const std::vector expected {5000, 7000, 8000}; + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, WindowedPhrasePrefixQueryKeepsCorrectCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_prefix_query(index_reader, {"failed", "ord"}, &docids, 10)); + + const std::vector expected {5000, 6000, 7000, 8000}; + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, SingleTailPhrasePrefixUsesStreamingPhrasePath) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_prefix_query(index_reader, {"failed", "orde"}, &docids, 10)); + + const std::vector expected {5000, 7000, 8000}; + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, MultiTermPhraseUsesPairPrefilter) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"failed", "order", "ordinal"}, &docids)); + + const std::vector expected {5000, 7000}; + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, RepeatedTermPhraseUsesCachedPostingSpan) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"repeat", "repeat", "repeat"}, &docids)); + + std::vector expected(9000); + std::iota(expected.begin(), expected.end(), 0); + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, DenseTermWithMissingDocKeepsCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector driver_docids; + assert_ok(term_query(index_reader, "driver", &driver_docids)); + EXPECT_EQ(driver_docids.size(), 8000); + + std::vector almost_docids; + assert_ok(term_query(index_reader, "almost", &almost_docids)); + EXPECT_EQ(almost_docids.size(), 8999); + ASSERT_GT(almost_docids.size(), 6144); + EXPECT_EQ(almost_docids[3999], 3999); + EXPECT_EQ(almost_docids[4000], 4001); + EXPECT_EQ(almost_docids[6143], 6144); + EXPECT_EQ(almost_docids[6144], 6145); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"driver", "almost"}, &docids)); + + std::vector expected; + expected.reserve(7999); + for (uint32_t docid = 0; docid < 8000; ++docid) { + if (docid != 4000) { + expected.push_back(docid); + } + } + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, SparseWindowBitsetKeepsCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"sparse_left", "sparse_right"}, &docids)); + + std::vector expected; + for (uint32_t docid = 0; docid < 9000; ++docid) { + if (docid % 3 == 0 && docid % 4 != 1) { + expected.push_back(docid); + } + } + EXPECT_EQ(docids, expected); +} + +TEST(SniiTermQueryTest, WindowedDenseTermEmitsRangesToSink) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + RecordingDocIdSink sink; + assert_ok(term_query(index_reader, "failed", &sink)); + + std::vector expected(9000); + std::iota(expected.begin(), expected.end(), 0); + EXPECT_EQ(sink.out, expected); + EXPECT_GT(sink.range_calls, 0); +} + +TEST(SniiPrxPodTest, SelectivePforCsrMatchesFullCsrAcrossRuns) { + std::vector freqs; + std::vector positions; + freqs.reserve(320); + for (uint32_t doc = 0; doc < 320; ++doc) { + const uint32_t freq = (doc % 5 == 0) ? 2 : 1; + freqs.push_back(freq); + positions.push_back(doc * 3); + if (freq == 2) { + positions.push_back(doc * 3 + 2); + } + } + + ByteSink sink; + assert_ok(format::build_prx_window_flat(positions, freqs, -1, &sink)); + + std::vector full_positions; + std::vector full_offsets; + ByteSource full_source(sink.view()); + assert_ok(format::read_prx_window_csr(&full_source, &full_positions, &full_offsets)); + + auto assert_selected_matches_full = [&](const std::vector& selected_docs) { + std::vector selected_positions; + std::vector selected_offsets; + ByteSource selected_source(sink.view()); + assert_ok(format::read_prx_window_csr_selective(&selected_source, selected_docs, + &selected_positions, &selected_offsets)); + + ASSERT_EQ(selected_offsets.size(), selected_docs.size() + 1); + for (size_t i = 0; i < selected_docs.size(); ++i) { + const uint32_t doc = selected_docs[i]; + const std::vector expected(full_positions.begin() + full_offsets[doc], + full_positions.begin() + full_offsets[doc + 1]); + const std::vector actual( + selected_positions.begin() + selected_offsets[i], + selected_positions.begin() + selected_offsets[i + 1]); + EXPECT_EQ(actual, expected); + } + }; + + assert_selected_matches_full({0, 1, 2}); + assert_selected_matches_full({0, 1, 127, 128, 129, 255, 256, 319}); +} + +TEST(SniiPforTest, LowBitWidthFastPathsRoundTrip) { + auto assert_round_trip = [](const std::vector& values, uint8_t expected_width) { + ByteSink sink; + snii::pfor_encode(values.data(), values.size(), &sink); + ASSERT_FALSE(sink.buffer().empty()); + EXPECT_EQ(sink.buffer().front(), expected_width); + + std::vector decoded(values.size(), 0xFFFFFFFF); + ByteSource source(sink.view()); + assert_ok(snii::pfor_decode(&source, values.size(), decoded.data())); + EXPECT_TRUE(source.eof()); + EXPECT_EQ(decoded, values); + }; + + std::vector one_bit(128); + for (size_t i = 0; i < one_bit.size(); ++i) { + one_bit[i] = static_cast(i & 1); + } + assert_round_trip(one_bit, 1); + + one_bit[17] = 1000; + assert_round_trip(one_bit, 1); + + std::vector two_bit(128); + for (size_t i = 0; i < two_bit.size(); ++i) { + two_bit[i] = static_cast(i & 3); + } + assert_round_trip(two_bit, 2); + + std::vector three_bit(131); + for (size_t i = 0; i < three_bit.size(); ++i) { + three_bit[i] = static_cast(i & 7); + } + assert_round_trip(three_bit, 3); + + std::vector four_bit(128); + for (size_t i = 0; i < four_bit.size(); ++i) { + four_bit[i] = static_cast(i & 15); + } + assert_round_trip(four_bit, 4); + + std::vector five_bit(129); + for (size_t i = 0; i < five_bit.size(); ++i) { + five_bit[i] = static_cast(i & 31); + } + assert_round_trip(five_bit, 5); + + std::vector six_bit(130); + for (size_t i = 0; i < six_bit.size(); ++i) { + six_bit[i] = static_cast(i & 63); + } + assert_round_trip(six_bit, 6); + + std::vector seven_bit(131); + for (size_t i = 0; i < seven_bit.size(); ++i) { + seven_bit[i] = static_cast(i & 127); + } + assert_round_trip(seven_bit, 7); + + std::vector eight_bit(256); + for (size_t i = 0; i < eight_bit.size(); ++i) { + eight_bit[i] = static_cast(i); + } + assert_round_trip(eight_bit, 8); +} + +} // namespace +} // namespace snii::query diff --git a/be/test/storage/segment/inverted_index_fs_directory_test.cpp b/be/test/storage/segment/inverted_index_fs_directory_test.cpp index d42559a0e39975..99cd9d8b613cc7 100644 --- a/be/test/storage/segment/inverted_index_fs_directory_test.cpp +++ b/be/test/storage/segment/inverted_index_fs_directory_test.cpp @@ -287,6 +287,58 @@ TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalWithBytesReadError) { _CLDELETE(input); } +TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalRecordsIndexIOStatsAndContext) { + std::filesystem::path test_file = _tmp_dir / "test_file_with_stats"; + std::ofstream ofs(test_file); + ofs << "test content for stats"; + ofs.close(); + + lucene::store::IndexInput* input = nullptr; + CLuceneError error; + + bool result = + DorisFSDirectory::FSIndexInput::open(_fs, test_file.string().c_str(), input, error); + EXPECT_TRUE(result); + + io::FileCacheStatistics stats; + io::IOContext io_ctx; + io_ctx.is_disposable = true; + io_ctx.is_index_data = false; + io_ctx.read_file_cache = false; + io_ctx.file_cache_stats = &stats; + + input->setIoContext(&io_ctx); + input->setIndexFile(true); + + uint8_t buffer[6]; + input->readBytes(buffer, 6, false); + EXPECT_EQ(std::string(reinterpret_cast(buffer), 6), "test c"); + + const auto* captured = static_cast(input->getIoContext()); + EXPECT_TRUE(captured->is_inverted_index); + EXPECT_TRUE(captured->is_index_data); + EXPECT_FALSE(captured->read_file_cache); + EXPECT_TRUE(captured->is_disposable); + EXPECT_EQ(captured->file_cache_stats, &stats); + + EXPECT_EQ(stats.inverted_index_request_bytes, 6); + EXPECT_EQ(stats.inverted_index_read_bytes, 6); + EXPECT_EQ(stats.inverted_index_range_read_count, 1); + EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1); + + input->setIoContext(nullptr); + captured = static_cast(input->getIoContext()); + EXPECT_TRUE(captured->is_inverted_index); + EXPECT_TRUE(captured->is_index_data); + EXPECT_EQ(captured->file_cache_stats, nullptr); + + input->setIndexFile(false); + captured = static_cast(input->getIoContext()); + EXPECT_FALSE(captured->is_index_data); + + _CLDELETE(input); +} + // Test 19: FSIndexOutput init error TEST_F(DorisFSDirectoryTest, FSIndexOutputInitError) { DebugPoints::instance()->add( @@ -841,4 +893,4 @@ TEST_F(DorisFSDirectoryTest, PrivGetFN) { } } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 54129adf81bed0..3e8def3c9710a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -105,6 +105,12 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c checkInvertedIndexProperties(properties, colType, invertedIndexFileStorageFormat); } + if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII + && !colType.isStringType() && !colType.isArrayType()) { + throw new AnalysisException("SNII inverted index storage format only supports string columns, column: " + + indexColName + " type: " + colType); + } + // default is "none" if not set if (parser == null) { parser = INVERTED_INDEX_PARSER_NONE; diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java index b208e712c273c4..24337cd4929316 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java @@ -379,11 +379,15 @@ public OlapFile.TabletMetaCloudPB.Builder createTabletMetaBuilder(long tableId, schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2); } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V3) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3); + } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.SNII); } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.DEFAULT) { if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V1); } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2); + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) { + schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.SNII); } else { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index b27db96bbe176b..392131a8cd4ea1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -1219,6 +1219,8 @@ public static TInvertedIndexFileStorageFormat analyzeInvertedIndexFileStorageFor return TInvertedIndexFileStorageFormat.V1; } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) { return TInvertedIndexFileStorageFormat.V2; + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) { + return TInvertedIndexFileStorageFormat.SNII; } else { return TInvertedIndexFileStorageFormat.V3; } @@ -1230,11 +1232,15 @@ public static TInvertedIndexFileStorageFormat analyzeInvertedIndexFileStorageFor return TInvertedIndexFileStorageFormat.V2; } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v3")) { return TInvertedIndexFileStorageFormat.V3; + } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("snii")) { + return TInvertedIndexFileStorageFormat.SNII; } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("default")) { if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) { return TInvertedIndexFileStorageFormat.V1; } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) { return TInvertedIndexFileStorageFormat.V2; + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) { + return TInvertedIndexFileStorageFormat.SNII; } else { return TInvertedIndexFileStorageFormat.V3; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java index 494e756538b112..bf5aac95225629 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java @@ -31,6 +31,7 @@ import org.apache.doris.common.Config; import org.apache.doris.common.UserException; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import com.google.common.collect.Maps; import org.apache.commons.lang3.StringUtils; @@ -134,6 +135,10 @@ public void validate(ConnectContext ctx) throws UserException { } IndexType indexType = existedIdx.getIndexType(); + OlapTable olapTable = (OlapTable) table; + if (olapTable.getInvertedIndexFileStorageFormat() == TInvertedIndexFileStorageFormat.SNII) { + throw new AnalysisException("BUILD INDEX is not supported for SNII inverted index storage format yet"); + } if ((Config.isNotCloudMode() && indexType == IndexType.NGRAM_BF) || indexType == IndexType.BLOOMFILTER || (Config.isCloudMode() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java index 14869e7925cf86..9303ebf95bcb7b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java @@ -848,8 +848,10 @@ public void validate(ConnectContext ctx) { } if (indexDef.getIndexType() == IndexType.ANN) { if (invertedIndexFileStorageFormat != null - && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) { - throw new AnalysisException("ANN index is not supported in index format V1"); + && (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1 + || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII)) { + throw new AnalysisException("ANN index is not supported in index format " + + invertedIndexFileStorageFormat); } } for (String indexColName : indexDef.getColumnNames()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java index 8630d80b7dc0ab..36f256994a7116 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java @@ -164,6 +164,11 @@ public void checkColumn(ColumnDefinition column, KeysType keysType, "ANN index can only be used in DUP_KEYS table or UNIQUE_KEYS table with" + " merge-on-write enabled"); } + if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1 + || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + throw new AnalysisException("ANN index is not supported in index format " + + invertedIndexFileStorageFormat); + } return; } @@ -177,6 +182,17 @@ public void checkColumn(ColumnDefinition column, KeysType keysType, throw new AnalysisException(colType + " is not supported in " + indexType.toString() + " index. " + "invalid index: " + name); } + if (indexType == IndexType.INVERTED + && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + boolean isStringIndex = colType.isStringLikeType() + || (colType.isArrayType() + && ((ArrayType) colType).getItemType().isStringLikeType()); + if (!isStringIndex) { + throw new AnalysisException( + "SNII inverted index storage format does not support BKD index on column: " + + indexColName); + } + } // In inverted index format v1, each subcolumn of a variant has its own index file, leading to high IOPS. // when the subcolumn type changes, it may result in missing files, causing link file failure. @@ -264,8 +280,10 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe "ANN index can only be used in DUP_KEYS table or UNIQUE_KEYS table with" + " merge-on-write enabled"); } - if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) { - throw new AnalysisException("ANN index is not supported in index format V1"); + if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1 + || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + throw new AnalysisException("ANN index is not supported in index format " + + invertedIndexFileStorageFormat); } return; } @@ -280,9 +298,16 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe throw new AnalysisException(colType + " is not supported in " + indexType.toString() + " index. " + "invalid index: " + name); } - - if (indexType == IndexType.ANN && !colType.isArrayType()) { - throw new AnalysisException("ANN index column must be array type"); + if (indexType == IndexType.INVERTED + && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + boolean isStringIndex = colType.isStringType() + || (colType.isArrayType() + && ((org.apache.doris.catalog.ArrayType) columnType).getItemType().isStringType()); + if (!isStringIndex) { + throw new AnalysisException( + "SNII inverted index storage format does not support BKD index on column: " + + indexColName); + } } // In inverted index format v1, each subcolumn of a variant has its own index file, leading to high IOPS. diff --git a/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java index fa6260d19f7a8d..8a836b6b5d6f2c 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java @@ -46,6 +46,7 @@ import org.apache.doris.qe.ConnectContext; import org.apache.doris.task.AgentTask; import org.apache.doris.task.AgentTaskQueue; +import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import org.apache.doris.thrift.TStatusCode; import org.apache.doris.thrift.TTaskType; import org.apache.doris.transaction.FakeTransactionIDGenerator; @@ -195,6 +196,47 @@ public void testBuildIndexIndexChange() throws UserException { Assert.assertEquals(OlapTableState.NORMAL, olapTable.getState()); } + @Test + public void testBuildIndexRejectedForSniiStorageFormat() throws UserException { + if (fakeEnv != null) { + fakeEnv.close(); + } + fakeEnv = new FakeEnv(); + if (fakeEditLog != null) { + fakeEditLog.close(); + } + fakeEditLog = new FakeEditLog(); + FakeEnv.setEnv(masterEnv); + SchemaChangeHandler schemaChangeHandler = Env.getCurrentEnv().getSchemaChangeHandler(); + ArrayList alterOps = new ArrayList<>(); + Database db = masterEnv.getInternalCatalog().getDbOrDdlException(CatalogTestUtil.testDbId1); + OlapTable olapTable = (OlapTable) db.getTableOrDdlException(CatalogTestUtil.testTableId1); + String indexName = "index1"; + TableNameInfo tableNameInfo = new TableNameInfo(masterEnv.getInternalCatalog().getName(), db.getName(), + olapTable.getName()); + IndexDefinition indexDefinition = new IndexDefinition(indexName, false, + Lists.newArrayList(olapTable.getBaseSchema().get(1).getName()), + "INVERTED", + Maps.newHashMap(), "balabala"); + CreateIndexOp createIndexClause = new CreateIndexOp(tableNameInfo, indexDefinition, false); + ConnectContext connectContext = new ConnectContext(); + createIndexClause.validate(connectContext); + alterOps.add(createIndexClause); + schemaChangeHandler.process(alterOps, db, olapTable); + TInvertedIndexFileStorageFormat originalFormat = olapTable.getInvertedIndexFileStorageFormat(); + try { + olapTable.setInvertedIndexFileStorageFormat(TInvertedIndexFileStorageFormat.SNII); + BuildIndexOp buildIndexClause = new BuildIndexOp(tableNameInfo, indexName, null, false); + buildIndexClause.validate(connectContext); + Assert.fail("BUILD INDEX should be rejected for SNII inverted index storage format."); + } catch (AnalysisException e) { + Assert.assertTrue(e.getMessage().contains( + "BUILD INDEX is not supported for SNII inverted index storage format yet")); + } finally { + olapTable.setInvertedIndexFileStorageFormat(originalFormat); + } + } + @Test public void testDropIndexIndexChange() throws UserException { if (fakeEnv != null) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java index 7b41ddc95cf840..060e687b495242 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java @@ -18,7 +18,9 @@ package org.apache.doris.nereids.trees.plans.commands; import org.apache.doris.catalog.AggregateType; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.Type; import org.apache.doris.catalog.info.IndexType; import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.trees.plans.commands.info.ColumnDefinition; @@ -57,6 +59,68 @@ void testVariantIndexFormatV1() throws AnalysisException { } } + @Test + void testSniiInvertedIndexColumnTypes() throws AnalysisException { + IndexDefinition def = new IndexDefinition("snii_index", false, Lists.newArrayList("col1"), + "INVERTED", null, "comment"); + + def.checkColumn(new ColumnDefinition("col1", StringType.INSTANCE, false, AggregateType.NONE, true, + null, "comment"), KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII); + def.checkColumn(new ColumnDefinition("col1", ArrayType.of(StringType.INSTANCE), false, + AggregateType.NONE, true, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII); + + AnalysisException intException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new ColumnDefinition("col1", IntegerType.INSTANCE, false, AggregateType.NONE, + true, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(intException.getMessage().contains("does not support BKD index")); + + AnalysisException arrayIntException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new ColumnDefinition("col1", ArrayType.of(IntegerType.INSTANCE), false, + AggregateType.NONE, true, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(arrayIntException.getMessage().contains("does not support BKD index")); + } + + @Test + void testSniiInvertedIndexCatalogColumnTypes() throws AnalysisException { + IndexDefinition def = new IndexDefinition("snii_index", false, Lists.newArrayList("col1"), + "INVERTED", null, "comment"); + + def.checkColumn(new Column("col1", Type.STRING, true), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII); + def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.STRING), true), + KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII); + + AnalysisException intException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new Column("col1", Type.INT, true), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(intException.getMessage().contains("does not support BKD index")); + + AnalysisException arrayIntException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.INT), true), + KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(arrayIntException.getMessage().contains("does not support BKD index")); + } + + @Test + void testSniiRejectsAnnIndex() { + IndexDefinition def = new IndexDefinition("ann_index", false, Lists.newArrayList("col1"), + "ANN", null, "comment"); + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new ColumnDefinition("col1", ArrayType.of(FloatType.INSTANCE), false, + AggregateType.NONE, false, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(exception.getMessage().contains("ANN index is not supported in index format SNII")); + + AnalysisException catalogException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.FLOAT), false), + KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(catalogException.getMessage().contains( + "ANN index is not supported in index format SNII")); + } + void testArrayTypeSupport() throws AnalysisException { IndexDefinition def = new IndexDefinition("array_index", false, Lists.newArrayList("col1"), "INVERTED", null, "array test"); diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 210a5ba0a1cf89..8577957927e64d 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -446,6 +446,7 @@ enum InvertedIndexStorageFormatPB { V1 = 0; V2 = 1; V3 = 2; + SNII = 3; } // Tablet-level storage format. Values match TStorageFormat (Thrift) integer values so diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift index 4b4780d933c6e2..8917efd68cd31a 100644 --- a/gensrc/thrift/AgentService.thrift +++ b/gensrc/thrift/AgentService.thrift @@ -196,7 +196,8 @@ enum TCompressionType { enum TInvertedIndexStorageFormat { DEFAULT = 0, // Default format, unspecified storage method. V1 = 1, // Index per idx: Each index is stored separately based on its identifier. - V2 = 2 // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. + V2 = 2, // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. + SNII = 4 // SNII native inverted index storage format } enum TBinlogFormat { diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift index c6b9c705307380..d088a936b9e05f 100644 --- a/gensrc/thrift/Types.thrift +++ b/gensrc/thrift/Types.thrift @@ -130,7 +130,8 @@ enum TInvertedIndexFileStorageFormat { DEFAULT = 0, // Default format, unspecified storage method. V1 = 1, // Index per idx: Each index is stored separately based on its identifier. V2 = 2, // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. - V3 = 3 // Position and dictionary compression + V3 = 3, // Position and dictionary compression + SNII = 4 // SNII native inverted index storage format } struct TScalarType { diff --git a/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out new file mode 100644 index 00000000000000..33e05cf4214d2f --- /dev/null +++ b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out @@ -0,0 +1,16 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !match_any -- +1 +2 + +-- !match_all -- +1 + +-- !match_phrase -- +5 + +-- !null_bitmap -- +4 + +-- !array_contains -- +1 diff --git a/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy new file mode 100644 index 00000000000000..7800350fb6b753 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_storage_format_snii", "p0, nonConcurrent") { + sql "DROP TABLE IF EXISTS test_storage_format_snii" + sql "DROP TABLE IF EXISTS test_storage_format_snii_array" + sql "DROP TABLE IF EXISTS test_storage_format_snii_add_index" + sql "DROP TABLE IF EXISTS test_storage_format_snii_bkd" + sql "DROP TABLE IF EXISTS test_storage_format_snii_array_bkd" + sql "DROP TABLE IF EXISTS test_storage_format_snii_ann" + + sql """ + CREATE TABLE test_storage_format_snii ( + id INT NULL, + body TEXT NULL, + INDEX idx_body (`body`) USING INVERTED PROPERTIES( + "parser" = "english", + "support_phrase" = "true", + "lower_case" = "true" + ) COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true", + "inverted_index_storage_format" = "SNII" + ); + """ + + sql """ + INSERT INTO test_storage_format_snii VALUES + (1, 'alpha beta gamma'), + (2, 'alpha delta'), + (3, 'beta epsilon'), + (4, NULL), + (5, 'quick brown fox'), + (6, 'quick fox'); + """ + sql "sync" + + order_qt_match_any """ + SELECT id FROM test_storage_format_snii + WHERE body MATCH_ANY 'alpha' + ORDER BY id + """ + order_qt_match_all """ + SELECT id FROM test_storage_format_snii + WHERE body MATCH_ALL 'alpha beta' + ORDER BY id + """ + order_qt_match_phrase """ + SELECT id FROM test_storage_format_snii + WHERE body MATCH_PHRASE 'quick brown' + ORDER BY id + """ + order_qt_null_bitmap """ + SELECT id FROM test_storage_format_snii + WHERE body IS NULL + ORDER BY id + """ + + sql """ + CREATE TABLE test_storage_format_snii_array ( + id INT NULL, + tags ARRAY NULL, + INDEX idx_tags (`tags`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + + sql """ + INSERT INTO test_storage_format_snii_array VALUES + (1, '["alpha", "beta"]'), + (2, '["gamma"]'), + (3, NULL); + """ + sql "sync" + + order_qt_array_contains """ + SELECT id FROM test_storage_format_snii_array + WHERE array_contains(tags, 'alpha') + ORDER BY id + """ + + test { + if (isCloudMode()) { + sql "BUILD INDEX ON test_storage_format_snii" + } else { + sql "BUILD INDEX idx_body ON test_storage_format_snii" + } + exception "BUILD INDEX is not supported for SNII inverted index storage format yet" + } + + sql """ + CREATE TABLE test_storage_format_snii_add_index ( + id INT NULL, + body TEXT NULL, + score INT NULL, + scores ARRAY NULL, + embedding ARRAY NOT NULL, + INDEX idx_body_added_table (`body`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + + test { + sql """ + ALTER TABLE test_storage_format_snii_add_index + ADD INDEX idx_score_added (`score`) USING INVERTED COMMENT '' + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + ALTER TABLE test_storage_format_snii_add_index + ADD INDEX idx_scores_added (`scores`) USING INVERTED COMMENT '' + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + CREATE INDEX idx_ann_added ON test_storage_format_snii_add_index (`embedding`) USING ANN PROPERTIES( + "index_type" = "hnsw", + "metric_type" = "l2_distance", + "dim" = "1" + ) + """ + exception "ANN index is not supported in index format SNII" + } + + test { + sql """ + CREATE TABLE test_storage_format_snii_bkd ( + id INT NULL, + score INT NULL, + INDEX idx_score (`score`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + CREATE TABLE test_storage_format_snii_array_bkd ( + id INT NULL, + scores ARRAY NULL, + INDEX idx_scores (`scores`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + CREATE TABLE test_storage_format_snii_ann ( + id INT NULL, + embedding ARRAY NOT NULL, + INDEX idx_ann (`embedding`) USING ANN PROPERTIES( + "index_type" = "hnsw", + "metric_type" = "l2_distance", + "dim" = "1" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + exception "ANN index is not supported in index format SNII" + } +}