From 44dc0ed858b5ca1a984ca6d963d642256049c304 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sat, 27 Jun 2026 01:24:40 +0800 Subject: [PATCH 01/12] [feature](be) Add SNII inverted index storage format ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: Doris only routed inverted index files through the existing V1/V2 storage implementations. This change adds SNII as an independent inverted index storage format, copies the SNII core reader/writer/query implementation into BE, and branches the Doris index file reader/writer paths so SNII reads, writes, queries, and null bitmap handling go through SNII code. SNII reuses Doris analyzer integration only; it does not route SNII storage through the existing CLucene directory/compound reader paths. SNII currently supports string and array string inverted indexes, while numeric/BKD indexes are rejected for this format until BKD support is implemented. ### Release note Add SNII as an inverted index storage format for string inverted indexes. BKD indexes are not supported with SNII yet. ### Check List (For Author) - Test: Build - `./build.sh --be` - `./build.sh --fe` - `build-support/clang-format.sh` - `build-support/check-format.sh` - `build-support/run-clang-tidy.sh --build-dir be/build_Release` attempted; it failed because clang-tidy could not resolve `stddef.h` in this toolchain and also reported pre-existing unrelated diagnostics. - Behavior changed: Yes. Tables using `inverted_index_storage_format=SNII` route string inverted index storage/query/null handling through SNII and reject BKD indexes. - Does this need documentation: Yes. No doc PR yet. --- be/src/snii/common/slice.h | 39 + be/src/snii/common/status.h | 57 ++ be/src/snii/encoding/byte_sink.h | 44 ++ be/src/snii/encoding/byte_source.h | 37 + be/src/snii/encoding/crc32c.h | 16 + be/src/snii/encoding/pfor.h | 22 + be/src/snii/encoding/section_framer.h | 27 + be/src/snii/encoding/varint.h | 26 + be/src/snii/encoding/zstd_codec.h | 16 + be/src/snii/format/bootstrap_header.h | 54 ++ be/src/snii/format/bsbf.h | 117 +++ be/src/snii/format/dict_block.h | 144 ++++ be/src/snii/format/dict_block_directory.h | 72 ++ be/src/snii/format/dict_entry.h | 112 +++ be/src/snii/format/format_constants.h | 111 +++ be/src/snii/format/frq_pod.h | 101 +++ be/src/snii/format/frq_prelude.h | 178 +++++ be/src/snii/format/logical_index_directory.h | 73 ++ be/src/snii/format/norms_pod.h | 68 ++ be/src/snii/format/null_bitmap.h | 87 +++ be/src/snii/format/per_index_meta.h | 150 ++++ be/src/snii/format/prx_pod.h | 90 +++ be/src/snii/format/sampled_term_index.h | 68 ++ be/src/snii/format/stats_block.h | 36 + be/src/snii/format/tail_meta_region.h | 74 ++ be/src/snii/format/tail_pointer.h | 55 ++ be/src/snii/io/batch_range_fetcher.h | 53 ++ be/src/snii/io/file_reader.h | 49 ++ be/src/snii/io/file_writer.h | 23 + be/src/snii/io/io_metrics.h | 26 + be/src/snii/io/local_file.h | 67 ++ be/src/snii/io/metered_file_reader.h | 50 ++ be/src/snii/io/s3_object_store.h | 122 ++++ be/src/snii/query/bm25_scorer.h | 63 ++ be/src/snii/query/boolean_query.h | 35 + be/src/snii/query/docid_sink.h | 32 + .../snii/query/internal/docid_conjunction.h | 75 ++ .../query/internal/docid_posting_reader.h | 32 + be/src/snii/query/internal/docid_set_ops.h | 15 + be/src/snii/query/internal/docid_union.h | 21 + be/src/snii/query/internal/position_math.h | 30 + be/src/snii/query/internal/term_expansion.h | 21 + be/src/snii/query/phrase_query.h | 37 + be/src/snii/query/prefix_query.h | 24 + be/src/snii/query/query_profile.h | 38 + be/src/snii/query/regexp_query.h | 24 + be/src/snii/query/scoring_query.h | 62 ++ be/src/snii/query/term_query.h | 22 + be/src/snii/query/wildcard_query.h | 24 + be/src/snii/reader/logical_index_reader.h | 123 ++++ be/src/snii/reader/snii_segment_reader.h | 50 ++ be/src/snii/reader/windowed_posting.h | 105 +++ be/src/snii/stats/snii_stats_provider.h | 67 ++ be/src/snii/version.h | 4 + be/src/snii/writer/compact_posting_pool.h | 180 +++++ be/src/snii/writer/logical_index_writer.h | 238 ++++++ be/src/snii/writer/memory_reporter.h | 51 ++ be/src/snii/writer/snii_compound_writer.h | 92 +++ be/src/snii/writer/spill_run_codec.h | 181 +++++ be/src/snii/writer/spillable_byte_buffer.h | 158 ++++ be/src/snii/writer/spimi_term_buffer.h | 362 +++++++++ be/src/snii/writer/temp_dir.h | 40 + be/src/storage/CMakeLists.txt | 1 + be/src/storage/index/index_file_reader.cpp | 76 +- be/src/storage/index/index_file_reader.h | 20 +- be/src/storage/index/index_file_writer.cpp | 61 +- be/src/storage/index/index_file_writer.h | 19 + be/src/storage/index/index_writer.cpp | 17 + .../index/inverted/inverted_index_reader.h | 7 +- .../index/snii/core/src/common/status.cpp | 24 + .../snii/core/src/encoding/byte_sink.cpp | 39 + .../snii/core/src/encoding/byte_source.cpp | 70 ++ .../index/snii/core/src/encoding/crc32c.cpp | 111 +++ .../index/snii/core/src/encoding/pfor.cpp | 182 +++++ .../snii/core/src/encoding/section_framer.cpp | 37 + .../index/snii/core/src/encoding/varint.cpp | 53 ++ .../snii/core/src/encoding/zstd_codec.cpp | 32 + .../snii/core/src/format/bootstrap_header.cpp | 91 +++ .../index/snii/core/src/format/bsbf.cpp | 218 ++++++ .../index/snii/core/src/format/dict_block.cpp | 293 ++++++++ .../core/src/format/dict_block_directory.cpp | 89 +++ .../index/snii/core/src/format/dict_entry.cpp | 293 ++++++++ .../index/snii/core/src/format/frq_pod.cpp | 196 +++++ .../snii/core/src/format/frq_prelude.cpp | 470 ++++++++++++ .../src/format/logical_index_directory.cpp | 116 +++ .../index/snii/core/src/format/norms_pod.cpp | 46 ++ .../snii/core/src/format/null_bitmap.cpp | 99 +++ .../snii/core/src/format/per_index_meta.cpp | 191 +++++ .../index/snii/core/src/format/prx_pod.cpp | 627 ++++++++++++++++ .../core/src/format/sampled_term_index.cpp | 154 ++++ .../snii/core/src/format/stats_block.cpp | 46 ++ .../snii/core/src/format/tail_meta_region.cpp | 129 ++++ .../snii/core/src/format/tail_pointer.cpp | 95 +++ .../snii/core/src/io/batch_range_fetcher.cpp | 81 +++ .../index/snii/core/src/io/local_file.cpp | 113 +++ .../snii/core/src/io/metered_file_reader.cpp | 117 +++ .../snii/core/src/io/s3_object_store.cpp | 217 ++++++ .../index/snii/core/src/query/bm25_scorer.cpp | 42 ++ .../snii/core/src/query/boolean_query.cpp | 99 +++ .../snii/core/src/query/docid_conjunction.cpp | 518 +++++++++++++ .../core/src/query/docid_posting_reader.cpp | 222 ++++++ .../snii/core/src/query/docid_set_ops.cpp | 105 +++ .../index/snii/core/src/query/docid_union.cpp | 31 + .../snii/core/src/query/phrase_query.cpp | 644 ++++++++++++++++ .../snii/core/src/query/prefix_query.cpp | 41 ++ .../snii/core/src/query/query_profile.cpp | 46 ++ .../snii/core/src/query/regexp_query.cpp | 82 +++ .../snii/core/src/query/scoring_query.cpp | 684 +++++++++++++++++ .../snii/core/src/query/term_expansion.cpp | 28 + .../index/snii/core/src/query/term_query.cpp | 33 + .../snii/core/src/query/wildcard_query.cpp | 71 ++ .../core/src/reader/logical_index_reader.cpp | 341 +++++++++ .../core/src/reader/snii_segment_reader.cpp | 97 +++ .../snii/core/src/reader/windowed_posting.cpp | 253 +++++++ .../core/src/stats/snii_stats_provider.cpp | 93 +++ .../core/src/writer/compact_posting_pool.cpp | 155 ++++ .../core/src/writer/logical_index_writer.cpp | 686 ++++++++++++++++++ .../core/src/writer/snii_compound_writer.cpp | 146 ++++ .../snii/core/src/writer/spill_run_codec.cpp | 597 +++++++++++++++ .../core/src/writer/spimi_term_buffer.cpp | 594 +++++++++++++++ .../storage/index/snii/snii_doris_adapter.cpp | 100 +++ .../storage/index/snii/snii_doris_adapter.h | 61 ++ .../storage/index/snii/snii_index_reader.cpp | 295 ++++++++ be/src/storage/index/snii/snii_index_reader.h | 62 ++ .../storage/index/snii/snii_index_writer.cpp | 197 +++++ be/src/storage/index/snii/snii_index_writer.h | 74 ++ be/src/storage/rowset/beta_rowset.cpp | 44 +- be/src/storage/segment/column_reader.cpp | 12 + be/src/storage/tablet/tablet_meta.cpp | 6 + be/src/storage/task/index_builder.cpp | 5 + .../doris/analysis/InvertedIndexUtil.java | 6 + .../datasource/CloudInternalCatalog.java | 4 + .../doris/common/util/PropertyAnalyzer.java | 6 + .../plans/commands/info/IndexDefinition.java | 22 + gensrc/proto/olap_file.proto | 1 + gensrc/thrift/AgentService.thrift | 3 +- gensrc/thrift/Types.thrift | 3 +- 137 files changed, 15382 insertions(+), 27 deletions(-) create mode 100644 be/src/snii/common/slice.h create mode 100644 be/src/snii/common/status.h create mode 100644 be/src/snii/encoding/byte_sink.h create mode 100644 be/src/snii/encoding/byte_source.h create mode 100644 be/src/snii/encoding/crc32c.h create mode 100644 be/src/snii/encoding/pfor.h create mode 100644 be/src/snii/encoding/section_framer.h create mode 100644 be/src/snii/encoding/varint.h create mode 100644 be/src/snii/encoding/zstd_codec.h create mode 100644 be/src/snii/format/bootstrap_header.h create mode 100644 be/src/snii/format/bsbf.h create mode 100644 be/src/snii/format/dict_block.h create mode 100644 be/src/snii/format/dict_block_directory.h create mode 100644 be/src/snii/format/dict_entry.h create mode 100644 be/src/snii/format/format_constants.h create mode 100644 be/src/snii/format/frq_pod.h create mode 100644 be/src/snii/format/frq_prelude.h create mode 100644 be/src/snii/format/logical_index_directory.h create mode 100644 be/src/snii/format/norms_pod.h create mode 100644 be/src/snii/format/null_bitmap.h create mode 100644 be/src/snii/format/per_index_meta.h create mode 100644 be/src/snii/format/prx_pod.h create mode 100644 be/src/snii/format/sampled_term_index.h create mode 100644 be/src/snii/format/stats_block.h create mode 100644 be/src/snii/format/tail_meta_region.h create mode 100644 be/src/snii/format/tail_pointer.h create mode 100644 be/src/snii/io/batch_range_fetcher.h create mode 100644 be/src/snii/io/file_reader.h create mode 100644 be/src/snii/io/file_writer.h create mode 100644 be/src/snii/io/io_metrics.h create mode 100644 be/src/snii/io/local_file.h create mode 100644 be/src/snii/io/metered_file_reader.h create mode 100644 be/src/snii/io/s3_object_store.h create mode 100644 be/src/snii/query/bm25_scorer.h create mode 100644 be/src/snii/query/boolean_query.h create mode 100644 be/src/snii/query/docid_sink.h create mode 100644 be/src/snii/query/internal/docid_conjunction.h create mode 100644 be/src/snii/query/internal/docid_posting_reader.h create mode 100644 be/src/snii/query/internal/docid_set_ops.h create mode 100644 be/src/snii/query/internal/docid_union.h create mode 100644 be/src/snii/query/internal/position_math.h create mode 100644 be/src/snii/query/internal/term_expansion.h create mode 100644 be/src/snii/query/phrase_query.h create mode 100644 be/src/snii/query/prefix_query.h create mode 100644 be/src/snii/query/query_profile.h create mode 100644 be/src/snii/query/regexp_query.h create mode 100644 be/src/snii/query/scoring_query.h create mode 100644 be/src/snii/query/term_query.h create mode 100644 be/src/snii/query/wildcard_query.h create mode 100644 be/src/snii/reader/logical_index_reader.h create mode 100644 be/src/snii/reader/snii_segment_reader.h create mode 100644 be/src/snii/reader/windowed_posting.h create mode 100644 be/src/snii/stats/snii_stats_provider.h create mode 100644 be/src/snii/version.h create mode 100644 be/src/snii/writer/compact_posting_pool.h create mode 100644 be/src/snii/writer/logical_index_writer.h create mode 100644 be/src/snii/writer/memory_reporter.h create mode 100644 be/src/snii/writer/snii_compound_writer.h create mode 100644 be/src/snii/writer/spill_run_codec.h create mode 100644 be/src/snii/writer/spillable_byte_buffer.h create mode 100644 be/src/snii/writer/spimi_term_buffer.h create mode 100644 be/src/snii/writer/temp_dir.h create mode 100644 be/src/storage/index/snii/core/src/common/status.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/byte_sink.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/byte_source.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/crc32c.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/pfor.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/section_framer.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/varint.cpp create mode 100644 be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp create mode 100644 be/src/storage/index/snii/core/src/format/bootstrap_header.cpp create mode 100644 be/src/storage/index/snii/core/src/format/bsbf.cpp create mode 100644 be/src/storage/index/snii/core/src/format/dict_block.cpp create mode 100644 be/src/storage/index/snii/core/src/format/dict_block_directory.cpp create mode 100644 be/src/storage/index/snii/core/src/format/dict_entry.cpp create mode 100644 be/src/storage/index/snii/core/src/format/frq_pod.cpp create mode 100644 be/src/storage/index/snii/core/src/format/frq_prelude.cpp create mode 100644 be/src/storage/index/snii/core/src/format/logical_index_directory.cpp create mode 100644 be/src/storage/index/snii/core/src/format/norms_pod.cpp create mode 100644 be/src/storage/index/snii/core/src/format/null_bitmap.cpp create mode 100644 be/src/storage/index/snii/core/src/format/per_index_meta.cpp create mode 100644 be/src/storage/index/snii/core/src/format/prx_pod.cpp create mode 100644 be/src/storage/index/snii/core/src/format/sampled_term_index.cpp create mode 100644 be/src/storage/index/snii/core/src/format/stats_block.cpp create mode 100644 be/src/storage/index/snii/core/src/format/tail_meta_region.cpp create mode 100644 be/src/storage/index/snii/core/src/format/tail_pointer.cpp create mode 100644 be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp create mode 100644 be/src/storage/index/snii/core/src/io/local_file.cpp create mode 100644 be/src/storage/index/snii/core/src/io/metered_file_reader.cpp create mode 100644 be/src/storage/index/snii/core/src/io/s3_object_store.cpp create mode 100644 be/src/storage/index/snii/core/src/query/bm25_scorer.cpp create mode 100644 be/src/storage/index/snii/core/src/query/boolean_query.cpp create mode 100644 be/src/storage/index/snii/core/src/query/docid_conjunction.cpp create mode 100644 be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp create mode 100644 be/src/storage/index/snii/core/src/query/docid_set_ops.cpp create mode 100644 be/src/storage/index/snii/core/src/query/docid_union.cpp create mode 100644 be/src/storage/index/snii/core/src/query/phrase_query.cpp create mode 100644 be/src/storage/index/snii/core/src/query/prefix_query.cpp create mode 100644 be/src/storage/index/snii/core/src/query/query_profile.cpp create mode 100644 be/src/storage/index/snii/core/src/query/regexp_query.cpp create mode 100644 be/src/storage/index/snii/core/src/query/scoring_query.cpp create mode 100644 be/src/storage/index/snii/core/src/query/term_expansion.cpp create mode 100644 be/src/storage/index/snii/core/src/query/term_query.cpp create mode 100644 be/src/storage/index/snii/core/src/query/wildcard_query.cpp create mode 100644 be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp create mode 100644 be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp create mode 100644 be/src/storage/index/snii/core/src/reader/windowed_posting.cpp create mode 100644 be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp create mode 100644 be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp create mode 100644 be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp create mode 100644 be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp create mode 100644 be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp create mode 100644 be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp create mode 100644 be/src/storage/index/snii/snii_doris_adapter.cpp create mode 100644 be/src/storage/index/snii/snii_doris_adapter.h create mode 100644 be/src/storage/index/snii/snii_index_reader.cpp create mode 100644 be/src/storage/index/snii/snii_index_reader.h create mode 100644 be/src/storage/index/snii/snii_index_writer.cpp create mode 100644 be/src/storage/index/snii/snii_index_writer.h diff --git a/be/src/snii/common/slice.h b/be/src/snii/common/slice.h new file mode 100644 index 00000000000000..db10b2dfc52b6f --- /dev/null +++ b/be/src/snii/common/slice.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace snii { + +// Read-only byte view (does not own memory). Lifetime is managed by the underlying buffer. +class Slice { +public: + Slice() = default; + Slice(const uint8_t* d, size_t n) : data_(d), size_(n) {} + explicit Slice(const std::vector& v) : data_(v.data()), size_(v.size()) {} + explicit Slice(std::string_view sv) + : data_(reinterpret_cast(sv.data())), size_(sv.size()) {} + + const uint8_t* data() const { return data_; } + size_t size() const { return size_; } + bool empty() const { return size_ == 0; } + + uint8_t operator[](size_t i) const { + assert(i < size_); + return data_[i]; + } + + Slice subslice(size_t off, size_t n) const { + assert(off + n <= size_); + return Slice(data_ + off, n); + } + +private: + const uint8_t* data_ = nullptr; + size_t size_ = 0; +}; + +} // namespace snii diff --git a/be/src/snii/common/status.h b/be/src/snii/common/status.h new file mode 100644 index 00000000000000..a8e21da814184a --- /dev/null +++ b/be/src/snii/common/status.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include + +namespace snii { + +enum class StatusCode { + kOk, + kCorruption, + kNotFound, + kInvalidArgument, + kIoError, + kUnsupported, + kInternal, +}; + +// Lightweight error type: success is kOk with no message; failure carries a code + human-readable message. +// Always return Status across API boundaries; silent failures are not allowed. +class Status { +public: + Status() = default; + + static Status OK() { return Status(); } + static Status Corruption(std::string m) { + return Status(StatusCode::kCorruption, std::move(m)); + } + static Status NotFound(std::string m) { return Status(StatusCode::kNotFound, std::move(m)); } + static Status InvalidArgument(std::string m) { + return Status(StatusCode::kInvalidArgument, std::move(m)); + } + static Status IoError(std::string m) { return Status(StatusCode::kIoError, std::move(m)); } + static Status Unsupported(std::string m) { + return Status(StatusCode::kUnsupported, std::move(m)); + } + static Status Internal(std::string m) { return Status(StatusCode::kInternal, std::move(m)); } + + bool ok() const { return code_ == StatusCode::kOk; } + StatusCode code() const { return code_; } + const std::string& message() const { return message_; } + std::string to_string() const; + +private: + Status(StatusCode c, std::string m) : code_(c), message_(std::move(m)) {} + + StatusCode code_ = StatusCode::kOk; + std::string message_; +}; + +} // namespace snii + +// Short-circuit return for expressions returning Status (propagate errors upward). +#define SNII_RETURN_IF_ERROR(expr) \ + do { \ + ::snii::Status _s = (expr); \ + if (!_s.ok()) return _s; \ + } while (0) diff --git a/be/src/snii/encoding/byte_sink.h b/be/src/snii/encoding/byte_sink.h new file mode 100644 index 00000000000000..604e307228cf39 --- /dev/null +++ b/be/src/snii/encoding/byte_sink.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" + +namespace snii { + +// append-only write cursor: all section serialization goes through this; manual byte assembly is forbidden. +// All multi-byte fixed-width fields are little-endian. +class ByteSink { +public: + void put_u8(uint8_t v) { buf_.push_back(v); } + void put_fixed16(uint16_t v); + void put_fixed32(uint32_t v); + void put_fixed64(uint64_t v); + void put_varint32(uint32_t v); + void put_varint64(uint64_t v); + void put_zigzag(int64_t v); + void put_bytes(Slice s); + + size_t size() const { return buf_.size(); } + const std::vector& buffer() const { return buf_; } + Slice view() const { return Slice(buf_); } + + // Resets the cursor to empty while RETAINING the backing capacity, so a sink can + // be reused across many small encodes (e.g. per-window region/prx scratch in the + // windowed posting builder) without re-allocating each time -- this avoids the + // cumulative small-allocation churn that fragments the heap arena and inflates + // peak RSS during the merge of a high-df term split into thousands of windows. + void clear() { buf_.clear(); } + + // Moves the backing buffer OUT to the caller (the sink is left empty), so an encoded + // section can be handed off without the copy (+ copy-induced capacity slack) that + // reading buffer() and copy-assigning would incur. Use only when the sink is not + // reused afterward (a stack-local about to die, or one that is clear()'d next). + std::vector take() { return std::move(buf_); } + +private: + std::vector buf_; +}; + +} // namespace snii diff --git a/be/src/snii/encoding/byte_source.h b/be/src/snii/encoding/byte_source.h new file mode 100644 index 00000000000000..96cf4eed665269 --- /dev/null +++ b/be/src/snii/encoding/byte_source.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" + +namespace snii { + +// Slice read cursor: all section deserialization goes through this; any overrun returns Corruption. +class ByteSource { +public: + explicit ByteSource(Slice s) : s_(s) {} + + Status get_u8(uint8_t* v); + Status get_fixed16(uint16_t* v); + Status get_fixed32(uint32_t* v); + Status get_fixed64(uint64_t* v); + Status get_varint32(uint32_t* v); + Status get_varint64(uint64_t* v); + Status get_zigzag(int64_t* v); + Status get_bytes(size_t n, Slice* out); + + size_t remaining() const { return s_.size() - pos_; } + size_t position() const { return pos_; } + bool eof() const { return pos_ == s_.size(); } + + // Returns a sub-view starting at absolute offset start with length len (used by framer etc. to rewind over the CRC coverage region). + Slice slice_from(size_t start, size_t len) const { return s_.subslice(start, len); } + +private: + Slice s_; + size_t pos_ = 0; +}; + +} // namespace snii diff --git a/be/src/snii/encoding/crc32c.h b/be/src/snii/encoding/crc32c.h new file mode 100644 index 00000000000000..08210379064d91 --- /dev/null +++ b/be/src/snii/encoding/crc32c.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include "snii/common/slice.h" + +namespace snii { + +// CRC32C (Castagnoli, polynomial 0x1EDC6F41). Used to checksum the tail of each format block. +uint32_t crc32c_extend(uint32_t crc, Slice data); + +inline uint32_t crc32c(Slice data) { + return crc32c_extend(0, data); +} + +} // namespace snii diff --git a/be/src/snii/encoding/pfor.h b/be/src/snii/encoding/pfor.h new file mode 100644 index 00000000000000..743cfe6f58e1a7 --- /dev/null +++ b/be/src/snii/encoding/pfor.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" + +namespace snii { + +// PFOR integer block encoder/decoder (unsigned uint32 array). +// Encoded layout: [u8 bit_width][varint n_exceptions][bit-packed low +// bits][exception table]. Selects the bit_width that minimizes total byte size; +// values exceeding it go into the exception table (index_delta, full_value). +// delta/zigzag is handled by the upper layer (.frq window); PFOR only processes +// unsigned integer arrays. +void pfor_encode(const uint32_t* values, size_t n, ByteSink* out); +Status pfor_decode(ByteSource* src, size_t n, uint32_t* out); +Status pfor_skip(ByteSource* src, size_t n); + +} // namespace snii diff --git a/be/src/snii/encoding/section_framer.h b/be/src/snii/encoding/section_framer.h new file mode 100644 index 00000000000000..cd8594f589a8da --- /dev/null +++ b/be/src/snii/encoding/section_framer.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" + +namespace snii { + +// A framed section: type + payload view. +struct FramedSection { + uint8_t type = 0; + Slice payload; +}; + +// Unified section framing: [u8 type][varint64 len][payload][fixed32 crc32c(type+len+payload)]. +// All full-format sections reuse this encode/checksum path to avoid ad-hoc hand-assembly. +// Unknown optional sections are dispatched by the caller based on type; read still verifies the CRC and skips the payload. +class SectionFramer { +public: + static void write(ByteSink& sink, uint8_t section_type, Slice payload); + static Status read(ByteSource& src, FramedSection* out); +}; + +} // namespace snii diff --git a/be/src/snii/encoding/varint.h b/be/src/snii/encoding/varint.h new file mode 100644 index 00000000000000..8a878b1d2928b4 --- /dev/null +++ b/be/src/snii/encoding/varint.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" + +namespace snii { + +// LEB128 variable-length integer encoding + zigzag. out buffer must be >=10 bytes; returns number of bytes written. +size_t varint_len(uint64_t v); +size_t encode_varint32(uint32_t v, uint8_t* out); +size_t encode_varint64(uint64_t v, uint8_t* out); + +// Decode a varint from the range [p, end); on success *next points to the next byte after the consumed input. +Status decode_varint32(const uint8_t* p, const uint8_t* end, uint32_t* v, const uint8_t** next); +Status decode_varint64(const uint8_t* p, const uint8_t* end, uint64_t* v, const uint8_t** next); + +inline uint64_t zigzag_encode(int64_t v) { + return (static_cast(v) << 1) ^ static_cast(v >> 63); +} +inline int64_t zigzag_decode(uint64_t v) { + return static_cast(v >> 1) ^ -static_cast(v & 1); +} + +} // namespace snii diff --git a/be/src/snii/encoding/zstd_codec.h b/be/src/snii/encoding/zstd_codec.h new file mode 100644 index 00000000000000..838df9af41b617 --- /dev/null +++ b/be/src/snii/encoding/zstd_codec.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" + +namespace snii { + +// Thin ZSTD wrapper. Used for compressing large payloads such as .prx windows. Decompression requires the caller to supply the original uncompressed length (from the block header). +Status zstd_compress(Slice input, int level, std::vector* out); +Status zstd_decompress(Slice input, size_t expected_uncomp_len, std::vector* out); + +} // namespace snii diff --git a/be/src/snii/format/bootstrap_header.h b/be/src/snii/format/bootstrap_header.h new file mode 100644 index 00000000000000..1face0347596c6 --- /dev/null +++ b/be/src/snii/format/bootstrap_header.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +// Fixed container header at the very start of a {rowset_id}_{seg_id}.idx file. +// Identifies the SNII container and carries basic compatibility info so a +// reader can fail fast before touching any streamed section or the tail meta +// region. +// +// On-disk layout (all multi-byte fields little-endian, fixed width; NOT framed +// by SectionFramer because it must be parseable without prior knowledge of the +// file): +// u32 magic == kContainerMagic +// u16 format_version == kFormatVersion +// u16 min_reader_version readers with kFormatVersion < this MUST refuse to +// read u32 flags container-level feature flags u32 +// header_length total bytes of this header including the checksum u8 +// tail_pointer_size size of the fixed tail pointer at EOF (hint for the +// reader) u32 header_checksum crc32c over all preceding header bytes +struct BootstrapHeader { + uint32_t magic = kContainerMagic; + uint16_t format_version = kFormatVersion; + uint16_t min_reader_version = kMinReaderVersion; + uint32_t flags = 0; + uint32_t header_length = 0; + uint8_t tail_pointer_size = 0; +}; + +// Total fixed on-disk size of the header, including the trailing crc32c. +inline constexpr uint32_t kBootstrapHeaderSize = + 4 /*magic*/ + 2 /*format_version*/ + 2 /*min_reader_version*/ + 4 /*flags*/ + + 4 /*header_length*/ + 1 /*tail_pointer_size*/ + 4 /*header_checksum*/; + +// Serializes the header to sink: writes header_length = kBootstrapHeaderSize +// and appends a crc32c over all preceding bytes. The caller's header_length +// field is ignored on input (it is always derived). Returns OK. +Status encode_bootstrap_header(const BootstrapHeader& header, ByteSink* sink); + +// Parses and validates a bootstrap header from the front of data. +// - too short / trailing bytes beyond the fixed header -> kCorruption +// - magic != kContainerMagic -> kCorruption +// - checksum mismatch -> kCorruption +// - format_version != kFormatVersion -> kUnsupported +// - min_reader_version > kFormatVersion -> kUnsupported +Status decode_bootstrap_header(Slice data, BootstrapHeader* out); + +} // namespace snii::format diff --git a/be/src/snii/format/bsbf.h b/be/src/snii/format/bsbf.h new file mode 100644 index 00000000000000..42a4e80f4dac12 --- /dev/null +++ b/be/src/snii/format/bsbf.h @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/io/file_reader.h" + +// Block-split bloom filter (BSBF) -- Apache Parquet split-block spec, with an +// S3-native on-demand single-block probe that none of the reference implementations +// (Apache Parquet, Doris storage, Doris format/parquet) ship. +// +// BIT FORMAT IS PARQUET-CANONICAL (interoperable with Apache Parquet / Doris +// format/parquet for the bitset bytes): +// - 256-bit (32-byte) blocks, 8 bits set per block. +// - key = XXH64(term, seed=0); high 32 bits select the block via FASTRANGE +// `block = ((hash>>32) * num_blocks) >> 32` (no power-of-2 requirement); low 32 +// bits select 8 in-block positions `1 << ((key * SALT[i]) >> 27)`. +// - num_bytes via Parquet OptimalNumOfBytes: power of 2 in [32, 128 MiB]. +// +// SNII WRAPPER (NOT Parquet's variable thrift header): a FIXED 28-byte header, then +// the contiguous, uncompressed, little-endian bitset. Because the header size is a +// constant, the bitset start is a constant offset (`section_base + 28`) and block i +// is at `section_base + 28 + i*32` -- so a single 32-byte block can be range-read on +// demand WITHOUT parsing a variable-length header and WITHOUT loading the whole blob. +namespace snii::format { + +constexpr uint32_t kBsbfBytesPerBlock = 32; // 256-bit block +constexpr uint32_t kBsbfBitsSetPerBlock = 8; // 8 uint32 words / block +constexpr uint32_t kBsbfMinBytes = 32; +constexpr uint32_t kBsbfMaxBytes = 128u * 1024 * 1024; // Parquet kMaximumBloomFilterBytes +constexpr uint32_t kBsbfHeaderSize = 28; // FIXED (constant bitset offset) +// L0/L1 tiering threshold (design "不存在的term快速过滤"): a bsbf section whose total +// size is <= this is loaded WHOLE into the resident reader at open (L0 -> free +// in-memory probe, no per-lookup round); larger filters stay L1 (header-only, probed +// one 32-byte block on demand). 256 KiB fits in a single cloud FileCache block. +constexpr uint32_t kBsbfResidentMaxBytes = 256u * 1024; + +// Canonical Parquet/Doris split-block SALT (8 odd 32-bit constants). +extern const uint32_t kBsbfSalt[kBsbfBitsSetPerBlock]; + +// XXH64(term, seed=0) -- the Parquet-canonical key (NOT XXH3, NOT Doris murmur). +uint64_t bsbf_hash(std::string_view term); + +// Parquet OptimalNumOfBytes(ndv, fpp): power of 2 in [32, 128 MiB]. +uint32_t bsbf_optimal_num_bytes(uint32_t ndv, double fpp); + +// Fastrange block index from a 64-bit hash and the block count. +inline uint32_t bsbf_block_index(uint64_t hash, uint32_t num_blocks) { + return static_cast(((hash >> 32) * num_blocks) >> 32); +} + +// Pure 32-byte-block kernel: does `block` contain the key's 8 bits? SIMD (AVX2) +// accelerated at runtime when available, scalar otherwise. Returns true => the term +// MAY be present (could be a false positive); false => DEFINITELY ABSENT. +bool bsbf_block_contains(uint64_t hash, const uint8_t block[kBsbfBytesPerBlock]); + +// In-memory builder + serializer. +class BsbfBuilder { +public: + BsbfBuilder() = default; + + // Sizes the filter for `ndv` distinct keys at target `fpp`. fpp in (0,1). + static Status create(uint32_t ndv, double fpp, BsbfBuilder* out); + + // Insert a key / term. SIMD-accelerated. + void insert(uint64_t hash); + void insert_term(std::string_view term) { insert(bsbf_hash(term)); } + + // In-memory probe over the resident bitset (build/warm path). SIMD-accelerated. + bool maybe_contains(uint64_t hash) const; + bool maybe_contains_term(std::string_view term) const { + return maybe_contains(bsbf_hash(term)); + } + + // Serialize [28-byte header][contiguous LE bitset] into `sink`. The header carries + // magic/version/hash+index strategy/num_bytes/num_blocks/ndv + header & bitset + // crc32c. The bitset is Parquet-canonical bytes. + Status serialize(ByteSink* sink) const; + + uint32_t num_bytes() const { return num_bytes_; } + uint32_t num_blocks() const { return num_blocks_; } + +private: + std::vector words_; // num_bytes_/4, blocks of 8 words + uint32_t num_bytes_ = 0; + uint32_t num_blocks_ = 0; + uint32_t ndv_ = 0; +}; + +// Resident header (28 bytes), parsed once at open. Validates magic/version/crc/bounds. +struct BsbfHeader { + uint32_t num_bytes = 0; + uint32_t num_blocks = 0; + uint32_t bitset_crc = 0; // stored crc32c of the bitset body (for L0 verification) + uint64_t bitset_base = 0; // absolute file offset of block 0 = section_base + 28 + + // Parse a 28-byte header located at `section_base` in the file. The bitset_base + // is set to section_base + kBsbfHeaderSize. + static Status parse(Slice header28, uint64_t section_base, BsbfHeader* out); + + // Absolute file offset of the 32-byte block this hash maps to. + uint64_t block_offset(uint64_t hash) const { + return bitset_base + + static_cast(bsbf_block_index(hash, num_blocks)) * kBsbfBytesPerBlock; + } +}; + +// On-demand probe: read EXACTLY ONE 32-byte block via `reader`, then test. No whole +// blob load, no deep copy. *maybe_present=false means DEFINITELY ABSENT. +Status bsbf_probe(snii::io::FileReader* reader, const BsbfHeader& header, uint64_t hash, + bool* maybe_present); + +} // namespace snii::format diff --git a/be/src/snii/format/dict_block.h b/be/src/snii/format/dict_block.h new file mode 100644 index 00000000000000..82ae2476c53561 --- /dev/null +++ b/be/src/snii/format/dict_block.h @@ -0,0 +1,144 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" + +// DICT block —— a positioning unit mapping term → postings read plan, and also +// the unit for remote on-demand fetching, caching, and CRC checksum +// verification (see docs/design/SNII-design-spec.source.md "DICT block" and +// "dict lookup flow summary" sections). +// +// Byte layout (strictly implemented; multi-byte fixed-width fields are +// little-endian, variable-length integers use LEB128): +// header: +// n_entries varint +// entry_format_ver u8 # = kDictBlockFormatVer +// block_flags u8 # bit0 = has_positions (consistency check +// against the value passed to reader) frq_base varint64 prx_base +// varint64 # present only when has_positions is set +// entries[n_entries] # variable-length DictEntry, front-coded in +// lexicographic order anchor_offsets[n_anchors] # u32 * n_anchors, byte +// offset of each anchor entry within the block n_anchors u32 crc32c +// u32 # covers [header .. n_anchors], detects corruption (sole CRC +// layer) +// +// Anchor rule: every anchor_interval entries, one "term anchor" is forced — +// that entry is encoded with prev_term="" (prefix_len=0, storing the full +// term), and its byte offset is recorded in anchor_offsets; non-anchor entries +// use the preceding entry's term as prev_term for front coding. The reader can +// start from any anchor and scan independently without needing earlier terms, +// enabling anchor binary search + local scan for exact term lookup. +namespace snii::format { + +// DICT block entry_format_ver: self-describing version of the DictEntry +// encoding. Reader rejects a mismatch so a query-only run cannot silently read +// an older dict-entry layout as the current one. +inline constexpr uint8_t kDictBlockFormatVer = 2; + +// block_flags bit definitions. +namespace dict_block_flags { +inline constexpr uint8_t kHasPositions = 1u << 0; // whether to write prx_base / .prx fields +// bit1-7 reserved +} // namespace dict_block_flags + +// DICT block writer: entries are added in lexicographic order via add_entry; +// internally maintains prev_term, determines anchors, accumulates size +// estimates, and on finish serializes header + entries + anchor table + CRC in +// one pass. +class DictBlockBuilder { +public: + DictBlockBuilder(IndexTier tier, bool has_positions, uint64_t frq_base, uint64_t prx_base, + uint32_t anchor_interval = 16); + + // Append one entry (caller must guarantee lexicographic term order). + // Internally decides whether it becomes an anchor. + void add_entry(const DictEntry& entry); + + // Upper-bound estimate of the serialized size of the current block (including + // header + entries + anchor table + CRC footer), used by the upper layer to + // decide when to cut a new block based on target_dict_block_bytes. + size_t estimated_bytes() const; + + // Number of entries. + uint32_t n_entries() const { return n_entries_; } + + // Serialize the entire block and append it to sink. + void finish(ByteSink* sink) const; + +private: + bool is_anchor(uint32_t index) const { return index % anchor_interval_ == 0; } + + IndexTier tier_; + bool has_positions_; + uint64_t frq_base_; + uint64_t prx_base_; + uint32_t anchor_interval_; + + uint32_t n_entries_ = 0; + std::vector entries_; + std::string prev_term_; // term of the previous entry (front coding base) + size_t entries_est_ = 0; // accumulated byte estimate for the entries section + size_t n_anchors_ = 0; // number of anchors +}; + +// DICT block reader: on open, verifies the CRC and parses the header / anchor +// table; find_term uses anchor binary search + local scan to locate a +// DictEntry. Holds a byte view of the block (non-owning); lifetime is managed +// by the caller. +class DictBlockReader { +public: + DictBlockReader() = default; + + // Parse and verify the entire block. CRC mismatch / truncation / invalid + // structure → Corruption; has_positions in the header inconsistent with the + // supplied argument → InvalidArgument. + static Status open(Slice block, IndexTier tier, bool has_positions, DictBlockReader* out); + + // Anchor binary search + local scan to locate target. Hit → *found=true and + // *out is filled; miss (including out-of-range, gap) → *found=false. + // Structural error → non-OK Status. + Status find_term(std::string_view target, bool* found, DictEntry* out) const; + + // Decodes EVERY entry in the block in lexicographic order into *out (each a + // self-contained DictEntry, owning its term). Used for ordered term + // enumeration (prefix / range scans). Resets the front-coding base at each + // anchor segment. + Status decode_all(std::vector* out) const; + + uint64_t frq_base() const { return frq_base_; } + uint64_t prx_base() const { return prx_base_; } + uint32_t n_entries() const { return n_entries_; } + +private: + // Sequentially scan from anchor anchor_idx to the end of that anchor segment, + // searching for target. + Status scan_from_anchor(size_t anchor_idx, std::string_view target, bool* found, + DictEntry* out) const; + + // Find the last anchor index where first_term(anchor) <= target; return false + // if none exists. + bool locate_anchor(std::string_view target, size_t* anchor_idx) const; + + Slice block_; // [header .. crc) full block view + IndexTier tier_ = IndexTier::kT1; + bool has_positions_ = false; + uint64_t frq_base_ = 0; + uint64_t prx_base_ = 0; + uint32_t n_entries_ = 0; + + size_t entries_begin_ = 0; // absolute offset of the start of the entries section + std::vector anchor_offsets_; // byte offset within the block for each anchor entry + std::vector + anchor_terms_; // full term of each anchor entry (used for binary search) +}; + +} // namespace snii::format diff --git a/be/src/snii/format/dict_block_directory.h b/be/src/snii/format/dict_block_directory.h new file mode 100644 index 00000000000000..a1d70e9ed5aec9 --- /dev/null +++ b/be/src/snii/format/dict_block_directory.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// BlockRef.flags bit definitions. +namespace block_ref_flags { +// bit0: the on-disk block bytes are zstd(uncompressed_block). When set, the +// directory also stores uncomp_len, and the reader zstd-decompresses the fetched +// [offset, offset+length) range to uncomp_len before parsing the dict block. The +// block-level crc32c (and BlockRef.checksum) cover the UNCOMPRESSED bytes, so a +// zstd block shrinks the bytes fetched from S3 while keeping the same integrity +// guarantees after decompression in RAM. +inline constexpr uint8_t kZstd = 1u << 0; +} // namespace block_ref_flags + +// Physical location and checksum info for a single DICT block. Aligned with SampledTermIndex by ordinal: +// SampledTermIndex[i]'s first_term corresponds to DictBlockDirectory[i] (see design spec +// "sampled dict index"). The read path issues a single range read over [offset, offset+length). +struct BlockRef { + uint64_t offset = 0; // absolute byte offset of the block within the container + uint64_t length = 0; // ON-DISK byte length of the block (compressed when kZstd) + uint32_t n_entries = 0; // number of DictEntry records within this block + uint8_t flags = 0; // block-level flags (block_ref_flags::*) + uint32_t checksum = 0; // crc32c of the block's UNCOMPRESSED content (verified after read) + uint64_t uncomp_len = 0; // uncompressed block byte length (stored only when kZstd set) +}; + +// DICT block directory: block ordinal → physical location mapping. +// +// on-disk layout (framed by SectionFramer with a unified type+len+crc32c wrapper): +// [u8 type=kDictBlockDirectory][varint64 payload_len][payload][fixed32 crc32c] +// payload = varint32 n_blocks +// then n_blocks × block_ref{ +// varint64 offset, varint64 length, varint32 n_entries, +// u8 flags, fixed32 checksum } +// Section-level crc detects truncation/corruption; block_ref.checksum is the per-block crc. +class DictBlockDirectoryBuilder { +public: + void add(const BlockRef& ref) { refs_.push_back(ref); } + + // Encodes as a kDictBlockDirectory framed section (with embedded crc32c) and appends to sink. + void finish(ByteSink* sink) const; + +private: + std::vector refs_; +}; + +// Reads and verifies a kDictBlockDirectory framed section; provides ordinal → BlockRef lookup. +// After parsing, all block_refs reside in the reader (entering the searcher cache along with meta). +class DictBlockDirectoryReader { +public: + // Verifies the section crc and deserializes all block_refs. + // crc mismatch / truncation / trailing bytes → kCorruption; wrong section type → kInvalidArgument. + static Status open(Slice section, DictBlockDirectoryReader* out); + + uint32_t n_blocks() const { return static_cast(refs_.size()); } + + // Returns the ordinal-th block_ref; ordinal >= n_blocks → kNotFound. + Status get(uint32_t ordinal, BlockRef* out) const; + +private: + std::vector refs_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/dict_entry.h b/be/src/snii/format/dict_entry.h new file mode 100644 index 00000000000000..e2b434ece3a22f --- /dev/null +++ b/be/src/snii/format/dict_entry.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/format_constants.h" +#include "snii/format/frq_pod.h" + +// DictEntry —— on-disk encoding/decoding of a dict entry. +// +// Byte layout (see docs/design/SNII-design-spec.source.md "dict entry" +// section): +// entry_len varint # byte length of entry body, allowing reader to skip +// unknown extensions or fast-skip entries +// --- entry body begins here, covered by entry_len --- +// prefix_len varint # length of shared prefix with prev_term +// suffix_len varint # number of suffix bytes +// suffix u8[] # suffix bytes that differ from prev_term +// flags u8 # bit0 kind / bit1 enc / bit2 has_sb / bit3 +// has_champion(=0) / bit4 offsets_ref(=0) df varint ttf_delta varint +// # only when tier>=T2 max_freq varint # only when tier>=T2 locator: +// pod_ref: frq_off_delta varint, frq_len varint, +// [prelude_len varint, frq_docs_len varint when enc=windowed] +// # docs-only prefix [prelude][dd-block]; windowed entries +// carry # per-window region metadata in the prelude. +// [frq_docs_len varint, slim region meta when enc=slim]: +// # frq_docs_len == dd region on-disk length; the docs-only +// prefix # [frq_off, frq_off+frq_docs_len) a docid-only reader +// fetches # without the freq region. win_mode u8 (bit0 +// dd_zstd, bit1 freq_zstd) dd_uncomp_len varint, crc_dd u32 +// [freq_uncomp_len varint, crc_freq u32 when tier>=T2] +// # The single slim window is [dd_region][freq_region]; +// dd_disk_len # = frq_docs_len, freq_disk_len = frq_len - +// frq_docs_len. +// [prx_off_delta varint, prx_len varint when tier>=T2] +// inline: frq_len varint, frq_bytes u8[], # frq_bytes = +// [dd_region][freq_region] +// slim region meta (as above, sans frq_docs_len which == dd disk +// len +// carried as inline_dd_disk_len varint), +// [prx_len varint, prx_bytes u8[] when tier>=T2] +// --- entry body ends --- +// +// CRC verification is performed at the DICT block level (covering block header +// + all entries + anchor offset table), no per-entry CRC to keep slim/inline +// low-frequency terms compact (spec §DICT block line 330/348). tier and +// positions capability are provided by per-index meta (not stored redundantly +// inside entries): when tier>=T2, ttf_delta / max_freq and .prx locator/bytes +// are written. +namespace snii::format { + +// Dict entry: inline or pod-ref (two states), self-described length, supports +// intra-block front coding. +struct DictEntry { + // term key (front coding relative to prev_term is applied during + // encode/decode; full term stored here). + std::string term; + + // flags. + DictEntryKind kind = DictEntryKind::kPodRef; + DictEntryEnc enc = DictEntryEnc::kSlim; + bool has_sb = false; + + // term stats. + uint32_t df = 0; + uint64_t ttf_delta = 0; // only when tier>=T2 + uint64_t max_freq = 0; // only when tier>=T2 + + // pod_ref locator. + uint64_t frq_off_delta = 0; + uint64_t frq_len = 0; + uint64_t prelude_len = 0; // only when enc=windowed + uint64_t frq_docs_len = 0; // pod_ref docs-only prefix length + uint64_t prx_off_delta = 0; // only when tier>=T2 + uint64_t prx_len = 0; // only when tier>=T2 + + // slim/inline single-window region codecs. The window is + // [dd_region][freq_region] (no self-describing header). dd_meta drives the + // docs-only decode; freq_meta the scoring decode (only when tier>=T2). For + // slim pod_ref dd_meta.disk_len == frq_docs_len; for inline it is stored as + // inline_dd_disk_len. + FrqRegionMeta dd_meta; + FrqRegionMeta freq_meta; // only when tier>=T2 + uint64_t inline_dd_disk_len = 0; // only for inline: dd region on-disk length + + // inline payload. + std::vector frq_bytes; // = [dd_region][freq_region] + std::vector prx_bytes; // only when tier>=T2 +}; + +// Encodes an entry into sink (appending) using the layout above, with front +// coding relative to prev_term. tier determines whether optional fields are +// written. +Status encode_dict_entry(const DictEntry& entry, std::string_view prev_term, IndexTier tier, + ByteSink* sink); + +// Decodes one entry from the current position of src; term is reconstructed +// from prev_term + suffix. Verifies the trailing CRC; out-of-range / CRC +// mismatch / invalid prefix_len all return Corruption. +Status decode_dict_entry(ByteSource* src, std::string_view prev_term, IndexTier tier, + DictEntry* out); + +// Skips one entry using only entry_len (does not parse internal fields or +// verify CRC). +Status skip_dict_entry(ByteSource* src); + +} // namespace snii::format diff --git a/be/src/snii/format/format_constants.h b/be/src/snii/format/format_constants.h new file mode 100644 index 00000000000000..188266d02910cf --- /dev/null +++ b/be/src/snii/format/format_constants.h @@ -0,0 +1,111 @@ +#pragma once + +#include + +// SNII container and per-section on-disk contract constants. +// Once published, these values are format semantics; changes require bumping +// format_version and maintaining a compatibility policy. All multi-byte +// fixed-width fields are little-endian; variable-length integers use LEB128 +// (see snii/encoding/varint.h). +namespace snii::format { + +// ---- Container-level magic / version ---- +// "SNII" reads as 0x49494E53 in little-endian. +inline constexpr uint32_t kContainerMagic = 0x49494E53u; // 'S''N''I''I' +inline constexpr uint32_t kTailMagic = 0x4C494154u; // 'T''A''I''L' +inline constexpr uint16_t kFormatVersion = 2; +inline constexpr uint16_t kMinReaderVersion = 2; +// Self-describing version of the meta layout (the per-index meta header AND the +// tail meta region share this single constant; a reader fails fast with +// Corruption on any mismatch). This is a from-scratch, pre-launch format: there +// is exactly ONE meta layout, so the value is 1. Bump it only AFTER launch, +// when a real on-disk change must coexist with already-written indexes -- +// pre-launch changes just fold into v1. +inline constexpr uint16_t kMetaFormatVersion = 1; + +// ---- SectionFramer section type ids (within per-index meta / tail region) +// ---- +enum class SectionType : uint8_t { + kStatsBlock = 1, + kSampledTermIndex = 2, + kDictBlockDirectory = 3, + kXFilter = 4, // reserved: legacy embedded XFilter; meta no longer emits/reads it + kSectionRefs = 5, + kPerIndexMetaHeader = 6, + kLogicalIndexDirectory = 7, + kTailMetaHeader = 8, + kFeatureBits = 9, +}; + +// ---- Logical index postings storage content configuration (fixed per logical +// index, not per-term) ---- Determines whether to write freq / positions / +// norms+stats. +enum class IndexConfig : uint8_t { + kDocsOnly = 0, // docid only: term/match filtering + kDocsPositions = 1, // docid+freq+positions: MATCH_PHRASE + kDocsPositionsScoring = 2, // + norms + stats: phrase + BM25 + kPositionsOffsets = 3, // reserved (highlight/RAG), not implemented in this release +}; + +// term stats / postings capability tiers: only tier>=kT2 writes +// ttf_delta/max_freq and .prx. +enum class IndexTier : uint8_t { + kT1 = 1, // docs-only + kT2 = 2, // docs-positions + kT3 = 3, // docs-positions-scoring +}; + +inline constexpr IndexTier tier_of(IndexConfig cfg) { + return cfg == IndexConfig::kDocsOnly ? IndexTier::kT1 + : cfg == IndexConfig::kDocsPositions ? IndexTier::kT2 + : IndexTier::kT3; // scoring / offsets +} +inline constexpr bool has_positions(IndexConfig cfg) { + return cfg != IndexConfig::kDocsOnly; +} +inline constexpr bool has_scoring(IndexConfig cfg) { + return cfg == IndexConfig::kDocsPositionsScoring; +} + +// ---- DictEntry flags bit definitions ---- +namespace dict_flags { +inline constexpr uint8_t kKind = 1u << 0; // 0=pod_ref / 1=inline +inline constexpr uint8_t kEnc = 1u << 1; // 0=slim / 1=windowed +inline constexpr uint8_t kHasSb = 1u << 2; // posting prelude includes sub-block directory +inline constexpr uint8_t kHasChampion = 1u << 3; // v1 always 0 +inline constexpr uint8_t kOffsetsRef = 1u << 4; // v1 always 0 +// bit5-7 reserved +} // namespace dict_flags + +enum class DictEntryKind : uint8_t { kPodRef = 0, kInline = 1 }; +enum class DictEntryEnc : uint8_t { kSlim = 0, kWindowed = 1 }; + +// ---- .prx window codec (codec byte bit0-5) ---- +// kRaw : plaintext varint payload (doc_count, per-doc pos_count + position +// deltas). kZstd : zstd-compressed plaintext payload (legacy reader still +// supported). kPfor : doc_count + per-doc pos_count (varint), then position +// deltas bit-packed +// as PFOR runs (kFrqBaseUnit each). No entropy coding -> far cheaper +// build CPU than zstd while staying competitive on size for ascending +// deltas. +enum class PrxCodec : uint8_t { + kRaw = 0, + kZstd = 1, + kPfor = 2 /* bit7 cont-reserved */ +}; + +// ---- Build-time parameters (not format semantics; may be tuned against real +// metrics) ---- +inline constexpr uint32_t kFrqBaseUnit = 256; // window base unit +inline constexpr uint32_t kSlimDfThreshold = 512; // df < this → slim +inline constexpr uint32_t kDefaultInlineThreshold = 256; // slim encoded bytes ≤ this → inline +// Adaptive window sizing (design #4): high-df windowed terms use larger windows +// to cut prelude rows + per-window header/crc overhead. Windows remain a whole +// multiple of kFrqBaseUnit so .prx alignment and win_base/last_docid semantics +// are preserved. A term whose df >= kAdaptiveWindowDfThreshold splits into +// kAdaptiveWindowDocs-sized windows instead of kFrqBaseUnit-sized ones. +inline constexpr uint32_t kAdaptiveWindowDfThreshold = 8192; // df >= this -> larger windows +inline constexpr uint32_t kAdaptiveWindowDocs = 1024; // larger window size (4 * base unit) +inline constexpr uint32_t kDefaultTargetDictBlockBytes = 64 * 1024; + +} // namespace snii::format diff --git a/be/src/snii/format/frq_pod.h b/be/src/snii/format/frq_pod.h new file mode 100644 index 00000000000000..aa3b36b23a4af5 --- /dev/null +++ b/be/src/snii/format/frq_pod.h @@ -0,0 +1,101 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +// .frq region codec (FrqPod): doc-delta (dd) and freq postings, columnar + PFOR +// (see docs/design SNII "frq design" and the read-byte-optimizations +// design 1.6). +// +// PHASE D (posting-level dd/freq grouping): windows are NO LONGER +// self-describing. A windowed .frq payload is laid out as +// [prelude][dd-block][freq-block] +// where the dd-block concatenates every window's dd_region and the freq-block +// concatenates every window's freq_region. Each region is independently encoded +// (raw or zstd, chosen by size) and the per-window codec metadata (mode, +// lengths, crc, offsets) is hoisted into the frq_prelude rows -- the region +// bytes carry NO header. This makes the docs-only prefix ([prelude][dd-block]) +// ONE contiguous run a docid-only / phrase reader can fetch in a single range, +// skipping the freq-block entirely. +// +// dd_region plaintext = VInt n ++ PFOR_runs(doc_delta) # n = doc count +// dd[0] = first_docid - win_base; dd[i] = docid[i] - docid[i-1]; win_base is +// the previous window's last docid (first window = 0). +// freq_region plaintext = PFOR_runs(freq) # present iff +// has_freq PFOR runs are segmented at 256 docs (kFrqBaseUnit); a partial +// segment writes the remainder. Variable-length integers reuse +// snii/encoding/varint; PFOR reuses snii/encoding/pfor; crc32c covers each +// region's ON-DISK bytes. +namespace snii::format { + +// Codec metadata for ONE encoded region (dd or freq), hoisted into the prelude. +// The region's on-disk bytes are pure payload (no header); these fields drive +// the decode. crc covers the on-disk (disk_len) bytes. +struct FrqRegionMeta { + bool zstd = false; // true => disk bytes are zstd(plaintext); false => raw + uint64_t uncomp_len = 0; // plaintext byte length (== disk_len when raw) + uint64_t disk_len = 0; // on-disk byte length of this region + uint32_t crc = 0; // crc32c of the on-disk (disk_len) bytes + // When false, decode_*_region SKIPS the per-region crc check (and the writer + // omits the 4-byte crc from the dict entry). Set false for INLINE entries: + // their region bytes live inside the dict block, whose own block-level crc32c + // already covers them, so a per-region crc is fully redundant. POD-ref + // regions (slim/windowed) live in the separately-fetched .frq POD -- their + // crc stays. + bool verify_crc = true; +}; + +// Encodes a window's dd_region plaintext (VInt n ++ PFOR_runs(doc_delta)) into +// raw or zstd (per zstd_level_or_neg_for_auto), APPENDS the on-disk bytes to +// out, and fills meta (mode/uncomp_len/disk_len/crc). The region carries no +// header. docids_ascending: ascending docids in this window (single doc or +// empty allowed). win_base: previous window's last docid (first window = 0); +// requires docids[0] >= win_base. zstd_level_or_neg_for_auto: <0 auto (zstd +// when large enough, else raw); 0 force +// raw; >0 force zstd at that level. +// Non-ascending docids / first_docid < win_base / null out returns +// InvalidArgument. +Status build_dd_region(std::span docids_ascending, uint64_t win_base, + int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta); + +// Vector convenience overload (forwards a span view; no copy of the elements). +inline Status build_dd_region(const std::vector& docids_ascending, uint64_t win_base, + int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta) { + return build_dd_region(std::span(docids_ascending), win_base, + zstd_level_or_neg_for_auto, out, meta); +} + +// Encodes a window's freq_region plaintext (PFOR_runs(freq)) into raw or zstd, +// APPENDS the on-disk bytes to out, and fills meta. Empty freqs yields a +// zero-length region. Null out returns InvalidArgument. +Status build_freq_region(std::span freqs, int zstd_level_or_neg_for_auto, + ByteSink* out, FrqRegionMeta* meta); + +// Vector convenience overload (forwards a span view; no copy of the elements). +inline Status build_freq_region(const std::vector& freqs, int zstd_level_or_neg_for_auto, + ByteSink* out, FrqRegionMeta* meta) { + return build_freq_region(std::span(freqs), zstd_level_or_neg_for_auto, out, + meta); +} + +// Decodes a dd_region from its on-disk slice (exactly disk_len bytes) + meta + +// win_base, reconstructing ascending docids. Verifies meta.crc against the +// slice. crc mismatch / wrong slice length / truncation / decompression / +// oversized count all return a non-OK Status. The freq region is irrelevant +// here (docs-only path). +Status decode_dd_region(Slice dd_disk, const FrqRegionMeta& meta, uint64_t win_base, + std::vector* docids); + +// Decodes a freq_region from its on-disk slice (exactly disk_len bytes) + meta, +// producing doc_count freqs. Verifies meta.crc. doc_count == 0 yields empty +// freqs (and requires a zero-length region). crc mismatch / wrong slice length +// / etc. return a non-OK Status. +Status decode_freq_region(Slice freq_disk, const FrqRegionMeta& meta, size_t doc_count, + std::vector* freqs); + +} // namespace snii::format diff --git a/be/src/snii/format/frq_prelude.h b/be/src/snii/format/frq_prelude.h new file mode 100644 index 00000000000000..848e2bf0e2926b --- /dev/null +++ b/be/src/snii/format/frq_prelude.h @@ -0,0 +1,178 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +// FrqPrelude: a TWO-LEVEL (super-block -> window) skippable directory that +// precedes a windowed .frq posting whose payload is laid out (PHASE D, design +// 1.6) with dd and freq regions GROUPED at posting level: +// windowed .frq payload = [prelude][dd-block][freq-block] +// dd-block = dd_region_0 ++ dd_region_1 ++ ... ++ dd_region_{N-1} +// freq-block = freq_region_0 ++ ... ++ freq_region_{N-1} (iff has_freq) +// Windows are NOT self-describing: each window's full codec metadata (region +// offsets, on-disk/uncompressed lengths, modes, crcs) lives in the prelude rows. +// The docs-only prefix [prelude][dd-block] is therefore ONE contiguous run a +// docid-only / phrase reader fetches in a single range, skipping the freq-block. +// +// DictEntry records prelude_len, frq_len (whole payload) and frq_docs_len +// (= prelude_len + dd_block_len) so a reader can range-fetch the prelude first, +// then fetch either the contiguous dd-block (docs-only) or both blocks (scoring). +// +// On-disk layout (strict; all multi-byte fixed fields little-endian, VInt = +// LEB128 via snii/encoding): +// header: +// u8 flags # bit0 has_freq, bit1 has_prx +// VInt N # number of .frq windows +// VInt G # windows per super-block (group_size; >=1) +// VInt n_super # = ceil(N / G); 0 when N==0 +// VInt sbdir_len # byte length of the super_block_dir region +// u32 crc32c # covers header + super_block_dir (NOT the window blocks) +// super_block_dir[n_super]: # small, resident: one row per super-block +// VInt sb_last_docid_delta # cumulative across super-blocks => absolute last +// # docid of the super-block's last window +// VInt sb_block_off # byte offset of this super-block's window block, +// # measured from the start of the window_dir region +// VInt sb_block_len # byte length of this super-block's window block +// window_dir: n_super self-contained blocks, each holding <=G window rows. +// per window row: +// VInt last_docid_delta # cumulative WITHIN the block => absolute last docid +// # (previous window's absolute last docid = win_base; +// # first window of first block: win_base = 0) +// VInt doc_count # number of docs in the window (frq_pod needs it) +// u8 win_mode # bit0 dd_zstd, bit1 freq_zstd +// VInt dd_off # dd_region byte offset within the dd-block +// VInt dd_disk_len # dd_region on-disk byte length +// VInt dd_uncomp_len # dd_region plaintext byte length +// u32 crc_dd # crc32c of the dd_region on-disk bytes +// VInt freq_off # freq_region offset within the freq-block (has_freq) +// VInt freq_disk_len # freq_region on-disk byte length (has_freq) +// VInt freq_uncomp_len # freq_region plaintext byte length (has_freq) +// u32 crc_freq # crc32c of the freq_region on-disk bytes (has_freq) +// VInt prx_off # .prx payload byte offset (present iff has_prx) +// VInt prx_len # .prx payload byte length (present iff has_prx) +// VInt max_freq # window max term frequency (WAND block-max) +// u8 max_norm # window score-max norm (WAND); 0 acceptable +// +// Reconstructing win_base / absolute last_docid (READER CONTRACT) is unchanged: +// the writer chains absolute last docids across windows; each row stores the delta +// of its absolute last docid from the previous window, and sb_last_docid seeds +// each block, so super-block binary search then in-block window binary search +// locate the window covering any docid without decoding the .frq blocks. +// +// The trailing crc32c covers only header + super_block_dir; every region carries +// its own crc (crc_dd / crc_freq) in the row. +namespace snii::format { + +namespace frq_prelude_flags { +inline constexpr uint8_t kHasFreq = 1u << 0; +inline constexpr uint8_t kHasPrx = 1u << 1; +} // namespace frq_prelude_flags + +// Per-window codec mode bits (win_mode byte). +namespace frq_win_mode { +inline constexpr uint8_t kDdZstd = 1u << 0; +inline constexpr uint8_t kFreqZstd = 1u << 1; +inline constexpr uint8_t kKnownBits = kDdZstd | kFreqZstd; +} // namespace frq_win_mode + +// Absolute, decoded metadata for one window (as the reader exposes it). The dd / +// freq region locators are offsets WITHIN the dd-block / freq-block respectively +// (both blocks follow the prelude). The reader derives the dd-block length from +// the last window's dd_off + dd_disk_len. +struct WindowMeta { + uint32_t last_docid = 0; // absolute last docid in the window + uint64_t win_base = 0; // absolute last docid of the previous window (0 for w==0) + uint32_t doc_count = 0; + + // dd_region locator (within the dd-block). + bool dd_zstd = false; + uint64_t dd_off = 0; + uint64_t dd_disk_len = 0; + uint64_t dd_uncomp_len = 0; + uint32_t crc_dd = 0; + + // freq_region locator (within the freq-block); valid only when has_freq. + bool freq_zstd = false; + uint64_t freq_off = 0; + uint64_t freq_disk_len = 0; + uint64_t freq_uncomp_len = 0; + uint32_t crc_freq = 0; + + uint64_t prx_off = 0; // valid only when has_prx + uint64_t prx_len = 0; // valid only when has_prx + uint32_t max_freq = 0; + uint8_t max_norm = 0; + + // In-memory only (NOT serialized in the prelude row). When false, the dd/freq + // region decode skips crc verification -- used when these region bytes are + // covered by an enclosing crc (e.g. an INLINE entry inside its dict block). + // Windowed/slim POD-ref rows leave this true (their regions carry a crc). + bool verify_crc = true; +}; + +// Builder input: one fully-computed WindowMeta per window, in term order, plus the +// super-block grouping factor. The writer fills last_docid (absolute), doc_count, +// the region locators/crcs, prx locator, max_freq and max_norm; win_base is derived +// during build (so callers may leave it 0). group_size must be >= 1. +struct FrqPreludeColumns { + bool has_freq = true; + bool has_prx = false; + uint32_t group_size = 64; // windows per super-block (G) + std::vector windows; +}; + +// Builds the prelude bytes and appends them to out. +// Returns InvalidArgument when out is null, group_size is 0, or the windows are +// not in non-decreasing last_docid order (a window's absolute last docid must be +// >= the previous window's). +Status build_frq_prelude(const FrqPreludeColumns& cols, ByteSink* out); + +// Reads and verifies a prelude buffer, exposing two-level skip access. The reader +// parses the header + super_block_dir on open (verifying the trailing crc) and +// eagerly decodes every window block into owned WindowMeta rows (the prelude is +// small relative to the postings). It does not retain the input. +class FrqPreludeReader { +public: + // Parses + verifies the prelude. crc mismatch / truncation / inconsistent + // offsets-or-lengths / oversized counts => kCorruption. + static Status open(Slice prelude, FrqPreludeReader* out); + + uint32_t n_windows() const { return static_cast(windows_.size()); } + uint32_t n_super_blocks() const { return n_super_; } + bool has_freq() const { return has_freq_; } + bool has_prx() const { return has_prx_; } + + // Total on-disk byte length of the dd-block (== sum of dd_disk_len; the docs-only + // prefix after the prelude). 0 when there are no windows. + uint64_t dd_block_len() const { return dd_block_len_; } + // Total on-disk byte length of the freq-block (== sum of freq_disk_len). 0 when + // !has_freq or no windows. + uint64_t freq_block_len() const { return freq_block_len_; } + + // Returns the absolute WindowMeta for window w. Out-of-range => InvalidArgument. + Status window(uint32_t w, WindowMeta* out) const; + + // Locates the window covering docid via super-block binary search then window + // binary search. *found=false (with OK) when docid is past the term's last + // docid; otherwise *w is the index of the covering window (the first window + // whose absolute last_docid >= docid). + Status locate_window(uint32_t docid, bool* found, uint32_t* w) const; + +private: + bool has_freq_ = false; + bool has_prx_ = false; + uint32_t group_size_ = 1; + uint32_t n_super_ = 0; + uint64_t dd_block_len_ = 0; + uint64_t freq_block_len_ = 0; + // Absolute last docid at each super-block boundary (size n_super_). + std::vector sb_last_docid_; + // All windows decoded with absolute fields, in term order (size N). + std::vector windows_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/logical_index_directory.h b/be/src/snii/format/logical_index_directory.h new file mode 100644 index 00000000000000..3cfddbd7227bb8 --- /dev/null +++ b/be/src/snii/format/logical_index_directory.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// Container-level directory entry: maps a logical index identity (index_id, index_suffix) +// to the physical location of its per-index meta block. Aligned with Doris key system +// (see design spec "footer meta region" logical index directory). The reader issues a +// single range read over [meta_off, meta_off + meta_len) to load that per-index meta. +struct LogicalIndexRef { + uint64_t index_id = 0; // logical index id (matches Doris InvertedIndexDescriptor key) + std::string index_suffix; // UTF-8 sub-index suffix; may be empty for the primary index + uint64_t meta_off = 0; // absolute byte offset of the per-index meta block in the container + uint64_t meta_len = 0; // byte length of the per-index meta block +}; + +// Logical index directory: (index_id, index_suffix) -> per-index meta block reference. +// +// on-disk layout (framed by SectionFramer with a unified type+len+crc32c wrapper): +// [u8 type=kLogicalIndexDirectory][varint64 payload_len][payload][fixed32 crc32c] +// payload = varint32 n_entries +// then n_entries x { +// varint64 index_id, +// varint32 suffix_len, suffix_bytes, +// varint64 per_index_meta_off, +// varint64 per_index_meta_len } +// The section-level crc covers the whole directory, so no per-entry crc is stored +// (the spec lists a per-entry crc32c as optional; it is folded into the framer crc here). +class LogicalIndexDirectoryBuilder { +public: + void add(const LogicalIndexRef& ref) { refs_.push_back(ref); } + + // Encodes as a kLogicalIndexDirectory framed section (with embedded crc32c) and appends to sink. + void finish(ByteSink* sink) const; + +private: + std::vector refs_; +}; + +// Reads and verifies a kLogicalIndexDirectory framed section; provides ordinal access and +// (index_id, suffix) lookup. After parsing, all entries reside in the reader (entering the +// searcher cache along with the rest of the tail meta region). +class LogicalIndexDirectoryReader { +public: + // Verifies the section crc and deserializes all entries. + // crc mismatch / truncation / trailing bytes / oversized counts -> kCorruption; + // wrong section type -> kInvalidArgument; null out -> kInvalidArgument. + static Status open(Slice framed, LogicalIndexDirectoryReader* out); + + uint32_t size() const { return static_cast(refs_.size()); } + + // Returns the i-th entry in encounter order; i >= size -> kNotFound. + Status get(uint32_t i, LogicalIndexRef* out) const; + + // Looks up the entry for (index_id, suffix). On match, *found=true and *out is populated; + // when absent, *found=false and *out is left untouched. Returns kInvalidArgument on null + // output pointers. The pair (index_id, suffix) is the unique key. + Status find(uint64_t index_id, std::string_view suffix, bool* found, + LogicalIndexRef* out) const; + +private: + std::vector refs_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/norms_pod.h b/be/src/snii/format/norms_pod.h new file mode 100644 index 00000000000000..6580b1df2ffcc1 --- /dev/null +++ b/be/src/snii/format/norms_pod.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// norms POD: per logical index / field stores 1-byte encoded doc length per doc, +// used by BM25 length normalization (SniiStatsProvider::encoded_norm) for per-docid lookup. +// +// On-disk layout (the whole section is framed by SectionFramer, which adds a type+len+crc32c envelope): +// framer payload = [varint64 doc_count][bytes encoded_norm[doc_count]] +// framer envelope = [u8 type][varint64 payload_len][payload][fixed32 crc32c] +// The encoding of encoded_norm (length -> 1B) is out of scope for this module; here we only handle raw byte storage and retrieval. +class NormsPodWriter { +public: + // Appends the encoded_norm for the next docid (docid is implicit, assigned in append order starting from 0). + void add(uint8_t encoded_norm) { norms_.push_back(encoded_norm); } + + // Number of docs accumulated so far (i.e., the next docid to be assigned). + size_t count() const { return norms_.size(); } + + // Writes [doc_count][bytes] framed by SectionFramer into sink (appends; does not clear sink). + void finish(ByteSink* sink) const; + +private: + std::vector norms_; +}; + +// Read-only view: on open, verifies the framer CRC and checks that doc_count/payload length are consistent, +// afterwards encoded_norm(docid) is O(1) direct indexing (zero-copy, borrows the underlying buffer). +class NormsPodReader { +public: + NormsPodReader() = default; + + // Parses the entire section (including the framer envelope). Returns Corruption on CRC mismatch, truncation, or length inconsistency. + // On success, *out borrows the memory pointed to by framer_payload; the caller must ensure its lifetime. + static Status open(Slice framed, NormsPodReader* out); + + uint32_t doc_count() const { return doc_count_; } + + // Precondition (hard contract): docid < doc_count(). Semantics match std::vector::operator[]: + // the caller is responsible for guaranteeing this (docid comes from trusted postings decoded internally by SNII). Asserts in debug builds; + // no check in Release (NDEBUG). Use try_encoded_norm when the docid is untrusted and needs validation. + uint8_t encoded_norm(uint32_t docid) const { + assert(docid < doc_count_); + return norms_[docid]; + } + + // Checked access: returns InvalidArgument if docid is out of range; never reads out-of-range memory. + Status try_encoded_norm(uint32_t docid, uint8_t* out) const { + if (docid >= doc_count_) return Status::InvalidArgument("norms: docid out of range"); + *out = norms_[docid]; + return Status::OK(); + } + +private: + const uint8_t* norms_ = nullptr; + uint32_t doc_count_ = 0; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/null_bitmap.h b/be/src/snii/format/null_bitmap.h new file mode 100644 index 00000000000000..efe5880a101f55 --- /dev/null +++ b/be/src/snii/format/null_bitmap.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +// Forward-declare the CRoaring C++ bitmap so this header stays free of the +// (large) roaring include; the concrete type is only needed in the .cpp. +namespace roaring { +class Roaring; +} // namespace roaring + +namespace snii::format { + +// SectionFramer type byte for the null-bitmap POD. There is no dedicated +// SectionType enum value yet, so we use a documented literal (0x20) outside the +// currently allocated enum range (1..9) to avoid colliding with existing types. +inline constexpr uint8_t kNullBitmapSectionType = 0x20; + +// NullBitmap POD: per logical index, a Roaring bitmap of null docids (docs whose +// value is NULL / not indexed). It decouples per-doc NULL information from the +// per-term dictionary / postings so NULL handling can pull only this side POD. +// +// On-disk layout (the whole section is framed by SectionFramer, which adds a +// type + varint64 len + payload + fixed32 crc32c envelope): +// framer payload = [varint64 doc_count][varint64 roaring_size][roaring_bytes] +// roaring_bytes is the portable CRoaring serialization (Roaring::write). +class NullBitmapWriter { +public: + NullBitmapWriter(); + ~NullBitmapWriter(); + + NullBitmapWriter(const NullBitmapWriter&) = delete; + NullBitmapWriter& operator=(const NullBitmapWriter&) = delete; + + // Marks docid as NULL (adding the same docid twice is idempotent). + void add_null(uint32_t docid); + + // Number of distinct null docids accumulated so far. + uint32_t null_count() const; + + // Serializes [doc_count][roaring_size][roaring_bytes] framed by SectionFramer + // and appends it to sink (does not clear sink). doc_count is the total number + // of docs in the logical index (recorded so the reader can round-trip it). + void finish(uint32_t doc_count, ByteSink* sink) const; + +private: + std::unique_ptr bitmap_; +}; + +// Read-only view: on open, SectionFramer verifies the CRC and truncation; this +// class then guards roaring_size against the remaining payload bytes before +// deserializing the Roaring bitmap (anti-DoS), so a corrupt size cannot trigger +// an oversized allocation/read. is_null() is then an O(1) membership test. +class NullBitmapReader { +public: + NullBitmapReader(); + ~NullBitmapReader(); + + NullBitmapReader(const NullBitmapReader&) = delete; + NullBitmapReader& operator=(const NullBitmapReader&) = delete; + NullBitmapReader(NullBitmapReader&&) noexcept; + NullBitmapReader& operator=(NullBitmapReader&&) noexcept; + + // Parses the entire section (framer envelope + payload). Returns Corruption on + // CRC mismatch, truncation, doc_count overflow, or an oversized roaring_size. + static Status open(Slice framed, NullBitmapReader* out); + + // True iff docid was marked NULL. docids outside the null set (including those + // >= doc_count) return false. + bool is_null(uint32_t docid) const; + + // Number of distinct null docids in the bitmap. + uint32_t null_count() const; + + // Total doc count of the logical index, as recorded by the writer. + uint32_t doc_count() const { return doc_count_; } + +private: + std::unique_ptr bitmap_; + uint32_t doc_count_ = 0; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/per_index_meta.h b/be/src/snii/format/per_index_meta.h new file mode 100644 index 00000000000000..1a89a8710fbd7a --- /dev/null +++ b/be/src/snii/format/per_index_meta.h @@ -0,0 +1,150 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/format_constants.h" +#include "snii/format/stats_block.h" + +// PerIndexMeta -- the per-logical-index metadata block that enters the searcher +// cache. It COMPOSES already-built sub-sections (StatsBlock, SampledTermIndex, +// DICT block directory, optional XFilter) plus the physical SectionRefs into a +// single contiguous block. See design spec "Per-index meta block". +// +// On-disk layout: +// PerIndexMetaHeader (fixed prefix, self-checksummed): +// u16 meta_format_version (== kMetaFormatVersion), little-endian +// varint64 index_id +// varint32 suffix_len +// u8[] suffix_bytes +// u32 flags (fixed32, little-endian) # feature bits, e.g. kHasBsbf +// u32 crc32c (fixed32) over all preceding header bytes +// then framed sub-sections (each via SectionFramer, type+len+payload+crc32c): +// StatsBlock (kStatsBlock, built here) +// SampledTermIndex (kSampledTermIndex, embedded already-framed bytes) +// DICT block directory (kDictBlockDirectory,embedded already-framed bytes) +// SectionRefs (kSectionRefs, built here; carries the bsbf ref) +// (+ any extra raw framed sections appended by add_raw_section) +// +// Design choice: the SampledTermIndex / DICT block directory / XFilter +// sub-sections are EMBEDDED as their producers' already-framed output (the raw +// SectionFramer frame), not re-framed. This lets the reader hand the exact frame +// Slice straight back to each sub-module's open() (which expects a full frame), +// and reuses the framer instead of re-implementing sub-section parsing. +namespace snii::format { + +// Physical reference to a contiguous region within the container. (0, 0) means +// the region is absent (e.g. no norms POD for a non-scoring index). A present- +// but-empty region (e.g. an all-INLINE index's posting_region) is (off, 0). +struct RegionRef { + uint64_t offset = 0; + uint64_t length = 0; +}; + +// Physical references to the data sections / side PODs of one logical index. +// Each RegionRef is encoded as varint64 offset followed by varint64 length, in +// the field order below. +// +// posting_region is the single interleaved [prx][frq] posting region (it replaced +// the former two separate frq_pod + prx_pod refs). Each pod_ref term writes its +// prx span first then its frq span, contiguously, in term order; both +// frq_off_delta and prx_off_delta now index into this one region. NO positions +// capability is inferred from posting_region.length -- it is non-zero for any +// docs-only index with a pod_ref term, and zero for an all-INLINE positional +// index; capability lives in the header kHasPositions flag instead. +struct SectionRefs { + RegionRef dict_region; + RegionRef posting_region; // interleaved [prx][frq] per term; was frq_pod + prx_pod + RegionRef norms; + RegionRef null_bitmap; + // Block-split bloom XFilter section ([28B header][bitset]); {0,0} when absent. + // A PHYSICAL section (not embedded in the resident meta) so a single 32-byte block + // can be probed on demand without loading the whole filter at open. + RegionRef bsbf; +}; + +// Builds a per-index meta block by composing already-built sub-sections. +class PerIndexMetaBuilder { +public: + // Header flags / feature bits. + static constexpr uint32_t kHasPositions = 1u << 0; // index is positions-capable (tier>=T2) + static constexpr uint32_t kHasBsbf = 1u << 1; // block-split bloom XFilter (section ref) + + PerIndexMetaBuilder(uint64_t index_id, std::string index_suffix, uint32_t flags); + + void set_stats(const StatsBlock& stats); + + // Raw output of SampledTermIndexBuilder::finish (a full kSampledTermIndex frame). + void set_sampled_term_index(Slice framed_bytes); + + // Raw output of DictBlockDirectoryBuilder::finish (a full kDictBlockDirectory frame). + void set_dict_block_directory(Slice framed_bytes); + + void set_section_refs(const SectionRefs& refs); + + // Appends an arbitrary already-framed section verbatim. Used for forward-compat + // optional sections; the reader skips unrecognized types. + void add_raw_section(Slice framed_bytes); + + // Serializes the header and all sub-sections into sink. + // sink == nullptr -> kInvalidArgument. + Status finish(ByteSink* sink) const; + +private: + uint64_t index_id_; + std::string index_suffix_; + uint32_t flags_; + StatsBlock stats_; + std::vector sampled_term_index_; + std::vector dict_block_directory_; + SectionRefs section_refs_; + std::vector> extra_sections_; +}; + +// Parses a per-index meta block: verifies the header crc, then walks the framed +// sub-sections (each crc-verified by the framer), capturing the full frame Slice +// of each known sub-section so callers can re-open it with the sub-module reader. +// Unrecognized optional section types are skipped. +class PerIndexMetaReader { +public: + PerIndexMetaReader() = default; + + // block == the full per-index meta block bytes; out must be non-null. + // Header crc mismatch / truncation / a sub-section crc mismatch -> kCorruption; + // missing a required sub-section -> kCorruption; out == nullptr -> kInvalidArgument. + static Status open(Slice block, PerIndexMetaReader* out); + + uint64_t index_id() const { return index_id_; } + const std::string& index_suffix() const { return index_suffix_; } + uint32_t flags() const { return flags_; } + + const StatsBlock& stats() const { return stats_; } + const SectionRefs& section_refs() const { return section_refs_; } + + // Full kSampledTermIndex frame Slice, ready for SampledTermIndexReader::open. + Slice sampled_term_index_bytes() const { return sampled_term_index_; } + // Full kDictBlockDirectory frame Slice, ready for DictBlockDirectoryReader::open. + Slice dict_block_directory_bytes() const { return dict_block_directory_; } + + // Block-split bloom XFilter: present iff a non-empty bsbf section ref exists. + bool has_bsbf() const { return section_refs_.bsbf.length > 0; } + + // Positions capability, read from the persisted header flag (NOT from any region + // length). True iff the index was built as docs-positions(+scoring) (tier>=T2). + bool has_positions() const { return (flags_ & PerIndexMetaBuilder::kHasPositions) != 0; } + +private: + uint64_t index_id_ = 0; + std::string index_suffix_; + uint32_t flags_ = 0; + StatsBlock stats_; + SectionRefs section_refs_; + Slice sampled_term_index_; + Slice dict_block_directory_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/prx_pod.h b/be/src/snii/format/prx_pod.h new file mode 100644 index 00000000000000..50c8536acb4cfe --- /dev/null +++ b/be/src/snii/format/prx_pod.h @@ -0,0 +1,90 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" + +// .prx position window (PrxPod): stores term position information for several +// docs within one window. +// +// Single-window on-disk byte layout (see docs/design SNII "prx design"): +// u8 codec # PrxCodec: 0=raw / 1=zstd / 2=pfor (bit7 cont-reserved) +// VInt uncomp_len # payload length (raw/pfor: on-disk payload bytes; zstd: +// plaintext) VInt comp_len # present only when codec==zstd u32 crc32c # +// covers header (codec..comp_len) + payload bytes payload # raw: varint +// plaintext; zstd: compressed; pfor: bit-packed +// +// raw/zstd plaintext payload (self-describing per-doc boundaries): +// VInt doc_count +// per doc: VInt pos_count, followed by pos_count position deltas (VInt) +// positions within a doc are ascending, stored as deltas (first absolute). +// +// pfor payload (default build codec; no entropy coding): +// VInt doc_count +// VInt total_pos # sum of all pos_counts +// per doc: VInt pos_count +// PFOR_runs(position_deltas) # total_pos deltas, kFrqBaseUnit per run, +// # flat doc order (first per doc +// absolute) +// +// Multi-byte fixed-length fields are little-endian; variable-length integers +// reuse snii/encoding/varint. crc32c checksum at window tail detects +// corruption. +namespace snii::format { + +// Build a .prx window and append it to sink. +// per_doc_positions[d] is the position list for the d-th doc within this +// window; must be ascending (duplicates allowed). +// zstd_level_or_negative_for_auto: +// <0 → auto: use ZSTD (default level) when payload is large enough, +// otherwise raw. 0 → force raw (no compression). >0 → force ZSTD with the +// given level. +// Non-ascending positions within a doc return InvalidArgument. +Status build_prx_window(std::span> per_doc_positions, + int zstd_level_or_negative_for_auto, ByteSink* sink); + +// Vector convenience overload (forwards a span view over the window's per-doc +// lists; the writer can pass a slice of its flat positions WITHOUT deep-copying +// the inner vectors into a fresh std::vector> per +// window). +inline Status build_prx_window(const std::vector>& per_doc_positions, + int zstd_level_or_negative_for_auto, ByteSink* sink) { + return build_prx_window(std::span>(per_doc_positions), + zstd_level_or_negative_for_auto, sink); +} + +// FLAT-positions builder: byte-identical output to build_prx_window above, but +// reads the window's positions from a single flat span partitioned per-doc by +// `freqs` (doc d owns the next freqs[d] entries; freqs.size() == doc count and +// sum(freqs) == positions_flat.size()). Lets the writer pass a subspan of the +// term's flat positions/freqs with NO vector-of-vectors materialization. +Status build_prx_window_flat(std::span positions_flat, + std::span freqs, int zstd_level_or_negative_for_auto, + ByteSink* sink); + +// Read and verify a .prx window from source, reconstructing the per-doc +// position list. CRC mismatch / invalid codec / truncation / decompression +// failure all return a non-OK Status. +Status read_prx_window(ByteSource* source, std::vector>* per_doc_positions); + +// CSR variant of read_prx_window: decodes ALL docs' positions into one flat +// buffer `pos_flat` with per-doc offsets `pos_off` (size doc_count+1, +// pos_off[0]==0), so doc d's positions are pos_flat[pos_off[d] .. +// pos_off[d+1]). Avoids the per-doc std::vector allocation of read_prx_window +// -- both output vectors are flat uint32 buffers whose capacity a caller can +// retain (clear()) across windows/queries. +Status read_prx_window_csr(ByteSource* source, std::vector* pos_flat, + std::vector* pos_off); + +// Selective CSR variant: decodes positions only for the requested local doc +// ordinals within this PRX window. `doc_ordinals` must be strictly ascending. +// The output uses the same CSR shape, but has doc_ordinals.size()+1 offsets. +Status read_prx_window_csr_selective(ByteSource* source, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off); + +} // namespace snii::format diff --git a/be/src/snii/format/sampled_term_index.h b/be/src/snii/format/sampled_term_index.h new file mode 100644 index 00000000000000..b4348dd74eccd9 --- /dev/null +++ b/be/src/snii/format/sampled_term_index.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/format_constants.h" + +// SampledTermIndex -- resident metadata for locating a query term to a candidate DICT block. +// +// Sampling granularity is per DICT block (not a fixed term count): each time the writer produces a DICT block, +// it writes the block's first_term into this index. Size grows proportionally to block count. At read time it is +// loaded into the searcher cache together with SniiLogicalIndexReader. See design spec "Sampled Term Index". +// +// On-disk layout (framed by SectionFramer, uniform type+len+crc32c): +// [u8 type=kSampledTermIndex][varint64 payload_len][payload][fixed32 crc32c] +// payload = +// n_blocks varint32 +// min_term len(varint32) + bytes # == sample_terms[0], omitted when n_blocks=0 +// max_term len(varint32) + bytes # == sample_terms[n-1], omitted when n_blocks=0 +// sample_terms[n_blocks]: # first_term of each block, in ascending order +// prefix_len varint32 # shared prefix length with the previous sample_term +// suffix_len varint32 +// suffix u8[suffix_len] +// +// Term bytes are compared as unsigned byte order (UTF-8 friendly, binary-safe). Front coding reuses +// the same prefix/suffix primitives as DictEntry; do not reimplement. +namespace snii::format { + +// Builder: appends the first_term of each DICT block in block ordinal order (must be strictly ascending), +// and serializes the entire set into a single kSampledTermIndex framed section on finish. +class SampledTermIndexBuilder { +public: + // Appends the first_term of the next DICT block. Call order determines block ordinal order. + void add_block_first_term(std::string_view first_term); + + // Serializes and appends to sink. An empty collection (no blocks) is valid; n_blocks=0. + void finish(ByteSink* sink); + +private: + std::vector first_terms_; +}; + +// Reader: verifies the checksum and materializes all sample_terms on open; subsequent locate calls are pure in-memory binary search. +class SampledTermIndexReader { +public: + SampledTermIndexReader() = default; + + // Parses a kSampledTermIndex framed section. + // CRC mismatch / truncation / field overrun → kCorruption; type != kSampledTermIndex → kInvalidArgument. + static Status open(Slice section, SampledTermIndexReader* out); + + // Binary-search locate: returns the block ordinal of the last sample_term <= target. + // target < min_term or target > max_term (including empty index) → *maybe_present=false (out of range, term is definitely absent). + // Otherwise *maybe_present=true and *block_ordinal is the ordinal of the matching block. + Status locate(std::string_view target, bool* maybe_present, uint32_t* block_ordinal) const; + + uint32_t n_blocks() const { return static_cast(sample_terms_.size()); } + +private: + std::vector sample_terms_; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/stats_block.h b/be/src/snii/format/stats_block.h new file mode 100644 index 00000000000000..20ef0c6613f85d --- /dev/null +++ b/be/src/snii/format/stats_block.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +// Statistics block within the per-index meta block. Carries only the counting stats +// needed for query planning and BM25; section location info is stored separately in SectionRefs (see design spec "Per-index meta block"). +// +// On-disk layout (framed by SectionFramer with unified type+len+crc32c): +// [u8 type=kStatsBlock][varint64 payload_len][payload][fixed32 crc32c] +// payload = varint64{ doc_count, indexed_doc_count, term_count, +// sum_total_term_freq, null_count } +// For field semantics see design spec "Scoring statistics design". +struct StatsBlock { + uint64_t doc_count = 0; // total doc count at segment level (including unindexed/NULL) + uint64_t indexed_doc_count = 0; // number of docs actually indexed (denominator for avgdl) + uint64_t term_count = 0; // number of unique terms in this index + uint64_t sum_total_term_freq = 0; // total token count across all indexed docs + uint64_t null_count = 0; // number of NULL / not-indexed docs +}; + +// Encodes into a kStatsBlock framed section (with built-in crc32c checksum) and appends to sink. +void encode_stats_block(const StatsBlock& sb, ByteSink* sink); + +// Reads and verifies a kStatsBlock framed section from src, populates out. +// CRC mismatch / truncation → kCorruption; type is not kStatsBlock → kInvalidArgument. +Status decode_stats_block(ByteSource* src, StatsBlock* out); + +} // namespace snii::format diff --git a/be/src/snii/format/tail_meta_region.h b/be/src/snii/format/tail_meta_region.h new file mode 100644 index 00000000000000..21fd737e55cf30 --- /dev/null +++ b/be/src/snii/format/tail_meta_region.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/logical_index_directory.h" + +namespace snii::format { + +// TailMetaRegion: the container's tail metadata region, located via the fixed +// tail pointer and read in one range. It bundles the per-logical-index meta +// blocks and the logical index directory so a reader can, after a single read, +// map (index_id, index_suffix) -> per-index meta block. See spec "footer meta +// region". +// +// On-disk layout (offsets are relative to the region start; the region is read +// whole into memory, so internal refs need not be file-absolute): +// TailMetaHeader: +// u32 meta_format_version (== kMetaFormatVersion) +// u32 flags +// u64 meta_region_len (== total region byte length) +// u32 n_logical_indexes +// u64 directory_offset (offset of the logical index directory in-region) +// u64 directory_length +// u32 header_crc32c (covers the header fields above) +// [per-index meta block #0][per-index meta block #1]... (opaque payloads) +// [logical index directory] (framed via LogicalIndexDirectory) +// u32 meta_region_checksum (crc32c over everything before it) +class TailMetaRegionBuilder { +public: + // Adds a per-index meta block (already serialized by PerIndexMetaBuilder) keyed + // by (index_id, index_suffix). Bytes are copied. + void add_index(uint64_t index_id, std::string index_suffix, Slice per_index_meta_bytes); + + // Serializes the whole region and appends it to sink. + void finish(ByteSink* sink) const; + +private: + struct Entry { + uint64_t index_id; + std::string suffix; + std::vector bytes; + }; + std::vector entries_; +}; + +class TailMetaRegionReader { +public: + TailMetaRegionReader() = default; + + // Parses and validates the region (header crc + region checksum + directory). + // region must outlive this reader (find() returns sub-views of it). + static Status open(Slice region, TailMetaRegionReader* out); + + uint32_t n_logical_indexes() const { return n_; } + const LogicalIndexDirectoryReader& directory() const { return dir_; } + + // Locates the per-index meta block bytes for (index_id, suffix). On match, + // *found=true and *per_index_meta_bytes views into the region; else *found=false. + Status find(uint64_t index_id, std::string_view suffix, bool* found, + Slice* per_index_meta_bytes) const; + +private: + Slice region_; + LogicalIndexDirectoryReader dir_; + uint32_t n_ = 0; +}; + +} // namespace snii::format diff --git a/be/src/snii/format/tail_pointer.h b/be/src/snii/format/tail_pointer.h new file mode 100644 index 00000000000000..655635bf071fb8 --- /dev/null +++ b/be/src/snii/format/tail_pointer.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" + +namespace snii::format { + +// Fixed-size entry written at the very end of a segment's .idx file. It lets a +// reader locate the tail meta region with a single read of the trailing +// tail_pointer_size() bytes (see design spec "fixed tail pointer"). +// +// On-disk layout (all multi-byte fields little-endian, FIXED total size so the +// reader can read exactly the last tail_pointer_size() bytes): +// [u32 magic = kTailMagic] +// [u16 format_version = kFormatVersion] +// [u64 meta_region_offset] +// [u64 meta_region_length] +// [u64 hot_off] (offset of the hot region [hot_off, EOF); +// 0 if absent) +// [u32 meta_region_checksum] +// [u32 bootstrap_header_checksum] +// [u8 tail_pointer_size] (== tail_pointer_size()) +// [u32 tail_checksum] (crc32c over all preceding tail-pointer bytes) +// +// The fixed layout deliberately does NOT use the SectionFramer (which is +// variable-length): a footer needs a constant trailing size the reader knows up +// front. +struct TailPointer { + uint64_t meta_region_offset = 0; + uint64_t meta_region_length = 0; + uint64_t hot_off = 0; + uint32_t meta_region_checksum = 0; + uint32_t bootstrap_header_checksum = 0; +}; + +// Constant on-disk size of the tail pointer, so the reader knows how many +// trailing bytes to read. +size_t tail_pointer_size(); + +// Appends the fixed-layout tail-pointer bytes (magic / version / fields / size / +// tail_checksum) to sink. Returns Internal if the encoded size would not fit the +// fixed-size contract (a programming error, never expected at runtime). +Status encode_tail_pointer(const TailPointer& tp, ByteSink* sink); + +// Parses the trailing tail-pointer bytes. last_bytes must be exactly +// tail_pointer_size() bytes long. Verifies magic and tail_checksum, then fills +// out with the parsed fields. Wrong magic / checksum mismatch / wrong length -> +// Corruption. +Status decode_tail_pointer(Slice last_bytes, TailPointer* out); + +} // namespace snii::format diff --git a/be/src/snii/io/batch_range_fetcher.h b/be/src/snii/io/batch_range_fetcher.h new file mode 100644 index 00000000000000..c9fc7bd083558e --- /dev/null +++ b/be/src/snii/io/batch_range_fetcher.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" + +namespace snii::io { + +// Collects the byte ranges a query plan needs, coalesces overlapping/adjacent +// ranges into physical reads, and fetches them in a single batch (one serial +// I/O round on a MeteredFileReader). Callers retrieve each requested range by +// the handle returned from add(). This is the SNII read path's batching layer: +// it front-loads range planning so reads are issued concurrently rather than +// cursor-by-cursor. +class BatchRangeFetcher { +public: + // coalesce_gap: requests separated by a gap <= this many bytes are merged into + // one physical read (reads a few extra bytes to save a request). 0 merges only + // overlapping/adjacent ranges. + explicit BatchRangeFetcher(FileReader* reader, uint64_t coalesce_gap = 0); + + // Registers a desired range; returns a handle usable with get() after fetch(). + size_t add(uint64_t offset, uint64_t len); + + // Coalesces and issues one batched read; fills internal buffers. + Status fetch(); + + // Bytes for handle h (valid only after a successful fetch(), until clear()). + Slice get(size_t h) const; + + size_t pending() const { return reqs_.size(); } + void clear(); + +private: + struct Req { + uint64_t offset; + uint64_t len; + size_t len_size = 0; // validated size_t length after successful fetch() + size_t phys_idx = 0; // index into phys_ after fetch + size_t sub_offset = 0; // byte offset of this req within its physical read + }; + + FileReader* reader_; + uint64_t coalesce_gap_; + std::vector reqs_; + std::vector> phys_; // physical read buffers after fetch +}; + +} // namespace snii::io diff --git a/be/src/snii/io/file_reader.h b/be/src/snii/io/file_reader.h new file mode 100644 index 00000000000000..b8aae0c9957d1a --- /dev/null +++ b/be/src/snii/io/file_reader.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/io_metrics.h" + +namespace snii::io { + +// One logical read request (offset, length). +struct Range { + uint64_t offset = 0; + size_t len = 0; +}; + +// The single physical-read primitive (a BE-internal read_at). All higher layers +// route reads through this so I/O can be accounted and backed by local files or +// object storage interchangeably. +class FileReader { +public: + virtual ~FileReader() = default; + + // Reads exactly len bytes starting at offset into *out (which is resized to + // len). Reading past EOF is an error (Corruption/IoError). + virtual Status read_at(uint64_t offset, size_t len, std::vector* out) = 0; + + // Reads a batch of ranges that may be served concurrently. The default is a + // sequential loop; backends that model concurrency (MeteredFileReader) or + // perform real parallel fetches (object storage) override this. + virtual Status read_batch(const std::vector& ranges, + std::vector>* outs) { + outs->resize(ranges.size()); + for (size_t i = 0; i < ranges.size(); ++i) { + SNII_RETURN_IF_ERROR(read_at(ranges[i].offset, ranges[i].len, &(*outs)[i])); + } + return Status::OK(); + } + + // Total size of the underlying object in bytes. + virtual uint64_t size() const = 0; + + // Optional live metrics. Readers that do not account I/O return nullptr. + virtual const IoMetrics* io_metrics() const { return nullptr; } +}; + +} // namespace snii::io diff --git a/be/src/snii/io/file_writer.h b/be/src/snii/io/file_writer.h new file mode 100644 index 00000000000000..a216898423c209 --- /dev/null +++ b/be/src/snii/io/file_writer.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" + +namespace snii::io { + +// Append-only writer (no seek-back), so the format can be produced in a single +// streaming pass compatible with S3FileWriter / StreamSinkFileWriter / packed +// writer. All container bytes are written front-to-back; back-references are +// resolved by writing metadata last. +class FileWriter { +public: + virtual ~FileWriter() = default; + + virtual Status append(Slice data) = 0; + virtual Status finalize() = 0; + virtual uint64_t bytes_written() const = 0; +}; + +} // namespace snii::io diff --git a/be/src/snii/io/io_metrics.h b/be/src/snii/io/io_metrics.h new file mode 100644 index 00000000000000..27e4d21bb0c2f8 --- /dev/null +++ b/be/src/snii/io/io_metrics.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +namespace snii::io { + +// Object-storage access metrics collected at FileReader boundaries. +struct IoMetrics { + uint64_t read_at_calls = 0; // BE-internal logical read requests issued + uint64_t serial_rounds = 0; // dependent serial I/O rounds + uint64_t range_gets = 0; // remote range GETs after cache coalescing + uint64_t remote_bytes = 0; // bytes fetched from remote + uint64_t total_request_bytes = 0; // sum of requested lengths before cache +}; + +inline IoMetrics delta(const IoMetrics& after, const IoMetrics& before) { + IoMetrics out; + out.read_at_calls = after.read_at_calls - before.read_at_calls; + out.serial_rounds = after.serial_rounds - before.serial_rounds; + out.range_gets = after.range_gets - before.range_gets; + out.remote_bytes = after.remote_bytes - before.remote_bytes; + out.total_request_bytes = after.total_request_bytes - before.total_request_bytes; + return out; +} + +} // namespace snii::io diff --git a/be/src/snii/io/local_file.h b/be/src/snii/io/local_file.h new file mode 100644 index 00000000000000..a67477750c2be3 --- /dev/null +++ b/be/src/snii/io/local_file.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" + +namespace snii::io { + +// Local-filesystem FileReader. Uses pread for positional, thread-safe reads +// (so concurrent batch fetches do not contend on a shared file offset). +class LocalFileReader : public FileReader { +public: + LocalFileReader() = default; + ~LocalFileReader() override; + + LocalFileReader(const LocalFileReader&) = delete; + LocalFileReader& operator=(const LocalFileReader&) = delete; + + Status open(const std::string& path); + Status read_at(uint64_t offset, size_t len, std::vector* out) override; + uint64_t size() const override { return size_; } + +private: + int fd_ = -1; + uint64_t size_ = 0; +}; + +// Local-filesystem append-only FileWriter. Appends accumulate in a fixed +// userspace buffer and are flushed to the fd in large chunks, collapsing the +// many tiny per-append ::write() syscalls of the build path (e.g. ~53k writes +// averaging ~683 B each) into a handful of big writes. The produced file is +// byte-identical to the unbuffered path; only the syscall count drops. +class LocalFileWriter : public FileWriter { +public: + LocalFileWriter() = default; + ~LocalFileWriter() override; + + LocalFileWriter(const LocalFileWriter&) = delete; + LocalFileWriter& operator=(const LocalFileWriter&) = delete; + + Status open(const std::string& path); + Status append(Slice data) override; + Status finalize() override; + uint64_t bytes_written() const override { return bytes_written_; } + +private: + // Userspace write buffer size. 256 KiB amortizes the write() syscall cost over + // many appends while keeping transient RAM negligible vs the index sections. + static constexpr size_t kBufCapacity = 256u * 1024; + + // Flushes the userspace buffer to the fd with a robust partial-write loop. + Status flush_buffer(); + // Writes a raw byte span straight to the fd (used for spans larger than the + // buffer, bypassing a needless copy). + Status write_all(const uint8_t* data, size_t len); + + int fd_ = -1; + uint64_t bytes_written_ = 0; + std::vector buf_; +}; + +} // namespace snii::io diff --git a/be/src/snii/io/metered_file_reader.h b/be/src/snii/io/metered_file_reader.h new file mode 100644 index 00000000000000..41fed3eb7ac49a --- /dev/null +++ b/be/src/snii/io/metered_file_reader.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include +#include + +#include "snii/io/file_reader.h" +#include "snii/io/io_metrics.h" + +namespace snii::io { + +// A FileReader decorator that models an object-storage FileCache: reads are +// aligned to fixed (default 1MiB) blocks; only not-yet-resident blocks become +// remote range GETs (adjacent misses are coalesced). It is the single shared +// "yardstick" through which both single blocking reads and batched concurrent +// reads are measured. +// +// - read_at(): a single blocking read. Any cache miss => +1 serial round +// (the cursor must wait for bytes before the next offset is known). +// - read_batch(): all ranges submitted concurrently => the whole batch is at +// most one serial round (+1 iff any range misses). +class MeteredFileReader : public FileReader { +public: + explicit MeteredFileReader(FileReader* inner, size_t block_size = (1u << 20)); + + Status read_at(uint64_t offset, size_t len, std::vector* out) override; + Status read_batch(const std::vector& ranges, + std::vector>* outs) override; + uint64_t size() const override { return inner_->size(); } + + const IoMetrics& metrics() const { return metrics_; } + const IoMetrics* io_metrics() const override { return &metrics_; } + // Clears counters AND the resident block set, modelling a cold (cache-empty) query. + void reset_metrics(); + +private: + Status validate_range(uint64_t offset, size_t len) const; + + // Accounts the cache effect of touching [offset, offset+len): records misses, + // coalesced GETs, and remote bytes. Returns true iff at least one block missed. + bool account_blocks(uint64_t offset, size_t len); + + FileReader* inner_; + size_t block_size_; + std::unordered_set resident_; + IoMetrics metrics_; +}; + +} // namespace snii::io diff --git a/be/src/snii/io/s3_object_store.h b/be/src/snii/io/s3_object_store.h new file mode 100644 index 00000000000000..2cf2270d751bb6 --- /dev/null +++ b/be/src/snii/io/s3_object_store.h @@ -0,0 +1,122 @@ +#pragma once + +// S3 / OSS object-storage backend for snii::io. +// +// ISOLATION: the ENTIRE body of this header (and its .cpp) is guarded by +// SNII_WITH_S3. When the option is OFF the translation unit compiles to nothing +// and pulls in NO aws-sdk headers, so core stays free of any aws dependency by +// default. Only when CMake is configured with -DSNII_WITH_S3=ON is the macro +// defined and aws linked. +#ifdef SNII_WITH_S3 + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" + +// Forward declarations only -- aws types are pimpl'd in the .cpp so that this +// header never leaks aws-sdk includes to its consumers. +namespace Aws::S3 { +class S3Client; +} // namespace Aws::S3 + +namespace snii::io { + +// Connection / addressing parameters for an S3-compatible endpoint (tested +// against Aliyun OSS, which requires virtual-hosted addressing). +struct S3Config { + std::string endpoint; // e.g. "oss-cn-hongkong.aliyuncs.com" + std::string region; // e.g. "cn-hongkong" + std::string bucket; // e.g. "doris-community-test" + std::string prefix; // object key prefix (no trailing slash required) + std::string ak; // access key id + std::string sk; // secret access key + long connect_timeout_ms = 10000; + long request_timeout_ms = 180000; + long http_request_timeout_ms = 180000; +}; + +// Process-wide aws InitAPI / ShutdownAPI lifecycle guard. +// +// aws-sdk-cpp requires Aws::InitAPI to be called exactly once before any client +// is used and Aws::ShutdownAPI once at teardown. Construct a single +// AwsApiGuard (e.g. on the stack of main, or as a static) that lives for the +// whole duration during which S3FileReader / S3FileWriter are used. The guard is +// reference counted, so nested guards are safe; the underlying InitAPI runs only +// for the first live instance and ShutdownAPI when the last one is destroyed. +class AwsApiGuard { +public: + AwsApiGuard(); + ~AwsApiGuard(); + + AwsApiGuard(const AwsApiGuard&) = delete; + AwsApiGuard& operator=(const AwsApiGuard&) = delete; +}; + +// Read-only FileReader backed by an S3/OSS object. Range reads use a ranged +// GetObject; size() is the object length cached from a HeadObject at open(). +class S3FileReader : public FileReader { +public: + S3FileReader() = default; + ~S3FileReader() override; + + S3FileReader(const S3FileReader&) = delete; + S3FileReader& operator=(const S3FileReader&) = delete; + S3FileReader(S3FileReader&&) noexcept; + S3FileReader& operator=(S3FileReader&&) noexcept; + + // Opens the object (prefix + "/" + key) and caches its size via HeadObject. + static Status open(const S3Config& cfg, const std::string& key, S3FileReader* out); + + Status read_at(uint64_t offset, size_t len, std::vector* out) override; + // Concurrent batch: issues the ranges' GetObjects in parallel (bounded), so a + // planned read round costs ~one round-trip instead of the sum of all GETs. + Status read_batch(const std::vector& ranges, + std::vector>* outs) override; + uint64_t size() const override { return size_; } + +private: + std::shared_ptr client_; + std::string bucket_; + std::string object_key_; // full key (prefix + "/" + key) + uint64_t size_ = 0; +}; + +// Append-only FileWriter backed by an S3/OSS object. Appends are buffered in +// memory; finalize() flushes the whole buffer in a single PutObject. Multipart +// upload is a future optimization. +class S3FileWriter : public FileWriter { +public: + S3FileWriter() = default; + ~S3FileWriter() override; + + S3FileWriter(const S3FileWriter&) = delete; + S3FileWriter& operator=(const S3FileWriter&) = delete; + S3FileWriter(S3FileWriter&&) noexcept; + S3FileWriter& operator=(S3FileWriter&&) noexcept; + + // Opens a writer targeting object (prefix + "/" + key). + Status open(const S3Config& cfg, const std::string& key); + + Status append(Slice data) override; + Status finalize() override; + uint64_t bytes_written() const override { return bytes_written_; } + +private: + std::shared_ptr client_; + std::string bucket_; + std::string object_key_; // full key (prefix + "/" + key) + std::vector buffer_; + uint64_t bytes_written_ = 0; + bool finalized_ = false; +}; + +} // namespace snii::io + +#endif // SNII_WITH_S3 diff --git a/be/src/snii/query/bm25_scorer.h b/be/src/snii/query/bm25_scorer.h new file mode 100644 index 00000000000000..85df67d3f5e1be --- /dev/null +++ b/be/src/snii/query/bm25_scorer.h @@ -0,0 +1,63 @@ +#pragma once + +#include + +// Bm25Scorer -- classic Okapi BM25 relevance scoring over SNII native stats. +// +// Per query term, idf is precomputed once from the collection statistics: +// idf = log(1 + (N - df + 0.5) / (df + 0.5)) +// where N = indexed doc count and df = the term's document frequency. The +// per-document contribution of a term then is: +// score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / avgdl)) +// where tf is the in-doc term frequency, dl the document length decoded from the +// 1-byte encoded norm, and avgdl the average document length. +// +// Norm encode/decode (DOCUMENTED CONTRACT): the writer stores doc length as a +// byte-quantized value floor-clamped to [1, 255]; decode is the identity map +// back to a double length. encode_norm(len) = clamp(len, 1, 255); +// decode_norm(b) = (b == 0 ? 1.0 : (double)b). This keeps short docs (len <= 255) +// exact and saturates longer docs at 255, matching the reference oracle. +namespace snii::query { + +// BM25 free parameters. Defaults are the classic Lucene/Elasticsearch values. +struct Bm25Params { + double k1 = 1.2; + double b = 0.75; +}; + +// Decodes a 1-byte encoded norm into a document length. byte 0 maps to 1.0 to +// avoid a zero-length divisor; otherwise it is the byte value itself. +double decode_norm(uint8_t encoded); + +// Encodes a document length into a 1-byte norm (clamped to [1, 255]). Provided +// so writers and test oracles share one quantization. +uint8_t encode_norm(uint64_t doc_length); + +// Per-term scoring context: the precomputed idf and the term's df. Built once per +// query term, then reused for every candidate document of that term. +class ScorerContext { +public: + // Builds the context from collection size n (indexed doc count) and the term's + // document frequency df. avgdl and params are supplied per score call. + static ScorerContext make(uint64_t n, uint64_t df); + + double idf() const { return idf_; } + uint64_t df() const { return df_; } + + // Scores one document occurrence: tf is the in-doc term frequency, encoded_norm + // the doc's 1-byte length norm, avgdl the collection average length. + double score(uint32_t tf, uint8_t encoded_norm, double avgdl, const Bm25Params& params) const; + + // Upper bound on score() over any document, given a window's maximum tf and the + // shortest doc length in the window (smallest dl maximizes the score). Used by + // the WAND-style block-max pruner. max_freq is the window's max tf; min_norm is + // the smallest encoded norm (=> smallest dl => largest score). + double max_score(uint32_t max_freq, uint8_t min_norm, double avgdl, + const Bm25Params& params) const; + +private: + double idf_ = 0.0; + uint64_t df_ = 0; +}; + +} // namespace snii::query diff --git a/be/src/snii/query/boolean_query.h b/be/src/snii/query/boolean_query.h new file mode 100644 index 00000000000000..f9cba6485eb37c --- /dev/null +++ b/be/src/snii/query/boolean_query.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// boolean_or -- MATCH_ANY semantics: return the sorted docid set containing at +// least one query term. Empty terms or all-absent terms produce an empty +// result. Duplicate input terms are ignored semantically and do not duplicate +// output docids. +namespace snii::query { + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, DocIdSink* sink); + +// boolean_and (MATCH all-terms): sorted docid set of docs containing EVERY +// term, no positional constraint. Valid on docs-only indexes. Empty terms or +// any absent term -> empty result. +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); + +} // namespace snii::query diff --git a/be/src/snii/query/docid_sink.h b/be/src/snii/query/docid_sink.h new file mode 100644 index 00000000000000..de08d24b9ce164 --- /dev/null +++ b/be/src/snii/query/docid_sink.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" + +namespace snii::query { + +// Bulk docid handoff for query operators. Each span is sorted ascending; callers +// that need a single vector can use VectorDocIdSink. +class DocIdSink { +public: + virtual ~DocIdSink() = default; + virtual Status append_sorted(std::span docids) = 0; +}; + +class VectorDocIdSink final : public DocIdSink { +public: + explicit VectorDocIdSink(std::vector& docids) : docids_(docids) {} + + Status append_sorted(std::span docids) override { + docids_.insert(docids_.end(), docids.begin(), docids.end()); + return Status::OK(); + } + +private: + std::vector& docids_; +}; + +} // namespace snii::query diff --git a/be/src/snii/query/internal/docid_conjunction.h b/be/src/snii/query/internal/docid_conjunction.h new file mode 100644 index 00000000000000..f97ac781a2c364 --- /dev/null +++ b/be/src/snii/query/internal/docid_conjunction.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_prelude.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +struct ResolvedQueryTerm { + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; +}; + +struct TermPlan { + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + uint32_t df = 0; + size_t order = 0; + size_t frq_handle = 0; + size_t prx_handle = 0; + size_t prelude_handle = 0; + bool pod_ref = false; + bool windowed = false; + snii::format::FrqPreludeReader prelude; +}; + +struct DocidChunk { + std::vector docids; + std::vector prx_doc_ordinals; + bool windowed = false; + uint32_t window = 0; +}; + +struct DocidSource { + std::vector chunks; +}; + +Status resolve_query_term(const snii::reader::LogicalIndexReader& idx, const std::string& term, + ResolvedQueryTerm* resolved, bool* found); + +Status plan_terms(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, snii::io::BatchRangeFetcher* fetcher, + std::vector* plans, bool* all_present, bool need_positions); + +Status plan_resolved_terms(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, + snii::io::BatchRangeFetcher* fetcher, std::vector* plans, + bool need_positions); + +Status open_preludes(const snii::io::BatchRangeFetcher& fetcher, std::vector* plans, + bool need_positions); + +Status inline_dd_region(const snii::format::DictEntry& entry, Slice* out); + +Status build_docid_only_conjunction(const snii::reader::LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates); + +Status build_docid_only_conjunction(const snii::reader::LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates, + std::vector* sources); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/docid_posting_reader.h b/be/src/snii/query/internal/docid_posting_reader.h new file mode 100644 index 00000000000000..cf297d4082ccd9 --- /dev/null +++ b/be/src/snii/query/internal/docid_posting_reader.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/format/dict_entry.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +struct ResolvedDocidPosting { + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; +}; + +// Decodes the docid-only posting for a resolved term. The caller owns term +// lookup and can batch/plan lookups independently; this module owns only the +// three posting encodings (inline, slim pod_ref, windowed pod_ref). +Status read_docid_posting(const snii::reader::LogicalIndexReader& idx, + const snii::format::DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, std::vector* docids); + +// Batch counterpart for multi-term docid-only operators. Windowed terms share one +// prelude fetch round and one docid fetch round, so OR-style operators pay by +// stage rather than by term. +Status read_docid_postings_batched(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, + std::vector>* docids); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/docid_set_ops.h b/be/src/snii/query/internal/docid_set_ops.h new file mode 100644 index 00000000000000..8aae88b90fa974 --- /dev/null +++ b/be/src/snii/query/internal/docid_set_ops.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +namespace snii::query::internal { + +std::vector intersect_sorted(const std::vector& a, + const std::vector& b); + +void union_sorted_into(std::vector* acc, const std::vector& next); + +std::vector union_sorted_many(const std::vector>& lists); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/docid_union.h b/be/src/snii/query/internal/docid_union.h new file mode 100644 index 00000000000000..89c53f103d2343 --- /dev/null +++ b/be/src/snii/query/internal/docid_union.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +// Reads already-resolved docid postings in planned batches, merges them as a +// sorted deduplicated union, then emits one bulk span to the sink. +Status build_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, + std::vector* out); + +Status emit_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, DocIdSink* sink); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/position_math.h b/be/src/snii/query/internal/position_math.h new file mode 100644 index 00000000000000..04e964a67b6e7e --- /dev/null +++ b/be/src/snii/query/internal/position_math.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include +#include + +namespace snii::query::internal { + +inline bool build_position_offsets(size_t count, std::vector* out) { + if (count >= std::numeric_limits::max()) { + return false; + } + out->clear(); + out->reserve(count); + uint32_t offset = 0; + while (out->size() < count) { + out->push_back(offset); + ++offset; + } + return true; +} + +inline bool add_position_offset(uint32_t start, uint32_t offset, uint32_t* out) { + if (start > std::numeric_limits::max() - offset) return false; + *out = start + offset; + return true; +} + +} // namespace snii::query::internal diff --git a/be/src/snii/query/internal/term_expansion.h b/be/src/snii/query/internal/term_expansion.h new file mode 100644 index 00000000000000..3b9753b4df267e --- /dev/null +++ b/be/src/snii/query/internal/term_expansion.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/reader/logical_index_reader.h" + +namespace snii::query::internal { + +using TermMatcher = std::function; + +// Enumerates dictionary terms from `enum_prefix`, filters them with `matches`, +// and emits the sorted docid union for matching entries. PrefixHit carries the +// DictEntry and block bases, so callers avoid a second lookup per expanded term. +Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx, + std::string_view enum_prefix, const TermMatcher& matches, + DocIdSink* sink); + +} // namespace snii::query::internal diff --git a/be/src/snii/query/phrase_query.h b/be/src/snii/query/phrase_query.h new file mode 100644 index 00000000000000..bcafc9dcb67516 --- /dev/null +++ b/be/src/snii/query/phrase_query.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// phrase_query -- MATCH_PHRASE: return the sorted docid set in which the terms +// occur consecutively (for some i, every term k appears at position pos+k in +// the same doc). It first builds the docid conjunction with docs-only posting +// reads, then fetches PRX only for chunks that can contain final candidates: +// 1. read preludes / docs-only posting ranges and intersect per-term docids; +// 2. fetch retained PRX chunks and stream positions for survivors; +// 3. for each surviving doc, check that some position p exists with +// term[0]@p, term[1]@p+1, ... term[n-1]@p+(n-1). +// An empty term list -> empty result. Any term absent -> empty result. +namespace snii::query { + +Status phrase_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status phrase_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); + +// phrase_prefix_query -- MATCH_PHRASE_PREFIX: the last item in `terms` is a +// term prefix and preceding items are exact terms. For example {"quick", "bro"} +// matches "quick brown" and "quick bronze". Empty terms -> empty result. +Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids); +Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile); + +} // namespace snii::query diff --git a/be/src/snii/query/prefix_query.h b/be/src/snii/query/prefix_query.h new file mode 100644 index 00000000000000..e7937733396797 --- /dev/null +++ b/be/src/snii/query/prefix_query.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// prefix_query -- MATCH_PREFIX semantics: enumerate dictionary terms with the +// requested prefix, then return the sorted docid set containing any enumerated +// term. Empty prefix enumerates all terms. No matching terms -> empty result. +namespace snii::query { + +Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, + std::vector* docids); +Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, + std::vector* docids, QueryProfile* profile); +Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, + DocIdSink* sink); + +} // namespace snii::query diff --git a/be/src/snii/query/query_profile.h b/be/src/snii/query/query_profile.h new file mode 100644 index 00000000000000..a4988f6a80c8d1 --- /dev/null +++ b/be/src/snii/query/query_profile.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +#include "snii/io/io_metrics.h" + +namespace snii::io { +class FileReader; +} + +namespace snii::query { + +struct QueryProfile { + uint64_t elapsed_ns = 0; + bool has_io_metrics = false; + snii::io::IoMetrics io_before; + snii::io::IoMetrics io_after; + snii::io::IoMetrics io_delta; +}; + +class QueryProfileScope { +public: + QueryProfileScope(snii::io::FileReader* reader, QueryProfile* profile); + ~QueryProfileScope(); + QueryProfileScope(const QueryProfileScope&) = delete; + QueryProfileScope& operator=(const QueryProfileScope&) = delete; + + void finish(); + +private: + snii::io::FileReader* reader_ = nullptr; + QueryProfile* profile_ = nullptr; + std::chrono::steady_clock::time_point start_; + bool finished_ = false; +}; + +} // namespace snii::query diff --git a/be/src/snii/query/regexp_query.h b/be/src/snii/query/regexp_query.h new file mode 100644 index 00000000000000..801dec8f2c677d --- /dev/null +++ b/be/src/snii/query/regexp_query.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// regexp_query -- MATCH_REGEXP semantics over dictionary terms. The pattern is +// evaluated with std::regex_match, so it must match the whole term. Matching +// terms are executed as a sorted deduplicated docid union. +namespace snii::query { + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids); +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids, QueryProfile* profile); +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* sink); + +} // namespace snii::query diff --git a/be/src/snii/query/scoring_query.h b/be/src/snii/query/scoring_query.h new file mode 100644 index 00000000000000..dc2ea75f0751e7 --- /dev/null +++ b/be/src/snii/query/scoring_query.h @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/bm25_scorer.h" +#include "snii/reader/logical_index_reader.h" +#include "snii/stats/snii_stats_provider.h" + +// scoring_query -- top-K BM25 scored retrieval over one logical index for one or +// more query terms. Two entry points produce IDENTICAL rankings: +// - scoring_query_exhaustive(): scores every candidate document (the baseline +// correctness oracle). +// - scoring_query_wand(): a block-max / WAND-style optimization that uses the +// per-window max_freq / max_norm columns from the frq_prelude to bound each +// window's best possible score and SKIP windows that cannot enter the +// current top-K. A window without block-max stats (slim/inline entries or a +// missing prelude) is never pruned, so the result still equals the +// exhaustive ranking. +// +// Results are sorted by score descending; ties are broken by ascending docid so +// the ordering is deterministic and the two paths compare equal. +namespace snii::query { + +// One scored hit. +struct ScoredDoc { + uint32_t docid = 0; + double score = 0.0; +}; + +// Exhaustive baseline: score every doc that contains any query term, return the +// top-k by score. params controls k1/b. Unknown terms are skipped. +Status scoring_query_exhaustive(const snii::reader::LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out); + +// WAND-style block-max pruning. MUST return the same top-k as the exhaustive +// path. Windows whose block-max upper bound cannot beat the current k-th score +// are skipped; windows lacking block-max stats are scored fully. +Status scoring_query_wand(const snii::reader::LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out); + +// SELECTIVE-FETCH block-max WAND (design spec section 5, "Phase C"). Same WAND / +// theta / >= tie machinery as scoring_query_wand, but it DEFERS the .frq window +// fetch: for each windowed term it first reads ONLY the frq_prelude (block-max +// columns), then fetches a term's .frq window lazily and at most once -- and ONLY +// when the running block-max bound proves a doc in that window can still reach the +// top-K (bound >= theta). A window the bound rules out is never fetched. The +// result (top-K docids AND scores, INCLUDING ties) is byte-identical to +// scoring_query_exhaustive / scoring_query_wand; only the bytes read differ. +// Slim/inline terms (no prelude) are fetched fully, exactly as today. +Status scoring_query_wand_selective(const snii::reader::LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out); + +} // namespace snii::query diff --git a/be/src/snii/query/term_query.h b/be/src/snii/query/term_query.h new file mode 100644 index 00000000000000..959e5a0ad20a7b --- /dev/null +++ b/be/src/snii/query/term_query.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// term_query -- the simplest SNII query: return the sorted docid set that +// contains term. It runs the term lookup on the logical index, then issues a +// single batched .frq range read (one serial round) to decode the postings. +// Absent term -> empty result (OK status). +namespace snii::query { + +Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, + std::vector* docids); +Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, + std::vector* docids, QueryProfile* profile); + +} // namespace snii::query diff --git a/be/src/snii/query/wildcard_query.h b/be/src/snii/query/wildcard_query.h new file mode 100644 index 00000000000000..de66450e3fda69 --- /dev/null +++ b/be/src/snii/query/wildcard_query.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/query/docid_sink.h" +#include "snii/query/query_profile.h" +#include "snii/reader/logical_index_reader.h" + +// wildcard_query -- MATCH_WILDCARD semantics over dictionary terms. `*` matches +// any byte sequence, `?` matches one byte, and all other bytes match literally. +// Matching terms are executed as a sorted deduplicated docid union. +namespace snii::query { + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids); +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids, QueryProfile* profile); +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* sink); + +} // namespace snii::query diff --git a/be/src/snii/reader/logical_index_reader.h b/be/src/snii/reader/logical_index_reader.h new file mode 100644 index 00000000000000..64c87203daabd2 --- /dev/null +++ b/be/src/snii/reader/logical_index_reader.h @@ -0,0 +1,123 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/bsbf.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/per_index_meta.h" +#include "snii/format/sampled_term_index.h" +#include "snii/format/stats_block.h" +#include "snii/io/file_reader.h" + +// LogicalIndexReader -- read-side counterpart of LogicalIndexWriter for one +// logical index. It owns the resident per-index meta sub-readers (XFilter, +// SampledTermIndex, DICT block directory, StatsBlock, SectionRefs) parsed from +// the per-index meta block, and resolves a query term to its DictEntry through +// the documented lookup flow: +// XFilter (reject absent) -> SampledTermIndex (candidate block ordinal) -> +// DICT block directory (block range) -> resident small-DICT block or one +// range read of the DICT block -> DictBlockReader::find_term. +// +// lookup() also returns the block's frq_base/prx_base (captured by the +// DictBlockReader) so callers can resolve a pod_ref entry's absolute .frq/.prx +// offsets via the writer's contract. Both deltas index into the SAME +// interleaved posting region (prx_base == frq_base; the prx span precedes the +// frq span): +// abs_frq = posting_region.offset + frq_base + entry.frq_off_delta +// abs_prx = posting_region.offset + prx_base + entry.prx_off_delta +// +// The meta block bytes must outlive this reader (they are owned by the parent +// SniiSegmentReader's resident meta region). +namespace snii::reader { + +class LogicalIndexReader { +public: + LogicalIndexReader() = default; + + // Parses the per-index meta block and binds the reader to file_reader. + // file_reader / meta_block must outlive this reader. + static Status open(snii::io::FileReader* file_reader, snii::format::IndexTier tier, + bool has_positions, Slice meta_block, LogicalIndexReader* out); + + // Resolves term to a DictEntry. *found=false when the term is absent (XFilter + // rejection, out-of-range sample, or DICT-block miss). On a hit, *entry is + // filled and *frq_base / *prx_base carry the candidate block's bases. + Status lookup(std::string_view term, bool* found, snii::format::DictEntry* entry, + uint64_t* frq_base, uint64_t* prx_base) const; + + // One enumerated term whose key has the requested prefix, with its DictEntry + // and the owning DICT block's frq/prx bases (for posting resolution). + struct PrefixHit { + std::string term; + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + }; + + // Ordered term enumeration: every term with `prefix`, in lexicographic order, + // by seeking the start DICT block via the SampledTermIndex and scanning + // forward across contiguous blocks until the terms pass the prefix range. + // Empty prefix enumerates all terms. This is the contiguous-DICT-block design + // the term-anchor layout was built for (MATCH_PHRASE_PREFIX / prefix / range + // queries). + Status prefix_terms(std::string_view prefix, std::vector* out) const; + + // Resolves a pod_ref entry's absolute .frq / .prx window byte range, + // validating the locator against the posting_region length (defends against + // corrupt entries: prelude_len > frq_len underflow, or off_delta+len past the + // region). Both windows resolve against the single posting_region. *abs_off + // is the absolute file offset of the window (after prelude); *len its byte + // length. + Status resolve_frq_window(const snii::format::DictEntry& entry, uint64_t frq_base, + uint64_t* abs_off, uint64_t* len) const; + Status resolve_prx_window(const snii::format::DictEntry& entry, uint64_t prx_base, + uint64_t* abs_off, uint64_t* len) const; + + const snii::format::SectionRefs& section_refs() const { return meta_.section_refs(); } + const snii::format::StatsBlock& stats() const { return meta_.stats(); } + snii::format::IndexTier tier() const { return tier_; } + bool has_positions() const { return has_positions_; } + snii::io::FileReader* reader() const { return reader_; } + +private: + snii::io::FileReader* reader_ = nullptr; + snii::format::IndexTier tier_ = snii::format::IndexTier::kT1; + bool has_positions_ = false; + snii::format::PerIndexMetaReader meta_; + snii::format::SampledTermIndexReader sti_; + snii::format::DictBlockDirectoryReader dbd_; + snii::format::BsbfHeader bsbf_header_; // resident header (from section ref) + bool has_bsbf_ = false; + // L0 tiering: when the bsbf section is small (<= kBsbfResidentMaxBytes) its + // whole bitset is loaded here at open -> in-memory probe, no per-lookup + // round. Empty => L1 (on-demand single-block probe via bsbf_probe). + bool bsbf_resident_ = false; + std::vector bsbf_resident_bitset_; + + // Small DICT blocks are opened once with the index so exact lookups avoid an + // otherwise serial S3 round for the term dictionary. Empty means the + // dictionary exceeded the resident threshold and lookup/prefix enumeration + // read blocks on demand. Each DictBlockReader holds a Slice into the owning + // bytes. + struct ResidentDictBlock { + std::vector bytes; + snii::format::DictBlockReader reader; + }; + struct OnDemandDictBlock { + std::vector bytes; + snii::format::DictBlockReader reader; + }; + Status load_resident_dict_blocks(); + Status dict_block_reader_for_ordinal(uint32_t ordinal, OnDemandDictBlock* on_demand, + const snii::format::DictBlockReader** out) const; + std::vector resident_dict_blocks_; +}; + +} // namespace snii::reader diff --git a/be/src/snii/reader/snii_segment_reader.h b/be/src/snii/reader/snii_segment_reader.h new file mode 100644 index 00000000000000..fc725889a03f94 --- /dev/null +++ b/be/src/snii/reader/snii_segment_reader.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/tail_meta_region.h" +#include "snii/io/file_reader.h" +#include "snii/reader/logical_index_reader.h" + +// SniiSegmentReader -- entry point for the SNII segment read path. It opens a +// single .idx container through a (possibly metered) io::FileReader and exposes +// its logical indexes. open() performs the minimal bootstrap reads: +// 1. the fixed bootstrap header (front of the file), +// 2. the fixed tail pointer (last tail_pointer_size() bytes), and +// 3. the tail meta region (one range read located via the tail pointer). +// The meta region bytes are held resident by the reader so per-index meta blocks +// (returned as sub-views) remain valid for the reader's lifetime. +// +// open_index() then materializes one LogicalIndexReader from the per-index meta +// block of a given (index_id, suffix); query functions operate on that reader. +namespace snii::reader { + +class SniiSegmentReader { +public: + SniiSegmentReader() = default; + + // Reads bootstrap header + tail pointer + tail meta region from reader. + // reader must outlive the returned SniiSegmentReader and every + // LogicalIndexReader opened from it. reader == nullptr / out == nullptr -> + // InvalidArgument; structural problems -> Corruption / Unsupported. + static Status open(snii::io::FileReader* reader, SniiSegmentReader* out); + + uint32_t n_logical_indexes() const { return region_reader_.n_logical_indexes(); } + + // Loads the per-index meta block for (index_id, suffix) and builds a + // LogicalIndexReader bound to the same FileReader. Absent index -> NotFound. + Status open_index(uint64_t index_id, std::string_view suffix, LogicalIndexReader* out) const; + + snii::io::FileReader* reader() const { return reader_; } + +private: + snii::io::FileReader* reader_ = nullptr; + std::vector meta_region_; // owned resident copy of the tail meta region + snii::format::TailMetaRegionReader region_reader_; +}; + +} // namespace snii::reader diff --git a/be/src/snii/reader/windowed_posting.h b/be/src/snii/reader/windowed_posting.h new file mode 100644 index 00000000000000..e02e6e2831e05b --- /dev/null +++ b/be/src/snii/reader/windowed_posting.h @@ -0,0 +1,105 @@ +#pragma once + +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_prelude.h" +#include "snii/reader/logical_index_reader.h" + +// WindowedPostingReader -- shared read-side decode of a windowed term's posting +// from its two-level frq_prelude + GROUPED dd-block / freq-block (design 1.6). +// +// A windowed pod_ref entry's .frq payload is laid out +// [prelude][dd-block][freq-block] +// where the dd-block concatenates every window's dd_region and the freq-block +// every window's freq_region. The docs-only prefix [prelude][dd-block] is ONE +// contiguous run. This helper: +// 1. range-fetches the prelude (prelude_len bytes) and parses the directory, +// 2. range-fetches the WHOLE dd-block in ONE contiguous range (and, for +// scoring, +// the whole freq-block in one more range), +// 3. decodes each window's dd region (and freq region) from the in-memory +// blocks +// via the prelude metadata (dd_off/dd_disk_len, freq_off/freq_disk_len), +// and concatenates the per-window docids / freqs / positions. +// +// The slim/inline single-window path is handled by the term/phrase/scoring +// callers directly; this helper is for enc=windowed entries only. +namespace snii::reader { + +// Coalesce gap (bytes) used when batch-fetching MULTIPLE dd sub-ranges of the +// SAME term (the phrase window-skip path): dd regions of one term are +// contiguous in the dd-block, so merging reads separated by <= this gap into +// one physical Range GET trades a little over-read for fewer remote GETs (the +// design's higher-priority metric). Only applied to same-term multi-window +// batches, never to cross-term. +inline constexpr uint64_t kSameTermCoalesceGap = 0; + +// Full decoded posting for one windowed term (docids ascending across windows). +struct DecodedPosting { + std::vector docids; + std::vector freqs; // aligned with docids + std::vector> positions; // aligned; empty when no prx +}; + +// Decodes the entire windowed posting. want_positions requires the index to +// have positions (and the entry to carry prx). want_freq selects whether the +// freq-block is fetched + decoded: when false ONLY the contiguous +// [prelude][dd-block] prefix is fetched (docid-only / phrase callers) and +// DecodedPosting.freqs stays empty; when true the freq-block is additionally +// fetched (scoring). Returns Corruption on any prelude/block inconsistency +// (doc-count mismatch, out-of-range offsets). +Status read_windowed_posting(const LogicalIndexReader& idx, const snii::format::DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, bool want_positions, + bool want_freq, DecodedPosting* out); + +// --- Sub-block (window) skipping helpers (shared with phrase / selective WAND) +// -- +// +// These expose the per-window dd/freq/prx addressing within the grouped blocks +// so the skip path can fetch ONLY the windows covering candidate docids (their +// dd sub-ranges within the dd-block, near-contiguous and coalesce-friendly) +// instead of the whole posting, without duplicating the offset arithmetic. + +// Absolute file byte ranges of one window's regions. dd is always valid; freq +// is valid only when want_freq; prx is valid only when want_positions (and +// has_prx). +struct WindowAbsRange { + uint64_t dd_off = 0; + uint64_t dd_len = 0; + uint64_t freq_off = 0; + uint64_t freq_len = 0; + uint64_t prx_off = 0; + uint64_t prx_len = 0; +}; + +// Fetches + parses the two-level prelude of a windowed entry (one batched +// read). +Status fetch_windowed_prelude(const LogicalIndexReader& idx, const snii::format::DictEntry& entry, + uint64_t frq_base, snii::format::FrqPreludeReader* prelude); + +// Computes the absolute file ranges of window w's dd region (and freq region +// when want_freq, and .prx window when want_positions), fully validated against +// the POD sections (anti-DoS: rejects out-of-range offsets and overflowing +// locators). +Status windowed_window_range(const LogicalIndexReader& idx, const snii::format::DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, + const snii::format::FrqPreludeReader& prelude, uint32_t w, + bool want_positions, bool want_freq, WindowAbsRange* out); + +// Decodes one window's docids (and per-doc positions when want_positions, and +// per-doc freqs when want_freq) from already-fetched byte slices: dd_region is +// the window's dd sub-slice; freq_region its freq sub-slice (ignored when +// !want_freq); prx_window its .prx bytes. The decoded docids are absolute +// (win_base applied). Returns Corruption on any doc-count mismatch between the +// prelude, dd/freq and prx. +Status decode_window_slices(const snii::format::WindowMeta& meta, Slice dd_region, + Slice freq_region, Slice prx_window, bool want_positions, + bool want_freq, std::vector* docids, + std::vector* freqs, + std::vector>* positions); + +} // namespace snii::reader diff --git a/be/src/snii/stats/snii_stats_provider.h b/be/src/snii/stats/snii_stats_provider.h new file mode 100644 index 00000000000000..12fdfa607bf0bd --- /dev/null +++ b/be/src/snii/stats/snii_stats_provider.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include + +#include "snii/common/status.h" +#include "snii/format/norms_pod.h" +#include "snii/reader/logical_index_reader.h" + +// SniiStatsProvider -- exposes the native SNII scoring statistics required by +// BM25, sourced directly from the on-disk structures of one logical index: +// - segment-level counts (doc_count, indexed_doc_count, sum_total_term_freq) +// from the StatsBlock embedded in the per-index meta block. +// - per-term df / ttf from the term's DictEntry (resolved through the reader's +// lookup flow). The LogicalIndexWriter stores ttf directly in ttf_delta for +// tier>=T2 entries, so total_term_freq returns entry.ttf_delta. +// - per-doc length normalization byte (encoded_norm) from the norms POD, +// range-read once at open via section_refs().norms and parsed with +// NormsPodReader. +// +// avgdl() = sum_total_term_freq / max(1, indexed_doc_count): the average document +// length used by BM25 length normalization. The provider performs no scoring; it +// only surfaces the statistics so snii::query::Bm25Scorer can combine them. +namespace snii::stats { + +class SniiStatsProvider { +public: + SniiStatsProvider() = default; + + // Binds to idx and materializes the norms POD (one range read) when the index + // carries scoring norms. idx must outlive this provider. A scoring index + // without a norms section, or a corrupt norms POD, returns a non-OK Status. + static Status open(const snii::reader::LogicalIndexReader* idx, SniiStatsProvider* out); + + // Segment-level counts (direct StatsBlock fields). + uint64_t doc_count() const { return doc_count_; } + uint64_t indexed_doc_count() const { return indexed_doc_count_; } + uint64_t sum_total_term_freq() const { return sum_total_term_freq_; } + + // Average document length: sum_total_term_freq / max(1, indexed_doc_count). + double avgdl() const; + + // Per-term document frequency. Absent term -> *df = 0 (OK status). + Status doc_freq(std::string_view term, uint64_t* df) const; + + // Per-term total term frequency (ttf = df + ttf_delta at tier>=T2). Absent + // term -> *ttf = 0 (OK status). + Status total_term_freq(std::string_view term, uint64_t* ttf) const; + + // 1-byte encoded doc-length norm for docid (raw byte from the norms POD). + // Out-of-range docid -> InvalidArgument; index without norms -> InvalidArgument. + Status encoded_norm(uint32_t docid, uint8_t* out) const; + + bool has_norms() const { return has_norms_; } + +private: + const snii::reader::LogicalIndexReader* idx_ = nullptr; + uint64_t doc_count_ = 0; + uint64_t indexed_doc_count_ = 0; + uint64_t sum_total_term_freq_ = 0; + bool has_norms_ = false; + // Owned copy of the framed norms section bytes; norms_reader_ borrows from it. + std::vector norms_bytes_; + snii::format::NormsPodReader norms_reader_; +}; + +} // namespace snii::stats diff --git a/be/src/snii/version.h b/be/src/snii/version.h new file mode 100644 index 00000000000000..dd2bdef2af8e3e --- /dev/null +++ b/be/src/snii/version.h @@ -0,0 +1,4 @@ +#pragma once +#define SNII_VERSION_MAJOR 0 +#define SNII_VERSION_MINOR 1 +#define SNII_VERSION_STRING "0.1.0" diff --git a/be/src/snii/writer/compact_posting_pool.h b/be/src/snii/writer/compact_posting_pool.h new file mode 100644 index 00000000000000..ceeb150faffc4f --- /dev/null +++ b/be/src/snii/writer/compact_posting_pool.h @@ -0,0 +1,180 @@ +#pragma once + +#include +#include +#include + +namespace snii::writer { + +// SEGMENTED BYTE ARENA with per-term SLICED runs (a ByteBlockPool, after Lucene). +// +// WHY: the SPIMI accumulator's bulk memory is the per-term posting bytes. Backing +// each term with its own std::vector pays two taxes that dominate peak +// RSS at scale: (1) geometric-growth doubling slack (~1.17x of the live payload), +// and (2) a 24-32 B vector/struct header per term (hundreds of thousands of +// terms). This pool removes both: all term bytes live in a few large fixed-size +// blocks (so slack is ~one block, amortized to ~1.05x), and a term needs only two +// 32-bit cursors of live state (chain head for reads + write head for appends). +// +// HOW (slices): a term's bytes are not stored contiguously. They live in a chain +// of SLICES of geometrically growing payload capacity (the kSliceSizes schedule: +// 4, 8, 16, ... bytes of payload). Each slice is laid out as +// [ payload bytes ... ][ 4-byte forward pointer ] +// The forward pointer holds the absolute offset of the next slice's first payload +// byte (0 while the slice is still the tail of the chain). When a slice's payload +// region fills, the writer allocates a larger slice, stores its head into the old +// slice's 4 pointer bytes, and keeps appending. A reader walks the chain by +// reading payload bytes until a slice boundary, then following the pointer. +// +// Both writer and reader recompute each slice's capacity from the chain's slice +// INDEX (0, 1, 2, ...) via the deterministic schedule, so neither needs to store +// per-slice sizes. The writer carries the current slice's end offset in its +// SliceWriter handle; the reader recomputes capacities as it advances. +// +// Offsets are GLOBAL absolute byte indices into the logical concatenation of all +// blocks: offset = block_index * kBlockSize + byte_in_block. kBlockSize is a power +// of two, so offset -> (block, byte) is a shift/mask. +class CompactPostingPool { +public: + // Block size (power of two). 32 KiB blocks keep per-block tail waste tiny (it + // matters at the smaller 1M scale where the whole arena is only tens of MiB) and + // bound the outer vector header cost; at the 5M scale a few thousand + // blocks is still cheap. Empirically the lowest peak across both scales. + static constexpr uint32_t kBlockShift = 15; + static constexpr uint32_t kBlockSize = 1u << kBlockShift; // 32 KiB + static constexpr uint32_t kBlockMask = kBlockSize - 1; + + // Per-slice forward-pointer width (absolute uint32 next-slice offset). + static constexpr uint32_t kPtrBytes = 4; + + // Geometric slice payload-capacity schedule and the level transition. Level i + // slices hold kSliceSizes[i] payload bytes; on overflow the chain advances to + // kNextLevel[i] (capping at the largest level). A GENTLE (~1.5x) many-level + // schedule starting small minimizes the over-allocated final slice (the + // dominant arena overhead) while keeping the per-slice forward-pointer count + // bounded for high-df chains. + static constexpr int kLevelCount = 16; + + CompactPostingPool(); + + CompactPostingPool(const CompactPostingPool&) = delete; + CompactPostingPool& operator=(const CompactPostingPool&) = delete; + + // Payload capacity (bytes) of a fresh level-0 slice. Exposed for tests that need + // to fill exactly one slice without hardcoding the schedule. + static uint32_t kSliceSizes_level0(); + + // Payload capacity of the slice at `level`, and the level a chain advances to when + // that slice overflows. Exposed (like kSliceSizes_level0) so tests can simulate the + // arena's bump allocator exactly -- e.g. to construct an EXACT block-boundary fill -- + // without hardcoding the private schedule. `level` must be in [0, kLevelCount). + static uint32_t kSliceSize_at(int level); + static uint8_t kNextLevel_at(int level); + + // Live append handle for one term's chain. POD, 8 bytes: the absolute write + // cursor and the absolute end of the current slice's payload region. The chain's + // current slice LEVEL is kept by the caller (a uint8, packed alongside its other + // flags) so this handle stays 8 bytes -- shaving the per-term accumulator. `head` + // (the chain's first payload offset) is also stored by the CALLER (the read entry + // point); start_chain returns it. + struct SliceWriter { + uint32_t cur = 0; // next byte to write (absolute) + uint32_t slice_end = 0; // one-past-last payload byte of the current slice + }; + + // Begins a fresh chain, initializing `w` to its first (level-0) slice and + // *level to 0, and returns the chain head (absolute first payload offset). + uint32_t start_chain(SliceWriter* w, uint8_t* level); + + // Appends one payload byte to the chain described by `w` / `*level`, growing the + // chain with a new linked slice (and advancing *level) when the current slice's + // payload region is exhausted. + void append_byte(SliceWriter* w, uint8_t* level, uint8_t value); + + // Total live payload bytes ever written across all chains (excludes slice + // forward-pointer overhead). Drives the spill-threshold estimate only. + uint64_t payload_bytes() const { return payload_bytes_; } + + // Bytes the arena currently occupies (block_count * kBlockSize). The pool + // addresses bytes with a uint32 offset (next_offset_), so the arena MUST stay + // below 4 GiB or alloc_run wraps and silently aliases block 0. The accumulator + // watches this to force a safety spill before the wrap; alloc_run also enforces it + // directly (throws std::overflow_error on a would-be wrap) so a direct user of the + // pool fails loudly rather than silently corrupting. + // Hard invariant: a single CompactPostingPool never exceeds UINT32_MAX bytes. + uint64_t arena_bytes() const { return static_cast(blocks_.size()) << kBlockShift; } + + // Releases ALL blocks back to the OS. Called after the accumulator is fully + // drained (or before a spill's next fill) so no input-side bytes stay resident. + void reset(); + + // ---- Reader ---------------------------------------------------------------- + // Forward cursor over one term's chain, yielding its payload bytes in write + // order by walking the slice forward pointers. + // + // CONTRACT of the `budget` ctor argument (single, unambiguous meaning): + // `budget` is an UPPER BOUND on the number of bytes this cursor may yield. It + // is NOT required to equal the exact payload length: passing the exact length + // is fine, and so is passing any value >= it (the production caller passes the + // chain's write-head offset, which always bounds the payload from above). The + // cursor is SELF-TERMINATING: once it walks off the last written byte it sees + // the tail slice's zero forward pointer and stops, regardless of how much + // budget remains. So an over-large budget can never make next() read past the + // chain (no aliasing of block 0, no off-chain access) -- the budget is purely a + // secondary cap. has_next() is therefore a reliable "more bytes remain" + // predicate for ANY budget >= the true length: it becomes false at the smaller + // of (budget exhausted, chain tail reached). + class Cursor { + public: + Cursor(const CompactPostingPool* pool, uint32_t head, uint64_t budget); + + // True while the cursor can still yield a REAL payload byte: the budget is not + // spent AND the cursor has not reached the chain tail. It peeks the tail forward + // pointer at a slice boundary so it never reports a phantom trailing byte, making + // has_next()/next() a safe loop for any budget >= the true payload length. + bool has_next() const; + // Yields the next payload byte. Returns 0 (and yields no more) once the chain + // tail is reached or the budget is spent -- never reads past the chain. + uint8_t next(); + + private: + const CompactPostingPool* pool_; + uint32_t cur_; // absolute read cursor + uint32_t slice_end_; // one-past-last payload byte of the current slice + uint32_t level_; // current slice level + uint64_t budget_; // remaining byte budget (upper bound on bytes to yield) + }; + + // Builds a cursor over the chain at `head`. `budget` is an UPPER BOUND on bytes to + // read (see Cursor's contract): the exact payload length or anything larger. The + // production caller passes the write-head offset, which always bounds the payload + // from above; the cursor self-terminates at the chain tail regardless. + Cursor cursor(uint32_t head, uint64_t budget) const { return Cursor(this, head, budget); } + +private: + static const uint32_t kSliceSizes[kLevelCount]; + static const uint8_t kNextLevel[kLevelCount]; + + uint8_t* at(uint32_t off) { return &blocks_[off >> kBlockShift][off & kBlockMask]; } + const uint8_t* at(uint32_t off) const { return &blocks_[off >> kBlockShift][off & kBlockMask]; } + + // Reads/writes the 4-byte forward pointer at the END of a slice whose payload + // region ends at `slice_end` (pointer occupies [slice_end, slice_end+4)). + uint32_t read_ptr(uint32_t slice_end) const; + void write_ptr(uint32_t slice_end, uint32_t next_head); + + // Reserves `bytes` contiguous bytes from the arena tail (a fresh block if the + // current tail cannot hold them) and returns the first reserved absolute offset. + // `bytes` must be <= kBlockSize. + uint32_t alloc_run(uint32_t bytes); + + // Allocates a slice at `level` (payload region + 4 pointer bytes), zeroes its + // forward pointer, and returns the first payload offset; sets *slice_end. + uint32_t alloc_slice(int level, uint32_t* slice_end); + + std::vector> blocks_; // fixed kBlockSize blocks + uint32_t next_offset_ = 0; // global bump pointer (absolute) into the tail block + uint64_t payload_bytes_ = 0; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/logical_index_writer.h b/be/src/snii/writer/logical_index_writer.h new file mode 100644 index 00000000000000..03fbe7994918a7 --- /dev/null +++ b/be/src/snii/writer/logical_index_writer.h @@ -0,0 +1,238 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/encoding/byte_sink.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/per_index_meta.h" +#include "snii/format/sampled_term_index.h" +#include "snii/format/stats_block.h" +#include "snii/io/file_writer.h" +#include "snii/writer/memory_reporter.h" +#include "snii/writer/spillable_byte_buffer.h" +#include "snii/writer/spimi_term_buffer.h" + +// LogicalIndexWriter -- builds the per-logical-index section bytes (interleaved +// posting region + DICT block region) and the meta sub-sections (SampledTermIndex, +// DICT block directory, StatsBlock, XFilter) for ONE logical index. It owns the +// in-memory section bytes and the metadata needed by the container orchestrator +// (SniiCompoundWriter) to resolve absolute offsets and emit the per-index meta +// block. +// +// This module deliberately produces ONLY relative bytes/structures: it has no +// knowledge of the absolute file position where the sections will land. The +// orchestrator stitches the absolute offsets in afterward (append-only, no +// seek-back). See snii_compound_writer.h for the precise offset contract. +// +// POSTING REGION (single interleaved sink): the former separate .frq POD and .prx +// POD are merged into ONE posting region. For each pod_ref term, in term order, the +// writer appends its prx span FIRST then its frq span, contiguously: +// posting region = concat over pod_ref terms of [prx span][frq span]. +// The prx span is empty when !has_prx (docs-only / keyword tier). INLINE terms +// append NOTHING to the posting region. +// +// Per-term encoding policy (v1): +// df >= kSlimDfThreshold (512): WINDOWED pod_ref. The term's [prx windows] are +// appended to the posting region first, then its [prelude][dd-block][freq-block] +// frq span. The DictEntry records frq/prx off_delta+len relative to +// frq_base/prx_base (see below). +// df < kSlimDfThreshold: SLIM. The postings are encoded as a single .frq +// window (and .prx window). If the encoded .frq bytes are small +// (<= kDefaultInlineThreshold), they are stored INLINE inside the DictEntry +// (kind=inline); otherwise the term's [prx][frq] spans are appended to the +// posting region as a slim pod_ref (kind=pod_ref, enc=slim, no prelude). +// +// frq_base / prx_base convention (DOCUMENTED CONTRACT): +// For each DICT block, frq_base == prx_base == the running byte offset into THIS +// index's posting region at the moment the block opens (the posting-region size +// when the block's first POD-backed entry is appended). A windowed/slim pod_ref +// entry then sets frq_off_delta = (offset of its frq span within the posting +// region) - frq_base, so the reader computes the absolute file offset as +// section_refs.posting_region.offset + frq_base + frq_off_delta. +// prx_base / prx_off_delta follow the identical rule against the SAME region. +// Because [prx][frq] are written contiguously per term, a writer-side property +// holds when has_prx: frq_off_delta == prx_off_delta + prx_len. The reader does +// NOT rely on it -- each delta is resolved independently. +// Inline entries carry no off_delta (bytes live in the entry). +namespace snii::writer { + +// Inputs describing one logical index to be written. +struct SniiIndexInput { + uint64_t index_id = 0; + std::string index_suffix; + snii::format::IndexConfig config = snii::format::IndexConfig::kDocsPositions; + uint32_t doc_count = 0; + std::vector null_docids; + // Per-doc 1-byte encoded norm (length doc_count); only consumed when the + // config has scoring. May be empty otherwise. + std::vector encoded_norms; + // Lexicographically sorted terms with ascending-docid postings. Used when + // `term_source` is null (callers that already hold a materialized vector, + // e.g. unit tests). The writer reads but does not retain these. + std::vector terms; + // Optional streaming term source. When non-null, the writer DRAINS it via + // SpimiTermBuffer::for_each_term_sorted so that only one term's postings is + // materialized at a time (avoiding the full TermPostings vector and its + // second-copy peak). `terms` is ignored when this is set. The buffer is + // consumed (emptied) by build(); the caller must keep it alive until build() + // returns and must not reuse it afterwards. + SpimiTermBuffer* term_source = nullptr; + // Target DICT block size in bytes; a block is cut once its estimate reaches + // this. 0 uses kDefaultTargetDictBlockBytes. Smaller values yield more blocks + // (and a finer-grained sampled-term index). + uint32_t target_dict_block_bytes = 0; + // Optional writer-level build-RAM reporter (one per SniiCompoundWriter = one + // segment inverted index). When non-null, the dict buffer reports its REAL + // resident-byte deltas (positive on grow, negative on spill). The SPIMI side + // (arena + slot index) reports through the SAME reporter, injected directly at + // the term_source's construction by the caller. null in bench / unit tests -> no + // reporting. NEVER report live_bytes_ (a gated estimate); report + // arena_bytes()+slot_of_+dict ram_bytes_. + MemoryReporter* mem_reporter = nullptr; +}; + +// Builds and holds the section bytes + meta sub-sections for one logical index. +class LogicalIndexWriter { +public: + explicit LogicalIndexWriter(const SniiIndexInput& in); + + // Builds DICT blocks, the interleaved posting region, sampled-term index, dict + // directory, stats and bsbf. The posting region is written STRAIGHT into + // `posting_out` as terms are produced (no temp round-trip for the bulk); the + // orchestrator captures its absolute offset/length from posting_out->bytes_written() + // around this call. Must be called once before the accessors below. Returns + // InvalidArgument on a null sink or inconsistent input (e.g. norms/doc_count + // mismatch when scoring is enabled, or non-ascending docids). + Status build(snii::io::FileWriter* posting_out); + + // DICT region byte length (relative; orchestrator decides its absolute offset). The + // DICT region (zstd-compressed blocks) is built into a tiered buffer during build() + // -- it must land contiguously AFTER the posting region (streamed concurrently), so + // it cannot stream directly. The buffer stays in RAM while small (spill-only build) + // and spills to a temp once it crosses the RAM cap (bounded peak RSS for a huge + // dict). Its bytes are emitted via stream_dict_region_into below. The posting region + // went straight to the output during build(), so it has no length accessor here -- + // the orchestrator measures it directly. norms stays in RAM (1 byte/doc). + uint64_t dict_region_size() const { return dict_buf_.size(); } + const std::vector& norms_bytes() const { return norms_section_; } + const std::vector& null_bitmap_bytes() const { return null_bitmap_section_; } + // Block-split bloom XFilter blob ([28B header][bitset]); empty when no terms. + const std::vector& bsbf_bytes() const { return bsbf_bytes_; } + bool has_bsbf() const { return !bsbf_bytes_.empty(); } + bool has_null_bitmap() const { return !null_bitmap_section_.empty(); } + + // Streams the DICT region (RAM or spilled temp) into the append-only container + // after its posting region. + Status stream_dict_region_into(snii::io::FileWriter* out) const { + return dict_buf_.stream_into(out); + } + + bool has_prx() const { return has_prx_; } + bool has_norms() const { return has_norms_; } + snii::format::IndexTier tier() const { return tier_; } + uint64_t index_id() const { return index_id_; } + const std::string& index_suffix() const { return index_suffix_; } + + // Builds the per-index meta block bytes given the resolved ABSOLUTE section + // refs (filled by the orchestrator), appending them to out. The DICT block + // directory entries are rebased to absolute offsets using dict_region_offset. + Status finish_meta(const snii::format::SectionRefs& abs_refs, uint64_t dict_region_offset, + ByteSink* out) const; + +private: + // One DICT block's directory record. The block's serialized bytes are appended to + // the in-RAM dict buffer as soon as the block is cut; only this compact summary + // (offset within the dict region + length + entry count + checksum) is kept to + // build the DICT block directory at finish_meta time. The absolute file offset is + // computed as dict_region_offset + rel_offset. + struct BlockRecord { + uint64_t rel_offset = 0; // byte offset of this block within the dict region + uint64_t length = 0; // ON-DISK block length (compressed when flags&kZstd) + uint32_t n_entries = 0; + uint32_t checksum = 0; // crc32c of the UNCOMPRESSED block bytes + uint8_t flags = 0; // block_ref_flags::* (kZstd when block is compressed) + uint64_t uncomp_len = 0; // uncompressed block length (when flags&kZstd) + std::string first_term; + }; + + // Validates one term's shape (parallel lengths, strictly ascending docids). + Status validate_term(const TermPostings& tp) const; + // Iterates terms (from the streaming source or the materialized vector), + // splitting DICT blocks by target size and filling PODs + blocks_. + Status build_blocks(); + // Per-term driver shared by both the streaming and materialized paths: + // validates the term, opens a block if needed, builds its DictEntry, and cuts + // the block once it reaches the target size. Mutates the running block state. + struct BlockState; + // `tp` is taken by mutable reference: the encode FREES the term's large flat + // arrays (docids/freqs/positions_flat) as soon as they are consumed, so the + // widest term's source does not co-exist with its encoded output at peak RSS. + Status process_term(TermPostings& tp, BlockState* st); + // Region-relative byte count of the posting bytes written so far (the offset basis + // for frq_base/prx_base + frq_off_delta/prx_off_delta). During build() the only + // writes to posting_out_ are this index's posting region, so the count is the + // output offset advanced since the region began. + uint64_t posting_size() const { return posting_out_->bytes_written() - posting_off0_; } + // Builds one DictEntry (inline or pod_ref), growing the posting region as needed. + Status build_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + snii::format::DictEntry* e); + // Builds a windowed (df >= kSlimDfThreshold) entry: multi-window + two-level + // prelude. The term's [prx span][frq span] is appended to the posting region. + Status build_windowed_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + snii::format::DictEntry* e); + // Builds a slim (df < kSlimDfThreshold) entry: single window, inline or + // pod_ref, no prelude. + Status build_slim_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + snii::format::DictEntry* e); + // Serializes the current open block, streams its bytes into the dict scratch + // file, and records a compact directory entry (no block bytes retained). + Status flush_block(snii::format::DictBlockBuilder* block, std::string first_term); + + uint64_t index_id_; + std::string index_suffix_; + snii::format::IndexTier tier_; + bool has_prx_; + bool has_freq_; // tier >= T2: a freq region is encoded per window + bool has_norms_; + uint32_t doc_count_; + std::vector null_docids_; + const std::vector& terms_; // materialized fallback (may be empty) + SpimiTermBuffer* term_source_; // streaming source (null => use terms_) + uint64_t term_count_ = 0; // distinct terms actually consumed + const std::vector& encoded_norms_; + + uint32_t target_dict_block_bytes_; + // The DICT region (zstd-compressed blocks) is staged here as blocks flush. It must + // land contiguously AFTER the posting region (which streams concurrently to the + // output), so it cannot stream directly; the orchestrator streams it into the + // container right after the posting region. It has NO independent local cap -- it + // spills to a temp via the writer's UNIFIED gate-2 cap (the MemoryReporter from + // SniiIndexInput, null off-Doris), the same single cap the SPIMI arena uses, so one + // threshold bounds the writer's total build RAM. The dict self-reports its ram_bytes_ + // deltas; the SPIMI term_source self-reports its arena+slot deltas (its reporter is + // injected at the source's own construction by the caller). + SpillableByteBuffer dict_buf_; + // The interleaved [prx][frq] posting region streams STRAIGHT into the container + // output during build() -- no temp. posting_out_ is the container writer (borrowed + // for the duration of build); posting_off0_ is its absolute offset when this index's + // region began, so posting_size() = bytes_written() - posting_off0_. + snii::io::FileWriter* posting_out_ = nullptr; + uint64_t posting_off0_ = 0; + std::vector norms_section_; + std::vector null_bitmap_section_; + + std::vector blocks_; + // One 8-byte XXH64 (seed 0) filter key per term, collected during the build pass + // so the whole-vocabulary string copy is never retained. + std::vector term_hashes_; + snii::format::StatsBlock stats_; + std::vector bsbf_bytes_; // serialized block-split bloom XFilter section +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/memory_reporter.h b/be/src/snii/writer/memory_reporter.h new file mode 100644 index 00000000000000..e9352d43d18e61 --- /dev/null +++ b/be/src/snii/writer/memory_reporter.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include + +namespace snii::writer { + +// Per-WRITER accurate byte counter for build-time RAM (one per SniiCompoundWriter = +// one per segment's inverted index). Modules report their own resident-byte deltas; +// current_bytes() is that writer's accurate live usage. OBSERVE-ONLY -- SNII never +// makes a flush decision from it (gate 1 belongs to Doris; gate 2 is the internal +// threshold). consume_release mirrors the delta into Doris's LOAD MemTracker so the +// inverted-index RAM is counted by MemTableMemoryLimiter's pressure decision; it is +// null off-Doris (bench / unit tests), where only the local atomic is updated. +class MemoryReporter { +public: + using ConsumeReleaseFn = std::function; // null off-Doris + // cap_bytes is the UNIFIED gate-2 buffer cap for the WHOLE writer (e.g. Doris's + // 512 MiB inverted-index buffer config); 0 = unlimited. Every build buffer of this + // writer (SPIMI arena + dict) self-spills when over_cap() is true -- one threshold on + // the unified total, not a separate per-buffer threshold. + explicit MemoryReporter(ConsumeReleaseFn consume_release = nullptr, uint64_t cap_bytes = 0) + : consume_release_(std::move(consume_release)), cap_bytes_(cap_bytes) {} + + MemoryReporter(const MemoryReporter&) = delete; + MemoryReporter& operator=(const MemoryReporter&) = delete; + + // delta > 0 grows, delta < 0 shrinks/frees. Exactly one report per change site. + void report(int64_t delta) { + current_.fetch_add(delta, std::memory_order_relaxed); + if (consume_release_) consume_release_(delta); // mirror into Doris load tracker + } + + int64_t current_bytes() const { return current_.load(std::memory_order_relaxed); } + + // True once the writer's UNIFIED total build RAM (arena + slot index + dict + ...) + // reaches the cap. The single gate-2 trigger shared by every buffer of the writer. + bool over_cap() const { + return cap_bytes_ != 0 && current_bytes() >= static_cast(cap_bytes_); + } + uint64_t cap_bytes() const { return cap_bytes_; } + +private: + std::atomic current_ {0}; + ConsumeReleaseFn consume_release_; + uint64_t cap_bytes_ = 0; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/snii_compound_writer.h b/be/src/snii/writer/snii_compound_writer.h new file mode 100644 index 00000000000000..bd3a7c454026ad --- /dev/null +++ b/be/src/snii/writer/snii_compound_writer.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/io/file_writer.h" +#include "snii/writer/logical_index_writer.h" + +// SniiCompoundWriter -- orchestrates a single-segment SNII container for one or +// more logical indexes, written front-to-back through an append-only +// io::FileWriter (no seek-back). It resolves all back-references by writing the +// tail meta region and the fixed tail pointer LAST. +// +// CONTAINER LAYOUT PRODUCED (this is the on-disk contract the reader matches): +// [bootstrap_header] (kBootstrapHeaderSize bytes) +// for each logical index, in add order: +// [posting region] interleaved [prx][frq] per pod_ref term, term order +// (prx span empty when !has_prx) +// [DICT blocks region] concatenated DICT blocks, split by +// target_dict_block_bytes +// for each logical index, in add order: +// [norms POD] NormsPodWriter::finish (scoring only; else absent) +// [null bitmap POD] NullBitmapWriter::finish (when nulls exist) +// [tail_meta_region] one per_index_meta block per index + directory +// [tail_pointer] encode_tail_pointer at EOF +// +// (The posting region is streamed BEFORE the DICT region per index: postings are +// the large append-only term-ordered stream; the DICT region is the compact +// compressed trailer.) +// +// OFFSET CONVENTIONS (ABSOLUTE file offsets unless stated otherwise): +// - SectionRefs in each per_index_meta record ABSOLUTE file offset+length of +// that index's posting_region, dict_region, norms. Absent regions are (0,0) +// (e.g. norms for a docs-positions index; null_bitmap is always (0,0) in v1). +// A present-but-empty posting_region (all-INLINE index) is (off, 0). +// - DictBlockDirectory entries record each DICT block's ABSOLUTE file offset + +// length. +// - A windowed/slim pod_ref entry's absolute .frq offset = +// section_refs.posting_region.offset + frq_base + frq_off_delta +// where frq_base is the posting-region-relative running offset captured at the +// block's open (see logical_index_writer.h). prx follows the identical rule +// against the SAME region (prx_base == frq_base). +// - tail_pointer.meta_region_offset/length point at the tail_meta_region; +// hot_off = 0 (no hot region in v1). +namespace snii::writer { + +class SniiCompoundWriter { +public: + explicit SniiCompoundWriter(snii::io::FileWriter* out); + + // Buffers one logical index: builds its section bytes and meta sub-sections. + // The actual file writing happens in finish() (single front-to-back pass). + Status add_logical_index(const SniiIndexInput& in); + + // Writes bootstrap header + all index sections + norms + tail meta region + + // tail pointer, then finalizes the underlying writer. May be called once. + Status finish(); + +private: + // Absolute placement of one index's sections, resolved during finish(). + struct Placement { + uint64_t dict_off = 0; + uint64_t dict_len = 0; + uint64_t post_off = 0; // interleaved [prx][frq] posting region (was frq + prx) + uint64_t post_len = 0; + uint64_t norms_off = 0; + uint64_t norms_len = 0; + uint64_t null_off = 0; + uint64_t null_len = 0; + uint64_t bsbf_off = 0; + uint64_t bsbf_len = 0; + }; + + Status ensure_bootstrap(); + Status write_bootstrap(); + Status write_norms(); + Status write_tail(); + Status append(const std::vector& bytes); + + snii::io::FileWriter* out_; + std::vector> indexes_; + // Per-index placement; post_off/post_len are filled as each index's posting region + // streams in during add_logical_index, the rest during finish(). The absolute write + // offset is out_->bytes_written() (the single source of truth -- no separate cursor). + std::vector placements_; + bool bootstrap_written_ = false; + bool finished_ = false; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/spill_run_codec.h b/be/src/snii/writer/spill_run_codec.h new file mode 100644 index 00000000000000..d79381aa67184f --- /dev/null +++ b/be/src/snii/writer/spill_run_codec.h @@ -0,0 +1,181 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/writer/spimi_term_buffer.h" + +namespace snii::writer { + +// On-disk SPIMI "run" codec for the spill / k-way-merge out-of-core build path. +// +// A RUN is a self-describing file holding a sequence of terms keyed by TERM-ID, +// each followed by its postings, in this exact wire layout. The file is produced +// and consumed by THIS module only (a private temp file -- the on-disk INDEX is +// unaffected), so the format is chosen for cheap I/O: docids, freqs and positions +// are ALL RAW fixed-width little-endian u32 BLOCKS (bulk memcpy on both ends, +// ~10x cheaper than per-value varint -- which cost ~1.5s of encode CPU over the +// 5M build's ~60M docids and compressed those streams poorly anyway). Decode +// still validates every length against the file size. +// +// run := record* (term-ids ordered by vocab string, +// strictly ascending within a run) +// record := +// VInt term_id (index into the shared vocabulary; the +// string is NOT stored -- smaller runs, +// no per-record string IO) +// VInt n_docs +// u32 docid * n_docs (RAW LE block, memcpy; ABSOLUTE ascending +// docids -- the merge concatenates across +// runs and re-deltas at index encode time) +// u32 freq * n_docs (RAW LE block, memcpy; each >= 1) +// VInt n_pos (== sum(freqs) when has_positions, else 0) +// u32 position * n_pos (RAW LE block, document-order, partitioned +// by freqs) +// +// Decode is fully STREAMED: a RunReader reads a small fixed buffer at a time and +// materializes only the CURRENT term's postings, never the whole run. The k-way +// merge keeps one heap slot per run (each holding only its current term-id + +// that term's postings), so peak memory is bounded by the widest single term +// summed across the runs that contain it -- not by total postings. The merge +// orders runs by the term-id's VOCAB STRING (resolved via the shared vocabulary) +// so the merged stream is lexicographic. + +// Writes a sorted sequence of terms (by id) to one run file. Term-ids must be +// handed to write_term in vocab-string ascending order (the spill caller sorts +// before spilling). RAII: the file is flushed and closed on close(); the partial +// file is left for the owning SpimiTermBuffer to delete on its temp-path list. +class RunWriter { +public: + RunWriter() = default; + ~RunWriter(); + + RunWriter(const RunWriter&) = delete; + RunWriter& operator=(const RunWriter&) = delete; + + // Opens `path` for writing (truncating). Returns IoError on failure. + Status open(const std::string& path); + + // Appends one term's postings under `term_id`. `tp.positions_flat` must be empty + // iff !has_positions (and otherwise hold sum(freqs) entries in doc order). + // Caller guarantees ascending docids and parallel docids/freqs lengths. + Status write_term(uint32_t term_id, const TermPostings& tp); + + // Flushes the buffer and closes the file. Safe to call once; idempotent. + Status close(); + +private: + Status flush(); + + int fd_ = -1; + std::vector buf_; // staging buffer; flushed in fixed-size chunks +}; + +// Streamed reader over one run file. After open() the first term is loaded; +// current()/current_id() expose it; advance() loads the next (or marks +// exhausted). Only the current term's postings live in memory at a time. The +// current record's `term` string is left EMPTY -- runs store only the id; the +// owner resolves the string via the shared vocabulary. +// +// LAZY POSITIONS (peak-RSS optimization for the widest merged term): advance() +// loads term_id / docids / freqs and the position-block COUNT, but does NOT read +// the position bytes -- it leaves the decode window cursor parked at the start of +// the position block. The owner then chooses, per term: +// * materialize_positions(): bulk-reads the block into current().positions_flat +// (the default; behaves exactly as the old eager reader). +// * stream_positions(dst, n): pulls the next n positions straight from the +// window in 64 KiB chunks, never materializing the whole block -- used by the +// k-way merge's wide-term position pump so the widest term's tens-of-MiB +// positions buffer is never resident. +// advance() drains any positions left unread from the previous term before the +// next record, so a partly-streamed (or skipped) term still lands at the right +// record boundary. The yielded byte sequence is identical either way. +class RunReader { +public: + RunReader() = default; + ~RunReader(); + + RunReader(const RunReader&) = delete; + RunReader& operator=(const RunReader&) = delete; + + // Opens `path`, loading the first record (if any). has_positions must match + // the writer's setting so n_pos is interpreted consistently. + Status open(const std::string& path, bool has_positions); + + bool exhausted() const { return exhausted_; } + const TermPostings& current() const { return current_; } + uint32_t current_id() const { return current_id_; } + + // Number of positions in the current term's (lazily-loaded) position block. + uint64_t current_pos_count() const { return pos_count_; } + // True once the current term's positions have been materialized OR fully + // streamed (i.e. nothing remains to read before advance()). + bool positions_drained() const { return pos_remaining_ == 0; } + + // Materializes the current term's position block into current().positions_flat + // (bulk read). Idempotent within a term: a no-op once positions are drained. + Status materialize_positions(); + // Streams the next `n` positions of the current term into dst[0..n) directly + // from the decode window (64 KiB chunks topped up on demand). Caller must not + // request more than positions_remaining(); each call advances the cursor. + Status stream_positions(uint32_t* dst, size_t n); + uint64_t positions_remaining() const { return pos_remaining_; } + + // Loads the next record into current(); sets exhausted() at end of file. Any + // positions of the current term left unread are skipped first. + Status advance(); + +private: + size_t available() const; // buffered bytes from pos_ to window end + Status fill(); // tops up the decode window from disk + Status ensure(size_t n); // guarantees >= n buffered bytes (or eof) + Status read_varint(uint64_t* v); // bounds-checked streamed varint + // Bulk-reads `count` RAW little-endian u32s from the window into `out` (resized + // to count). Bounds-checked against the run's true length (Corruption on EOF). + Status read_raw_u32(size_t count, std::vector* out); + // Streams `count` raw u32s from the window into dst (caller-owned, sized by the + // caller); shared by read_raw_u32 (into a vector) and stream_positions. + Status pull_raw_u32(uint8_t* dst, size_t count); + // Drains (and discards) any remaining positions of the current term so the + // window cursor lands at the next record boundary. + Status skip_remaining_positions(); + + int fd_ = -1; + bool has_positions_ = false; + bool exhausted_ = false; + uint64_t file_size_ = 0; // total run byte size (fstat at open); bounds lengths + std::vector window_; // sliding decode window + size_t pos_ = 0; // consumed offset within window_ + bool eof_ = false; // no more bytes on disk + uint32_t current_id_ = 0; // current record's term-id + uint64_t pos_count_ = 0; // current term's total position count (from n_pos) + uint64_t pos_remaining_ = 0; // positions still unread in the current block + TermPostings current_; +}; + +// K-way merges the given run files into a single term stream ordered by the +// term-id's VOCAB STRING (lexicographic), invoking `fn` once per distinct +// term-id with its postings concatenated across all runs that contain it (in +// run order -> docids stay ascending) and its `term` resolved from `vocab`. +// Only one merged term is materialized at a time. Returns IoError/Corruption on +// bad run data. has_positions must match how the runs were written. `vocab` +// maps term-id -> string and is borrowed. +// +// allow_stream_positions (peak-RSS optimization): when true (the streaming-writer +// path), a WIDE merged term's positions are NOT materialized into positions_flat; +// instead the TermPostings carries a pos_pump that streams positions in document +// order straight from the run readers (which stay parked at this term's blocks +// for the duration of the fn() call). `fn` MUST therefore consume each term +// SYNCHRONOUSLY and must NOT retain the TermPostings past the call (the pump +// references live readers freed when the merge advances). Callers that retain the +// term (e.g. finalize_sorted) MUST pass false, so positions are always fully +// materialized. The produced bytes are identical either way. +Status MergeRuns(const std::vector& run_paths, const std::vector& vocab, + bool has_positions, const std::function& fn, + bool allow_stream_positions = true); + +} // namespace snii::writer diff --git a/be/src/snii/writer/spillable_byte_buffer.h b/be/src/snii/writer/spillable_byte_buffer.h new file mode 100644 index 00000000000000..0f5737e2bdd2f1 --- /dev/null +++ b/be/src/snii/writer/spillable_byte_buffer.h @@ -0,0 +1,158 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/common/status.h" +#include "snii/io/local_file.h" +#include "snii/writer/memory_reporter.h" +#include "snii/writer/temp_dir.h" + +namespace snii::writer { + +// A tiered append buffer for one build-time section. While resident it holds the +// bytes as a CHAIN OF CHUNKS (one per append) rather than a single growing vector: +// each append owns a right-sized allocation, so there is NO geometric-doubling +// realloc transient and NO power-of-two capacity slack -- the resident cost is +// exactly the bytes appended, for any section size. Once the running size crosses +// `cap_bytes` the buffer SPILLS to a temp file (resolve_temp_dir()) and routes later +// appends there, so a huge section stays RSS-bounded at ~cap_bytes while a small one +// is RAM-only (zero disk, spill-only build). append order/bytes are identical +// wherever they land; stream_into() reproduces the section in order. RAII-removes the +// temp. (cap_bytes == UINT64_MAX disables spilling -> always RAM.) +class SpillableByteBuffer { +public: + // `reporter` is an OPTIONAL writer-level build-RAM reporter (null off-Doris / + // unit tests). When non-null, every change to ram_bytes_ (the RESIDENT tier) is + // mirrored to it as a signed delta: a positive delta per RAM append, and a single + // negative delta == prior ram_bytes_ when the buffer spills (the resident chunks + // are dropped and the bytes move to disk, so they must NOT be counted as RSS). + // Spilled bytes live on disk and are never reported. + SpillableByteBuffer(uint64_t cap_bytes, std::string tag, MemoryReporter* reporter = nullptr) + : cap_bytes_(cap_bytes), tag_(std::move(tag)), reporter_(reporter) {} + ~SpillableByteBuffer() { + // Balance the reporter: on the common un-spilled path the resident ram_bytes_ was + // reported as positive on append but never released, so release it now (a missed + // negative would leak into Doris's MemTracker). After a spill, spill_to_disk() + // already reported the negative and ram_bytes_ no longer counts as resident. + if (reporter_ && !spilled_ && ram_bytes_ > 0) { + reporter_->report(-static_cast(ram_bytes_)); + } + if (!temp_path_.empty()) std::remove(temp_path_.c_str()); + } + SpillableByteBuffer(const SpillableByteBuffer&) = delete; + SpillableByteBuffer& operator=(const SpillableByteBuffer&) = delete; + + // Total bytes appended so far (the offset basis for callers recording sub-offsets). + uint64_t size() const { return spilled_ ? spilled_bytes_ : ram_bytes_; } + + // Copying append (the Slice bytes are copied into a fresh chunk). + Status append(Slice bytes) { + if (spilled_) { + SNII_RETURN_IF_ERROR(temp_.append(bytes)); + spilled_bytes_ += bytes.size(); + return Status::OK(); + } + if (!bytes.empty()) { + chunks_.emplace_back(bytes.data(), bytes.data() + bytes.size()); + ram_bytes_ += bytes.size(); + if (reporter_) reporter_->report(static_cast(bytes.size())); + } + if (over_cap()) return spill_to_disk(); + return Status::OK(); + } + + // Move append: the section ADOPTS the caller's vector (no copy, no slack). The + // common dict path -- each flushed block is handed off by move. + Status append_move(std::vector&& v) { + if (spilled_) { + SNII_RETURN_IF_ERROR(temp_.append(Slice(v))); + spilled_bytes_ += v.size(); + return Status::OK(); + } + if (!v.empty()) { + ram_bytes_ += v.size(); + if (reporter_) reporter_->report(static_cast(v.size())); + chunks_.push_back(std::move(v)); + } + if (over_cap()) return spill_to_disk(); + return Status::OK(); + } + + // Must be called once after the last append, before stream_into(): flushes the temp + // (if spilled) so it can be read back. A no-op for a RAM-resident buffer. + Status seal() { + if (spilled_ && !sealed_) { + SNII_RETURN_IF_ERROR(temp_.finalize()); + sealed_ = true; + } + return Status::OK(); + } + + // Streams the whole section (RAM chunks or sealed temp) into `out`, in append order. + Status stream_into(snii::io::FileWriter* out) const { + if (!spilled_) { + for (const auto& c : chunks_) { + if (!c.empty()) SNII_RETURN_IF_ERROR(out->append(Slice(c))); + } + return Status::OK(); + } + snii::io::LocalFileReader r; + SNII_RETURN_IF_ERROR(r.open(temp_path_)); + constexpr uint64_t kChunk = 1u << 20; // fixed copy window (no whole-section reload) + std::vector buf; + for (uint64_t off = 0; off < spilled_bytes_; off += kChunk) { + const uint64_t n = std::min(kChunk, spilled_bytes_ - off); + SNII_RETURN_IF_ERROR(r.read_at(off, n, &buf)); + SNII_RETURN_IF_ERROR(out->append(Slice(buf))); + } + return Status::OK(); + } + + bool spilled() const { return spilled_; } + +private: + // Gate-2 spill condition (UNIFIED): spill when the writer's TOTAL build RAM crosses + // the one shared cap (reporter_->over_cap()), with the local cap_bytes_ kept only as + // a defensive per-buffer hard ceiling (e.g. when no reporter is attached). + bool over_cap() const { + return (reporter_ != nullptr && reporter_->over_cap()) || ram_bytes_ >= cap_bytes_; + } + Status spill_to_disk() { + temp_path_ = resolve_temp_dir() + "/snii_" + tag_ + "_" + std::to_string(::getpid()) + "_" + + std::to_string(reinterpret_cast(this)) + ".tmp"; + SNII_RETURN_IF_ERROR(temp_.open(temp_path_)); + for (const auto& c : chunks_) { + if (!c.empty()) SNII_RETURN_IF_ERROR(temp_.append(Slice(c))); + } + spilled_bytes_ = ram_bytes_; + // The resident tier is freed: report the full negative delta == prior ram_bytes_ + // so the writer-level RAM counter (and Doris's LOAD tracker) no longer counts + // these bytes as RSS -- they now live on disk. This single negative balances the + // sum of all prior positive append deltas (net-zero RAM after spill). + if (reporter_) reporter_->report(-static_cast(ram_bytes_)); + std::vector>().swap(chunks_); // reclaim the RAM immediately + spilled_ = true; + return Status::OK(); + } + + uint64_t cap_bytes_; + std::string tag_; + MemoryReporter* reporter_ = nullptr; // optional build-RAM reporter (null off-Doris) + std::vector> chunks_; // resident tier: one chunk per append + uint64_t ram_bytes_ = 0; + bool spilled_ = false; + bool sealed_ = false; + snii::io::LocalFileWriter temp_; + std::string temp_path_; + uint64_t spilled_bytes_ = 0; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/spimi_term_buffer.h b/be/src/snii/writer/spimi_term_buffer.h new file mode 100644 index 00000000000000..d2b617ccfb4c69 --- /dev/null +++ b/be/src/snii/writer/spimi_term_buffer.h @@ -0,0 +1,362 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "snii/common/status.h" +#include "snii/writer/compact_posting_pool.h" +#include "snii/writer/memory_reporter.h" + +namespace snii::writer { + +// One term's posting list: docids ascending, with parallel freqs and (when +// positions are enabled) a single FLAT positions buffer. +// +// positions_flat holds every position for the term in document order, partitioned +// by freqs: doc i owns the next freqs[i] entries. This is the SAME layout the +// accumulator stores natively, so no per-doc vector-of-vectors is ever built on +// the build/merge hot path (that vector-of-vectors was the dominant peak-RSS +// driver for high-df terms). doc_positions(i) returns a non-owning span view of +// doc i's positions for consumers that want per-doc access (e.g. the prx window +// builder, tests). positions_flat is empty when positions are disabled. +struct TermPostings { + std::string term; + std::vector docids; + std::vector freqs; + std::vector positions_flat; // empty when positions disabled + + // OPTIONAL streamed-positions source (peak-RSS optimization for very-high-df + // terms). When set, positions_flat is left EMPTY and the writer pulls positions + // SEQUENTIALLY in document order via pos_pump(dst, n) -- filling `dst[0..n)` with + // the next n positions -- one window at a time, so the term's full flat positions + // buffer (tens of MiB for the widest term) is never materialized. The yielded + // bytes are byte-identical to building from positions_flat (same values, same + // order). pos_total is the total number of positions the pump will yield (== + // sum(freqs)); it lets the writer validate without a flat buffer. When pos_pump + // is null, positions come from positions_flat as before. Only the writer's prx + // builders consume this; all other consumers use positions_flat. + // + // OWNERSHIP CONTRACT (synchronous-consume-once): a streamed pos_pump captures + // references into the producer's stack and its parked run readers/arena, valid ONLY + // for the duration of the synchronous fn(TermPostings&&) call that delivered this + // TermPostings. The consumer MUST pull all positions inside fn() and MUST NOT store + // the TermPostings or invoke pos_pump after fn() returns. Callers that retain the + // TermPostings pass allow_stream_positions=false, which materializes positions into + // positions_flat instead (no pump). As a safety net, a deferred call to a streamed + // pump throws std::logic_error rather than dereferencing freed state. + std::function pos_pump; + uint64_t pos_total = 0; + + // Byte offset of doc i's first position within positions_flat (prefix sum of + // freqs). O(i) -- callers iterating all docs should track a running offset. + size_t pos_offset(size_t doc_index) const { + size_t off = 0; + for (size_t i = 0; i < doc_index; ++i) off += freqs[i]; + return off; + } + // Non-owning view of doc i's positions (length freqs[i]) into positions_flat. + std::span doc_positions(size_t doc_index) const { + const size_t off = pos_offset(doc_index); + return std::span(positions_flat.data() + off, freqs[doc_index]); + } + + // Rebuilds the per-doc position lists (for callers/tests wanting per-doc access) + // from positions_flat partitioned by freqs. O(total positions); allocates. + std::vector> positions_per_doc() const { + std::vector> out(freqs.size()); + size_t off = 0; + for (size_t i = 0; i < freqs.size(); ++i) { + out[i].assign(positions_flat.begin() + off, positions_flat.begin() + off + freqs[i]); + off += freqs[i]; + } + return out; + } + + // Sets the flat positions from per-doc lists (convenience for tests / callers + // that produce per-doc positions). Does NOT touch freqs; the caller is expected + // to keep freqs[i] == per_doc[i].size() consistent (the writer validates this). + void set_positions_per_doc(const std::vector>& per_doc) { + positions_flat.clear(); + for (const auto& d : per_doc) + positions_flat.insert(positions_flat.end(), d.begin(), d.end()); + } +}; + +// In-memory SPIMI (Single-Pass In-Memory Indexing) accumulator for one logical +// index. Records term occurrences and produces lexicographically sorted terms +// with ascending-docid posting lists. +// +// TERM-ID ACCUMULATION (no per-token string work): tokens are accumulated by an +// INTEGER term-id, not by hashing/constructing a std::string per token. The +// caller supplies a VOCABULARY mapping term-id -> term string; the buffer keeps +// a DENSE std::vector indexed by term-id, so the hot add_token path is a +// vector index + a couple of pushes -- no hashing, no allocation per token. The +// vocabulary is resolved to strings only once per distinct term at finalize. +// +// Two construction modes: +// * BORROWED vocab (the fast path): pass a non-null `vocab` that the caller +// owns and keeps alive; add_token(term_id, ...) indexes straight into it. +// * OWNED vocab (compatibility): pass a null `vocab`; the string-keyed +// add_token(string_view, ...) interns each new term into an internal owned +// vocabulary (assigning ids in first-seen order) and forwards to the id +// path. Existing callers that feed strings keep working unchanged. +// +// SPILL / K-WAY MERGE (out-of-core, bounds input RAM): when a non-zero +// spill_threshold_bytes is set, the REAL resident accumulator size (the posting +// arena + the vocab-sized slot index, pool_.arena_bytes() + slot_of_.capacity()*4) +// is compared against the threshold as tokens arrive; once it crosses the +// threshold the buffer SORTS its current terms, +// writes a self-describing sorted RUN to a temp file, and CLEARS memory. Each +// run record is keyed by the TERM-ID (varint); the k-way merge orders runs by +// the id's VOCAB STRING so the merged stream stays lexicographic. Because +// tokens arrive in globally ascending docid order, a term that reappears in a +// later run only covers strictly-later docids, so concatenating its postings in +// run order during the final merge keeps docids ascending. for_each_term_sorted +// flushes the residual buffer as a final run, then k-way merges all runs +// materializing only ONE merged term at a time -> peak memory stays bounded by +// the threshold (plus the widest single term), NOT by total postings. With the +// default threshold 0 (unlimited) the path is exactly the in-memory behavior. +// +// Internal representation is a COMPACT TAGGED VARINT byte stream per term, held in +// a shared SEGMENTED ARENA (CompactPostingPool), NOT per-term uint32 vectors. Each +// term owns ONE arena chain holding a stream of per-TOKEN entries in arrival +// order: every token contributes varint((pos << 1) | new_doc_bit); when new_doc_bit +// is set, the token's doc differs from the previous one, so a zigzag-varint(docid - +// prev_docid) immediately follows. Frequencies are NOT stored -- a doc's freq is +// the count of consecutive same-doc tokens, recovered while decoding. This drops +// the entire freq stream and the second (positions) chain versus a freq/prox split, +// so the payload is ~3.4x smaller than raw uint32 docids/freqs/positions, and the +// shared arena removes per-vector doubling slack and per-term vector headers. Each +// append writes straight into the chain (no deferred per-doc flush): the only live +// per-term state is the current doc id (to detect a doc change) and the delta base. +// to_postings() decodes a term's chain back to the SAME flat TermPostings the +// writer consumes, so the produced .idx is BYTE-IDENTICAL. positions_flat stays +// empty (and pos is tagged as 0) when positions are disabled; freq still counts. +// +// Duplicate vocab strings: the vocab is assumed to map each id to a DISTINCT +// string (a dense vocabulary). If two ids share a string they sort adjacently +// but are emitted as two separate terms; callers must not rely on coalescing. +class SpimiTermBuffer { +public: + // BORROWED-vocab constructor: `vocab` maps term-id -> term string and is + // borrowed (NOT owned) -- the caller must keep it alive for the buffer's + // lifetime. add_token(term_id, ...) accumulates by id with no string work. + // spill_threshold_bytes is the gate-2 internal buffer cap (e.g. 512 MiB), + // sourced from config; == 0 means unlimited (pure in-memory, default). A + // positive value caps the REAL resident accumulator size (pool_.arena_bytes() + + // slot_of_.capacity()*4), triggering a spill when that crosses the cap -- NOT the + // old per-token estimate. + // `reporter` is the OPTIONAL writer-level build-RAM reporter (null off-Doris / + // unit tests). When non-null, the accumulator reports its REAL resident-byte + // deltas -- pool_.arena_bytes() + slot_of_.capacity()*4 -- positive on grow, + // negative on every reset/free, exactly once. NEVER reports live_bytes_ (a gated + // estimate that feeds only the spill threshold). + explicit SpimiTermBuffer(const std::vector* vocab, bool has_positions, + size_t spill_threshold_bytes = 0, MemoryReporter* reporter = nullptr); + + // OWNED-vocab (compatibility) constructor: no external vocab. The string-keyed + // add_token interns terms into an internal vocabulary on first occurrence. + explicit SpimiTermBuffer(bool has_positions, size_t spill_threshold_bytes = 0, + MemoryReporter* reporter = nullptr); + + ~SpimiTermBuffer(); + + SpimiTermBuffer(const SpimiTermBuffer&) = delete; + SpimiTermBuffer& operator=(const SpimiTermBuffer&) = delete; + + // Records one token by TERM-ID: term `term_id` occurs in `docid` at `pos`. + // `term_id` must be in [0, vocab_size). An out-of-range id latches an + // InvalidArgument into status() and is ignored. For a given term, docids are + // expected to arrive in non-decreasing order, and positions within a docid in + // ascending order; out-of-order docids (INCLUDING a REVISITED docid -- the same + // docid appearing again after a different one) are tolerated and reordered at + // finalize: SortByDocid stably sorts by docid and COALESCES same-docid groups + // (summing freqs, concatenating positions in document order), so the emitted + // postings have exactly ONE strictly-ascending entry per docid -- matching the + // k-way merge path and the writer's strictly-ascending precondition. + void add_token(uint32_t term_id, uint32_t docid, uint32_t pos); + + // Compatibility overload: records one token by TERM STRING. Valid ONLY on an + // OWNED-vocab buffer (constructed without an external vocab); interns `term` + // into the internal vocabulary on first occurrence, then forwards by id. Called + // on a BORROWED-vocab buffer it is REJECTED (latches InvalidArgument, token + // ignored) -- interning would grow the owned vocab out of step with the borrowed + // one and corrupt the build. It also allocates a std::string per call, so the + // hot path is the id overload; prefer that and reserve this for tests / legacy + // string-fed callers. + void add_token(std::string_view term, uint32_t docid, uint32_t pos); + + // Number of DISTINCT terms accumulated so far (touched ids still resident). + size_t unique_terms() const; + uint64_t total_tokens() const { return total_tokens_; } + bool has_positions() const { return has_positions_; } + + // OK unless an add_token validation error (out-of-range term-id, wrong vocab + // mode) was latched. for_each_term_sorted now returns its own I/O Status + // directly; callers that use add_token's latch-and-report pattern MUST check + // this after draining to surface input-side validation errors. + [[nodiscard]] Status status() const { return spill_status_; } + + // TEST-ONLY: number of spill run files written so far (== 0 in pure in-memory + // mode). Lets tests assert that a gate-2 spill actually fired once the REAL + // resident size crossed the configured cap. Not part of the production API. + size_t run_count_for_test() const { return run_paths_.size(); } + + // Materializes all terms sorted lexicographically; each term's docids are + // ascending. Convenience wrapper around for_each_term_sorted that keeps the + // whole result alive at once. Prefer for_each_term_sorted for low peak memory. + // MUST be called at most once: it drains internal state. A SECOND drain (a + // repeat call, or a finalize_sorted after a for_each_term_sorted, or vice versa) + // returns EMPTY and latches an error into status() rather than re-emitting. + std::vector finalize_sorted(); + + // Streams terms to `fn` in lexicographic order, building ONE transient + // TermPostings at a time and freeing that term's accumulated arrays before + // moving to the next. This keeps at most a single term's postings duplicated, + // avoiding the input+output coexistence peak. MUST be called at most once: it + // drains internal state. A SECOND drain invokes `fn` zero times and returns + // an Internal error (a re-merge of the still-present run files would otherwise + // re-emit every term). Returns non-OK on spill/merge I/O or corruption errors, + // or if a prior add_token latched a validation error into status(). + Status for_each_term_sorted(const std::function& fn); + +private: + // Compact per-term accumulator: ONE tagged-varint arena chain plus a few cursors. + // Every token is appended immediately (no deferred flush), so the only running + // state is the current doc id and the delta base. A sentinel chain head of + // kNoChain marks a term that has not started its chain yet (so an all-empty term + // costs no arena bytes). ntok / ndocs bound the decode loop and size reserves. + // Total ~36 B per live term. + static constexpr uint32_t kNoChain = 0xFFFFFFFFu; + struct Term { + uint32_t head = kNoChain; // chain read entry point + CompactPostingPool::SliceWriter w; // append cursor for the chain (8 B) + uint32_t ntok = 0; // total tokens (entries) in the chain + uint32_t cur_docid = 0; // most-recent doc id: detects doc change AND + // is the zigzag delta base for the next doc + uint8_t level = 0; // current slice level of w (packed here, not in w) + bool started = false; // false until the first token is appended + bool sorted = true; // false if a docid arrived out of ascending order + }; + static_assert(sizeof(CompactPostingPool::SliceWriter) == 8, + "SliceWriter must stay 8 bytes to keep Term compact"); + + // The active vocabulary (term-id -> string): either the borrowed pointer or, + // in owned mode, &owned_vocab_. Always non-null after construction. + const std::vector& vocab() const { return *vocab_; } + + // Accumulates one already-validated token into the per-id Term. + void accumulate(uint32_t term_id, uint32_t docid, uint32_t pos); + + // Decodes `t`'s compact chain into a TermPostings (the exact docids/freqs/ + // positions the writer consumes), sorting by docid first if `t.sorted` is false. + // When `allow_stream_positions` is true (the in-memory drain path), a large + // sorted term's positions are provided via TermPostings::pos_pump instead of a + // materialized positions_flat (peak-RSS win). The spill path passes false so the + // run codec always sees a fully-materialized positions_flat. + TermPostings to_postings(std::string term, Term&& t, bool allow_stream_positions) const; + + // Returns the touched term-ids sorted by their vocab string (lexicographic). + // Sorts by a PRECOMPUTED integer string-rank (term-id -> lexicographic rank), + // not by full std::string compare: a single std::string sort over the whole + // vocabulary is amortized across every spill, so each spill's sort is an + // integer compare instead of paying a fresh O(touched * strcmp) on every spill. + std::vector sorted_ids() const; + // Builds string_rank_ (term-id -> lexicographic rank) once, lazily. Idempotent. + void ensure_string_rank() const; + // Streams the in-memory terms in sorted order, draining the slot pool (the + // in-memory single-pass path). When `allow_stream_positions` is true, large + // sorted terms stream positions via pos_pump (valid only because the callback + // consumes each term synchronously while the arena is still resident); callers + // that RETAIN the TermPostings past the drain (finalize_sorted) must pass false. + Status drain_sorted(const std::function& fn, bool allow_stream_positions); + // Spills the current buffer to a fresh sorted run file and clears memory. + Status spill_to_run(); + // Writes all current terms (sorted) to an already-open RunWriter, draining. + Status drain_to_writer(class RunWriter* w); + // REAL resident accumulator bytes: pool_.arena_bytes() + slot_of_.capacity()*4. + // The single source of truth for both the gate-2 spill trigger and the spill + // space-precheck -- replaces the old gated live_bytes_ estimate. + uint64_t resident_bytes() const; + // Reports the signed change in REAL resident bytes (pool_.arena_bytes() + + // slot_of_.capacity()*4) to mem_reporter_ since the previous call, then caches the + // new total. Single-source diff: every grow/reset/free emits EXACTLY ONE delta + // (self-balancing -> impossible to double-count or miss a negative). No-op when + // mem_reporter_ is null. + void report_arena_delta(); + // Final k-way merge over the spilled runs (+ the residual flushed as a run). + // When `allow_stream_positions` is true (the streaming for_each path), a wide + // merged term streams positions via pos_pump (valid only because fn consumes + // synchronously while the run readers stay parked); callers that RETAIN the + // TermPostings past the merge (finalize_sorted) MUST pass false. + Status merge_runs(const std::function& fn, bool allow_stream_positions); + // Deletes every temp run file; called from the destructor (RAII cleanup). + void cleanup_runs(); + // Frees a drained term's accumulator (id leaves the touched set). + void release_term(uint32_t term_id); + + const std::vector* vocab_; // active vocab (borrowed or &owned_) + std::vector owned_vocab_; // owned mode: interned term strings + // Owned mode only: term string -> term-id, for interning on first occurrence. + std::unordered_map intern_; + + bool has_positions_; + size_t spill_threshold_bytes_; // 0 => unlimited (no spilling) + uint64_t total_tokens_ = 0; + + // POOLED accumulators (replaces a dense vocab-sized std::vector, which + // cost ~80 B per vocab id even for the ~empty majority -- the single largest + // input-phase memory line). slot_of_ is the only vocab-sized array: a 4 B index + // per id (0 == no live Term; otherwise slot index + 1). slots_ holds ONE Term + // per CURRENTLY-LIVE id, so its size tracks the live touched count, not the + // vocabulary. On first touch an id claims a slot (reusing a freed one from + // free_slots_ when available, else appending). release_term frees the slot back + // to the pool and clears slot_of_[id]. touched_ids_ lists every live id so + // finalize/spill iterate touched ids without scanning the whole vocabulary. + // present_[id] is now (slot_of_[id] != 0). The hot add path is still a vector + // index + a couple of pushes: no hashing, no per-token allocation. + std::vector slot_of_; // vocab-sized: id -> slot index + 1 (0=empty) + std::vector slots_; // live Term pool (size ~ live touched count) + std::vector free_slots_; // recycled slot indices (drained terms) + std::vector touched_ids_; + size_t live_term_count_ = 0; // present (non-drained) terms; == unique_terms() + + // Shared arena backing every live term's DOC and POS varint byte chains. Holds + // the bulk of the accumulator's memory in a few large blocks (no per-term vector + // headers, no per-vector doubling slack) -- the compact-RSS win. + CompactPostingPool pool_; + + // Optional writer-level build-RAM reporter (null off-Doris / unit tests) and the + // last resident-byte total it was told about. report_arena_delta() diffs the live + // total (arena_bytes() + slot_of_.capacity()*4) against reported_resident_. + MemoryReporter* mem_reporter_ = nullptr; + int64_t reported_resident_ = 0; + + // Returns the live Term for `term_id`, claiming a pool slot on first touch. + Term& term_slot(uint32_t term_id, bool* new_term); + + // Appends one byte / one varint to a term's tagged chain, lazily starting the + // chain on first use (so an untouched term costs no arena bytes). + void put_byte(Term* t, uint8_t b); + void put_varint(Term* t, uint64_t v); + + std::vector run_paths_; // spilled run temp files (deleted in dtor) + Status spill_status_; // first spill / range error, at finalize + bool drained_ = false; // set once finalize_sorted/for_each_term_sorted has run; + // a second drain would (spilled path) re-merge the run + // files and re-emit every term, or (in-memory path) emit + // nothing -- both wrong. Guard against the double-drain. + + // Lazily-built vocab-sized map: term-id -> its lexicographic rank among all + // vocab strings. Computed once (one full std::string sort of the vocabulary) + // on the first sorted_ids() call, then reused by every spill's id sort. mutable + // so the const sorted_ids() can fill it on demand. + mutable std::vector string_rank_; +}; + +} // namespace snii::writer diff --git a/be/src/snii/writer/temp_dir.h b/be/src/snii/writer/temp_dir.h new file mode 100644 index 00000000000000..36d51d578a5e2a --- /dev/null +++ b/be/src/snii/writer/temp_dir.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +#include +#include +#include + +namespace snii::writer { + +// Scratch directory for spill runs and section temp files. Resolution order: +// SNII_TEMP_DIR (explicit config) -> TMPDIR (POSIX default) -> /tmp (fallback). +// +// Point SNII_TEMP_DIR / TMPDIR at a REAL disk (SSD/NVMe). /tmp is often tmpfs (a +// RAM-backed filesystem) on modern systems, where spilling does NOT reduce RSS -- +// it just moves bytes from heap to tmpfs, defeating the purpose of spilling. +inline std::string resolve_temp_dir() { + for (const char* var : {"SNII_TEMP_DIR", "TMPDIR"}) { + const char* v = std::getenv(var); + if (v != nullptr && v[0] != '\0') { + std::string d(v); + while (d.size() > 1 && d.back() == '/') d.pop_back(); // strip trailing '/' + return d; + } + } + return "/tmp"; +} + +// Best-effort free bytes on the filesystem backing `dir`. Returns UINT64_MAX when +// statvfs fails, so a caller's space pre-check never false-positives on an +// unstattable path. CAVEATS: this is best-effort only -- it is subject to TOCTOU +// (free space can drop before/while the write runs), and on tmpfs it reports +// RAM-backed space (use the temp-dir config to avoid tmpfs in the first place). +inline uint64_t temp_dir_available_bytes(const std::string& dir) { + struct statvfs vfs; + if (::statvfs(dir.c_str(), &vfs) != 0) return UINT64_MAX; + return static_cast(vfs.f_bavail) * static_cast(vfs.f_frsize); +} + +} // namespace snii::writer diff --git a/be/src/storage/CMakeLists.txt b/be/src/storage/CMakeLists.txt index e7a82b486dbe63..3aee9b6a87bae2 100644 --- a/be/src/storage/CMakeLists.txt +++ b/be/src/storage/CMakeLists.txt @@ -28,6 +28,7 @@ file(GLOB_RECURSE SRC_FILES CONFIGURE_DEPENDS *.cpp) # files in the ann_index directory. They are compiled separately as a .a library # and linked by Storage. list(FILTER SRC_FILES EXCLUDE REGEX ".*/storage/index/ann/.*\\.cpp$") +list(FILTER SRC_FILES EXCLUDE REGEX ".*/storage/index/snii/core/src/io/s3_object_store\\.cpp$") if (ENABLE_VARIANT_NESTED_GROUP) list(REMOVE_ITEM SRC_FILES diff --git a/be/src/storage/index/index_file_reader.cpp b/be/src/storage/index/index_file_reader.cpp index 348e1399421e5a..bb43015fa966d4 100644 --- a/be/src/storage/index/index_file_reader.cpp +++ b/be/src/storage/index/index_file_reader.cpp @@ -20,6 +20,7 @@ #include #include +#include "common/cast_set.h" #include "storage/index/inverted/inverted_index_compound_reader.h" #include "storage/index/inverted/inverted_index_fs_directory.h" #include "storage/tablet/tablet_schema.h" @@ -31,7 +32,9 @@ Status IndexFileReader::init(int32_t read_buffer_size, const io::IOContext* io_c std::unique_lock lock(_mutex); // Lock for writing if (!_inited) { _read_buffer_size = read_buffer_size; - if (_storage_format >= InvertedIndexStorageFormatPB::V2) { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + RETURN_IF_ERROR(_init_snii(io_ctx)); + } else if (_storage_format >= InvertedIndexStorageFormatPB::V2) { RETURN_IF_ERROR(_init_from(read_buffer_size, io_ctx)); } _inited = true; @@ -136,7 +139,31 @@ Status IndexFileReader::_init_from(int32_t read_buffer_size, const io::IOContext return Status::OK(); } +Status IndexFileReader::_init_snii(const io::IOContext* /*io_ctx*/) { + auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + int64_t file_size = -1; + if (_idx_file_info.has_index_size()) { + file_size = _idx_file_info.index_size(); + } + file_size = file_size == 0 ? -1 : file_size; + + io::FileReaderOptions opts; + opts.file_size = file_size; + opts.tablet_id = _tablet_id; + io::FileReaderSPtr reader; + RETURN_IF_ERROR(_fs->open_file(index_file_full_path, &reader, &opts)); + _snii_file_reader = std::make_shared(std::move(reader)); + _snii_segment_reader = std::make_unique(); + RETURN_IF_ERROR(snii_doris::to_doris_status(snii::reader::SniiSegmentReader::open( + _snii_file_reader.get(), _snii_segment_reader.get()))); + return Status::OK(); +} + Result IndexFileReader::get_all_directories() { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return ResultError(Status::Error( + "SNII format does not expose CLucene directories")); + } InvertedIndexDirectoryMap res; std::shared_lock lock(_mutex); // Lock for reading for (auto& [index, _] : _indices_entries) { @@ -155,6 +182,11 @@ Result> IndexFileReader:: int64_t index_id, const std::string& index_suffix, const io::IOContext* io_ctx) const { std::unique_ptr compound_reader; + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return ResultError(Status::Error( + "SNII format does not open CLucene compound readers")); + } + if (_storage_format == InvertedIndexStorageFormatPB::V1) { auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_id, index_suffix); @@ -229,6 +261,26 @@ Result> IndexFileReader:: return compound_reader; } +Result> IndexFileReader::open_snii_index( + const TabletIndex* index_meta) const { + DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII); + std::shared_lock lock(_mutex); + if (_snii_segment_reader == nullptr) { + return ResultError(Status::Error( + "SNII index file {} is not opened", + InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix))); + } + auto logical_reader = std::make_unique(); + auto status = + _snii_segment_reader->open_index(cast_set(index_meta->index_id()), + index_meta->get_index_suffix(), logical_reader.get()); + auto doris_status = snii_doris::to_doris_status(status); + if (!doris_status.ok()) { + return ResultError(doris_status); + } + return logical_reader; +} + Result> IndexFileReader::open( const TabletIndex* index_meta, const io::IOContext* io_ctx) const { auto index_id = index_meta->index_id(); @@ -254,6 +306,23 @@ Status IndexFileReader::index_file_exist(const TabletIndex* index_meta, bool* re auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_meta->index_id(), index_meta->get_index_suffix()); return _fs->exists(index_file_path, res); + } else if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + RETURN_IF_ERROR(_fs->exists(index_file_path, res)); + if (!*res || _snii_segment_reader == nullptr) { + return Status::OK(); + } + auto logical_reader = std::make_unique(); + auto status = _snii_segment_reader->open_index(cast_set(index_meta->index_id()), + index_meta->get_index_suffix(), + logical_reader.get()); + if (status.code() == snii::StatusCode::kNotFound) { + *res = false; + return Status::OK(); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(status)); + *res = true; + return Status::OK(); } else { std::shared_lock lock(_mutex); // Lock for reading if (_stream == nullptr) { @@ -279,6 +348,11 @@ Status IndexFileReader::has_null(const TabletIndex* index_meta, bool* res) const *res = true; return Status::OK(); } + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + auto logical_reader = DORIS_TRY(open_snii_index(index_meta)); + *res = logical_reader->section_refs().null_bitmap.length > 0; + return Status::OK(); + } std::shared_lock lock(_mutex); // Lock for reading if (_stream == nullptr) { return Status::Error( diff --git a/be/src/storage/index/index_file_reader.h b/be/src/storage/index/index_file_reader.h index fb4ec2b9a62fe3..896c8bd51745ff 100644 --- a/be/src/storage/index/index_file_reader.h +++ b/be/src/storage/index/index_file_reader.h @@ -33,8 +33,11 @@ #include "common/be_mock_util.h" #include "common/config.h" #include "io/fs/file_system.h" +#include "snii/reader/logical_index_reader.h" +#include "snii/reader/snii_segment_reader.h" #include "storage/index/index_file_writer.h" #include "storage/index/inverted/inverted_index_desc.h" +#include "storage/index/snii/snii_doris_adapter.h" namespace doris { class TabletIndex; @@ -60,7 +63,7 @@ class IndexFileReader { : _fs(std::move(fs)), _index_path_prefix(std::move(index_path_prefix)), _storage_format(storage_format), - _idx_file_info(idx_file_info), + _idx_file_info(std::move(idx_file_info)), _tablet_id(tablet_id) {} virtual ~IndexFileReader() = default; @@ -68,6 +71,8 @@ class IndexFileReader { const io::IOContext* io_ctx = nullptr); MOCK_FUNCTION Result> open( const TabletIndex* index_meta, const io::IOContext* io_ctx = nullptr) const; + Result> open_snii_index( + const TabletIndex* index_meta) const; void debug_file_entries(); std::string get_index_file_cache_key(const TabletIndex* index_meta) const; std::string get_index_file_path(const TabletIndex* index_meta) const; @@ -75,12 +80,19 @@ class IndexFileReader { Status has_null(const TabletIndex* index_meta, bool* res) const; Result get_all_directories(); // open file v2, init _stream - int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } + int64_t get_inverted_file_size() const { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return _snii_file_reader == nullptr ? 0 : _snii_file_reader->size(); + } + return _stream == nullptr ? 0 : _stream->length(); + } const std::string& get_index_path_prefix() const { return _index_path_prefix; } + InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } friend IndexFileWriter; protected: Status _init_from(int32_t read_buffer_size, const io::IOContext* io_ctx); + Status _init_snii(const io::IOContext* io_ctx); Result> _open( int64_t index_id, const std::string& index_suffix, const io::IOContext* io_ctx = nullptr) const; @@ -88,6 +100,8 @@ class IndexFileReader { private: IndicesEntriesMap _indices_entries; std::unique_ptr _stream = nullptr; + std::shared_ptr _snii_file_reader; + std::unique_ptr _snii_segment_reader; const io::FileSystemSPtr _fs; std::string _index_path_prefix; int32_t _read_buffer_size = -1; @@ -99,4 +113,4 @@ class IndexFileReader { }; } // namespace segment_v2 -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/storage/index/index_file_writer.cpp b/be/src/storage/index/index_file_writer.cpp index afd09c84620bb5..96541efc436404 100644 --- a/be/src/storage/index/index_file_writer.cpp +++ b/be/src/storage/index/index_file_writer.cpp @@ -22,6 +22,7 @@ #include #include +#include "common/cast_set.h" #include "common/status.h" #include "io/fs/packed_file_writer.h" #include "io/fs/s3_file_writer.h" @@ -34,6 +35,7 @@ #include "storage/index/inverted/inverted_index_desc.h" #include "storage/index/inverted/inverted_index_fs_directory.h" #include "storage/index/inverted/inverted_index_reader.h" +#include "storage/index/snii/snii_doris_adapter.h" #include "storage/tablet/tablet_schema.h" namespace doris::segment_v2 { @@ -56,7 +58,7 @@ IndexFileWriter::IndexFileWriter(io::FileSystemSPtr fs, std::string index_path_p _tmp_dir = tmp_file_dir.native(); if (_storage_format == InvertedIndexStorageFormatPB::V1) { _index_storage_format = std::make_unique(this); - } else { + } else if (_storage_format != InvertedIndexStorageFormatPB::SNII) { _index_storage_format = std::make_unique(this); } } @@ -84,6 +86,10 @@ Status IndexFileWriter::_insert_directory_into_map(int64_t index_id, } Result> IndexFileWriter::open(const TabletIndex* index_meta) { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return ResultError(Status::Error( + "SNII format does not open CLucene directories")); + } auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( _tmp_dir, _rowset_id, _seg_id, index_meta->index_id(), index_meta->get_index_suffix()); auto dir = std::shared_ptr(DorisFSDirectoryFactory::getDirectory( @@ -97,6 +103,35 @@ Result> IndexFileWriter::open(const TabletInde return dir; } +Status IndexFileWriter::add_snii_index(const TabletIndex* index_meta, uint32_t doc_count, + std::vector null_docids, + snii::writer::SpimiTermBuffer* term_buffer, + snii::format::IndexConfig index_config) { + DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII); + DCHECK(index_meta != nullptr); + DCHECK(term_buffer != nullptr); + if (_idx_v2_writer == nullptr) { + return Status::Error( + "SNII index file writer is null for {}", _index_path_prefix); + } + if (_snii_file_writer == nullptr) { + _snii_file_writer = std::make_unique(_idx_v2_writer.get()); + _snii_compound_writer = + std::make_unique(_snii_file_writer.get()); + } + + snii::writer::SniiIndexInput input; + input.index_id = cast_set(index_meta->index_id()); + input.index_suffix = index_meta->get_index_suffix(); + input.config = index_config; + input.doc_count = doc_count; + input.null_docids = std::move(null_docids); + input.term_source = term_buffer; + RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->add_logical_index(input))); + ++_snii_index_count; + return Status::OK(); +} + Status IndexFileWriter::delete_index(const TabletIndex* index_meta) { DBUG_EXECUTE_IF("IndexFileWriter::delete_index_index_meta_nullptr", { index_meta = nullptr; }); if (!index_meta) { @@ -123,6 +158,9 @@ Status IndexFileWriter::delete_index(const TabletIndex* index_meta) { } Status IndexFileWriter::add_into_searcher_cache() { + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + return Status::OK(); + } auto index_file_reader = std::make_unique( _fs, _index_path_prefix, _storage_format, InvertedIndexFileInfo(), _tablet_id); auto st = index_file_reader->init(); @@ -196,6 +234,21 @@ Result> IndexFileWriter::_construct_index_ Status IndexFileWriter::begin_close() { DCHECK(!_closed) << debug_string(); _closed = true; + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + if (_snii_compound_writer == nullptr) { + if (_idx_v2_writer == nullptr) { + return Status::OK(); + } + _snii_file_writer = + std::make_unique(_idx_v2_writer.get()); + _snii_compound_writer = + std::make_unique(_snii_file_writer.get()); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->finish())); + _total_file_size = _idx_v2_writer == nullptr ? 0 : _idx_v2_writer->bytes_appended(); + _file_info.set_index_size(_total_file_size); + return Status::OK(); + } if (_indices_dirs.empty()) { // An empty file must still be created even if there are no indexes to write if (dynamic_cast(_idx_v2_writer.get()) != nullptr || @@ -238,6 +291,12 @@ Status IndexFileWriter::begin_close() { Status IndexFileWriter::finish_close() { DCHECK(_closed) << debug_string(); + if (_storage_format == InvertedIndexStorageFormatPB::SNII) { + if (_idx_v2_writer != nullptr && _idx_v2_writer->state() != io::FileWriter::State::CLOSED) { + RETURN_IF_ERROR(_idx_v2_writer->close(false)); + } + return Status::OK(); + } if (_indices_dirs.empty()) { // An empty file must still be created even if there are no indexes to write if (dynamic_cast(_idx_v2_writer.get()) != nullptr || diff --git a/be/src/storage/index/index_file_writer.h b/be/src/storage/index/index_file_writer.h index a303de8b68c156..7cf02c686400ed 100644 --- a/be/src/storage/index/index_file_writer.h +++ b/be/src/storage/index/index_file_writer.h @@ -24,21 +24,33 @@ #include #include +#include #include "common/be_mock_util.h" #include "io/fs/file_system.h" #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" +#include "snii/format/format_constants.h" +#include "snii/writer/snii_compound_writer.h" #include "storage/index/index_storage_format.h" #include "storage/index/inverted/inverted_index_common.h" #include "storage/index/inverted/inverted_index_compound_reader.h" #include "storage/index/inverted/inverted_index_searcher.h" +#include "storage/index/snii/snii_doris_adapter.h" + +namespace snii::writer { +class SpimiTermBuffer; +class SniiCompoundWriter; +} // namespace snii::writer namespace doris { class TabletIndex; namespace segment_v2 { class DorisFSDirectory; +namespace snii_doris { +class DorisSniiFileWriter; +} // namespace snii_doris using InvertedIndexDirectoryMap = std::map, std::shared_ptr>; @@ -55,6 +67,10 @@ class IndexFileWriter { virtual ~IndexFileWriter() = default; MOCK_FUNCTION Result> open(const TabletIndex* index_meta); + Status add_snii_index(const TabletIndex* index_meta, uint32_t doc_count, + std::vector null_docids, + snii::writer::SpimiTermBuffer* term_buffer, + snii::format::IndexConfig config); Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); Status add_into_searcher_cache(); @@ -113,6 +129,9 @@ class IndexFileWriter { IndexStorageFormatPtr _index_storage_format; int64_t _tablet_id = -1; + std::unique_ptr _snii_file_writer; + std::unique_ptr _snii_compound_writer; + size_t _snii_index_count = 0; friend class IndexStorageFormatV1; friend class IndexStorageFormatV2; diff --git a/be/src/storage/index/index_writer.cpp b/be/src/storage/index/index_writer.cpp index 2325d280471337..6fb23c3c107e51 100644 --- a/be/src/storage/index/index_writer.cpp +++ b/be/src/storage/index/index_writer.cpp @@ -18,6 +18,7 @@ #include "common/exception.h" #include "storage/index/ann/ann_index_writer.h" #include "storage/index/inverted/inverted_index_writer.h" +#include "storage/index/snii/snii_index_writer.h" #include "storage/tablet/tablet_schema.h" #include "storage/types.h" @@ -80,6 +81,22 @@ Status IndexColumnWriter::create(const TabletColumn* column, } } + if (storage_format == InvertedIndexStorageFormatPB::SNII) { + if (!is_string_type(type)) { + return Status::Error( + "SNII inverted index storage format does not support BKD index type {}", + type); + } + *res = std::make_unique(index_file_writer, index_meta, + single_field); + auto st = (*res)->init(); + if (!st.ok()) { + (*res)->close_on_error(); + return st; + } + return Status::OK(); + } + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_unsupported_type_for_inverted_index", { type = FieldType::OLAP_FIELD_TYPE_JSONB; }) switch (type) { diff --git a/be/src/storage/index/inverted/inverted_index_reader.h b/be/src/storage/index/inverted/inverted_index_reader.h index 0e2f6a120d41e3..a2aa0533f2bf7b 100644 --- a/be/src/storage/index/inverted/inverted_index_reader.h +++ b/be/src/storage/index/inverted/inverted_index_reader.h @@ -230,9 +230,9 @@ class InvertedIndexReader : public IndexReader { const Field& query_value, InvertedIndexQueryType query_type, size_t* count) = 0; - Status read_null_bitmap(const IndexQueryContextPtr& context, - InvertedIndexQueryCacheHandle* cache_handle, - lucene::store::Directory* dir = nullptr); + virtual Status read_null_bitmap(const IndexQueryContextPtr& context, + InvertedIndexQueryCacheHandle* cache_handle, + lucene::store::Directory* dir = nullptr); virtual InvertedIndexReaderType type() = 0; @@ -335,7 +335,6 @@ class InvertedIndexVisitor : public lucene::util::bkd::bkd_reader::intersect_vis std::string query_min; std::string query_max; -public: InvertedIndexVisitor(const void* io_ctx, lucene::util::bkd::bkd_reader* r, roaring::Roaring* hits, bool only_count = false); ~InvertedIndexVisitor() override = default; diff --git a/be/src/storage/index/snii/core/src/common/status.cpp b/be/src/storage/index/snii/core/src/common/status.cpp new file mode 100644 index 00000000000000..d8f66b4a68cd98 --- /dev/null +++ b/be/src/storage/index/snii/core/src/common/status.cpp @@ -0,0 +1,24 @@ +#include "snii/common/status.h" + +#include +#include + +namespace snii { +namespace { + +// Name table in the same order as the StatusCode enum, to avoid a long switch chain in to_string. +constexpr std::array kCodeNames = { + "OK", "Corruption", "NotFound", "InvalidArgument", "IoError", "Unsupported", "Internal"}; + +} // namespace + +std::string Status::to_string() const { + std::string out = kCodeNames[static_cast(code_)]; + if (!message_.empty()) { + out += ": "; + out += message_; + } + return out; +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp b/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp new file mode 100644 index 00000000000000..fc5c70d6b5569d --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/byte_sink.cpp @@ -0,0 +1,39 @@ +#include "snii/encoding/byte_sink.h" + +#include "snii/encoding/varint.h" + +namespace snii { + +void ByteSink::put_fixed16(uint16_t v) { + for (int i = 0; i < 2; ++i) buf_.push_back(static_cast(v >> (8 * i))); +} + +void ByteSink::put_fixed32(uint32_t v) { + for (int i = 0; i < 4; ++i) buf_.push_back(static_cast(v >> (8 * i))); +} + +void ByteSink::put_fixed64(uint64_t v) { + for (int i = 0; i < 8; ++i) buf_.push_back(static_cast(v >> (8 * i))); +} + +void ByteSink::put_varint32(uint32_t v) { + uint8_t tmp[5]; + size_t n = encode_varint32(v, tmp); + buf_.insert(buf_.end(), tmp, tmp + n); +} + +void ByteSink::put_varint64(uint64_t v) { + uint8_t tmp[10]; + size_t n = encode_varint64(v, tmp); + buf_.insert(buf_.end(), tmp, tmp + n); +} + +void ByteSink::put_zigzag(int64_t v) { + put_varint64(zigzag_encode(v)); +} + +void ByteSink::put_bytes(Slice s) { + buf_.insert(buf_.end(), s.data(), s.data() + s.size()); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/byte_source.cpp b/be/src/storage/index/snii/core/src/encoding/byte_source.cpp new file mode 100644 index 00000000000000..d75d4945ff7f9d --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/byte_source.cpp @@ -0,0 +1,70 @@ +#include "snii/encoding/byte_source.h" + +#include "snii/encoding/varint.h" + +namespace snii { + +Status ByteSource::get_u8(uint8_t* v) { + if (remaining() < 1) return Status::Corruption("get_u8 overrun"); + *v = s_[pos_++]; + return Status::OK(); +} + +Status ByteSource::get_fixed16(uint16_t* v) { + if (remaining() < 2) return Status::Corruption("get_fixed16 overrun"); + uint16_t r = 0; + for (int i = 0; i < 2; ++i) r |= static_cast(s_[pos_ + i]) << (8 * i); + pos_ += 2; + *v = r; + return Status::OK(); +} + +Status ByteSource::get_fixed32(uint32_t* v) { + if (remaining() < 4) return Status::Corruption("get_fixed32 overrun"); + uint32_t r = 0; + for (int i = 0; i < 4; ++i) r |= static_cast(s_[pos_ + i]) << (8 * i); + pos_ += 4; + *v = r; + return Status::OK(); +} + +Status ByteSource::get_fixed64(uint64_t* v) { + if (remaining() < 8) return Status::Corruption("get_fixed64 overrun"); + uint64_t r = 0; + for (int i = 0; i < 8; ++i) r |= static_cast(s_[pos_ + i]) << (8 * i); + pos_ += 8; + *v = r; + return Status::OK(); +} + +Status ByteSource::get_varint64(uint64_t* v) { + const uint8_t* p = s_.data() + pos_; + const uint8_t* next = nullptr; + SNII_RETURN_IF_ERROR(decode_varint64(p, s_.data() + s_.size(), v, &next)); + pos_ = static_cast(next - s_.data()); + return Status::OK(); +} + +Status ByteSource::get_varint32(uint32_t* v) { + uint64_t tmp; + SNII_RETURN_IF_ERROR(get_varint64(&tmp)); + if (tmp > 0xFFFFFFFFu) return Status::Corruption("varint32 overflow"); + *v = static_cast(tmp); + return Status::OK(); +} + +Status ByteSource::get_zigzag(int64_t* v) { + uint64_t tmp; + SNII_RETURN_IF_ERROR(get_varint64(&tmp)); + *v = zigzag_decode(tmp); + return Status::OK(); +} + +Status ByteSource::get_bytes(size_t n, Slice* out) { + if (remaining() < n) return Status::Corruption("get_bytes overrun"); + *out = s_.subslice(pos_, n); + pos_ += n; + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/crc32c.cpp b/be/src/storage/index/snii/core/src/encoding/crc32c.cpp new file mode 100644 index 00000000000000..811ef86a697152 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/crc32c.cpp @@ -0,0 +1,111 @@ +#include "snii/encoding/crc32c.h" + +#include +#include +#include + +#if defined(__x86_64__) || defined(_M_X64) +#define SNII_CRC32C_X86 1 +#include +#include // _mm_crc32_u8/u32/u64 (SSE4.2) +#endif + +namespace snii { +namespace { + +// Bit-reflected Castagnoli polynomial (CRC32C / iSCSI). +constexpr uint32_t kPoly = 0x82F63B78u; + +// Builds the slice-by-8 lookup tables. Column 0 is the classic byte table; each +// successive column folds in one more byte of look-ahead, letting the inner loop +// consume 8 bytes per iteration with 8 table reads + XORs instead of 8 dependent +// shift/lookup steps. The checksum value is identical to the byte-at-a-time loop. +std::array, 8> make_slice8_table() { + std::array, 8> t {}; + for (uint32_t i = 0; i < 256; ++i) { + uint32_t c = i; + for (int k = 0; k < 8; ++k) c = (c & 1) ? (kPoly ^ (c >> 1)) : (c >> 1); + t[0][i] = c; + } + for (uint32_t i = 0; i < 256; ++i) { + uint32_t c = t[0][i]; + for (int s = 1; s < 8; ++s) { + c = t[0][c & 0xFF] ^ (c >> 8); + t[s][i] = c; + } + } + return t; +} + +const std::array, 8> kSlice8 = make_slice8_table(); + +inline uint32_t load_le32(const uint8_t* p) { + return static_cast(p[0]) | (static_cast(p[1]) << 8) | + (static_cast(p[2]) << 16) | (static_cast(p[3]) << 24); +} + +// Pure software slice-by-8 (used as the portable path and the hardware fallback). +uint32_t crc32c_slice8(uint32_t crc, const uint8_t* p, size_t n) { + while (n >= 8) { + crc ^= load_le32(p); + const uint32_t hi = load_le32(p + 4); + crc = kSlice8[7][crc & 0xFF] ^ kSlice8[6][(crc >> 8) & 0xFF] ^ + kSlice8[5][(crc >> 16) & 0xFF] ^ kSlice8[4][crc >> 24] ^ kSlice8[3][hi & 0xFF] ^ + kSlice8[2][(hi >> 8) & 0xFF] ^ kSlice8[1][(hi >> 16) & 0xFF] ^ kSlice8[0][hi >> 24]; + p += 8; + n -= 8; + } + while (n--) { + crc = kSlice8[0][(crc ^ *p++) & 0xFF] ^ (crc >> 8); + } + return crc; +} + +#if SNII_CRC32C_X86 +// Hardware CRC32C via the SSE4.2 crc32 instruction. The intrinsics operate on the +// same bit-reflected Castagnoli polynomial as the tables, so the result is +// byte-identical. This TU is compiled without -msse4.2, so gate the intrinsics +// behind a function-level target attribute and a runtime CPUID check. +__attribute__((target("sse4.2"))) uint32_t crc32c_hw(uint32_t crc, const uint8_t* p, size_t n) { + while (n >= 8) { + uint64_t v; + std::memcpy(&v, p, sizeof(v)); // unaligned-safe; x86 folds to a plain load + crc = static_cast(_mm_crc32_u64(crc, v)); + p += 8; + n -= 8; + } + if (n >= 4) { + crc = _mm_crc32_u32(crc, load_le32(p)); + p += 4; + n -= 4; + } + while (n--) crc = _mm_crc32_u8(crc, *p++); + return crc; +} + +bool detect_sse42() { + unsigned eax = 0, ebx = 0, ecx = 0, edx = 0; + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) return false; + return (ecx & bit_SSE4_2) != 0; +} + +const bool kHasSse42 = detect_sse42(); +#endif + +} // namespace + +uint32_t crc32c_extend(uint32_t crc, Slice data) { + const uint8_t* p = data.data(); + const size_t n = data.size(); + crc = ~crc; +#if SNII_CRC32C_X86 + if (kHasSse42) { + crc = crc32c_hw(crc, p, n); + return ~crc; + } +#endif + crc = crc32c_slice8(crc, p, n); + return ~crc; +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/pfor.cpp b/be/src/storage/index/snii/core/src/encoding/pfor.cpp new file mode 100644 index 00000000000000..19f6442185a556 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/pfor.cpp @@ -0,0 +1,182 @@ +#include "snii/encoding/pfor.h" + +#include +#include +#include +#include + +#include "snii/common/slice.h" + +namespace snii { +namespace { + +// Unaligned little-endian 64-bit load from a raw byte pointer (single +// instruction on x86; memcpy is the portable, UB-free spelling the compiler +// folds to a mov). +inline uint64_t load_u64_le(const uint8_t* p) { + uint64_t v; + std::memcpy(&v, p, sizeof(v)); +#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + v = __builtin_bswap64(v); +#endif + return v; +} + +uint8_t bits_for(uint32_t v) { + uint8_t b = 0; + while (v) { + ++b; + v >>= 1; + } + return b; +} + +// Choose the bit_width that minimizes total bytes (packed + exceptions). +// Exception cost estimated at ~6 bytes each. +uint8_t choose_width(const uint32_t* v, size_t n) { + uint8_t maxw = 0; + for (size_t i = 0; i < n; ++i) maxw = std::max(maxw, bits_for(v[i])); + uint8_t best = maxw; + size_t best_cost = SIZE_MAX; + for (int w = 0; w <= maxw; ++w) { + size_t exc = 0; + for (size_t i = 0; i < n; ++i) + if (bits_for(v[i]) > w) ++exc; + size_t cost = (static_cast(w) * n + 7) / 8 + exc * 6; + if (cost < best_cost) { + best_cost = cost; + best = static_cast(w); + } + } + return best; +} + +uint32_t low_mask(uint8_t w) { + return (w >= 32) ? 0xFFFFFFFFu : ((1u << w) - 1u); +} + +void bitpack(const uint32_t* v, size_t n, uint8_t w, ByteSink* out) { + if (w == 0) return; + uint64_t acc = 0; + int filled = 0; + for (size_t i = 0; i < n; ++i) { + acc |= static_cast(v[i] & low_mask(w)) << filled; + filled += w; + while (filled >= 8) { + out->put_u8(static_cast(acc)); + acc >>= 8; + filled -= 8; + } + } + if (filled > 0) out->put_u8(static_cast(acc)); +} + +Status bitunpack(ByteSource* src, size_t n, uint8_t w, uint32_t* out) { + if (w == 0) { + std::memset(out, 0, n * sizeof(uint32_t)); + return Status::OK(); + } + // Pull the whole packed run in ONE bounds-checked slice (#3: was one get_u8 + // per byte -- a Status-returning call + bounds check each), then unpack + // straight from the contiguous buffer. Each value's w<=32 bits start at bit + // offset i*w and span at most ceil((7+32)/8)=5 bytes, so a single unaligned + // 64-bit load at byte (i*w)/8 always covers it: one load + shift + mask per + // value, branchless, no per-byte accumulator loop (#2). Measured fewest + // instructions and fewest cycles of the alternatives -- the dependency-free + // per-value form lets the core overlap the loads (the unaligned word reads + // all hit L1, the packed run being only KiB). + const size_t packed = (static_cast(w) * n + 7) / 8; + Slice buf; + SNII_RETURN_IF_ERROR(src->get_bytes(packed, &buf)); + const uint8_t* base = buf.data(); + const uint64_t mask = low_mask(w); + + // Fast path: values whose 8-byte load window stays inside the buffer + // (byte_off + 8 + // <= packed). The final few are finished by the tail loop, which zero-pads + // past end. + size_t i = 0; + if (packed >= 8) { + const size_t last_safe_byte = packed - 8; + for (; i < n; ++i) { + const size_t bit_off = static_cast(w) * i; + const size_t byte_off = bit_off >> 3; + if (byte_off > last_safe_byte) break; + out[i] = static_cast((load_u64_le(base + byte_off) >> (bit_off & 7)) & mask); + } + } + for (; i < n; ++i) { + const size_t bit_off = static_cast(w) * i; + const size_t byte_off = bit_off >> 3; + uint64_t word = 0; + for (size_t b = byte_off; b < packed && b < byte_off + 8; ++b) { + word |= static_cast(base[b]) << ((b - byte_off) * 8); + } + out[i] = static_cast((word >> (bit_off & 7)) & mask); + } + return Status::OK(); +} + +} // namespace + +void pfor_encode(const uint32_t* values, size_t n, ByteSink* out) { + uint8_t w = choose_width(values, n); + std::vector> exc; // (index, full value) + std::vector low(values, values + n); + for (size_t i = 0; i < n; ++i) { + if (bits_for(values[i]) > w) { + exc.emplace_back(static_cast(i), values[i]); + low[i] = 0; // Write 0 as placeholder at exception position; true value + // stored in exception table + } + } + out->put_u8(w); + out->put_varint32(static_cast(exc.size())); + bitpack(low.data(), n, w, out); + uint32_t prev = 0; + for (const auto& e : exc) { + out->put_varint32(e.first - prev); + out->put_varint32(e.second); + prev = e.first; + } +} + +Status pfor_decode(ByteSource* src, size_t n, uint32_t* out) { + uint8_t w; + SNII_RETURN_IF_ERROR(src->get_u8(&w)); + uint32_t n_exc; + SNII_RETURN_IF_ERROR(src->get_varint32(&n_exc)); + SNII_RETURN_IF_ERROR(bitunpack(src, n, w, out)); + uint32_t idx = 0; + for (uint32_t i = 0; i < n_exc; ++i) { + uint32_t d, val; + SNII_RETURN_IF_ERROR(src->get_varint32(&d)); + SNII_RETURN_IF_ERROR(src->get_varint32(&val)); + idx += d; + if (idx >= n) return Status::Corruption("pfor exception index out of range"); + out[idx] = val; + } + return Status::OK(); +} + +Status pfor_skip(ByteSource* src, size_t n) { + uint8_t w = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&w)); + uint32_t n_exc = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&n_exc)); + const size_t packed = (static_cast(w) * n + 7) / 8; + Slice unused; + SNII_RETURN_IF_ERROR(src->get_bytes(packed, &unused)); + uint32_t idx = 0; + for (uint32_t i = 0; i < n_exc; ++i) { + uint32_t d = 0; + uint32_t val = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&d)); + SNII_RETURN_IF_ERROR(src->get_varint32(&val)); + idx += d; + if (idx >= n) return Status::Corruption("pfor exception index out of range"); + } + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/section_framer.cpp b/be/src/storage/index/snii/core/src/encoding/section_framer.cpp new file mode 100644 index 00000000000000..99d086c79e705c --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/section_framer.cpp @@ -0,0 +1,37 @@ +#include "snii/encoding/section_framer.h" + +#include "snii/encoding/crc32c.h" + +namespace snii { + +void SectionFramer::write(ByteSink& sink, uint8_t section_type, Slice payload) { + // Assemble type+len+payload in a temporary sink, compute crc over the whole thing, then write it all out. + ByteSink framed; + framed.put_u8(section_type); + framed.put_varint64(payload.size()); + framed.put_bytes(payload); + uint32_t crc = crc32c(framed.view()); + sink.put_bytes(framed.view()); + sink.put_fixed32(crc); +} + +Status SectionFramer::read(ByteSource& src, FramedSection* out) { + size_t start = src.position(); + uint8_t type; + SNII_RETURN_IF_ERROR(src.get_u8(&type)); + uint64_t len; + SNII_RETURN_IF_ERROR(src.get_varint64(&len)); + Slice payload; + SNII_RETURN_IF_ERROR(src.get_bytes(static_cast(len), &payload)); + size_t framed_len = src.position() - start; + uint32_t stored; + SNII_RETURN_IF_ERROR(src.get_fixed32(&stored)); + if (crc32c(src.slice_from(start, framed_len)) != stored) { + return Status::Corruption("section crc mismatch"); + } + out->type = type; + out->payload = payload; + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/varint.cpp b/be/src/storage/index/snii/core/src/encoding/varint.cpp new file mode 100644 index 00000000000000..12877f972cb089 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/varint.cpp @@ -0,0 +1,53 @@ +#include "snii/encoding/varint.h" + +namespace snii { + +size_t varint_len(uint64_t v) { + size_t n = 1; + while (v >= 0x80) { + v >>= 7; + ++n; + } + return n; +} + +size_t encode_varint64(uint64_t v, uint8_t* out) { + size_t i = 0; + while (v >= 0x80) { + out[i++] = static_cast(v) | 0x80; + v >>= 7; + } + out[i++] = static_cast(v); + return i; +} + +size_t encode_varint32(uint32_t v, uint8_t* out) { + return encode_varint64(v, out); +} + +Status decode_varint64(const uint8_t* p, const uint8_t* end, uint64_t* v, const uint8_t** next) { + uint64_t result = 0; + int shift = 0; + while (p < end) { + uint8_t b = *p++; + result |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) { + *v = result; + *next = p; + return Status::OK(); + } + shift += 7; + if (shift >= 64) return Status::Corruption("varint64 overflow"); + } + return Status::Corruption("varint truncated"); +} + +Status decode_varint32(const uint8_t* p, const uint8_t* end, uint32_t* v, const uint8_t** next) { + uint64_t tmp; + SNII_RETURN_IF_ERROR(decode_varint64(p, end, &tmp, next)); + if (tmp > 0xFFFFFFFFu) return Status::Corruption("varint32 overflow"); + *v = static_cast(tmp); + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp b/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp new file mode 100644 index 00000000000000..abb01981d63450 --- /dev/null +++ b/be/src/storage/index/snii/core/src/encoding/zstd_codec.cpp @@ -0,0 +1,32 @@ +#include "snii/encoding/zstd_codec.h" + +#include + +#include + +namespace snii { + +Status zstd_compress(Slice input, int level, std::vector* out) { + size_t bound = ZSTD_compressBound(input.size()); + out->resize(bound); + size_t n = ZSTD_compress(out->data(), bound, input.data(), input.size(), level); + if (ZSTD_isError(n)) { + return Status::Internal(std::string("zstd compress: ") + ZSTD_getErrorName(n)); + } + out->resize(n); + return Status::OK(); +} + +Status zstd_decompress(Slice input, size_t expected_uncomp_len, std::vector* out) { + out->resize(expected_uncomp_len); + size_t n = ZSTD_decompress(out->data(), expected_uncomp_len, input.data(), input.size()); + if (ZSTD_isError(n)) { + return Status::Corruption(std::string("zstd decompress: ") + ZSTD_getErrorName(n)); + } + if (n != expected_uncomp_len) { + return Status::Corruption("zstd decompressed length mismatch"); + } + return Status::OK(); +} + +} // namespace snii diff --git a/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp b/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp new file mode 100644 index 00000000000000..e65c4817d1c6dc --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/bootstrap_header.cpp @@ -0,0 +1,91 @@ +#include "snii/format/bootstrap_header.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" + +namespace snii::format { + +namespace { + +// Number of bytes covered by header_checksum: everything except the trailing +// crc32c. +constexpr size_t kChecksumCoverage = kBootstrapHeaderSize - 4; + +// Writes all fixed fields except the trailing checksum. Field order is the +// on-disk contract; reuse ByteSink fixed-width primitives, never hand-assemble +// bytes. +void encode_fields(const BootstrapHeader& header, ByteSink* sink) { + sink->put_fixed32(header.magic); + sink->put_fixed32((static_cast(header.min_reader_version) << 16) | + header.format_version); + sink->put_fixed32(header.flags); + sink->put_fixed32(kBootstrapHeaderSize); // header_length is always derived + sink->put_u8(header.tail_pointer_size); +} + +} // namespace + +Status encode_bootstrap_header(const BootstrapHeader& header, ByteSink* sink) { + if (sink == nullptr) { + return Status::InvalidArgument("bootstrap_header: null sink"); + } + ByteSink fields; + encode_fields(header, &fields); + const uint32_t checksum = crc32c(fields.view()); + sink->put_bytes(fields.view()); + sink->put_fixed32(checksum); + return Status::OK(); +} + +Status decode_bootstrap_header(Slice data, BootstrapHeader* out) { + if (out == nullptr) { + return Status::InvalidArgument("bootstrap_header: null out"); + } + // Reject any size other than the exact fixed header: short input is + // truncation, longer input means stray trailing bytes the parser would + // otherwise ignore. + if (data.size() != kBootstrapHeaderSize) { + return Status::Corruption("bootstrap_header: wrong header size"); + } + + ByteSource src(data); + uint32_t magic = 0; + uint32_t version_pair = 0; + uint32_t flags = 0; + uint32_t header_length = 0; + uint8_t tail_pointer_size = 0; + uint32_t stored_checksum = 0; + SNII_RETURN_IF_ERROR(src.get_fixed32(&magic)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&version_pair)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&flags)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&header_length)); + SNII_RETURN_IF_ERROR(src.get_u8(&tail_pointer_size)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&stored_checksum)); + + if (magic != kContainerMagic) { + return Status::Corruption("bootstrap_header: bad container magic"); + } + const uint32_t computed = crc32c(data.subslice(0, kChecksumCoverage)); + if (computed != stored_checksum) { + return Status::Corruption("bootstrap_header: checksum mismatch"); + } + + const auto min_reader_version = static_cast((version_pair >> 16) & 0xFFFFu); + const auto format_version = static_cast(version_pair & 0xFFFFu); + if (format_version != kFormatVersion) { + return Status::Unsupported("bootstrap_header: unsupported container format_version"); + } + if (min_reader_version > kFormatVersion) { + return Status::Unsupported("bootstrap_header: container requires a newer reader version"); + } + + out->magic = magic; + out->format_version = format_version; + out->min_reader_version = min_reader_version; + out->flags = flags; + out->header_length = header_length; + out->tail_pointer_size = tail_pointer_size; + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/bsbf.cpp b/be/src/storage/index/snii/core/src/format/bsbf.cpp new file mode 100644 index 00000000000000..adfe5e445c2dce --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/bsbf.cpp @@ -0,0 +1,218 @@ +#include "snii/format/bsbf.h" + +#include + +#include "snii/encoding/crc32c.h" + +#if defined(__x86_64__) || defined(_M_X64) +#include +#define SNII_BSBF_X86 1 +#endif + +#define XXH_INLINE_ALL +#include "xxhash.h" + +namespace snii::format { + +const uint32_t kBsbfSalt[kBsbfBitsSetPerBlock] = {0x47b6137bU, 0x44974d91U, 0x8824ad5bU, + 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, + 0x9efc4947U, 0x5c6bfb31U}; + +namespace { + +void store_le32(uint8_t* p, uint32_t v) { + p[0] = static_cast(v); + p[1] = static_cast(v >> 8); + p[2] = static_cast(v >> 16); + p[3] = static_cast(v >> 24); +} +uint32_t load_le32(const uint8_t* p) { + return static_cast(p[0]) | (static_cast(p[1]) << 8) | + (static_cast(p[2]) << 16) | (static_cast(p[3]) << 24); +} + +bool cpu_has_avx2() { +#if defined(SNII_BSBF_X86) + static const bool v = __builtin_cpu_supports("avx2"); + return v; +#else + return false; +#endif +} + +// --- scalar kernels --- +inline void masks_scalar(uint32_t key, uint32_t m[8]) { + for (int i = 0; i < 8; ++i) m[i] = 1u << ((key * kBsbfSalt[i]) >> 27); +} +bool block_contains_scalar(uint64_t hash, const uint8_t* block) { + const uint32_t* w = reinterpret_cast(block); // LE + uint32_t m[8]; + masks_scalar(static_cast(hash), m); + for (int i = 0; i < 8; ++i) + if ((load_le32(reinterpret_cast(w + i)) & m[i]) != m[i]) return false; + return true; +} +void insert_scalar(uint32_t* words, uint32_t block, uint32_t key) { + uint32_t m[8]; + masks_scalar(key, m); + for (int i = 0; i < 8; ++i) words[block * 8 + i] |= m[i]; +} +bool find_scalar(const uint32_t* words, uint32_t block, uint32_t key) { + uint32_t m[8]; + masks_scalar(key, m); + for (int i = 0; i < 8; ++i) + if ((words[block * 8 + i] & m[i]) != m[i]) return false; + return true; +} + +#if defined(SNII_BSBF_X86) +// --- AVX2 kernels: a 256-bit block is one YMM register --- +__attribute__((target("avx2"))) __m256i mask_avx2(uint32_t key) { + const __m256i salt = + _mm256_setr_epi32(static_cast(kBsbfSalt[0]), static_cast(kBsbfSalt[1]), + static_cast(kBsbfSalt[2]), static_cast(kBsbfSalt[3]), + static_cast(kBsbfSalt[4]), static_cast(kBsbfSalt[5]), + static_cast(kBsbfSalt[6]), static_cast(kBsbfSalt[7])); + const __m256i prod = _mm256_mullo_epi32(_mm256_set1_epi32(static_cast(key)), salt); + const __m256i shifts = _mm256_srli_epi32(prod, 27); // top 5 bits -> 0..31 + return _mm256_sllv_epi32(_mm256_set1_epi32(1), shifts); +} +__attribute__((target("avx2"))) bool block_contains_avx2(uint64_t hash, const uint8_t* block) { + const __m256i m = mask_avx2(static_cast(hash)); + const __m256i b = _mm256_loadu_si256(reinterpret_cast(block)); + return _mm256_testc_si256(b, m) != 0; // (~b & m) == 0 -> b contains m +} +__attribute__((target("avx2"))) void insert_avx2(uint32_t* words, uint32_t block, uint32_t key) { + __m256i* p = reinterpret_cast<__m256i*>(words + block * 8); + _mm256_storeu_si256(p, _mm256_or_si256(_mm256_loadu_si256(p), mask_avx2(key))); +} +__attribute__((target("avx2"))) bool find_avx2(const uint32_t* words, uint32_t block, + uint32_t key) { + const __m256i m = mask_avx2(key); + const __m256i b = _mm256_loadu_si256(reinterpret_cast(words + block * 8)); + return _mm256_testc_si256(b, m) != 0; +} +#endif + +} // namespace + +uint64_t bsbf_hash(std::string_view term) { + return XXH64(term.data(), term.size(), /*seed=*/0); +} + +uint32_t bsbf_optimal_num_bytes(uint32_t ndv, double fpp) { + // Parquet OptimalNumOfBits, then >>3 for bytes. + const double m = -8.0 * ndv / std::log(1 - std::pow(fpp, 1.0 / 8)); + uint32_t num_bits; + if (m < 0 || m > static_cast(kBsbfMaxBytes) * 8) { + num_bits = kBsbfMaxBytes << 3; + } else { + num_bits = static_cast(m); + } + if (num_bits < (kBsbfMinBytes << 3)) num_bits = kBsbfMinBytes << 3; + if (num_bits & (num_bits - 1)) { // next power of 2 + uint32_t p = 1; + while (p < num_bits) p <<= 1; + num_bits = p; + } + if (num_bits > (kBsbfMaxBytes << 3)) num_bits = kBsbfMaxBytes << 3; + return num_bits >> 3; +} + +bool bsbf_block_contains(uint64_t hash, const uint8_t block[kBsbfBytesPerBlock]) { +#if defined(SNII_BSBF_X86) + if (cpu_has_avx2()) return block_contains_avx2(hash, block); +#endif + return block_contains_scalar(hash, block); +} + +Status BsbfBuilder::create(uint32_t ndv, double fpp, BsbfBuilder* out) { + if (out == nullptr) return Status::InvalidArgument("bsbf: null out"); + if (!(fpp > 0.0 && fpp < 1.0)) return Status::InvalidArgument("bsbf: fpp out of (0,1)"); + if (ndv == 0) ndv = 1; + out->num_bytes_ = bsbf_optimal_num_bytes(ndv, fpp); + out->num_blocks_ = out->num_bytes_ / kBsbfBytesPerBlock; + out->ndv_ = ndv; + out->words_.assign(out->num_bytes_ / 4, 0u); + return Status::OK(); +} + +void BsbfBuilder::insert(uint64_t hash) { + const uint32_t block = bsbf_block_index(hash, num_blocks_); + const uint32_t key = static_cast(hash); +#if defined(SNII_BSBF_X86) + if (cpu_has_avx2()) { + insert_avx2(words_.data(), block, key); + return; + } +#endif + insert_scalar(words_.data(), block, key); +} + +bool BsbfBuilder::maybe_contains(uint64_t hash) const { + const uint32_t block = bsbf_block_index(hash, num_blocks_); + const uint32_t key = static_cast(hash); +#if defined(SNII_BSBF_X86) + if (cpu_has_avx2()) return find_avx2(words_.data(), block, key); +#endif + return find_scalar(words_.data(), block, key); +} + +Status BsbfBuilder::serialize(ByteSink* sink) const { + if (sink == nullptr) return Status::InvalidArgument("bsbf: null sink"); + if (num_bytes_ == 0) return Status::InvalidArgument("bsbf: not built"); + uint8_t hdr[kBsbfHeaderSize] = {0}; + hdr[0] = 'B'; + hdr[1] = 'S'; + hdr[2] = 'B'; + hdr[3] = 'F'; + hdr[4] = 1; // version + hdr[5] = 0; // hash strategy: XXH64 seed 0 + hdr[6] = 0; // index strategy: fastrange + hdr[7] = 0; // pad + store_le32(hdr + 8, num_bytes_); + store_le32(hdr + 12, num_blocks_); + store_le32(hdr + 16, ndv_); + store_le32(hdr + 20, crc32c(Slice(hdr, 20))); // header crc over [0,20) + const uint8_t* bits = reinterpret_cast(words_.data()); + store_le32(hdr + 24, crc32c(Slice(bits, num_bytes_))); // bitset crc + sink->put_bytes(Slice(hdr, kBsbfHeaderSize)); + sink->put_bytes(Slice(bits, num_bytes_)); // contiguous, uncompressed, LE + return Status::OK(); +} + +Status BsbfHeader::parse(Slice h, uint64_t section_base, BsbfHeader* out) { + if (out == nullptr) return Status::InvalidArgument("bsbf: null out"); + if (h.size() < kBsbfHeaderSize) return Status::Corruption("bsbf: short header"); + const uint8_t* p = h.data(); + if (p[0] != 'B' || p[1] != 'S' || p[2] != 'B' || p[3] != 'F') + return Status::Corruption("bsbf: bad magic"); + if (p[4] != 1) return Status::Corruption("bsbf: bad version"); + if (p[5] != 0) return Status::Corruption("bsbf: unsupported hash strategy"); + if (p[6] != 0) return Status::Corruption("bsbf: unsupported index strategy"); + if (crc32c(Slice(p, 20)) != load_le32(p + 20)) + return Status::Corruption("bsbf: header crc mismatch"); + const uint32_t nb = load_le32(p + 8); + const uint32_t nblk = load_le32(p + 12); + if (nb < kBsbfMinBytes || nb > kBsbfMaxBytes || (nb & (nb - 1)) != 0) + return Status::Corruption("bsbf: num_bytes out of range or not power of 2"); + if (nblk != nb / kBsbfBytesPerBlock) return Status::Corruption("bsbf: num_blocks mismatch"); + out->num_bytes = nb; + out->num_blocks = nblk; + out->bitset_crc = load_le32(p + 24); + out->bitset_base = section_base + kBsbfHeaderSize; + return Status::OK(); +} + +Status bsbf_probe(snii::io::FileReader* reader, const BsbfHeader& header, uint64_t hash, + bool* maybe_present) { + if (reader == nullptr || maybe_present == nullptr) + return Status::InvalidArgument("bsbf: null arg"); + std::vector blk; + SNII_RETURN_IF_ERROR(reader->read_at(header.block_offset(hash), kBsbfBytesPerBlock, &blk)); + if (blk.size() < kBsbfBytesPerBlock) return Status::Corruption("bsbf: short block read"); + *maybe_present = bsbf_block_contains(hash, blk.data()); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/dict_block.cpp b/be/src/storage/index/snii/core/src/format/dict_block.cpp new file mode 100644 index 00000000000000..375414df96f264 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/dict_block.cpp @@ -0,0 +1,293 @@ +#include "snii/format/dict_block.h" + +#include + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/varint.h" + +namespace snii::format { + +namespace { + +constexpr size_t kFooterBytes = sizeof(uint32_t); // trailing crc32c +constexpr size_t kNAnchorsBytes = sizeof(uint32_t); // n_anchors u32 +constexpr size_t kAnchorOffBytes = sizeof(uint32_t); // per-anchor offset u32 + +// Estimate the encoded upper-bound byte size of one entry (no actual encoding; used by estimated_bytes). +// Take the maximum varint width of each variable-length field plus payload bytes to guarantee an upper bound. +size_t estimate_entry_bytes(const DictEntry& e) { + size_t body = 0; + body += varint_len(static_cast(e.term.size())); // prefix_len upper bound + body += varint_len(static_cast(e.term.size())); // suffix_len upper bound + body += e.term.size(); // suffix bytes upper bound + body += 1; // flags + body += 10; // df + ttf + max_freq upper bound + body += 10; // ttf_delta + body += 10; // max_freq + if (e.kind == DictEntryKind::kInline) { + body += 10 + e.frq_bytes.size(); + body += 10 + e.prx_bytes.size(); + } else { + body += 10 * 5; // frq_off/frq_len/prelude/prx_off/prx_len upper bound + } + return varint_len(static_cast(body)) + body; // entry_len + body +} + +} // namespace + +// ---- DictBlockBuilder ---- + +DictBlockBuilder::DictBlockBuilder(IndexTier tier, bool has_positions, uint64_t frq_base, + uint64_t prx_base, uint32_t anchor_interval) + : tier_(tier), + has_positions_(has_positions), + frq_base_(frq_base), + prx_base_(prx_base), + anchor_interval_(anchor_interval == 0 ? 1 : anchor_interval) {} + +void DictBlockBuilder::add_entry(const DictEntry& entry) { + if (is_anchor(n_entries_)) ++n_anchors_; + entries_est_ += estimate_entry_bytes(entry); + entries_.push_back(entry); + prev_term_ = entry.term; + ++n_entries_; +} + +size_t DictBlockBuilder::estimated_bytes() const { + size_t header = varint_len(static_cast(n_entries_)) + 2; // +ver +flags + header += varint_len(frq_base_); + if (has_positions_) header += varint_len(prx_base_); + const size_t anchors = n_anchors_ * kAnchorOffBytes + kNAnchorsBytes; + return header + entries_est_ + anchors + kFooterBytes; +} + +void DictBlockBuilder::finish(ByteSink* sink) const { + ByteSink body; // header + entries + anchor_offsets + n_anchors (crc covered region) + + // header. + body.put_varint64(static_cast(n_entries_)); + body.put_u8(kDictBlockFormatVer); + body.put_u8(has_positions_ ? dict_block_flags::kHasPositions : 0u); + body.put_varint64(frq_base_); + if (has_positions_) body.put_varint64(prx_base_); + + // entries: anchor entries use prev_term="" and record their byte offset within the block. + std::vector anchor_offsets; + anchor_offsets.reserve(n_anchors_); + std::string prev; + for (uint32_t i = 0; i < n_entries_; ++i) { + const bool anchor = is_anchor(i); + if (anchor) { + anchor_offsets.push_back(static_cast(body.size())); + } + const std::string_view prev_term = anchor ? std::string_view {} : std::string_view(prev); + encode_dict_entry(entries_[i], prev_term, tier_, &body); + prev = entries_[i].term; + } + + // anchor_offsets[] + n_anchors. + for (uint32_t off : anchor_offsets) body.put_fixed32(off); + body.put_fixed32(static_cast(anchor_offsets.size())); + + // Write the entire block (including crc footer) to sink. + sink->put_bytes(body.view()); + sink->put_fixed32(crc32c(body.view())); +} + +// ---- DictBlockReader ---- + +namespace { + +// Verify the block length is sufficient and validate the trailing crc; return a Slice of the covered region (excluding crc footer). +Status verify_crc(Slice block, Slice* covered) { + if (block.size() < kFooterBytes + kNAnchorsBytes) { + return Status::Corruption("dict_block: block too short to contain footer"); + } + const size_t covered_len = block.size() - kFooterBytes; + *covered = block.subslice(0, covered_len); + + ByteSource crc_src(block.subslice(covered_len, kFooterBytes)); + uint32_t stored = 0; + SNII_RETURN_IF_ERROR(crc_src.get_fixed32(&stored)); + if (crc32c(*covered) != stored) { + return Status::Corruption("dict_block: crc32c checksum mismatch"); + } + return Status::OK(); +} + +// Read and verify that block_flags is consistent with has_positions. +Status check_flags(uint8_t flags, bool has_positions) { + const bool flag_pos = (flags & dict_block_flags::kHasPositions) != 0; + if (flag_pos != has_positions) { + return Status::InvalidArgument("dict_block: has_positions inconsistent with block_flags"); + } + return Status::OK(); +} + +} // namespace + +Status DictBlockReader::open(Slice block, IndexTier tier, bool has_positions, + DictBlockReader* out) { + if (out == nullptr) return Status::InvalidArgument("dict_block: out is null"); + *out = DictBlockReader {}; + + Slice covered; + SNII_RETURN_IF_ERROR(verify_crc(block, &covered)); + out->block_ = covered; + out->tier_ = tier; + out->has_positions_ = has_positions; + + // header. + ByteSource src(covered); + uint64_t n_entries = 0; + SNII_RETURN_IF_ERROR(src.get_varint64(&n_entries)); + uint8_t ver = 0; + uint8_t flags = 0; + SNII_RETURN_IF_ERROR(src.get_u8(&ver)); + SNII_RETURN_IF_ERROR(src.get_u8(&flags)); + if (ver != kDictBlockFormatVer) { + return Status::Unsupported("dict_block: unsupported entry_format_ver"); + } + SNII_RETURN_IF_ERROR(check_flags(flags, has_positions)); + SNII_RETURN_IF_ERROR(src.get_varint64(&out->frq_base_)); + if (has_positions) SNII_RETURN_IF_ERROR(src.get_varint64(&out->prx_base_)); + + out->n_entries_ = static_cast(n_entries); + out->entries_begin_ = src.position(); + + // The anchor table is at the tail of covered: [... anchor_offsets[n] n_anchors(u32)]. + if (covered.size() < kNAnchorsBytes) { + return Status::Corruption("dict_block: missing n_anchors"); + } + ByteSource na_src(covered.subslice(covered.size() - kNAnchorsBytes, kNAnchorsBytes)); + uint32_t n_anchors = 0; + SNII_RETURN_IF_ERROR(na_src.get_fixed32(&n_anchors)); + + const size_t anchor_table_bytes = static_cast(n_anchors) * kAnchorOffBytes; + if (covered.size() < kNAnchorsBytes + anchor_table_bytes || + out->entries_begin_ + anchor_table_bytes + kNAnchorsBytes > covered.size()) { + return Status::Corruption("dict_block: anchor table out of range"); + } + const size_t anchor_table_begin = covered.size() - kNAnchorsBytes - anchor_table_bytes; + + ByteSource at_src(covered.subslice(anchor_table_begin, anchor_table_bytes)); + out->anchor_offsets_.resize(n_anchors); + out->anchor_terms_.resize(n_anchors); + for (uint32_t i = 0; i < n_anchors; ++i) { + uint32_t off = 0; + SNII_RETURN_IF_ERROR(at_src.get_fixed32(&off)); + if (off >= anchor_table_begin) { + return Status::Corruption("dict_block: anchor offset out of range"); + } + // Anchor offsets must be strictly monotonically increasing, and the first anchor must be exactly the start of the entries region (entry 0 is always an anchor). + // Otherwise scan_from_anchor's segment-length computation seg_end-seg_begin would underflow as size_t and cause an out-of-range read, + // guarding against non-monotonic offset tables with a re-stamped crc (remote on-demand read / cache misalignment scenarios). + if (i == 0) { + if (off != out->entries_begin_) { + return Status::Corruption( + "dict_block: first anchor offset is not the start of entries"); + } + } else if (off <= out->anchor_offsets_[i - 1]) { + return Status::Corruption("dict_block: anchor offsets are not strictly increasing"); + } + out->anchor_offsets_[i] = off; + // Anchor entries are encoded with prev_term="" and can be decoded independently to retrieve their term. + ByteSource e_src(covered.subslice(off, anchor_table_begin - off)); + DictEntry probe; + SNII_RETURN_IF_ERROR(decode_dict_entry(&e_src, std::string_view {}, tier, &probe)); + out->anchor_terms_[i] = std::move(probe.term); + } + return Status::OK(); +} + +bool DictBlockReader::locate_anchor(std::string_view target, size_t* anchor_idx) const { + if (anchor_terms_.empty()) return false; + if (target < std::string_view(anchor_terms_.front())) return false; + // The last anchor_term <= target. + size_t lo = 0; + size_t hi = anchor_terms_.size(); // open interval + while (lo + 1 < hi) { + const size_t mid = lo + (hi - lo) / 2; + if (std::string_view(anchor_terms_[mid]) <= target) { + lo = mid; + } else { + hi = mid; + } + } + *anchor_idx = lo; + return true; +} + +Status DictBlockReader::decode_all(std::vector* out) const { + if (out == nullptr) return Status::InvalidArgument("dict_block: out is null"); + out->clear(); + out->reserve(n_entries_); + for (size_t a = 0; a < anchor_offsets_.size(); ++a) { + const size_t seg_begin = anchor_offsets_[a]; + const bool is_last = a + 1 == anchor_offsets_.size(); + const size_t seg_end = is_last ? (block_.size() - kNAnchorsBytes - + anchor_offsets_.size() * kAnchorOffBytes) + : anchor_offsets_[a + 1]; + if (seg_end < seg_begin || seg_end > block_.size()) { + return Status::Corruption("dict_block: anchor segment range invalid"); + } + ByteSource src(block_.subslice(seg_begin, seg_end - seg_begin)); + std::string prev; // first entry of a segment is an anchor (prev_term="") + while (!src.eof()) { + DictEntry e; + SNII_RETURN_IF_ERROR(decode_dict_entry(&src, std::string_view(prev), tier_, &e)); + prev = e.term; + out->push_back(std::move(e)); + } + } + if (out->size() != n_entries_) { + return Status::Corruption("dict_block: decoded entry count mismatch"); + } + return Status::OK(); +} + +Status DictBlockReader::scan_from_anchor(size_t anchor_idx, std::string_view target, bool* found, + DictEntry* out) const { + // Byte range of this anchor segment: [anchor_offset, next anchor offset or anchor table start). + const size_t seg_begin = anchor_offsets_[anchor_idx]; + const bool is_last = anchor_idx + 1 == anchor_offsets_.size(); + const size_t seg_end = + is_last ? (block_.size() - kNAnchorsBytes - anchor_offsets_.size() * kAnchorOffBytes) + : anchor_offsets_[anchor_idx + 1]; + + // Fallback: open() has already verified anchor monotonicity; this additionally guards against seg_end block_.size()) { + return Status::Corruption("dict_block: anchor segment range invalid"); + } + ByteSource src(block_.subslice(seg_begin, seg_end - seg_begin)); + std::string prev; // the first entry in the segment is an anchor, prev_term="" + while (!src.eof()) { + DictEntry e; + SNII_RETURN_IF_ERROR(decode_dict_entry(&src, std::string_view(prev), tier_, &e)); + if (e.term == target) { + *found = true; + *out = std::move(e); + return Status::OK(); + } + if (std::string_view(e.term) > target) { + *found = false; // already past target; entries are sorted so it does not exist + return Status::OK(); + } + prev = std::move(e.term); + } + *found = false; + return Status::OK(); +} + +Status DictBlockReader::find_term(std::string_view target, bool* found, DictEntry* out) const { + if (found == nullptr || out == nullptr) { + return Status::InvalidArgument("dict_block: found / out is null"); + } + *found = false; + size_t anchor_idx = 0; + if (!locate_anchor(target, &anchor_idx)) return Status::OK(); + return scan_from_anchor(anchor_idx, target, found, out); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp b/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp new file mode 100644 index 00000000000000..05f73814c32d2d --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/dict_block_directory.cpp @@ -0,0 +1,89 @@ +#include "snii/format/dict_block_directory.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +namespace { + +// Each block_ref has a fixed field order; reuse ByteSink varint/fixed primitives — do not hand-craft bytes manually. +// uncomp_len trails only when the kZstd flag is set, so uncompressed-block +// directories keep their compact (v1-identical) per-ref byte layout. +void encode_ref(const BlockRef& ref, ByteSink* payload) { + payload->put_varint64(ref.offset); + payload->put_varint64(ref.length); + payload->put_varint32(ref.n_entries); + payload->put_u8(ref.flags); + payload->put_fixed32(ref.checksum); + if (ref.flags & block_ref_flags::kZstd) payload->put_varint64(ref.uncomp_len); +} + +Status decode_ref(ByteSource* ps, BlockRef* ref) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->offset)); + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->length)); + SNII_RETURN_IF_ERROR(ps->get_varint32(&ref->n_entries)); + SNII_RETURN_IF_ERROR(ps->get_u8(&ref->flags)); + SNII_RETURN_IF_ERROR(ps->get_fixed32(&ref->checksum)); + if (ref->flags & block_ref_flags::kZstd) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->uncomp_len)); + } + return Status::OK(); +} + +Status decode_payload(Slice payload, std::vector* refs) { + ByteSource ps(payload); + uint32_t n_blocks = 0; + SNII_RETURN_IF_ERROR(ps.get_varint32(&n_blocks)); + // Guard against a corrupted, inflated count from untrusted bytes: each BlockRef + // needs >= 8 bytes (flags u8 + checksum u32 + >= 1 byte for each of 3 varints), + // so cap before reserve to avoid a huge allocation. + constexpr size_t kMinRefBytes = 8; + if (n_blocks > ps.remaining() / kMinRefBytes) { + return Status::Corruption("dict_block_directory: n_blocks exceeds payload capacity"); + } + refs->clear(); + refs->reserve(n_blocks); + for (uint32_t i = 0; i < n_blocks; ++i) { + BlockRef ref {}; + SNII_RETURN_IF_ERROR(decode_ref(&ps, &ref)); + refs->push_back(ref); + } + if (!ps.eof()) { + return Status::Corruption("dict_block_directory: trailing bytes in payload"); + } + return Status::OK(); +} + +} // namespace + +void DictBlockDirectoryBuilder::finish(ByteSink* sink) const { + ByteSink payload; + payload.put_varint32(static_cast(refs_.size())); + for (const auto& ref : refs_) { + encode_ref(ref, &payload); + } + SectionFramer::write(*sink, static_cast(SectionType::kDictBlockDirectory), + payload.view()); +} + +Status DictBlockDirectoryReader::open(Slice section, DictBlockDirectoryReader* out) { + ByteSource src(section); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + if (sec.type != static_cast(SectionType::kDictBlockDirectory)) { + return Status::InvalidArgument("dict_block_directory: unexpected section type"); + } + return decode_payload(sec.payload, &out->refs_); +} + +Status DictBlockDirectoryReader::get(uint32_t ordinal, BlockRef* out) const { + if (ordinal >= refs_.size()) { + return Status::NotFound("dict_block_directory: ordinal out of range"); + } + *out = refs_[ordinal]; + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/dict_entry.cpp b/be/src/storage/index/snii/core/src/format/dict_entry.cpp new file mode 100644 index 00000000000000..3b7a189e2c276b --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/dict_entry.cpp @@ -0,0 +1,293 @@ +#include "snii/format/dict_entry.h" + +#include + +#include "snii/common/slice.h" + +namespace snii::format { + +namespace { + +// Pure-function assembly / parsing of flags bits; avoids a long inline if-else +// chain. +uint8_t pack_flags(const DictEntry& e) { + uint8_t f = 0; + if (e.kind == DictEntryKind::kInline) f |= dict_flags::kKind; + if (e.enc == DictEntryEnc::kWindowed) f |= dict_flags::kEnc; + if (e.has_sb) f |= dict_flags::kHasSb; + // bit3 has_champion / bit4 offsets_ref are always 0 in v1. + return f; +} + +void apply_flags(uint8_t f, DictEntry* e) { + e->kind = (f & dict_flags::kKind) ? DictEntryKind::kInline : DictEntryKind::kPodRef; + e->enc = (f & dict_flags::kEnc) ? DictEntryEnc::kWindowed : DictEntryEnc::kSlim; + e->has_sb = (f & dict_flags::kHasSb) != 0; +} + +// Length of the longest common prefix between term and prev_term. +uint32_t common_prefix_len(std::string_view term, std::string_view prev) { + uint32_t n = 0; + const uint32_t lim = static_cast(std::min(term.size(), prev.size())); + while (n < lim && term[n] == prev[n]) ++n; + return n; +} + +bool tier_has_stats(IndexTier tier) { + return tier >= IndexTier::kT2; +} + +// ---- Encode entry body (excluding entry_len and trailing crc) ---- + +void write_term_key(const DictEntry& e, std::string_view prev, ByteSink* sink) { + const uint32_t prefix = common_prefix_len(e.term, prev); + const std::string_view suffix = std::string_view(e.term).substr(prefix); + sink->put_varint32(prefix); + sink->put_varint32(static_cast(suffix.size())); + sink->put_bytes(Slice(suffix)); +} + +void write_stats(const DictEntry& e, IndexTier tier, ByteSink* sink) { + sink->put_varint32(e.df); + if (!tier_has_stats(tier)) return; + sink->put_varint64(e.ttf_delta); + sink->put_varint64(e.max_freq); +} + +// Per-window codec mode byte shared by slim/inline single-window regions. +uint8_t pack_win_mode(const DictEntry& e) { + uint8_t mode = 0; + if (e.dd_meta.zstd) mode |= 1u << 0; // dd_zstd + if (e.freq_meta.zstd) mode |= 1u << 1; // freq_zstd + return mode; +} + +// Writes the slim/inline region codec metadata (dd always; freq when tier>=T2). +// store_crc=false (INLINE entries, format v2) omits the redundant per-region +// crc32c: the inline bytes already sit inside the dict block, whose own +// block-level crc32c covers them. POD-ref entries pass store_crc=true (their +// regions live in the separately-fetched .frq POD, uncovered by the block crc). +void write_region_meta(const DictEntry& e, IndexTier tier, bool store_crc, ByteSink* sink) { + sink->put_u8(pack_win_mode(e)); + sink->put_varint64(e.dd_meta.uncomp_len); + if (store_crc) sink->put_fixed32(e.dd_meta.crc); + if (!tier_has_stats(tier)) return; + sink->put_varint64(e.freq_meta.uncomp_len); + if (store_crc) sink->put_fixed32(e.freq_meta.crc); +} + +void write_pod_ref(const DictEntry& e, IndexTier tier, ByteSink* sink) { + sink->put_varint64(e.frq_off_delta); + sink->put_varint64(e.frq_len); + if (e.enc == DictEntryEnc::kWindowed) { + sink->put_varint64(e.prelude_len); + sink->put_varint64(e.frq_docs_len); + } else { + sink->put_varint64(e.frq_docs_len); // slim pod_ref: dd region on-disk length + // POD-ref regions live in the .frq POD (not covered by the block crc): keep + // crc. + write_region_meta(e, tier, /*store_crc=*/true, sink); + } + if (!tier_has_stats(tier)) return; + sink->put_varint64(e.prx_off_delta); + sink->put_varint64(e.prx_len); +} + +void write_inline(const DictEntry& e, IndexTier tier, ByteSink* sink) { + sink->put_varint64(static_cast(e.frq_bytes.size())); + sink->put_bytes(Slice(e.frq_bytes)); + sink->put_varint64(e.inline_dd_disk_len); + // INLINE bytes are covered by the dict block crc32c: omit the redundant + // per-region crc. + write_region_meta(e, tier, /*store_crc=*/false, sink); + if (!tier_has_stats(tier)) return; + sink->put_varint64(static_cast(e.prx_bytes.size())); + sink->put_bytes(Slice(e.prx_bytes)); +} + +void write_body(const DictEntry& e, std::string_view prev, IndexTier tier, ByteSink* sink) { + write_term_key(e, prev, sink); + sink->put_u8(pack_flags(e)); + write_stats(e, tier, sink); + if (e.kind == DictEntryKind::kInline) { + write_inline(e, tier, sink); + } else { + write_pod_ref(e, tier, sink); + } +} + +// ---- Decode entry body ---- + +Status read_term_key(ByteSource* src, std::string_view prev, DictEntry* out) { + uint32_t prefix = 0; + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&prefix)); + SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len)); + if (prefix > prev.size()) { + return Status::Corruption("dict_entry: prefix_len exceeds prev_term length"); + } + Slice suffix; + SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix)); + out->term.assign(prev.substr(0, prefix)); + out->term.append(reinterpret_cast(suffix.data()), suffix.size()); + return Status::OK(); +} + +Status read_stats(ByteSource* src, IndexTier tier, DictEntry* out) { + SNII_RETURN_IF_ERROR(src->get_varint32(&out->df)); + if (!tier_has_stats(tier)) return Status::OK(); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->ttf_delta)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->max_freq)); + return Status::OK(); +} + +// Reads the slim/inline region codec metadata (mode/uncomp/[crc]) and fills the +// dd/freq region disk_len from the supplied total/split lengths. has_crc=false +// (INLINE entries, format v2) means no per-region crc was stored: the on-disk +// crc field is absent and region decode must skip crc verification (verify_crc= +// false) since the dict block's own crc32c already covers the inline bytes. +Status read_region_meta(ByteSource* src, IndexTier tier, bool has_crc, uint64_t dd_disk_len, + uint64_t freq_disk_len, DictEntry* out) { + uint8_t mode = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&mode)); + if ((mode & ~0x3u) != 0) { + return Status::Corruption("dict_entry: unknown win_mode bits"); + } + out->dd_meta.zstd = (mode & (1u << 0)) != 0; + out->dd_meta.disk_len = dd_disk_len; + out->dd_meta.verify_crc = has_crc; + SNII_RETURN_IF_ERROR(src->get_varint64(&out->dd_meta.uncomp_len)); + if (has_crc) SNII_RETURN_IF_ERROR(src->get_fixed32(&out->dd_meta.crc)); + if (!tier_has_stats(tier)) { + if (mode & (1u << 1)) { + return Status::Corruption("dict_entry: freq mode set without freq tier"); + } + return Status::OK(); + } + out->freq_meta.zstd = (mode & (1u << 1)) != 0; + out->freq_meta.disk_len = freq_disk_len; + out->freq_meta.verify_crc = has_crc; + SNII_RETURN_IF_ERROR(src->get_varint64(&out->freq_meta.uncomp_len)); + if (has_crc) SNII_RETURN_IF_ERROR(src->get_fixed32(&out->freq_meta.crc)); + return Status::OK(); +} + +Status read_pod_ref(ByteSource* src, IndexTier tier, DictEntry* out) { + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_off_delta)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_len)); + if (out->enc == DictEntryEnc::kWindowed) { + SNII_RETURN_IF_ERROR(src->get_varint64(&out->prelude_len)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_docs_len)); + if (out->prelude_len == 0 || out->prelude_len > out->frq_docs_len || + out->frq_docs_len > out->frq_len) { + return Status::Corruption("dict_entry: invalid windowed docs prefix"); + } + } else { + SNII_RETURN_IF_ERROR(src->get_varint64(&out->frq_docs_len)); + if (out->frq_docs_len > out->frq_len) { + return Status::Corruption("dict_entry: frq_docs_len exceeds frq_len"); + } + SNII_RETURN_IF_ERROR(read_region_meta(src, tier, /*has_crc=*/true, out->frq_docs_len, + out->frq_len - out->frq_docs_len, out)); + } + if (!tier_has_stats(tier)) return Status::OK(); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->prx_off_delta)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->prx_len)); + return Status::OK(); +} + +Status read_byte_blob(ByteSource* src, std::vector* out) { + uint64_t len = 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&len)); + Slice bytes; + SNII_RETURN_IF_ERROR(src->get_bytes(static_cast(len), &bytes)); + out->assign(bytes.data(), bytes.data() + bytes.size()); + return Status::OK(); +} + +Status read_inline(ByteSource* src, IndexTier tier, DictEntry* out) { + SNII_RETURN_IF_ERROR(read_byte_blob(src, &out->frq_bytes)); + SNII_RETURN_IF_ERROR(src->get_varint64(&out->inline_dd_disk_len)); + if (out->inline_dd_disk_len > out->frq_bytes.size()) { + return Status::Corruption("dict_entry: inline_dd_disk_len exceeds frq_bytes"); + } + const uint64_t freq_disk_len = + static_cast(out->frq_bytes.size()) - out->inline_dd_disk_len; + // INLINE entries store no per-region crc (covered by the block crc): + // has_crc=false. + SNII_RETURN_IF_ERROR(read_region_meta(src, tier, /*has_crc=*/false, out->inline_dd_disk_len, + freq_disk_len, out)); + if (!tier_has_stats(tier)) return Status::OK(); + SNII_RETURN_IF_ERROR(read_byte_blob(src, &out->prx_bytes)); + return Status::OK(); +} + +Status read_locator(ByteSource* src, IndexTier tier, DictEntry* out) { + if (out->kind == DictEntryKind::kInline) return read_inline(src, tier, out); + return read_pod_ref(src, tier, out); +} + +// Read entry_len (= body length) and verify that src has enough remaining +// bytes. +Status read_entry_len(ByteSource* src, uint64_t* total) { + SNII_RETURN_IF_ERROR(src->get_varint64(total)); + if (*total > src->remaining()) { + return Status::Corruption("dict_entry: entry_len out of range"); + } + return Status::OK(); +} + +} // namespace + +Status encode_dict_entry(const DictEntry& entry, std::string_view prev_term, IndexTier tier, + ByteSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("dict_entry: sink is null"); + + // Serialize the body into a temporary buffer first to obtain the exact + // length, then write entry_len + body. CRC verification is done uniformly at + // the DICT block level (covering block header + all entries + anchor table); + // CRC is not repeated at the entry level, to keep slim/inline low-frequency + // terms maximally compact (spec §DICT block/§dict entry). + ByteSink body; + write_body(entry, prev_term, tier, &body); + sink->put_varint64(static_cast(body.size())); + sink->put_bytes(body.view()); + return Status::OK(); +} + +Status decode_dict_entry(ByteSource* src, std::string_view prev_term, IndexTier tier, + DictEntry* out) { + if (src == nullptr || out == nullptr) { + return Status::InvalidArgument("dict_entry: src / out is null"); + } + *out = DictEntry {}; + + uint64_t total = 0; + SNII_RETURN_IF_ERROR(read_entry_len(src, &total)); + const size_t body_start = src->position(); + + SNII_RETURN_IF_ERROR(read_term_key(src, prev_term, out)); + uint8_t flags = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&flags)); + apply_flags(flags, out); + SNII_RETURN_IF_ERROR(read_stats(src, tier, out)); + SNII_RETURN_IF_ERROR(read_locator(src, tier, out)); + + // The body must consume exactly entry_len bytes; otherwise the structure is + // inconsistent with the tier. + const size_t consumed = src->position() - body_start; + if (consumed != static_cast(total)) { + return Status::Corruption("dict_entry: body length does not match entry_len"); + } + return Status::OK(); +} + +Status skip_dict_entry(ByteSource* src) { + if (src == nullptr) return Status::InvalidArgument("dict_entry: src is null"); + uint64_t total = 0; + SNII_RETURN_IF_ERROR(read_entry_len(src, &total)); + Slice unused; + return src->get_bytes(static_cast(total), &unused); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/frq_pod.cpp b/be/src/storage/index/snii/core/src/format/frq_pod.cpp new file mode 100644 index 00000000000000..1dc28fb9eea696 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/frq_pod.cpp @@ -0,0 +1,196 @@ +#include "snii/format/frq_pod.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/pfor.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/format_constants.h" + +namespace snii::format { +namespace { + +// Auto-compression threshold: use raw when a region is smaller than this byte +// count (zstd gain is negligible and metadata overhead is relatively large). +inline constexpr size_t kAutoZstdMinBytes = 512; +// Default zstd level for auto mode. +inline constexpr int kDefaultZstdLevel = 3; +// Maximum decompressed byte size for a single region. Guards against a +// corrupted uncomp_len read from S3 that inflated to a huge value: sanity-check +// before allocating/decompressing to avoid GB-scale allocations. Windows are +// 256-doc aligned and normally far smaller than this. +inline constexpr uint32_t kMaxRegionUncompBytes = 256u * 1024 * 1024; +// Maximum doc count per .frq window (guards against a corrupted n). Window +// baseline is 256, practical combined cap is 2048, so this is a loose but +// astronomically-large-number-blocking upper bound. +inline constexpr uint32_t kMaxWindowDocs = 1u << 24; + +// Encode a uint32 array into multiple PFOR runs, each of 256 (kFrqBaseUnit) +// elements. n / run count is not written: the number of runs is derived from +// total length n and kFrqBaseUnit, and the decoder computes it the same way. +void encode_pfor_runs(std::span values, ByteSink* out) { + size_t n = values.size(); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + pfor_encode(values.data() + off, run, out); + } +} + +// Decode n uint32 values from source (multiple PFOR runs of 256 each). +Status decode_pfor_runs(ByteSource* src, size_t n, std::vector* out) { + out->assign(n, 0); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + SNII_RETURN_IF_ERROR(pfor_decode(src, run, out->data() + off)); + } + return Status::OK(); +} + +// Verifies docids are ascending and the first entry is not below win_base. +Status validate_docs(std::span docs, uint64_t win_base) { + if (docs.empty()) return Status::OK(); + if (static_cast(docs.front()) < win_base) { + return Status::InvalidArgument("frq: first docid below win_base"); + } + for (size_t i = 1; i < docs.size(); ++i) { + if (docs[i] < docs[i - 1]) { + return Status::InvalidArgument("frq: docids must be ascending"); + } + } + return Status::OK(); +} + +// Decision: given level and plaintext length, determine whether to compress. +bool should_compress(int level, size_t plain_len) { + if (level == 0) return false; // force raw + if (level > 0) return true; // force zstd + return plain_len >= kAutoZstdMinBytes; // auto +} + +// Encodes one region's plaintext into raw or zstd, appends the on-disk bytes to +// out, and fills meta (mode/uncomp_len/disk_len/crc). The region carries no +// header. +Status emit_region(Slice plain, int level, ByteSink* out, FrqRegionMeta* meta) { + if (out == nullptr || meta == nullptr) { + return Status::InvalidArgument("frq: null region out"); + } + meta->uncomp_len = plain.size(); + std::vector disk; + if (should_compress(level, plain.size())) { + meta->zstd = true; + SNII_RETURN_IF_ERROR(zstd_compress(plain, level > 0 ? level : kDefaultZstdLevel, &disk)); + } else { + meta->zstd = false; + disk.assign(plain.data(), plain.data() + plain.size()); + } + meta->disk_len = static_cast(disk.size()); + meta->crc = crc32c(Slice(disk)); + out->put_bytes(Slice(disk)); + return Status::OK(); +} + +// Materializes a region's plaintext (raw borrows the view; zstd decompresses) +// and verifies its crc + slice length against meta. +Status open_region(Slice disk, const FrqRegionMeta& meta, std::vector* holder, + Slice* plain) { + if (disk.size() != static_cast(meta.disk_len)) { + return Status::Corruption("frq: region slice length mismatch"); + } + if (meta.uncomp_len > kMaxRegionUncompBytes) { + return Status::Corruption("frq: region uncomp_len exceeds sane cap"); + } + // Inline entries (verify_crc=false) carry no per-region crc: their on-disk + // bytes are covered by the enclosing dict block's block-level crc32c, so the + // region crc would be redundant. POD-ref regions keep their own crc check. + if (meta.verify_crc && crc32c(disk) != meta.crc) { + return Status::Corruption("frq: region crc mismatch"); + } + if (!meta.zstd) { + if (meta.uncomp_len != meta.disk_len) { + return Status::Corruption("frq: raw region length inconsistent"); + } + *plain = disk; + return Status::OK(); + } + SNII_RETURN_IF_ERROR(zstd_decompress(disk, static_cast(meta.uncomp_len), holder)); + *plain = Slice(*holder); + return Status::OK(); +} + +} // namespace + +Status build_dd_region(std::span docids_ascending, uint64_t win_base, + int zstd_level_or_neg_for_auto, ByteSink* out, FrqRegionMeta* meta) { + if (out == nullptr || meta == nullptr) { + return Status::InvalidArgument("frq: null dd region out"); + } + SNII_RETURN_IF_ERROR(validate_docs(docids_ascending, win_base)); + ByteSink plain; // VInt n ++ PFOR_runs(doc_delta) + std::vector dd(docids_ascending.size()); + uint64_t prev = win_base; + for (size_t i = 0; i < docids_ascending.size(); ++i) { + dd[i] = static_cast(static_cast(docids_ascending[i]) - prev); + prev = docids_ascending[i]; + } + plain.put_varint32(static_cast(docids_ascending.size())); + encode_pfor_runs(dd, &plain); + return emit_region(plain.view(), zstd_level_or_neg_for_auto, out, meta); +} + +Status build_freq_region(std::span freqs, int zstd_level_or_neg_for_auto, + ByteSink* out, FrqRegionMeta* meta) { + if (out == nullptr || meta == nullptr) { + return Status::InvalidArgument("frq: null freq region out"); + } + ByteSink plain; + encode_pfor_runs(freqs, &plain); + return emit_region(plain.view(), zstd_level_or_neg_for_auto, out, meta); +} + +Status decode_dd_region(Slice dd_disk, const FrqRegionMeta& meta, uint64_t win_base, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("frq: null docids out"); + std::vector holder; + Slice plain; + SNII_RETURN_IF_ERROR(open_region(dd_disk, meta, &holder, &plain)); + ByteSource src(plain); + uint32_t n = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&n)); + if (n > kMaxWindowDocs) return Status::Corruption("frq: doc count exceeds sane cap"); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, n, docids)); + if (!src.eof()) { + return Status::Corruption("frq: trailing bytes after dd region payload"); + } + uint64_t cur = win_base; + for (uint32_t i = 0; i < n; ++i) { + cur += (*docids)[i]; + (*docids)[i] = static_cast(cur); + } + return Status::OK(); +} + +Status decode_freq_region(Slice freq_disk, const FrqRegionMeta& meta, size_t doc_count, + std::vector* freqs) { + if (freqs == nullptr) return Status::InvalidArgument("frq: null freqs out"); + std::vector holder; + Slice plain; + SNII_RETURN_IF_ERROR(open_region(freq_disk, meta, &holder, &plain)); + if (doc_count == 0) { + if (meta.uncomp_len != 0) { + return Status::Corruption("frq: empty freq region expected"); + } + freqs->clear(); + return Status::OK(); + } + ByteSource src(plain); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, freqs)); + if (!src.eof()) { + return Status::Corruption("frq: trailing bytes after freq region payload"); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/frq_prelude.cpp b/be/src/storage/index/snii/core/src/format/frq_prelude.cpp new file mode 100644 index 00000000000000..568fda00f2f854 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/frq_prelude.cpp @@ -0,0 +1,470 @@ +#include "snii/format/frq_prelude.h" + +#include +#include +#include + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" + +namespace snii::format { + +namespace { + +// Anti-DoS: a segment holds at most ~15M docs (>=1 doc/window), so 1<<24 +// windows is a generous ceiling that still prevents multi-GB allocations from a +// crafted N. (crc32c is not a MAC and cannot defend a re-stamped inflated count.) +constexpr uint64_t kMaxWindows = 1ull << 24; + +uint64_t ceil_div(uint64_t a, uint64_t b) { + return (a + b - 1) / b; +} + +uint8_t make_flags(const FrqPreludeColumns& cols) { + uint8_t flags = 0; + if (cols.has_freq) flags |= frq_prelude_flags::kHasFreq; + if (cols.has_prx) flags |= frq_prelude_flags::kHasPrx; + return flags; +} + +uint8_t make_win_mode(const WindowMeta& m, bool has_freq) { + uint8_t mode = 0; + if (m.dd_zstd) mode |= frq_win_mode::kDdZstd; + if (has_freq && m.freq_zstd) mode |= frq_win_mode::kFreqZstd; + return mode; +} + +Status checked_add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) { + if (rhs > std::numeric_limits::max() - lhs) { + return Status::Corruption(message); + } + *out = lhs + rhs; + return Status::OK(); +} + +Status checked_u32(uint64_t value, const char* message, uint32_t* out) { + if (value > std::numeric_limits::max()) { + return Status::Corruption(message); + } + *out = static_cast(value); + return Status::OK(); +} + +Status validate_window_doc_count(bool first_window, uint64_t win_base, uint64_t last_docid, + uint64_t doc_count) { + uint64_t first_docid = 0; + if (!first_window) { + SNII_RETURN_IF_ERROR(checked_add_u64( + win_base, 1, "frq_prelude: window base exceeds docid range", &first_docid)); + } + if (last_docid < first_docid) { + return Status::Corruption("frq_prelude: invalid window docid range"); + } + const uint64_t width = last_docid - first_docid + 1; + if (doc_count > width) { + return Status::Corruption("frq_prelude: doc_count exceeds window width"); + } + return Status::OK(); +} + +// Validates builder input: non-null sink, group_size>=1, sane count, and +// non-decreasing absolute last_docid across windows. +Status validate_input(const FrqPreludeColumns& cols, ByteSink* out) { + if (out == nullptr) return Status::InvalidArgument("frq_prelude: null sink"); + if (cols.group_size == 0) { + return Status::InvalidArgument("frq_prelude: group_size must be >= 1"); + } + if (cols.windows.size() > kMaxWindows) { + return Status::InvalidArgument("frq_prelude: window count exceeds cap"); + } + for (size_t w = 1; w < cols.windows.size(); ++w) { + if (cols.windows[w].last_docid < cols.windows[w - 1].last_docid) { + return Status::InvalidArgument("frq_prelude: last_docid not monotonic"); + } + } + return Status::OK(); +} + +// Encodes one window row into a per-block sink. last_docid_delta is the row's +// absolute last_docid minus prev_last (the previous window's absolute last). +void encode_window_row(const WindowMeta& m, bool has_freq, bool has_prx, uint64_t prev_last, + ByteSink* block) { + block->put_varint64(static_cast(m.last_docid) - prev_last); + block->put_varint64(m.doc_count); + block->put_u8(make_win_mode(m, has_freq)); + block->put_varint64(m.dd_off); + block->put_varint64(m.dd_disk_len); + block->put_varint64(m.dd_uncomp_len); + block->put_fixed32(m.crc_dd); + if (has_freq) { + block->put_varint64(m.freq_off); + block->put_varint64(m.freq_disk_len); + block->put_varint64(m.freq_uncomp_len); + block->put_fixed32(m.crc_freq); + } + if (has_prx) { + block->put_varint64(m.prx_off); + block->put_varint64(m.prx_len); + } + block->put_varint64(m.max_freq); + block->put_u8(m.max_norm); +} + +// One super-block's serialized window block plus its directory fields. +struct SuperBlock { + ByteSink block; + uint64_t last_docid = 0; // absolute last docid of this super-block's last window +}; + +// Builds every super-block's window block (row-encoded) and records the running +// absolute last docid at each super-block boundary. +std::vector encode_super_blocks(const FrqPreludeColumns& cols) { + const uint32_t g = cols.group_size; + const size_t n = cols.windows.size(); + std::vector blocks; + blocks.reserve(static_cast(ceil_div(n, g))); + uint64_t prev_last = 0; // previous window's absolute last docid (chains across blocks) + for (size_t start = 0; start < n; start += g) { + const size_t end = std::min(n, start + g); + SuperBlock sb; + for (size_t w = start; w < end; ++w) { + encode_window_row(cols.windows[w], cols.has_freq, cols.has_prx, prev_last, &sb.block); + prev_last = cols.windows[w].last_docid; + } + sb.last_docid = prev_last; + blocks.push_back(std::move(sb)); + } + return blocks; +} + +// Serializes the super_block_dir (one row per super-block) into dir_sink, using +// each block's byte length to compute its offset within the window_dir region. +void encode_super_block_dir(const std::vector& blocks, ByteSink* dir_sink) { + uint64_t prev_last = 0; + uint64_t block_off = 0; + for (const SuperBlock& sb : blocks) { + dir_sink->put_varint64(sb.last_docid - prev_last); + dir_sink->put_varint64(block_off); + dir_sink->put_varint64(sb.block.size()); + prev_last = sb.last_docid; + block_off += sb.block.size(); + } +} + +} // namespace + +Status build_frq_prelude(const FrqPreludeColumns& cols, ByteSink* out) { + SNII_RETURN_IF_ERROR(validate_input(cols, out)); + + const std::vector blocks = encode_super_blocks(cols); + ByteSink dir_sink; + encode_super_block_dir(blocks, &dir_sink); + + // covered = header + super_block_dir (the crc covers exactly this region). + ByteSink covered; + covered.put_u8(make_flags(cols)); + covered.put_varint64(cols.windows.size()); + covered.put_varint64(cols.group_size); + covered.put_varint64(blocks.size()); + covered.put_varint64(dir_sink.size()); + covered.put_bytes(dir_sink.view()); + + out->put_bytes(covered.view()); + out->put_fixed32(crc32c(covered.view())); + for (const SuperBlock& sb : blocks) out->put_bytes(sb.block.view()); + return Status::OK(); +} + +namespace { + +// Decoded header fields shared between parse phases. +struct Header { + bool has_freq = false; + bool has_prx = false; + uint64_t n = 0; + uint64_t group_size = 0; + uint64_t n_super = 0; + uint64_t sbdir_len = 0; +}; + +// Verifies the trailing crc covers [start of buffer .. end of super_block_dir]. +// covered_len = header bytes (up to and including sbdir_len) + sbdir_len. +Status verify_covered_crc(Slice prelude, size_t header_end, uint64_t sbdir_len) { + const size_t covered = header_end + static_cast(sbdir_len); + if (covered + sizeof(uint32_t) > prelude.size()) { + return Status::Corruption("frq_prelude: buffer too short for crc region"); + } + uint32_t stored = 0; + ByteSource crc_src(prelude.subslice(covered, sizeof(uint32_t))); + SNII_RETURN_IF_ERROR(crc_src.get_fixed32(&stored)); + if (crc32c(prelude.subslice(0, covered)) != stored) { + return Status::Corruption("frq_prelude: crc32c mismatch"); + } + return Status::OK(); +} + +// Parses + validates the header (counts capped before any later reserve). +Status parse_header(ByteSource* src, Header* h) { + uint8_t flags = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&flags)); + h->has_freq = (flags & frq_prelude_flags::kHasFreq) != 0; + h->has_prx = (flags & frq_prelude_flags::kHasPrx) != 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&h->n)); + SNII_RETURN_IF_ERROR(src->get_varint64(&h->group_size)); + SNII_RETURN_IF_ERROR(src->get_varint64(&h->n_super)); + SNII_RETURN_IF_ERROR(src->get_varint64(&h->sbdir_len)); + if (h->n > kMaxWindows || h->n_super > kMaxWindows) { + return Status::Corruption("frq_prelude: window count exceeds sane cap"); + } + if (h->group_size == 0) { + return Status::Corruption("frq_prelude: group_size is zero"); + } + if (h->n_super != ceil_div(h->n, h->group_size)) { + return Status::Corruption("frq_prelude: n_super inconsistent with N/G"); + } + return Status::OK(); +} + +// One super-block directory row. +struct SbDirRow { + uint64_t last_docid = 0; + uint64_t block_off = 0; + uint64_t block_len = 0; +}; + +// Decodes the super_block_dir region into absolute-last-docid rows, validating +// monotonic last docids and contiguous, in-bounds block offsets. +Status decode_super_block_dir(Slice dir, const Header& h, std::vector* rows, + uint64_t* window_region_len) { + ByteSource src(dir); + rows->clear(); + rows->reserve(static_cast(h.n_super)); + uint64_t prev_last = 0; + uint64_t expect_off = 0; + for (uint64_t s = 0; s < h.n_super; ++s) { + SbDirRow r; + uint64_t ldd = 0; + SNII_RETURN_IF_ERROR(src.get_varint64(&ldd)); + SNII_RETURN_IF_ERROR(src.get_varint64(&r.block_off)); + SNII_RETURN_IF_ERROR(src.get_varint64(&r.block_len)); + SNII_RETURN_IF_ERROR(checked_add_u64( + prev_last, ldd, "frq_prelude: super-block last_docid overflow", &r.last_docid)); + uint32_t checked_last = 0; + SNII_RETURN_IF_ERROR(checked_u32( + r.last_docid, "frq_prelude: super-block last_docid exceeds u32", &checked_last)); + if (r.last_docid < prev_last || r.block_off != expect_off) { + return Status::Corruption("frq_prelude: super-block dir inconsistent"); + } + expect_off += r.block_len; + prev_last = r.last_docid; + rows->push_back(r); + } + if (!src.eof()) { + return Status::Corruption("frq_prelude: super-block dir has trailing bytes"); + } + *window_region_len = expect_off; + return Status::OK(); +} + +// Validates a per-window codec mode byte against the known bits. +Status check_win_mode(uint8_t mode, bool has_freq) { + if ((mode & ~frq_win_mode::kKnownBits) != 0) { + return Status::Corruption("frq_prelude: unknown win_mode bits"); + } + if (!has_freq && (mode & frq_win_mode::kFreqZstd) != 0) { + return Status::Corruption("frq_prelude: freq mode set without has_freq"); + } + return Status::OK(); +} + +// Decodes one window row, advancing prev_last to this window's absolute last. +Status decode_window_row(ByteSource* src, bool has_freq, bool has_prx, bool first_window, + uint64_t* prev_last, WindowMeta* m) { + uint64_t ldd = 0, doc_count = 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&ldd)); + SNII_RETURN_IF_ERROR(src->get_varint64(&doc_count)); + uint8_t mode = 0; + SNII_RETURN_IF_ERROR(src->get_u8(&mode)); + SNII_RETURN_IF_ERROR(check_win_mode(mode, has_freq)); + m->dd_zstd = (mode & frq_win_mode::kDdZstd) != 0; + m->freq_zstd = has_freq && (mode & frq_win_mode::kFreqZstd) != 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_off)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_disk_len)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->dd_uncomp_len)); + SNII_RETURN_IF_ERROR(src->get_fixed32(&m->crc_dd)); + if (has_freq) { + SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_off)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_disk_len)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->freq_uncomp_len)); + SNII_RETURN_IF_ERROR(src->get_fixed32(&m->crc_freq)); + } + if (has_prx) { + SNII_RETURN_IF_ERROR(src->get_varint64(&m->prx_off)); + SNII_RETURN_IF_ERROR(src->get_varint64(&m->prx_len)); + } + uint64_t max_freq = 0; + SNII_RETURN_IF_ERROR(src->get_varint64(&max_freq)); + SNII_RETURN_IF_ERROR(src->get_u8(&m->max_norm)); + uint64_t last_docid = 0; + SNII_RETURN_IF_ERROR(checked_add_u64(*prev_last, ldd, "frq_prelude: window last_docid overflow", + &last_docid)); + SNII_RETURN_IF_ERROR( + validate_window_doc_count(first_window, *prev_last, last_docid, doc_count)); + m->win_base = *prev_last; + SNII_RETURN_IF_ERROR( + checked_u32(last_docid, "frq_prelude: window last_docid exceeds u32", &m->last_docid)); + SNII_RETURN_IF_ERROR( + checked_u32(doc_count, "frq_prelude: window doc_count exceeds u32", &m->doc_count)); + SNII_RETURN_IF_ERROR( + checked_u32(max_freq, "frq_prelude: window max_freq exceeds u32", &m->max_freq)); + *prev_last = last_docid; + return Status::OK(); +} + +// Decodes one super-block's window block (<=G rows) into the global window list, +// seeding win_base from prev_last and re-checking the recorded sb last docid. +Status decode_one_block(Slice block, const Header& h, uint64_t sb_last_docid, size_t row_count, + uint64_t* prev_last, std::vector* windows) { + ByteSource src(block); + for (size_t i = 0; i < row_count; ++i) { + WindowMeta m; + SNII_RETURN_IF_ERROR( + decode_window_row(&src, h.has_freq, h.has_prx, windows->empty(), prev_last, &m)); + windows->push_back(m); + } + if (!src.eof()) { + return Status::Corruption("frq_prelude: window block has trailing bytes"); + } + if (*prev_last != sb_last_docid) { + return Status::Corruption("frq_prelude: window block last docid mismatch"); + } + return Status::OK(); +} + +// Decodes all window blocks pointed to by the super_block_dir. +Status decode_all_blocks(Slice window_region, const Header& h, const std::vector& dir, + std::vector* windows) { + windows->clear(); + windows->reserve(static_cast(h.n)); + uint64_t prev_last = 0; + for (size_t s = 0; s < dir.size(); ++s) { + const SbDirRow& r = dir[s]; + if (r.block_off + r.block_len > window_region.size() || + r.block_off + r.block_len < r.block_off) { + return Status::Corruption("frq_prelude: window block out of region"); + } + const uint64_t already = static_cast(windows->size()); + const uint64_t rows = std::min(h.group_size, h.n - already); + Slice block = window_region.subslice(static_cast(r.block_off), + static_cast(r.block_len)); + SNII_RETURN_IF_ERROR(decode_one_block(block, h, r.last_docid, static_cast(rows), + &prev_last, windows)); + } + if (windows->size() != h.n) { + return Status::Corruption("frq_prelude: decoded window count mismatch"); + } + return Status::OK(); +} + +// Validates the dd/freq region locators tile the dd-block / freq-block contiguously +// (each region starts where the previous one ended) and returns the block lengths. +// Contiguity makes the docs-only prefix one solid run and bounds the read range. +Status validate_region_layout(const Header& h, const std::vector& windows, + uint64_t* dd_block_len, uint64_t* freq_block_len) { + uint64_t dd_expect = 0; + uint64_t freq_expect = 0; + for (const WindowMeta& m : windows) { + if (m.dd_off != dd_expect) { + return Status::Corruption("frq_prelude: dd region not contiguous"); + } + if (m.dd_disk_len > m.dd_uncomp_len && !m.dd_zstd) { + return Status::Corruption("frq_prelude: raw dd region length inconsistent"); + } + if (dd_expect + m.dd_disk_len < dd_expect) { + return Status::Corruption("frq_prelude: dd block length overflow"); + } + dd_expect += m.dd_disk_len; + if (h.has_freq) { + if (m.freq_off != freq_expect) { + return Status::Corruption("frq_prelude: freq region not contiguous"); + } + if (freq_expect + m.freq_disk_len < freq_expect) { + return Status::Corruption("frq_prelude: freq block length overflow"); + } + freq_expect += m.freq_disk_len; + } + } + *dd_block_len = dd_expect; + *freq_block_len = freq_expect; + return Status::OK(); +} + +} // namespace + +Status FrqPreludeReader::open(Slice prelude, FrqPreludeReader* out) { + ByteSource src(prelude); + Header h; + SNII_RETURN_IF_ERROR(parse_header(&src, &h)); + const size_t header_end = src.position(); + SNII_RETURN_IF_ERROR(verify_covered_crc(prelude, header_end, h.sbdir_len)); + + if (header_end + static_cast(h.sbdir_len) > prelude.size()) { + return Status::Corruption("frq_prelude: sbdir_len past buffer"); + } + Slice dir = prelude.subslice(header_end, static_cast(h.sbdir_len)); + std::vector rows; + uint64_t window_region_len = 0; + SNII_RETURN_IF_ERROR(decode_super_block_dir(dir, h, &rows, &window_region_len)); + + const size_t region_start = header_end + static_cast(h.sbdir_len) + sizeof(uint32_t); + if (region_start + static_cast(window_region_len) > prelude.size()) { + return Status::Corruption("frq_prelude: window region past buffer"); + } + Slice window_region = prelude.subslice(region_start, static_cast(window_region_len)); + + out->has_freq_ = h.has_freq; + out->has_prx_ = h.has_prx; + out->group_size_ = static_cast(h.group_size); + out->n_super_ = static_cast(h.n_super); + out->sb_last_docid_.clear(); + out->sb_last_docid_.reserve(rows.size()); + for (const SbDirRow& r : rows) out->sb_last_docid_.push_back(r.last_docid); + SNII_RETURN_IF_ERROR(decode_all_blocks(window_region, h, rows, &out->windows_)); + return validate_region_layout(h, out->windows_, &out->dd_block_len_, &out->freq_block_len_); +} + +Status FrqPreludeReader::window(uint32_t w, WindowMeta* out) const { + if (out == nullptr) return Status::InvalidArgument("frq_prelude: null window out"); + if (w >= windows_.size()) { + return Status::InvalidArgument("frq_prelude: window index out of range"); + } + *out = windows_[w]; + return Status::OK(); +} + +Status FrqPreludeReader::locate_window(uint32_t docid, bool* found, uint32_t* w) const { + if (found == nullptr || w == nullptr) { + return Status::InvalidArgument("frq_prelude: null locate out"); + } + *found = false; + if (windows_.empty()) return Status::OK(); + if (docid > windows_.back().last_docid) return Status::OK(); + + // Level 1: first super-block whose absolute last docid >= docid. + const auto sb_it = std::lower_bound(sb_last_docid_.begin(), sb_last_docid_.end(), + static_cast(docid)); + const size_t sb = static_cast(sb_it - sb_last_docid_.begin()); + // Level 2: window binary search within [sb*G, min((sb+1)*G, N)). + const size_t lo = sb * group_size_; + const size_t hi = std::min(lo + group_size_, windows_.size()); + for (size_t i = lo; i < hi; ++i) { + if (docid <= windows_[i].last_docid) { + *found = true; + *w = static_cast(i); + return Status::OK(); + } + } + return Status::OK(); // unreachable when invariants hold; defensive miss. +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp b/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp new file mode 100644 index 00000000000000..27ca75b8f6b9ec --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/logical_index_directory.cpp @@ -0,0 +1,116 @@ +#include "snii/format/logical_index_directory.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +namespace { + +// Minimum payload bytes any entry can occupy: index_id (>=1) + suffix_len (>=1, value 0) + +// meta_off (>=1) + meta_len (>=1). Used as an anti-DoS lower bound before reserving. +constexpr size_t kMinEntryBytes = 4; + +// Encode one directory entry. Fixed field order; reuse ByteSink varint/bytes primitives. +void encode_entry(const LogicalIndexRef& ref, ByteSink* payload) { + payload->put_varint64(ref.index_id); + payload->put_varint32(static_cast(ref.index_suffix.size())); + payload->put_bytes(Slice(std::string_view(ref.index_suffix))); + payload->put_varint64(ref.meta_off); + payload->put_varint64(ref.meta_len); +} + +// Decode one directory entry, validating suffix_len against the remaining payload before copying. +Status decode_entry(ByteSource* ps, LogicalIndexRef* ref) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->index_id)); + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(ps->get_varint32(&suffix_len)); + // Anti-DoS: reject a suffix_len that cannot fit in the remaining bytes before allocating. + if (suffix_len > ps->remaining()) { + return Status::Corruption("logical_index_directory: suffix_len exceeds payload"); + } + Slice suffix; + SNII_RETURN_IF_ERROR(ps->get_bytes(suffix_len, &suffix)); + ref->index_suffix.assign(reinterpret_cast(suffix.data()), suffix.size()); + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->meta_off)); + SNII_RETURN_IF_ERROR(ps->get_varint64(&ref->meta_len)); + return Status::OK(); +} + +Status decode_payload(Slice payload, std::vector* refs) { + ByteSource ps(payload); + uint32_t n_entries = 0; + SNII_RETURN_IF_ERROR(ps.get_varint32(&n_entries)); + // Anti-DoS: cap n_entries against the remaining payload before reserving, so a corrupted + // inflated count cannot trigger a huge allocation. + if (n_entries > ps.remaining() / kMinEntryBytes) { + return Status::Corruption("logical_index_directory: n_entries exceeds payload capacity"); + } + refs->clear(); + refs->reserve(n_entries); + for (uint32_t i = 0; i < n_entries; ++i) { + LogicalIndexRef ref {}; + SNII_RETURN_IF_ERROR(decode_entry(&ps, &ref)); + refs->push_back(std::move(ref)); + } + if (!ps.eof()) { + return Status::Corruption("logical_index_directory: trailing bytes in payload"); + } + return Status::OK(); +} + +} // namespace + +void LogicalIndexDirectoryBuilder::finish(ByteSink* sink) const { + ByteSink payload; + payload.put_varint32(static_cast(refs_.size())); + for (const auto& ref : refs_) { + encode_entry(ref, &payload); + } + SectionFramer::write(*sink, static_cast(SectionType::kLogicalIndexDirectory), + payload.view()); +} + +Status LogicalIndexDirectoryReader::open(Slice framed, LogicalIndexDirectoryReader* out) { + if (out == nullptr) { + return Status::InvalidArgument("logical_index_directory: out is null"); + } + ByteSource src(framed); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + if (sec.type != static_cast(SectionType::kLogicalIndexDirectory)) { + return Status::InvalidArgument("logical_index_directory: unexpected section type"); + } + return decode_payload(sec.payload, &out->refs_); +} + +Status LogicalIndexDirectoryReader::get(uint32_t i, LogicalIndexRef* out) const { + if (out == nullptr) { + return Status::InvalidArgument("logical_index_directory: out is null"); + } + if (i >= refs_.size()) { + return Status::NotFound("logical_index_directory: index out of range"); + } + *out = refs_[i]; + return Status::OK(); +} + +Status LogicalIndexDirectoryReader::find(uint64_t index_id, std::string_view suffix, bool* found, + LogicalIndexRef* out) const { + if (found == nullptr || out == nullptr) { + return Status::InvalidArgument("logical_index_directory: output pointer is null"); + } + *found = false; + for (const auto& ref : refs_) { + if (ref.index_id != index_id || std::string_view(ref.index_suffix) != suffix) { + continue; + } + *out = ref; + *found = true; + return Status::OK(); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/norms_pod.cpp b/be/src/storage/index/snii/core/src/format/norms_pod.cpp new file mode 100644 index 00000000000000..a6f80c03b1ebcd --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/norms_pod.cpp @@ -0,0 +1,46 @@ +#include "snii/format/norms_pod.h" + +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +void NormsPodWriter::finish(ByteSink* sink) const { + // Build inner payload: [varint64 doc_count][raw norm bytes]. + ByteSink payload; + payload.put_varint64(norms_.size()); + payload.put_bytes(Slice(norms_)); + // Delegate outer framing to SectionFramer to append type+len+crc32c, avoiding manual checksum assembly. + SectionFramer::write(*sink, static_cast(SectionType::kStatsBlock), payload.view()); +} + +Status NormsPodReader::open(Slice framed, NormsPodReader* out) { + // framer handles CRC verify, truncation detection, and payload slicing. + ByteSource src(framed); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + + // Parse inner payload: [varint64 doc_count][bytes]. + ByteSource payload(sec.payload); + uint64_t doc_count = 0; + SNII_RETURN_IF_ERROR(payload.get_varint64(&doc_count)); + if (doc_count > std::numeric_limits::max()) { + return Status::Corruption("norms POD doc_count overflows uint32"); + } + // doc_count must exactly equal the remaining byte count (1 byte per doc). + if (payload.remaining() != doc_count) { + return Status::Corruption("norms POD length mismatch"); + } + + Slice bytes; + SNII_RETURN_IF_ERROR(payload.get_bytes(static_cast(doc_count), &bytes)); + out->doc_count_ = static_cast(doc_count); + out->norms_ = bytes.data(); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/null_bitmap.cpp b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp new file mode 100644 index 00000000000000..2ca7be630fe06d --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp @@ -0,0 +1,99 @@ +#include "snii/format/null_bitmap.h" + +#include +#include + +#include "roaring/roaring.h" +#include "roaring/roaring.hh" +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" + +namespace snii::format { + +NullBitmapWriter::NullBitmapWriter() : bitmap_(std::make_unique()) {} + +NullBitmapWriter::~NullBitmapWriter() = default; + +void NullBitmapWriter::add_null(uint32_t docid) { + bitmap_->add(docid); +} + +uint32_t NullBitmapWriter::null_count() const { + return static_cast(bitmap_->cardinality()); +} + +void NullBitmapWriter::finish(uint32_t doc_count, ByteSink* sink) const { + // Serialize the Roaring bitmap to its portable on-disk form. + const size_t roaring_size = bitmap_->getSizeInBytes(); + std::vector roaring_buf(roaring_size); + bitmap_->write(roaring_buf.data()); + + // Build inner payload: [varint64 doc_count][varint64 roaring_size][bytes]. + ByteSink payload; + payload.put_varint64(doc_count); + payload.put_varint64(roaring_size); + payload.put_bytes(Slice(reinterpret_cast(roaring_buf.data()), roaring_size)); + + // Delegate the type + len + crc32c envelope to SectionFramer. + SectionFramer::write(*sink, kNullBitmapSectionType, payload.view()); +} + +NullBitmapReader::NullBitmapReader() : bitmap_(std::make_unique()) {} + +NullBitmapReader::~NullBitmapReader() = default; + +NullBitmapReader::NullBitmapReader(NullBitmapReader&&) noexcept = default; +NullBitmapReader& NullBitmapReader::operator=(NullBitmapReader&&) noexcept = default; + +Status NullBitmapReader::open(Slice framed, NullBitmapReader* out) { + // SectionFramer handles CRC verification, truncation detection, and payload + // slicing. + ByteSource src(framed); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + + // Parse inner payload: [varint64 doc_count][varint64 roaring_size][bytes]. + ByteSource payload(sec.payload); + uint64_t doc_count = 0; + SNII_RETURN_IF_ERROR(payload.get_varint64(&doc_count)); + if (doc_count > std::numeric_limits::max()) { + return Status::Corruption("null bitmap doc_count overflows uint32"); + } + + uint64_t roaring_size = 0; + SNII_RETURN_IF_ERROR(payload.get_varint64(&roaring_size)); + // Anti-DoS: the declared roaring_size must not exceed the bytes actually + // present, otherwise readSafe could be told to walk past the payload. + if (roaring_size > payload.remaining()) { + return Status::Corruption("null bitmap roaring_size exceeds payload"); + } + + Slice roaring_bytes; + SNII_RETURN_IF_ERROR(payload.get_bytes(static_cast(roaring_size), &roaring_bytes)); + + // Validate the Roaring container BEFORE deserializing. A CRC-valid frame can + // still carry malformed roaring bytes; Roaring::readSafe / read would then hit + // CRoaring's terminate-or-throw path (NULL -> ROARING_TERMINATE). The safe, + // non-throwing C probe returns the exact byte count a valid container would + // consume, or 0 on malformed/insufficient input. + const char* rb = reinterpret_cast(roaring_bytes.data()); + const size_t probed = + roaring_bitmap_portable_deserialize_size(rb, static_cast(roaring_size)); + if (probed == 0 || probed != static_cast(roaring_size)) { + return Status::Corruption("null bitmap: malformed roaring container"); + } + *out->bitmap_ = roaring::Roaring::readSafe(rb, static_cast(roaring_size)); + out->doc_count_ = static_cast(doc_count); + return Status::OK(); +} + +bool NullBitmapReader::is_null(uint32_t docid) const { + return bitmap_->contains(docid); +} + +uint32_t NullBitmapReader::null_count() const { + return static_cast(bitmap_->cardinality()); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/per_index_meta.cpp b/be/src/storage/index/snii/core/src/format/per_index_meta.cpp new file mode 100644 index 00000000000000..31bb6e42445404 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/per_index_meta.cpp @@ -0,0 +1,191 @@ +#include "snii/format/per_index_meta.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/section_framer.h" + +namespace snii::format { + +namespace { + +// Upper bound on index_suffix length read from untrusted bytes, capped before +// allocation to avoid a DoS-inflated reserve. A logical index suffix is a short +// column/field name; 64 KiB is far beyond any legitimate value. +constexpr uint32_t kMaxSuffixLen = 64u * 1024u; + +void encode_region(const RegionRef& r, ByteSink* payload) { + payload->put_varint64(r.offset); + payload->put_varint64(r.length); +} + +Status decode_region(ByteSource* ps, RegionRef* r) { + SNII_RETURN_IF_ERROR(ps->get_varint64(&r->offset)); + SNII_RETURN_IF_ERROR(ps->get_varint64(&r->length)); + return Status::OK(); +} + +// SectionRefs payload: five RegionRefs in fixed order, each as varint64 pair. +// Order: dict_region, posting_region, norms, null_bitmap, bsbf. +void encode_section_refs(const SectionRefs& refs, ByteSink* sink) { + ByteSink payload; + encode_region(refs.dict_region, &payload); + encode_region(refs.posting_region, &payload); + encode_region(refs.norms, &payload); + encode_region(refs.null_bitmap, &payload); + encode_region(refs.bsbf, &payload); + SectionFramer::write(*sink, static_cast(SectionType::kSectionRefs), payload.view()); +} + +Status decode_section_refs(Slice payload, SectionRefs* out) { + ByteSource ps(payload); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->dict_region)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->posting_region)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->norms)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->null_bitmap)); + SNII_RETURN_IF_ERROR(decode_region(&ps, &out->bsbf)); + if (!ps.eof()) { + return Status::Corruption("per_index_meta: trailing bytes in section_refs"); + } + return Status::OK(); +} + +// Writes the self-checksummed header prefix. Layout matches the class comment. +void encode_header(uint64_t index_id, const std::string& suffix, uint32_t flags, ByteSink* sink) { + ByteSink head; + head.put_fixed16(kMetaFormatVersion); + head.put_varint64(index_id); + head.put_varint32(static_cast(suffix.size())); + head.put_bytes(Slice(suffix)); + head.put_fixed32(flags); + uint32_t crc = crc32c(head.view()); + sink->put_bytes(head.view()); + sink->put_fixed32(crc); +} + +// Parses and crc-verifies the header prefix, advancing src past the crc field. +Status decode_header(Slice block, ByteSource* src, uint64_t* index_id, std::string* suffix, + uint32_t* flags) { + size_t start = src->position(); + uint16_t version = 0; + SNII_RETURN_IF_ERROR(src->get_fixed16(&version)); + if (version != kMetaFormatVersion) { + return Status::Corruption("per_index_meta: unsupported meta_format_version"); + } + SNII_RETURN_IF_ERROR(src->get_varint64(index_id)); + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len)); + if (suffix_len > kMaxSuffixLen || suffix_len > src->remaining()) { + return Status::Corruption("per_index_meta: suffix_len exceeds bounds"); + } + Slice suffix_view; + SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix_view)); + SNII_RETURN_IF_ERROR(src->get_fixed32(flags)); + size_t covered = src->position() - start; + uint32_t stored = 0; + SNII_RETURN_IF_ERROR(src->get_fixed32(&stored)); + if (crc32c(block.subslice(start, covered)) != stored) { + return Status::Corruption("per_index_meta: header crc mismatch"); + } + suffix->assign(reinterpret_cast(suffix_view.data()), suffix_view.size()); + return Status::OK(); +} + +// Reads one framed section, returning both its type and the FULL frame Slice +// (type+len+payload+crc) so it can be re-opened by a sub-module reader. The +// framer itself crc-verifies the frame. +Status read_frame(Slice block, ByteSource* src, uint8_t* type, Slice* frame) { + size_t start = src->position(); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(*src, &sec)); + *type = sec.type; + *frame = block.subslice(start, src->position() - start); + return Status::OK(); +} + +// Captures one frame into the matching reader field by section type. Returns +// false (via *handled) for unrecognized types so the caller skips them. +// Routes an optional sub-section frame to its slot. Unknown section types are +// intentionally ignored (forward compatibility: skip unknown optional sections). +void dispatch_frame(uint8_t type, Slice frame, Slice* sampled, Slice* dict) { + if (type == static_cast(SectionType::kSampledTermIndex)) { + *sampled = frame; + } else if (type == static_cast(SectionType::kDictBlockDirectory)) { + *dict = frame; + } +} + +} // namespace + +PerIndexMetaBuilder::PerIndexMetaBuilder(uint64_t index_id, std::string index_suffix, + uint32_t flags) + : index_id_(index_id), index_suffix_(std::move(index_suffix)), flags_(flags) {} + +void PerIndexMetaBuilder::set_stats(const StatsBlock& stats) { + stats_ = stats; +} + +void PerIndexMetaBuilder::set_sampled_term_index(Slice framed_bytes) { + sampled_term_index_.assign(framed_bytes.data(), framed_bytes.data() + framed_bytes.size()); +} + +void PerIndexMetaBuilder::set_dict_block_directory(Slice framed_bytes) { + dict_block_directory_.assign(framed_bytes.data(), framed_bytes.data() + framed_bytes.size()); +} + +void PerIndexMetaBuilder::set_section_refs(const SectionRefs& refs) { + section_refs_ = refs; +} + +void PerIndexMetaBuilder::add_raw_section(Slice framed_bytes) { + extra_sections_.emplace_back(framed_bytes.data(), framed_bytes.data() + framed_bytes.size()); +} + +Status PerIndexMetaBuilder::finish(ByteSink* sink) const { + if (sink == nullptr) { + return Status::InvalidArgument("per_index_meta: null sink"); + } + encode_header(index_id_, index_suffix_, flags_, sink); + encode_stats_block(stats_, sink); + sink->put_bytes(Slice(sampled_term_index_)); + sink->put_bytes(Slice(dict_block_directory_)); + encode_section_refs(section_refs_, sink); + for (const auto& extra : extra_sections_) { + sink->put_bytes(Slice(extra)); + } + return Status::OK(); +} + +Status PerIndexMetaReader::open(Slice block, PerIndexMetaReader* out) { + if (out == nullptr) { + return Status::InvalidArgument("per_index_meta: null reader"); + } + ByteSource src(block); + SNII_RETURN_IF_ERROR( + decode_header(block, &src, &out->index_id_, &out->index_suffix_, &out->flags_)); + bool have_stats = false; + bool have_refs = false; + while (!src.eof()) { + uint8_t type = 0; + Slice frame; + SNII_RETURN_IF_ERROR(read_frame(block, &src, &type, &frame)); + if (type == static_cast(SectionType::kStatsBlock)) { + ByteSource fs(frame); + SNII_RETURN_IF_ERROR(decode_stats_block(&fs, &out->stats_)); + have_stats = true; + } else if (type == static_cast(SectionType::kSectionRefs)) { + FramedSection sec; + ByteSource fs(frame); + SNII_RETURN_IF_ERROR(SectionFramer::read(fs, &sec)); + SNII_RETURN_IF_ERROR(decode_section_refs(sec.payload, &out->section_refs_)); + have_refs = true; + } else { + dispatch_frame(type, frame, &out->sampled_term_index_, &out->dict_block_directory_); + } + } + if (!have_stats || !have_refs) { + return Status::Corruption("per_index_meta: missing required sub-section"); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/prx_pod.cpp b/be/src/storage/index/snii/core/src/format/prx_pod.cpp new file mode 100644 index 00000000000000..a4a21f9056abfc --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/prx_pod.cpp @@ -0,0 +1,627 @@ +#include "snii/format/prx_pod.h" + +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/pfor.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/format_constants.h" + +namespace snii::format { +namespace { + +// Auto-compression threshold: use raw when payload is smaller than this (zstd +// gain is negligible and metadata overhead is relatively large). +inline constexpr size_t kAutoZstdMinBytes = 512; +// Default zstd level in auto mode. +inline constexpr int kDefaultZstdLevel = 3; +// Maximum decompressed byte size for a single .prx window. Guards against a +// corrupted uncomp_len read from S3 inflated to a huge value: sanity-check +// before allocating/decompressing to avoid GB-scale allocations. Windows are +// 256-doc aligned and normally far below this limit. +inline constexpr uint32_t kMaxWindowUncompBytes = 256u * 1024 * 1024; +// Anti-DoS cap on position count decoded from a single window before +// allocation. +inline constexpr uint32_t kMaxWindowPositions = 1u << 26; // 64M positions/window +// Anti-DoS cap on doc count decoded from a single window before allocation. A +// corrupt doc_count is otherwise fed straight to assign()/reserve() -> +// bad_alloc. +inline constexpr uint32_t kMaxWindowDocs = 1u << 24; // 16M docs/window + +// Writer-side precondition for the FLAT builders: the per-doc partition `freqs` +// must address exactly the positions present in `flat`. If sum(freqs) overruns +// flat.size() a (positions_flat, freqs) mismatch would index flat[off+i] past +// the span end -- an out-of-bounds read on caller-supplied data. Reject it as +// InvalidArgument BEFORE any indexing so the bug surfaces as a clean Status, +// never UB. (sum < size leaves trailing positions unused, which is also a +// writer bug, so we require exact equality.) Uint64 accumulation cannot +// overflow for uint32 freqs. +Status check_flat_partition(std::span flat, std::span freqs) { + uint64_t sum = 0; + for (uint32_t fc : freqs) sum += fc; + if (sum != flat.size()) { + return Status::InvalidArgument("prx: sum(freqs) does not match positions_flat size"); + } + return Status::OK(); +} + +// Encode per-doc position lists into a self-describing plain payload (doc_count +// + per-doc delta stream). +Status encode_payload(std::span> per_doc, ByteSink* out) { + out->put_varint32(static_cast(per_doc.size())); + for (const auto& doc : per_doc) { + out->put_varint32(static_cast(doc.size())); + uint32_t prev = 0; + for (size_t i = 0; i < doc.size(); ++i) { + uint32_t pos = doc[i]; + if (i > 0 && pos < prev) { + return Status::InvalidArgument("prx: positions within a doc must be ascending"); + } + out->put_varint32(i == 0 ? pos : pos - prev); + prev = pos; + } + } + return Status::OK(); +} + +// FLAT-positions encoder: identical wire output to encode_payload above, but +// reads positions from a single flat span partitioned per-doc by `freqs` (doc d +// owns the next freqs[d] entries). This avoids materializing a +// vector-of-vectors for the window; freqs.size() is the doc count and +// sum(freqs) == flat.size(). +Status encode_payload_flat(std::span flat, std::span freqs, + ByteSink* out) { + SNII_RETURN_IF_ERROR(check_flat_partition(flat, freqs)); + out->put_varint32(static_cast(freqs.size())); + size_t off = 0; + for (uint32_t fc : freqs) { + out->put_varint32(fc); + uint32_t prev = 0; + for (uint32_t i = 0; i < fc; ++i) { + const uint32_t pos = flat[off + i]; + if (i > 0 && pos < prev) { + return Status::InvalidArgument("prx: positions within a doc must be ascending"); + } + out->put_varint32(i == 0 ? pos : pos - prev); + prev = pos; + } + off += fc; + } + return Status::OK(); +} + +// Encode a uint32 array into PFOR runs of kFrqBaseUnit (256) elements each. The +// run count is derived by the decoder from the total length, so it is not +// stored. +void encode_pfor_runs(std::span values, ByteSink* out) { + const size_t n = values.size(); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + const size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + pfor_encode(values.data() + off, run, out); + } +} + +// Decode n uint32 values (multiple PFOR runs of kFrqBaseUnit each) into out. +Status decode_pfor_runs(ByteSource* src, size_t n, std::vector* out) { + out->assign(n, 0); + for (size_t off = 0; off < n; off += kFrqBaseUnit) { + const size_t run = (n - off < kFrqBaseUnit) ? (n - off) : kFrqBaseUnit; + SNII_RETURN_IF_ERROR(pfor_decode(src, run, out->data() + off)); + } + return Status::OK(); +} + +// PFOR window payload (self-describing; no entropy coding): +// VInt doc_count +// VInt total_pos # sum of all pos_counts +// PFOR_runs(pos_counts) # doc_count values (bit-packed; mostly 1 -> ~1 +// bit) PFOR_runs(position_deltas) # total_pos deltas, flat across docs (first +// per +// # doc absolute, rest delta-within-doc) +// Bit-packing the per-doc pos_counts (vs one varint each) is the size win: in a +// uniform corpus most docs have freq 1, so the count column packs to ~1 +// bit/doc. Builds the payload from a flat positions span partitioned per-doc by +// `freqs`. +Status encode_pfor_payload_flat(std::span flat, std::span freqs, + ByteSink* out) { + SNII_RETURN_IF_ERROR(check_flat_partition(flat, freqs)); + out->put_varint32(static_cast(freqs.size())); + out->put_varint32(static_cast(flat.size())); + encode_pfor_runs(freqs, out); + std::vector deltas; + deltas.reserve(flat.size()); + size_t off = 0; + for (uint32_t fc : freqs) { + uint32_t prev = 0; + for (uint32_t i = 0; i < fc; ++i) { + const uint32_t pos = flat[off + i]; + if (i > 0 && pos < prev) { + return Status::InvalidArgument("prx: positions within a doc must be ascending"); + } + deltas.push_back(i == 0 ? pos : pos - prev); + prev = pos; + } + off += fc; + } + encode_pfor_runs(deltas, out); + return Status::OK(); +} + +// Builds the PFOR payload from per-doc lists (delegates through a flat view). +Status encode_pfor_payload(std::span> per_doc, ByteSink* out) { + std::vector flat, freqs; + freqs.reserve(per_doc.size()); + for (const auto& doc : per_doc) { + freqs.push_back(static_cast(doc.size())); + flat.insert(flat.end(), doc.begin(), doc.end()); + } + return encode_pfor_payload_flat(flat, freqs, out); +} + +// Decode per-doc position lists from a PFOR payload. +Status decode_pfor_payload(Slice plain, std::vector>* out) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + std::vector pos_counts; + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); + uint64_t sum = 0; + for (uint32_t d = 0; d < doc_count; ++d) sum += pos_counts[d]; + if (sum != total_pos) { + return Status::Corruption("prx: pos_count sum mismatch"); + } + std::vector deltas; + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, &deltas)); + out->clear(); + out->reserve(doc_count); + size_t off = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + std::vector doc; + doc.reserve(pos_counts[d]); + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_counts[d]; ++i) { + prev = (i == 0) ? deltas[off + i] : prev + deltas[off + i]; + doc.push_back(prev); + } + off += pos_counts[d]; + out->push_back(std::move(doc)); + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload"); + return Status::OK(); +} + +// Writes a PFOR window: codec=pfor, payload, crc(header+payload). +void write_pfor(Slice payload, ByteSink* sink) { + ByteSink framed; + framed.put_u8(static_cast(PrxCodec::kPfor)); + framed.put_varint32(static_cast(payload.size())); + framed.put_bytes(payload); + sink->put_bytes(framed.view()); + sink->put_fixed32(crc32c(framed.view())); +} + +// Decode per-doc position lists from a plain payload. +Status decode_payload(Slice plain, std::vector>* out) { + ByteSource src(plain); + uint32_t doc_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + out->clear(); + out->reserve(doc_count); + for (uint32_t d = 0; d < doc_count; ++d) { + uint32_t pos_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count)); + std::vector doc; + doc.reserve(pos_count); + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t delta = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&delta)); + prev = (i == 0) ? delta : prev + delta; + doc.push_back(prev); + } + out->push_back(std::move(doc)); + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload"); + return Status::OK(); +} + +// CSR decode of a PFOR payload: all docs' positions into one flat buffer + +// per-doc offsets, with NO per-doc std::vector allocation. `pos_off` has +// doc_count+1 entries (pos_off[0]==0); doc d's positions are +// pos_flat[pos_off[d] .. pos_off[d+1]). +Status decode_pfor_payload_csr(Slice plain, std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + pos_off->clear(); + pos_off->reserve(static_cast(doc_count) + 1); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, pos_off)); + uint64_t sum = 0; + for (uint32_t d = 0; d < doc_count; ++d) sum += (*pos_off)[d]; + if (sum != total_pos) return Status::Corruption("prx: pos_count sum mismatch"); + pos_flat->reserve(total_pos); + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat)); + size_t off = 0; + uint32_t next_off = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + const uint32_t pos_count = (*pos_off)[d]; + (*pos_off)[d] = next_off; + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t& value = (*pos_flat)[off + i]; + prev = (i == 0) ? value : prev + value; + value = prev; + } + off += pos_count; + next_off += pos_count; + } + pos_off->push_back(next_off); + if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload"); + return Status::OK(); +} + +Status validate_doc_ordinals(std::span doc_ordinals, uint32_t doc_count) { + uint32_t prev = 0; + for (size_t i = 0; i < doc_ordinals.size(); ++i) { + const uint32_t doc = doc_ordinals[i]; + if (doc >= doc_count) { + return Status::Corruption("prx: selected doc ordinal out of range"); + } + if (i != 0 && doc <= prev) { + return Status::InvalidArgument("prx: selected doc ordinals must be strictly ascending"); + } + prev = doc; + } + return Status::OK(); +} + +Status decode_pfor_payload_csr_selective(Slice plain, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); + + std::vector pos_counts; + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); + uint64_t sum = 0; + for (uint32_t d = 0; d < doc_count; ++d) sum += pos_counts[d]; + if (sum != total_pos) return Status::Corruption("prx: pos_count sum mismatch"); + + pos_flat->clear(); + pos_off->clear(); + pos_off->reserve(doc_ordinals.size() + 1); + pos_off->push_back(0); + + struct SelectedRange { + uint32_t begin = 0; + uint32_t end = 0; + uint32_t out_begin = 0; + }; + std::vector selected; + selected.reserve(doc_ordinals.size()); + uint32_t delta_begin = 0; + size_t next_doc = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + const uint32_t count = pos_counts[d]; + if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) { + const uint32_t out_begin = static_cast(pos_flat->size()); + selected.push_back(SelectedRange {delta_begin, delta_begin + count, out_begin}); + pos_flat->resize(pos_flat->size() + count); + pos_off->push_back(static_cast(pos_flat->size())); + ++next_doc; + } + delta_begin += count; + } + + std::vector run_buf; + size_t range_idx = 0; + for (uint32_t run_begin = 0; run_begin < total_pos; run_begin += kFrqBaseUnit) { + const uint32_t run_len = std::min(kFrqBaseUnit, total_pos - run_begin); + const uint32_t run_end = run_begin + run_len; + while (range_idx < selected.size() && selected[range_idx].end <= run_begin) { + ++range_idx; + } + if (range_idx == selected.size() || selected[range_idx].begin >= run_end) { + SNII_RETURN_IF_ERROR(pfor_skip(&src, run_len)); + continue; + } + + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, run_len, &run_buf)); + for (size_t ri = range_idx; ri < selected.size() && selected[ri].begin < run_end; ++ri) { + const SelectedRange& range = selected[ri]; + const uint32_t copy_begin = std::max(range.begin, run_begin); + const uint32_t copy_end = std::min(range.end, run_end); + const uint32_t dst_begin = range.out_begin + copy_begin - range.begin; + for (uint32_t off = copy_begin; off < copy_end; ++off) { + (*pos_flat)[dst_begin + off - copy_begin] = run_buf[off - run_begin]; + } + } + } + + for (size_t i = 0; i < doc_ordinals.size(); ++i) { + uint32_t prev = 0; + for (uint32_t off = (*pos_off)[i]; off < (*pos_off)[i + 1]; ++off) { + uint32_t& value = (*pos_flat)[off]; + prev = (off == (*pos_off)[i]) ? value : prev + value; + value = prev; + } + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload"); + return Status::OK(); +} + +// CSR decode of a plain (raw) payload. See decode_pfor_payload_csr. +Status decode_payload_csr(Slice plain, std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + pos_flat->clear(); + pos_off->clear(); + pos_off->reserve(static_cast(doc_count) + 1); + pos_off->push_back(0); + uint64_t total_pos = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + uint32_t pos_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count)); + total_pos += pos_count; + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t delta = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&delta)); + prev = (i == 0) ? delta : prev + delta; + pos_flat->push_back(prev); + } + pos_off->push_back(static_cast(pos_flat->size())); + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload"); + return Status::OK(); +} + +Status decode_payload_csr_selective(Slice plain, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); + pos_flat->clear(); + pos_off->clear(); + pos_off->reserve(doc_ordinals.size() + 1); + pos_off->push_back(0); + size_t next_doc = 0; + uint64_t total_pos = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + uint32_t pos_count = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&pos_count)); + total_pos += pos_count; + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + const bool selected = next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d; + uint32_t prev = 0; + for (uint32_t i = 0; i < pos_count; ++i) { + uint32_t delta = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&delta)); + if (!selected) continue; + prev = (i == 0) ? delta : prev + delta; + pos_flat->push_back(prev); + } + if (selected) { + pos_off->push_back(static_cast(pos_flat->size())); + ++next_doc; + } + } + if (!src.eof()) return Status::Corruption("prx: trailing bytes after payload"); + return Status::OK(); +} + +// Decision: given level and plain length, determine whether to compress. +bool should_compress(int level, size_t plain_len) { + if (level == 0) return false; // force raw + if (level > 0) return true; // force zstd + return plain_len >= kAutoZstdMinBytes; // auto +} + +// Write a raw window: codec=raw, uncomp_len, crc(header+payload), payload. +void write_raw(Slice plain, ByteSink* sink) { + ByteSink framed; + framed.put_u8(static_cast(PrxCodec::kRaw)); + framed.put_varint32(static_cast(plain.size())); + framed.put_bytes(plain); + sink->put_bytes(framed.view()); + sink->put_fixed32(crc32c(framed.view())); +} + +// Write a zstd window: codec=zstd, uncomp_len, comp_len, crc(header+payload), +// payload. +Status write_zstd(Slice plain, int level, ByteSink* sink) { + std::vector comp; + SNII_RETURN_IF_ERROR(zstd_compress(plain, level > 0 ? level : kDefaultZstdLevel, &comp)); + ByteSink framed; + framed.put_u8(static_cast(PrxCodec::kZstd)); + framed.put_varint32(static_cast(plain.size())); + framed.put_varint32(static_cast(comp.size())); + framed.put_bytes(Slice(comp)); + sink->put_bytes(framed.view()); + sink->put_fixed32(crc32c(framed.view())); + return Status::OK(); +} + +// Read header + payload, verify crc in retrospect, and return the payload view +// and uncomp_len to the caller. +Status read_framed(ByteSource* src, uint8_t* codec, uint32_t* uncomp_len, Slice* payload) { + size_t start = src->position(); + SNII_RETURN_IF_ERROR(src->get_u8(codec)); + if (*codec != static_cast(PrxCodec::kRaw) && + *codec != static_cast(PrxCodec::kZstd) && + *codec != static_cast(PrxCodec::kPfor)) { + return Status::Corruption("prx: unknown codec"); + } + SNII_RETURN_IF_ERROR(src->get_varint32(uncomp_len)); + if (*uncomp_len > kMaxWindowUncompBytes) { + return Status::Corruption("prx: uncomp_len exceeds sane window cap"); + } + size_t payload_len = *uncomp_len; + if (*codec == static_cast(PrxCodec::kZstd)) { + uint32_t comp_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&comp_len)); + payload_len = comp_len; + } + SNII_RETURN_IF_ERROR(src->get_bytes(payload_len, payload)); + size_t framed_len = src->position() - start; + uint32_t stored = 0; + SNII_RETURN_IF_ERROR(src->get_fixed32(&stored)); + if (crc32c(src->slice_from(start, framed_len)) != stored) { + return Status::Corruption("prx: window crc mismatch"); + } + return Status::OK(); +} + +} // namespace + +Status build_prx_window(std::span> per_doc_positions, + int zstd_level_or_negative_for_auto, ByteSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("prx: null sink"); + // Forced legacy codecs (level 0 = raw varint, level > 0 = zstd) are kept so + // the test/legacy paths still exercise them; the auto path (< 0) now emits + // PFOR bit-packed deltas -- no entropy coding, far cheaper build CPU than + // zstd-3. + if (zstd_level_or_negative_for_auto >= 0) { + ByteSink plain; + SNII_RETURN_IF_ERROR(encode_payload(per_doc_positions, &plain)); + Slice plain_view = plain.view(); + if (!should_compress(zstd_level_or_negative_for_auto, plain_view.size())) { + write_raw(plain_view, sink); + return Status::OK(); + } + return write_zstd(plain_view, zstd_level_or_negative_for_auto, sink); + } + ByteSink payload; + SNII_RETURN_IF_ERROR(encode_pfor_payload(per_doc_positions, &payload)); + write_pfor(payload.view(), sink); + return Status::OK(); +} + +Status build_prx_window_flat(std::span positions_flat, + std::span freqs, int zstd_level_or_negative_for_auto, + ByteSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("prx: null sink"); + if (zstd_level_or_negative_for_auto >= 0) { + ByteSink plain; + SNII_RETURN_IF_ERROR(encode_payload_flat(positions_flat, freqs, &plain)); + Slice plain_view = plain.view(); + if (!should_compress(zstd_level_or_negative_for_auto, plain_view.size())) { + write_raw(plain_view, sink); + return Status::OK(); + } + return write_zstd(plain_view, zstd_level_or_negative_for_auto, sink); + } + ByteSink payload; + SNII_RETURN_IF_ERROR(encode_pfor_payload_flat(positions_flat, freqs, &payload)); + write_pfor(payload.view(), sink); + return Status::OK(); +} + +Status read_prx_window(ByteSource* source, std::vector>* per_doc_positions) { + if (source == nullptr || per_doc_positions == nullptr) { + return Status::InvalidArgument("prx: null arg"); + } + uint8_t codec = 0; + uint32_t uncomp_len = 0; + Slice payload; + SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload)); + if (codec == static_cast(PrxCodec::kPfor)) { + return decode_pfor_payload(payload, per_doc_positions); + } + if (codec == static_cast(PrxCodec::kRaw)) { + return decode_payload(payload, per_doc_positions); + } + std::vector plain; + SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain)); + return decode_payload(Slice(plain), per_doc_positions); +} + +Status read_prx_window_csr(ByteSource* source, std::vector* pos_flat, + std::vector* pos_off) { + if (source == nullptr || pos_flat == nullptr || pos_off == nullptr) { + return Status::InvalidArgument("prx: null arg"); + } + uint8_t codec = 0; + uint32_t uncomp_len = 0; + Slice payload; + SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload)); + if (codec == static_cast(PrxCodec::kPfor)) { + return decode_pfor_payload_csr(payload, pos_flat, pos_off); + } + if (codec == static_cast(PrxCodec::kRaw)) { + return decode_payload_csr(payload, pos_flat, pos_off); + } + std::vector plain; + SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain)); + return decode_payload_csr(Slice(plain), pos_flat, pos_off); +} + +Status read_prx_window_csr_selective(ByteSource* source, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + if (source == nullptr || pos_flat == nullptr || pos_off == nullptr) { + return Status::InvalidArgument("prx: null arg"); + } + uint8_t codec = 0; + uint32_t uncomp_len = 0; + Slice payload; + SNII_RETURN_IF_ERROR(read_framed(source, &codec, &uncomp_len, &payload)); + if (codec == static_cast(PrxCodec::kPfor)) { + return decode_pfor_payload_csr_selective(payload, doc_ordinals, pos_flat, pos_off); + } + if (codec == static_cast(PrxCodec::kRaw)) { + return decode_payload_csr_selective(payload, doc_ordinals, pos_flat, pos_off); + } + std::vector plain; + SNII_RETURN_IF_ERROR(zstd_decompress(payload, uncomp_len, &plain)); + return decode_payload_csr_selective(Slice(plain), doc_ordinals, pos_flat, pos_off); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp b/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp new file mode 100644 index 00000000000000..1f7790e3aac84e --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/sampled_term_index.cpp @@ -0,0 +1,154 @@ +#include "snii/format/sampled_term_index.h" + +#include + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/section_framer.h" + +namespace snii::format { + +namespace { + +// Longest common prefix length of term and prev (front coding primitive, consistent with dict_entry). +uint32_t common_prefix_len(std::string_view term, std::string_view prev) { + uint32_t n = 0; + const uint32_t lim = static_cast(std::min(term.size(), prev.size())); + while (n < lim && term[n] == prev[n]) ++n; + return n; +} + +// Write a front-coded term key (prefix_len + suffix_len + suffix). +void write_term_key(std::string_view term, std::string_view prev, ByteSink* sink) { + const uint32_t prefix = common_prefix_len(term, prev); + const std::string_view suffix = term.substr(prefix); + sink->put_varint32(prefix); + sink->put_varint32(static_cast(suffix.size())); + sink->put_bytes(Slice(suffix)); +} + +// Read a front-coded term key and reconstruct it into out from prev + suffix. +Status read_term_key(ByteSource* src, std::string_view prev, std::string* out) { + uint32_t prefix = 0; + uint32_t suffix_len = 0; + SNII_RETURN_IF_ERROR(src->get_varint32(&prefix)); + SNII_RETURN_IF_ERROR(src->get_varint32(&suffix_len)); + if (prefix > prev.size()) { + return Status::Corruption("sampled_term_index: prefix_len exceeds prev_term length"); + } + Slice suffix; + SNII_RETURN_IF_ERROR(src->get_bytes(suffix_len, &suffix)); + out->assign(prev.substr(0, prefix)); + out->append(reinterpret_cast(suffix.data()), suffix.size()); + return Status::OK(); +} + +} // namespace + +void SampledTermIndexBuilder::add_block_first_term(std::string_view first_term) { + first_terms_.emplace_back(first_term); +} + +void SampledTermIndexBuilder::finish(ByteSink* sink) { + ByteSink payload; + payload.put_varint32(static_cast(first_terms_.size())); + // min_term / max_term are written only when non-empty (== first/last sample_term). + if (!first_terms_.empty()) { + write_term_key(first_terms_.front(), std::string_view {}, &payload); + write_term_key(first_terms_.back(), std::string_view {}, &payload); + std::string_view prev {}; + for (const auto& t : first_terms_) { + write_term_key(t, prev, &payload); + prev = t; + } + } + SectionFramer::write(*sink, static_cast(SectionType::kSampledTermIndex), + payload.view()); +} + +namespace { + +// Parse n_blocks, min/max (not used directly; consumed for checksum alignment), and all sample_terms from payload. +Status parse_payload(Slice payload, std::vector* terms) { + ByteSource src(payload); + uint32_t n_blocks = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&n_blocks)); + if (n_blocks == 0) { + if (!src.eof()) { + return Status::Corruption("sampled_term_index: empty index contains trailing bytes"); + } + terms->clear(); + return Status::OK(); + } + + // min_term / max_term (do not drive binary search directly; must be consumed to verify structural alignment). + std::string min_term; + std::string max_term; + SNII_RETURN_IF_ERROR(read_term_key(&src, std::string_view {}, &min_term)); + SNII_RETURN_IF_ERROR(read_term_key(&src, std::string_view {}, &max_term)); + + std::vector out; + out.reserve(n_blocks); + std::string prev; + for (uint32_t i = 0; i < n_blocks; ++i) { + std::string term; + SNII_RETURN_IF_ERROR(read_term_key(&src, prev, &term)); + prev = term; + out.push_back(std::move(term)); + } + if (!src.eof()) { + return Status::Corruption("sampled_term_index: payload contains trailing bytes"); + } + if (out.front() != min_term || out.back() != max_term) { + return Status::Corruption("sampled_term_index: min/max inconsistent with sample_terms"); + } + *terms = std::move(out); + return Status::OK(); +} + +} // namespace + +Status SampledTermIndexReader::open(Slice section, SampledTermIndexReader* out) { + if (out == nullptr) { + return Status::InvalidArgument("sampled_term_index: out is null"); + } + ByteSource src(section); + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(src, &sec)); + if (sec.type != static_cast(SectionType::kSampledTermIndex)) { + return Status::InvalidArgument("sampled_term_index: not a kSampledTermIndex section"); + } + *out = SampledTermIndexReader {}; + return parse_payload(sec.payload, &out->sample_terms_); +} + +Status SampledTermIndexReader::locate(std::string_view target, bool* maybe_present, + uint32_t* block_ordinal) const { + if (maybe_present == nullptr || block_ordinal == nullptr) { + return Status::InvalidArgument("sampled_term_index: output pointer is null"); + } + *maybe_present = false; + *block_ordinal = 0; + if (sample_terms_.empty()) { + return Status::OK(); // empty index: always out of range. + } + // target < min_term (first block's first term) -> before the first block, so it + // cannot exist in any block. NOTE: a target GREATER than the last sample term is + // NOT out of range -- sample_terms_ holds each block's FIRST term, so the LAST + // block can contain terms greater than its first term. Such a target routes to + // the last block (upper_bound -> end()), where find_term confirms presence. + if (target < std::string_view(sample_terms_.front())) { + return Status::OK(); + } + // Last sample_term <= target: step back one position after upper_bound. For a + // target past every sample term, upper_bound returns end() and idx = n-1 (the + // last block), which is correct. + auto it = std::upper_bound( + sample_terms_.begin(), sample_terms_.end(), target, + [](std::string_view t, const std::string& s) { return t < std::string_view(s); }); + const auto idx = (it - sample_terms_.begin()) - 1; // it > begin (< min excluded). + *maybe_present = true; + *block_ordinal = static_cast(idx); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/stats_block.cpp b/be/src/storage/index/snii/core/src/format/stats_block.cpp new file mode 100644 index 00000000000000..527f4f98d43d79 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/stats_block.cpp @@ -0,0 +1,46 @@ +#include "snii/format/stats_block.h" + +namespace snii::format { + +namespace { + +// Field order within payload is fixed; reuse ByteSink varint primitives — do not hand-assemble bytes. +void encode_payload(const StatsBlock& sb, ByteSink* payload) { + payload->put_varint64(sb.doc_count); + payload->put_varint64(sb.indexed_doc_count); + payload->put_varint64(sb.term_count); + payload->put_varint64(sb.sum_total_term_freq); + payload->put_varint64(sb.null_count); +} + +Status decode_payload(Slice payload, StatsBlock* out) { + ByteSource ps(payload); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->doc_count)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->indexed_doc_count)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->term_count)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->sum_total_term_freq)); + SNII_RETURN_IF_ERROR(ps.get_varint64(&out->null_count)); + if (!ps.eof()) { + return Status::Corruption("stats_block: trailing bytes in payload"); + } + return Status::OK(); +} + +} // namespace + +void encode_stats_block(const StatsBlock& sb, ByteSink* sink) { + ByteSink payload; + encode_payload(sb, &payload); + SectionFramer::write(*sink, static_cast(SectionType::kStatsBlock), payload.view()); +} + +Status decode_stats_block(ByteSource* src, StatsBlock* out) { + FramedSection sec; + SNII_RETURN_IF_ERROR(SectionFramer::read(*src, &sec)); + if (sec.type != static_cast(SectionType::kStatsBlock)) { + return Status::InvalidArgument("stats_block: unexpected section type"); + } + return decode_payload(sec.payload, out); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp b/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp new file mode 100644 index 00000000000000..ed781c4d82e667 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/tail_meta_region.cpp @@ -0,0 +1,129 @@ +#include "snii/format/tail_meta_region.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/format/format_constants.h" + +namespace snii::format { +namespace { + +// Header field bytes (before header_crc): u32 ver + u32 flags + u64 meta_region_len +// + u32 n + u64 directory_offset + u64 directory_length. +constexpr size_t kHeaderFields = 4 + 4 + 8 + 4 + 8 + 8; // 36 +constexpr size_t kHeaderSize = kHeaderFields + 4; // + header_crc32c +constexpr size_t kRegionChecksumSize = 4; + +} // namespace + +void TailMetaRegionBuilder::add_index(uint64_t index_id, std::string index_suffix, + Slice per_index_meta_bytes) { + Entry e; + e.index_id = index_id; + e.suffix = std::move(index_suffix); + e.bytes.assign(per_index_meta_bytes.data(), + per_index_meta_bytes.data() + per_index_meta_bytes.size()); + entries_.push_back(std::move(e)); +} + +void TailMetaRegionBuilder::finish(ByteSink* sink) const { + // Lay out per-index meta blocks right after the header; build the directory + // with each block's in-region offset/length. + LogicalIndexDirectoryBuilder dir; + uint64_t offset = kHeaderSize; + for (const Entry& e : entries_) { + LogicalIndexRef ref; + ref.index_id = e.index_id; + ref.index_suffix = e.suffix; + ref.meta_off = offset; + ref.meta_len = e.bytes.size(); + dir.add(ref); + offset += e.bytes.size(); + } + const uint64_t directory_offset = offset; + ByteSink dir_bytes; + dir.finish(&dir_bytes); + const uint64_t directory_length = dir_bytes.size(); + const uint64_t meta_region_len = directory_offset + directory_length + kRegionChecksumSize; + + ByteSink fields; + fields.put_fixed32(kMetaFormatVersion); + fields.put_fixed32(0); // flags + fields.put_fixed64(meta_region_len); + fields.put_fixed32(static_cast(entries_.size())); + fields.put_fixed64(directory_offset); + fields.put_fixed64(directory_length); + + ByteSink region; + region.put_bytes(fields.view()); + region.put_fixed32(crc32c(fields.view())); // header_crc32c + for (const Entry& e : entries_) region.put_bytes(Slice(e.bytes)); + region.put_bytes(dir_bytes.view()); + region.put_fixed32(crc32c(region.view())); // meta_region_checksum + + sink->put_bytes(region.view()); +} + +Status TailMetaRegionReader::open(Slice region, TailMetaRegionReader* out) { + if (out == nullptr) return Status::InvalidArgument("tail_meta_region: null out"); + if (region.size() < kHeaderSize + kRegionChecksumSize) { + return Status::Corruption("tail_meta_region: region too short"); + } + + // Verify the trailing region checksum. + const size_t covered = region.size() - kRegionChecksumSize; + ByteSource cs(region.subslice(covered, kRegionChecksumSize)); + uint32_t region_crc = 0; + SNII_RETURN_IF_ERROR(cs.get_fixed32(®ion_crc)); + if (crc32c(region.subslice(0, covered)) != region_crc) { + return Status::Corruption("tail_meta_region: meta_region_checksum mismatch"); + } + + // Parse + verify the header. + ByteSource hs(region.subslice(0, kHeaderFields)); + uint32_t ver = 0, flags = 0, n = 0; + uint64_t meta_region_len = 0, directory_offset = 0, directory_length = 0; + SNII_RETURN_IF_ERROR(hs.get_fixed32(&ver)); + SNII_RETURN_IF_ERROR(hs.get_fixed32(&flags)); + SNII_RETURN_IF_ERROR(hs.get_fixed64(&meta_region_len)); + SNII_RETURN_IF_ERROR(hs.get_fixed32(&n)); + SNII_RETURN_IF_ERROR(hs.get_fixed64(&directory_offset)); + SNII_RETURN_IF_ERROR(hs.get_fixed64(&directory_length)); + ByteSource hc(region.subslice(kHeaderFields, 4)); + uint32_t header_crc = 0; + SNII_RETURN_IF_ERROR(hc.get_fixed32(&header_crc)); + if (crc32c(region.subslice(0, kHeaderFields)) != header_crc) { + return Status::Corruption("tail_meta_region: header crc mismatch"); + } + if (ver != kMetaFormatVersion) { + return Status::Unsupported("tail_meta_region: unsupported meta_format_version"); + } + if (meta_region_len != region.size()) { + return Status::Corruption("tail_meta_region: declared length mismatch"); + } + if (directory_offset + directory_length > region.size() || directory_offset < kHeaderSize) { + return Status::Corruption("tail_meta_region: directory out of range"); + } + + SNII_RETURN_IF_ERROR(LogicalIndexDirectoryReader::open( + region.subslice(directory_offset, directory_length), &out->dir_)); + if (out->dir_.size() != n) { + return Status::Corruption("tail_meta_region: directory size mismatch"); + } + out->region_ = region; + out->n_ = n; + return Status::OK(); +} + +Status TailMetaRegionReader::find(uint64_t index_id, std::string_view suffix, bool* found, + Slice* per_index_meta_bytes) const { + LogicalIndexRef ref; + SNII_RETURN_IF_ERROR(dir_.find(index_id, suffix, found, &ref)); + if (!*found) return Status::OK(); + if (ref.meta_off + ref.meta_len > region_.size()) { + return Status::Corruption("tail_meta_region: meta block out of range"); + } + *per_index_meta_bytes = region_.subslice(ref.meta_off, ref.meta_len); + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/format/tail_pointer.cpp b/be/src/storage/index/snii/core/src/format/tail_pointer.cpp new file mode 100644 index 00000000000000..bc17f5652d4f82 --- /dev/null +++ b/be/src/storage/index/snii/core/src/format/tail_pointer.cpp @@ -0,0 +1,95 @@ +#include "snii/format/tail_pointer.h" + +#include "snii/encoding/byte_source.h" +#include "snii/encoding/crc32c.h" +#include "snii/format/format_constants.h" + +namespace snii::format { + +namespace { + +// Byte widths of every fixed field, used to derive the constant on-disk size: +// u32 magic + u16 version + 3*u64 + 2*u32 + u8 size + u32 tail_checksum. +constexpr size_t kMagicBytes = 4; +constexpr size_t kVersionBytes = 2; +constexpr size_t kU64Bytes = 8; +constexpr size_t kU32Bytes = 4; +constexpr size_t kSizeByteBytes = 1; + +constexpr size_t kFixedSize = + kMagicBytes + kVersionBytes + 3 * kU64Bytes + 2 * kU32Bytes + kSizeByteBytes + kU32Bytes; +// tail_checksum is the trailing u32 and covers every byte before it. +constexpr size_t kChecksumCoverage = kFixedSize - kU32Bytes; + +// Serializes the checksum-covered region in fixed field order into covered. +void serialize_covered(const TailPointer& tp, ByteSink* covered) { + covered->put_fixed32(kTailMagic); + covered->put_fixed16(kFormatVersion); + covered->put_fixed64(tp.meta_region_offset); + covered->put_fixed64(tp.meta_region_length); + covered->put_fixed64(tp.hot_off); + covered->put_fixed32(tp.meta_region_checksum); + covered->put_fixed32(tp.bootstrap_header_checksum); + covered->put_u8(static_cast(kFixedSize)); +} + +} // namespace + +size_t tail_pointer_size() { + return kFixedSize; +} + +Status encode_tail_pointer(const TailPointer& tp, ByteSink* sink) { + ByteSink covered; + serialize_covered(tp, &covered); + if (covered.size() != kChecksumCoverage) { + return Status::Internal("tail_pointer: covered size mismatch"); + } + const uint32_t tail_checksum = crc32c(covered.view()); + sink->put_bytes(covered.view()); + sink->put_fixed32(tail_checksum); + return Status::OK(); +} + +Status decode_tail_pointer(Slice last_bytes, TailPointer* out) { + // Anti-DoS / framing: the tail pointer is a fixed-size footer, so reject any + // input that is not exactly the fixed size before touching its contents. + if (last_bytes.size() != kFixedSize) { + return Status::Corruption("tail_pointer: input is not the fixed size"); + } + // Verify the trailing tail_checksum over the covered region first; a mismatch + // means any parsed field would be untrustworthy. + const Slice covered = last_bytes.subslice(0, kChecksumCoverage); + ByteSource src(last_bytes); + + uint32_t magic = 0; + SNII_RETURN_IF_ERROR(src.get_fixed32(&magic)); + if (magic != kTailMagic) { + return Status::Corruption("tail_pointer: bad magic"); + } + + uint16_t format_version = 0; + SNII_RETURN_IF_ERROR(src.get_fixed16(&format_version)); + (void)format_version; // Read to advance the cursor; version policy lives in + // the bootstrap header, not here. + SNII_RETURN_IF_ERROR(src.get_fixed64(&out->meta_region_offset)); + SNII_RETURN_IF_ERROR(src.get_fixed64(&out->meta_region_length)); + SNII_RETURN_IF_ERROR(src.get_fixed64(&out->hot_off)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&out->meta_region_checksum)); + SNII_RETURN_IF_ERROR(src.get_fixed32(&out->bootstrap_header_checksum)); + + uint8_t on_disk_size = 0; + SNII_RETURN_IF_ERROR(src.get_u8(&on_disk_size)); + if (on_disk_size != kFixedSize) { + return Status::Corruption("tail_pointer: embedded size mismatch"); + } + + uint32_t tail_checksum = 0; + SNII_RETURN_IF_ERROR(src.get_fixed32(&tail_checksum)); + if (tail_checksum != crc32c(covered)) { + return Status::Corruption("tail_pointer: tail_checksum mismatch"); + } + return Status::OK(); +} + +} // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp b/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp new file mode 100644 index 00000000000000..1292f8d4f09c2e --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/batch_range_fetcher.cpp @@ -0,0 +1,81 @@ +#include "snii/io/batch_range_fetcher.h" + +#include +#include + +namespace snii::io { +namespace { + +Status checked_end(uint64_t offset, uint64_t len, uint64_t* out) { + if (len > std::numeric_limits::max() - offset) { + return Status::Corruption("batch_range_fetcher: range end overflow"); + } + *out = offset + len; + return Status::OK(); +} + +Status checked_size(uint64_t len, size_t* out) { + if (len > static_cast(std::numeric_limits::max())) { + return Status::Corruption("batch_range_fetcher: physical range too large"); + } + *out = static_cast(len); + return Status::OK(); +} + +} // namespace + +BatchRangeFetcher::BatchRangeFetcher(FileReader* reader, uint64_t coalesce_gap) + : reader_(reader), coalesce_gap_(coalesce_gap) {} + +size_t BatchRangeFetcher::add(uint64_t offset, uint64_t len) { + reqs_.push_back(Req {offset, len}); + return reqs_.size() - 1; +} + +void BatchRangeFetcher::clear() { + reqs_.clear(); + phys_.clear(); +} + +Status BatchRangeFetcher::fetch() { + if (reader_ == nullptr) return Status::InvalidArgument("batch_range_fetcher: null reader"); + phys_.clear(); + if (reqs_.empty()) return Status::OK(); + + std::vector order(reqs_.size()); + for (size_t i = 0; i < order.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return reqs_[a].offset < reqs_[b].offset; }); + + // Sweep in offset order, merging requests into physical segments. + std::vector segs; + uint64_t cur_start = 0; + uint64_t cur_end = 0; + for (size_t k = 0; k < order.size(); ++k) { + Req& r = reqs_[order[k]]; + uint64_t r_end = 0; + SNII_RETURN_IF_ERROR(checked_end(r.offset, r.len, &r_end)); + SNII_RETURN_IF_ERROR(checked_size(r.len, &r.len_size)); + const bool disjoint = r.offset > cur_end && r.offset - cur_end > coalesce_gap_; + if (segs.empty() || disjoint) { + segs.push_back(Range {r.offset, 0}); // length finalized below + cur_start = r.offset; + cur_end = r_end; + } else { + cur_end = std::max(cur_end, r_end); + } + r.phys_idx = segs.size() - 1; + SNII_RETURN_IF_ERROR(checked_size(r.offset - cur_start, &r.sub_offset)); + SNII_RETURN_IF_ERROR(checked_size(cur_end - cur_start, &segs.back().len)); + } + + return reader_->read_batch(segs, &phys_); +} + +Slice BatchRangeFetcher::get(size_t h) const { + const Req& r = reqs_[h]; + const std::vector& buf = phys_[r.phys_idx]; + return Slice(buf.data() + r.sub_offset, r.len_size); +} + +} // namespace snii::io diff --git a/be/src/storage/index/snii/core/src/io/local_file.cpp b/be/src/storage/index/snii/core/src/io/local_file.cpp new file mode 100644 index 00000000000000..af64664fe6ad30 --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/local_file.cpp @@ -0,0 +1,113 @@ +#include "snii/io/local_file.h" + +#include +#include +#include + +#include +#include + +namespace snii::io { +namespace { + +std::string errno_msg(const char* what) { + return std::string(what) + ": " + std::strerror(errno); +} + +} // namespace + +LocalFileReader::~LocalFileReader() { + if (fd_ >= 0) ::close(fd_); +} + +Status LocalFileReader::open(const std::string& path) { + fd_ = ::open(path.c_str(), O_RDONLY); + if (fd_ < 0) return Status::IoError(errno_msg("open")); + struct stat st; + if (::fstat(fd_, &st) != 0) return Status::IoError(errno_msg("fstat")); + size_ = static_cast(st.st_size); + return Status::OK(); +} + +Status LocalFileReader::read_at(uint64_t offset, size_t len, std::vector* out) { + if (fd_ < 0) return Status::IoError("read_at on unopened file"); + // Non-wrapping bounds check (offset+len could overflow uint64 on a corrupt arg). + if (offset > size_ || len > size_ - offset) { + return Status::Corruption("read_at past end of file"); + } + out->resize(len); + size_t done = 0; + while (done < len) { + ssize_t n = ::pread(fd_, out->data() + done, len - done, static_cast(offset + done)); + if (n < 0) { + if (errno == EINTR) continue; + return Status::IoError(errno_msg("pread")); + } + if (n == 0) return Status::Corruption("pread returned 0 before len"); + done += static_cast(n); + } + return Status::OK(); +} + +LocalFileWriter::~LocalFileWriter() { + if (fd_ >= 0) ::close(fd_); // best-effort: dtor cannot surface a flush error +} + +Status LocalFileWriter::open(const std::string& path) { + fd_ = ::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd_ < 0) return Status::IoError(errno_msg("open")); + buf_.reserve(kBufCapacity); + return Status::OK(); +} + +Status LocalFileWriter::write_all(const uint8_t* data, size_t len) { + size_t done = 0; + while (done < len) { + ssize_t n = ::write(fd_, data + done, len - done); + if (n < 0) { + if (errno == EINTR) continue; + return Status::IoError(errno_msg("write")); + } + done += static_cast(n); + } + return Status::OK(); +} + +Status LocalFileWriter::flush_buffer() { + if (buf_.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(write_all(buf_.data(), buf_.size())); + buf_.clear(); + return Status::OK(); +} + +Status LocalFileWriter::append(Slice data) { + if (fd_ < 0) return Status::IoError("append on unopened file"); + const size_t len = data.size(); + if (len == 0) return Status::OK(); + // Spans larger than the buffer go straight to the fd (after flushing pending + // bytes) to avoid a pointless copy and an oversized buffer. + if (len >= kBufCapacity) { + SNII_RETURN_IF_ERROR(flush_buffer()); + SNII_RETURN_IF_ERROR(write_all(data.data(), len)); + bytes_written_ += len; + return Status::OK(); + } + if (buf_.size() + len > kBufCapacity) SNII_RETURN_IF_ERROR(flush_buffer()); + buf_.insert(buf_.end(), data.data(), data.data() + len); + bytes_written_ += len; + return Status::OK(); +} + +Status LocalFileWriter::finalize() { + if (fd_ < 0) return Status::IoError("finalize on unopened file"); + SNII_RETURN_IF_ERROR(flush_buffer()); + if (::fsync(fd_) != 0) return Status::IoError(errno_msg("fsync")); + if (::close(fd_) != 0) { + fd_ = -1; + return Status::IoError(errno_msg("close")); + } + fd_ = -1; + return Status::OK(); +} + +} // namespace snii::io diff --git a/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp b/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp new file mode 100644 index 00000000000000..a643d8eca5aa3f --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/metered_file_reader.cpp @@ -0,0 +1,117 @@ +#include "snii/io/metered_file_reader.h" + +#include + +namespace snii::io { +namespace { + +// Inclusive [first, last] block ids touched by a validated [offset, offset+len). +// Empty len touches no block (callers guard len==0 before calling this). +void block_range(uint64_t offset, size_t len, size_t block_size, uint64_t* first, uint64_t* last) { + *first = offset / block_size; + *last = (offset + len - 1) / block_size; +} + +} // namespace + +MeteredFileReader::MeteredFileReader(FileReader* inner, size_t block_size) + : inner_(inner), block_size_(block_size) {} + +void MeteredFileReader::reset_metrics() { + resident_.clear(); + metrics_ = IoMetrics {}; +} + +Status MeteredFileReader::validate_range(uint64_t offset, size_t len) const { + if (inner_ == nullptr) return Status::InvalidArgument("metered: null inner reader"); + if (block_size_ == 0) return Status::InvalidArgument("metered: zero block size"); + const uint64_t total = inner_->size(); + if (offset > total || len > total - offset) { + return Status::Corruption("metered: read range past end"); + } + return Status::OK(); +} + +// Accounts the FileCache effect of touching [offset, offset+len): newly missed +// blocks become coalesced remote GETs and remote bytes. Returns true iff any +// block missed. (Single contiguous span -> at most one coalesced run.) +bool MeteredFileReader::account_blocks(uint64_t offset, size_t len) { + if (len == 0) return false; + uint64_t first = 0, last = 0; + block_range(offset, len, block_size_, &first, &last); + + bool any_miss = false; + bool in_run = false; // currently inside a contiguous run of missing blocks + const uint64_t total = inner_->size(); + for (uint64_t b = first; b <= last; ++b) { + if (resident_.count(b)) { + in_run = false; + continue; + } + resident_.insert(b); + any_miss = true; + const uint64_t block_start = b * block_size_; + metrics_.remote_bytes += std::min(block_size_, total - block_start); + if (!in_run) { + ++metrics_.range_gets; // start of a new coalesced GET + in_run = true; + } + } + return any_miss; +} + +Status MeteredFileReader::read_at(uint64_t offset, size_t len, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("metered: null out"); + SNII_RETURN_IF_ERROR(validate_range(offset, len)); + ++metrics_.read_at_calls; + metrics_.total_request_bytes += len; + // A single blocking read: any miss forces one serial round (the next offset is + // not known until these bytes return). + if (account_blocks(offset, len)) ++metrics_.serial_rounds; + return inner_->read_at(offset, len, out); +} + +Status MeteredFileReader::read_batch(const std::vector& ranges, + std::vector>* outs) { + if (outs == nullptr) return Status::InvalidArgument("metered: null batch out"); + for (const Range& r : ranges) { + SNII_RETURN_IF_ERROR(validate_range(r.offset, r.len)); + } + + // Gather the union of touched blocks so coalescing spans the whole batch, and + // the entire batch counts as at most one serial round. + std::vector blocks; + for (const Range& r : ranges) { + metrics_.total_request_bytes += r.len; + if (r.len == 0) continue; + uint64_t first = 0, last = 0; + block_range(r.offset, r.len, block_size_, &first, &last); + for (uint64_t b = first; b <= last; ++b) blocks.push_back(b); + } + metrics_.read_at_calls += ranges.size(); + + std::sort(blocks.begin(), blocks.end()); + blocks.erase(std::unique(blocks.begin(), blocks.end()), blocks.end()); + + bool any_miss = false; + const uint64_t total = inner_->size(); + uint64_t prev_miss = 0; + bool have_prev = false; + for (uint64_t b : blocks) { + if (resident_.count(b)) continue; + resident_.insert(b); + any_miss = true; + metrics_.remote_bytes += std::min(block_size_, total - b * block_size_); + if (!have_prev || b != prev_miss + 1) ++metrics_.range_gets; // new run + prev_miss = b; + have_prev = true; + } + if (any_miss) ++metrics_.serial_rounds; + + // Delegate the actual byte fetch to the inner reader's batch path, so a backend + // that fetches a batch concurrently (e.g. S3FileReader) realizes the planned + // round as parallel GETs (matching the single serial round accounted above). + return inner_->read_batch(ranges, outs); +} + +} // namespace snii::io diff --git a/be/src/storage/index/snii/core/src/io/s3_object_store.cpp b/be/src/storage/index/snii/core/src/io/s3_object_store.cpp new file mode 100644 index 00000000000000..6be72027ebe263 --- /dev/null +++ b/be/src/storage/index/snii/core/src/io/s3_object_store.cpp @@ -0,0 +1,217 @@ +#include "snii/io/s3_object_store.h" + +// The whole implementation is compiled only when the S3 backend is enabled. +// Without SNII_WITH_S3 this file is an empty translation unit and pulls in no +// aws-sdk headers, keeping core aws-free by default. +#ifdef SNII_WITH_S3 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace snii::io { +namespace { + +// Refcounted process-wide InitAPI/ShutdownAPI control, shared by AwsApiGuard. +std::mutex g_api_mu; +int g_api_refcount = 0; +Aws::SDKOptions g_api_options; + +void api_acquire() { + std::lock_guard lock(g_api_mu); + if (g_api_refcount == 0) { + Aws::InitAPI(g_api_options); + } + ++g_api_refcount; +} + +void api_release() { + std::lock_guard lock(g_api_mu); + if (g_api_refcount > 0) { + --g_api_refcount; + if (g_api_refcount == 0) { + Aws::ShutdownAPI(g_api_options); + } + } +} + +// Builds a virtual-hosted-addressing S3 client for an OSS-compatible endpoint. +// OSS rejects path-style addressing (SecondLevelDomainForbidden), so virtual +// addressing is mandatory; payload signing is disabled (Never). +std::shared_ptr make_client(const S3Config& cfg) { + Aws::Auth::AWSCredentials creds(Aws::String(cfg.ak.c_str()), Aws::String(cfg.sk.c_str())); + Aws::Client::ClientConfigurationInitValues init; + init.shouldDisableIMDS = true; + Aws::Client::ClientConfiguration client_cfg(init); + client_cfg.endpointOverride = Aws::String(cfg.endpoint.c_str()); + client_cfg.region = Aws::String(cfg.region.c_str()); + client_cfg.connectTimeoutMs = cfg.connect_timeout_ms; + client_cfg.requestTimeoutMs = cfg.request_timeout_ms; + client_cfg.httpRequestTimeoutMs = cfg.http_request_timeout_ms; + return std::make_shared( + creds, client_cfg, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + /*useVirtualAddressing=*/true); +} + +std::string join_key(const std::string& prefix, const std::string& key) { + if (prefix.empty()) return key; + return prefix + "/" + key; +} + +} // namespace + +AwsApiGuard::AwsApiGuard() { + api_acquire(); +} +AwsApiGuard::~AwsApiGuard() { + api_release(); +} + +// --------------------------------------------------------------------------- +// S3FileReader +// --------------------------------------------------------------------------- + +S3FileReader::~S3FileReader() = default; + +S3FileReader::S3FileReader(S3FileReader&&) noexcept = default; +S3FileReader& S3FileReader::operator=(S3FileReader&&) noexcept = default; + +Status S3FileReader::open(const S3Config& cfg, const std::string& key, S3FileReader* out) { + if (out == nullptr) return Status::InvalidArgument("S3FileReader::open: null out"); + out->client_ = make_client(cfg); + out->bucket_ = cfg.bucket; + out->object_key_ = join_key(cfg.prefix, key); + + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(Aws::String(out->bucket_.c_str())); + req.SetKey(Aws::String(out->object_key_.c_str())); + auto outcome = out->client_->HeadObject(req); + if (!outcome.IsSuccess()) { + return Status::IoError("HeadObject(" + out->object_key_ + + "): " + outcome.GetError().GetMessage().c_str()); + } + out->size_ = static_cast(outcome.GetResult().GetContentLength()); + return Status::OK(); +} + +Status S3FileReader::read_at(uint64_t offset, size_t len, std::vector* out) { + if (client_ == nullptr) return Status::IoError("read_at on unopened S3 object"); + if (out == nullptr) return Status::InvalidArgument("read_at: null out"); + // Non-wrapping bounds check (offset+len could overflow uint64 on a corrupt arg). + if (offset > size_ || len > size_ - offset) { + return Status::Corruption("read_at past end of object"); + } + out->resize(len); + if (len == 0) return Status::OK(); + + Aws::S3::Model::GetObjectRequest req; + req.SetBucket(Aws::String(bucket_.c_str())); + req.SetKey(Aws::String(object_key_.c_str())); + std::ostringstream range; + range << "bytes=" << offset << "-" << (offset + len - 1); + req.SetRange(Aws::String(range.str().c_str())); + + auto outcome = client_->GetObject(req); + if (!outcome.IsSuccess()) { + return Status::IoError("GetObject(" + object_key_ + + "): " + outcome.GetError().GetMessage().c_str()); + } + auto& body = outcome.GetResult().GetBody(); + body.read(reinterpret_cast(out->data()), static_cast(len)); + const std::streamsize got = body.gcount(); + if (static_cast(got) != len) { + return Status::Corruption("GetObject returned fewer bytes than requested"); + } + return Status::OK(); +} + +Status S3FileReader::read_batch(const std::vector& ranges, + std::vector>* outs) { + if (outs == nullptr) return Status::InvalidArgument("read_batch: null outs"); + outs->resize(ranges.size()); + if (ranges.empty()) return Status::OK(); + // Issue GETs concurrently in bounded waves; aws S3Client is safe for parallel + // requests and each range writes a distinct output buffer. + constexpr size_t kMaxConcurrent = 16; + Status first_err; + for (size_t base = 0; base < ranges.size(); base += kMaxConcurrent) { + const size_t end = std::min(base + kMaxConcurrent, ranges.size()); + std::vector> futs; + for (size_t i = base; i < end; ++i) { + futs.push_back(std::async(std::launch::async, [this, &ranges, outs, i]() { + return read_at(ranges[i].offset, ranges[i].len, &(*outs)[i]); + })); + } + for (auto& f : futs) { + const Status s = f.get(); + if (!s.ok() && first_err.ok()) first_err = s; + } + } + return first_err; +} + +// --------------------------------------------------------------------------- +// S3FileWriter +// --------------------------------------------------------------------------- + +S3FileWriter::~S3FileWriter() = default; + +S3FileWriter::S3FileWriter(S3FileWriter&&) noexcept = default; +S3FileWriter& S3FileWriter::operator=(S3FileWriter&&) noexcept = default; + +Status S3FileWriter::open(const S3Config& cfg, const std::string& key) { + client_ = make_client(cfg); + bucket_ = cfg.bucket; + object_key_ = join_key(cfg.prefix, key); + buffer_.clear(); + bytes_written_ = 0; + finalized_ = false; + return Status::OK(); +} + +Status S3FileWriter::append(Slice data) { + if (client_ == nullptr) return Status::IoError("append on unopened S3 writer"); + if (finalized_) return Status::IoError("append after finalize"); + buffer_.insert(buffer_.end(), data.data(), data.data() + data.size()); + bytes_written_ += data.size(); + return Status::OK(); +} + +Status S3FileWriter::finalize() { + if (client_ == nullptr) return Status::IoError("finalize on unopened S3 writer"); + if (finalized_) return Status::IoError("finalize called twice"); + + Aws::S3::Model::PutObjectRequest req; + req.SetBucket(Aws::String(bucket_.c_str())); + req.SetKey(Aws::String(object_key_.c_str())); + auto stream = Aws::MakeShared("S3FileWriter"); + stream->write(reinterpret_cast(buffer_.data()), + static_cast(buffer_.size())); + req.SetBody(stream); + req.SetContentLength(static_cast(buffer_.size())); + + auto outcome = client_->PutObject(req); + if (!outcome.IsSuccess()) { + return Status::IoError("PutObject(" + object_key_ + + "): " + outcome.GetError().GetMessage().c_str()); + } + finalized_ = true; + return Status::OK(); +} + +} // namespace snii::io + +#endif // SNII_WITH_S3 diff --git a/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp b/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp new file mode 100644 index 00000000000000..4987d788e6ed7d --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/bm25_scorer.cpp @@ -0,0 +1,42 @@ +#include "snii/query/bm25_scorer.h" + +#include +#include + +namespace snii::query { + +double decode_norm(uint8_t encoded) { + return encoded == 0 ? 1.0 : static_cast(encoded); +} + +uint8_t encode_norm(uint64_t doc_length) { + const uint64_t clamped = std::clamp(doc_length, 1, 255); + return static_cast(clamped); +} + +ScorerContext ScorerContext::make(uint64_t n, uint64_t df) { + ScorerContext ctx; + ctx.df_ = df; + const double nn = static_cast(n); + const double dff = static_cast(df); + // idf = log(1 + (N - df + 0.5) / (df + 0.5)); always positive for df <= N. + ctx.idf_ = std::log(1.0 + (nn - dff + 0.5) / (dff + 0.5)); + return ctx; +} + +double ScorerContext::score(uint32_t tf, uint8_t encoded_norm, double avgdl, + const Bm25Params& params) const { + const double dl = decode_norm(encoded_norm); + const double tff = static_cast(tf); + const double denom = tff + params.k1 * (1.0 - params.b + params.b * dl / avgdl); + return idf_ * (tff * (params.k1 + 1.0)) / denom; +} + +double ScorerContext::max_score(uint32_t max_freq, uint8_t min_norm, double avgdl, + const Bm25Params& params) const { + // The score grows monotonically with tf and shrinks with dl, so the per-window + // upper bound uses the window's largest tf and smallest dl (min encoded norm). + return score(max_freq, min_norm, avgdl, params); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/boolean_query.cpp b/be/src/storage/index/snii/core/src/query/boolean_query.cpp new file mode 100644 index 00000000000000..e4befe6e316b4a --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/boolean_query.cpp @@ -0,0 +1,99 @@ +#include "snii/query/boolean_query.h" + +#include +#include +#include +#include + +#include "snii/format/dict_entry.h" +#include "snii/query/docid_sink.h" +#include "snii/query/internal/docid_conjunction.h" +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/query/internal/docid_union.h" + +namespace snii::query { + +namespace { + +std::vector unique_terms(const std::vector& terms) { + std::vector out; + out.reserve(terms.size()); + for (const std::string& term : terms) out.emplace_back(term); + std::sort(out.begin(), out.end()); + out.erase(std::unique(out.begin(), out.end()), out.end()); + return out; +} + +Status resolve_or_postings(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, + std::vector* postings) { + postings->clear(); + for (std::string_view term : unique_terms(terms)) { + bool found = false; + snii::format::DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base)); + if (!found) continue; + + postings->push_back({std::move(entry), frq_base, prx_base}); + } + return Status::OK(); +} + +} // namespace + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("boolean_or: null out"); + docids->clear(); + if (terms.empty()) return Status::OK(); + + std::vector postings; + SNII_RETURN_IF_ERROR(resolve_or_postings(idx, terms, &postings)); + return internal::build_docid_union(idx, postings, docids); +} + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return boolean_or(idx, terms, docids); +} + +Status boolean_or(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("boolean_or: null sink"); + if (terms.empty()) return Status::OK(); + + std::vector postings; + SNII_RETURN_IF_ERROR(resolve_or_postings(idx, terms, &postings)); + return internal::emit_docid_union(idx, postings, sink); +} + +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("boolean_and: null out"); + docids->clear(); + if (terms.empty()) return Status::OK(); + + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + bool all_present = false; + SNII_RETURN_IF_ERROR(internal::plan_terms(idx, terms, &round1, &plans, &all_present, + /*need_positions=*/false)); + if (!all_present) return Status::OK(); + if (round1.pending() > 0) SNII_RETURN_IF_ERROR(round1.fetch()); + SNII_RETURN_IF_ERROR(internal::open_preludes(round1, &plans, + /*need_positions=*/false)); + return internal::build_docid_only_conjunction(idx, round1, plans, docids); +} + +Status boolean_and(const snii::reader::LogicalIndexReader& idx, + const std::vector& terms, std::vector* docids, + QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return boolean_and(idx, terms, docids); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp new file mode 100644 index 00000000000000..a2477eaf576682 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -0,0 +1,518 @@ +#include "snii/query/internal/docid_conjunction.h" + +#include +#include +#include + +#include "snii/format/frq_pod.h" +#include "snii/query/internal/docid_set_ops.h" +#include "snii/reader/windowed_posting.h" + +namespace snii::query::internal { + +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeReader; +using snii::format::WindowMeta; +using snii::reader::LogicalIndexReader; + +namespace { + +Status slim_frq_docs_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) { + if (entry.frq_docs_len > win_len) { + return Status::Corruption("docid_conjunction: slim frq_docs_len exceeds frq window"); + } + *out = entry.frq_docs_len > 0 ? entry.frq_docs_len : win_len; + return Status::OK(); +} + +Status add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) { + if (rhs > std::numeric_limits::max() - lhs) { + return Status::Corruption(message); + } + *out = lhs + rhs; + return Status::OK(); +} + +Status posting_abs_offset(const LogicalIndexReader& idx, uint64_t base, uint64_t delta, + const char* message, uint64_t* out) { + uint64_t with_base = 0; + SNII_RETURN_IF_ERROR( + add_u64(idx.section_refs().posting_region.offset, base, message, &with_base)); + return add_u64(with_base, delta, message, out); +} + +Status configure_term_plan(const LogicalIndexReader& idx, bool need_positions, + snii::io::BatchRangeFetcher* fetcher, TermPlan* p) { + p->df = p->entry.df; + p->pod_ref = (p->entry.kind == DictEntryKind::kPodRef); + p->windowed = p->pod_ref && p->entry.enc == DictEntryEnc::kWindowed; + if (p->windowed) { + uint64_t prelude_abs = 0; + SNII_RETURN_IF_ERROR(posting_abs_offset(idx, p->frq_base, p->entry.frq_off_delta, + "docid_conjunction: prelude offset overflow", + &prelude_abs)); + p->prelude_handle = fetcher->add(prelude_abs, p->entry.prelude_len); + } else if (p->pod_ref) { + uint64_t foff = 0; + uint64_t flen = 0; + uint64_t poff = 0; + uint64_t plen = 0; + SNII_RETURN_IF_ERROR(idx.resolve_frq_window(p->entry, p->frq_base, &foff, &flen)); + uint64_t frq_fetch = flen; + SNII_RETURN_IF_ERROR(slim_frq_docs_len(p->entry, flen, &frq_fetch)); + p->frq_handle = fetcher->add(foff, frq_fetch); + if (need_positions) { + SNII_RETURN_IF_ERROR(idx.resolve_prx_window(p->entry, p->prx_base, &poff, &plen)); + p->prx_handle = fetcher->add(poff, plen); + } + } + return Status::OK(); +} + +std::vector all_windows(const FrqPreludeReader& prelude) { + std::vector ws(prelude.n_windows()); + for (uint32_t i = 0; i < prelude.n_windows(); ++i) ws[i] = i; + return ws; +} + +std::vector ascending_df_order(const std::vector& plans) { + std::vector order(plans.size()); + for (size_t i = 0; i < plans.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return plans[a].df < plans[b].df; }); + return order; +} + +Status first_docid_in_window(const WindowMeta& meta, uint32_t window_ordinal, uint32_t* first) { + if (window_ordinal == 0) { + *first = 0; + return Status::OK(); + } + if (meta.win_base >= std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: window base exceeds docid range"); + } + *first = static_cast(meta.win_base + 1); + if (*first > meta.last_docid) { + return Status::Corruption("docid_conjunction: invalid window docid range"); + } + return Status::OK(); +} + +Status is_dense_full_window(const WindowMeta& meta, uint32_t window_ordinal, bool* full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, window_ordinal, &first)); + const uint64_t width = static_cast(meta.last_docid) - first + 1; + *full = meta.doc_count == width; + return Status::OK(); +} + +Status append_docid_range(uint32_t first, uint32_t last, std::vector* out) { + if (last < first) { + return Status::Corruption("docid_conjunction: invalid dense docid range"); + } + const uint64_t count64 = static_cast(last) - first + 1; + if (count64 > static_cast(std::numeric_limits::max() - out->size())) { + return Status::Corruption("docid_conjunction: dense docid range too large"); + } + out->reserve(out->size() + static_cast(count64)); + uint32_t docid = first; + while (true) { + out->push_back(docid); + if (docid == last) break; + ++docid; + } + return Status::OK(); +} + +Status append_docid_ordinal(size_t ordinal, std::vector* out) { + if (ordinal > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: doc ordinal exceeds u32"); + } + out->push_back(static_cast(ordinal)); + return Status::OK(); +} + +void append_candidate_range(const std::vector& candidates, uint32_t first, uint32_t last, + std::vector* out) { + const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); + const auto end = std::upper_bound(begin, candidates.end(), last); + out->insert(out->end(), begin, end); +} + +Status append_candidate_range_with_ordinals(const std::vector& candidates, uint32_t first, + uint32_t last, std::vector* out, + DocidChunk* chunk) { + const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); + const auto end = std::upper_bound(begin, candidates.end(), last); + chunk->docids.reserve(static_cast(end - begin)); + chunk->prx_doc_ordinals.reserve(static_cast(end - begin)); + for (auto it = begin; it != end; ++it) { + out->push_back(*it); + chunk->docids.push_back(*it); + SNII_RETURN_IF_ERROR(append_docid_ordinal( + static_cast(*it) - static_cast(first), &chunk->prx_doc_ordinals)); + } + return Status::OK(); +} + +size_t log2_ceil(size_t n) { + if (n <= 1) return 1; + --n; + size_t bits = 0; + while (n != 0) { + ++bits; + n >>= 1; + } + return bits; +} + +void intersect_window_candidates(const std::vector& candidates, + const std::vector& term_docids, uint32_t first, + uint32_t last, std::vector* out) { + const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); + const auto end = std::upper_bound(begin, candidates.end(), last); + const size_t candidate_count = static_cast(end - begin); + if (candidate_count == 0 || term_docids.empty()) return; + + const uint64_t width = static_cast(last) - first + 1; + const uint64_t missing_count = term_docids.size() <= width ? width - term_docids.size() : width; + if (term_docids.size() <= width && missing_count != 0 && missing_count * 8 <= width && + missing_count < candidate_count) { + std::vector missing; + missing.reserve(static_cast(missing_count)); + uint32_t expect = first; + for (uint32_t docid : term_docids) { + while (expect < docid) { + missing.push_back(expect); + ++expect; + } + if (docid < std::numeric_limits::max()) expect = docid + 1; + } + while (expect <= last) { + missing.push_back(expect); + if (expect == std::numeric_limits::max()) break; + ++expect; + } + size_t miss = 0; + for (auto it = begin; it != end; ++it) { + while (miss < missing.size() && missing[miss] < *it) ++miss; + if (miss == missing.size() || missing[miss] != *it) out->push_back(*it); + } + return; + } + + const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1; + if (candidate_count < term_docids.size() / probes_per_candidate) { + for (auto it = begin; it != end; ++it) { + if (std::binary_search(term_docids.begin(), term_docids.end(), *it)) { + out->push_back(*it); + } + } + return; + } + std::set_intersection(begin, end, term_docids.begin(), term_docids.end(), + std::back_inserter(*out)); +} + +Status intersect_window_candidates_with_ordinals(const std::vector& candidates, + const std::vector& term_docids, + uint32_t first, uint32_t last, + std::vector* out, DocidChunk* chunk) { + const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); + const auto end = std::upper_bound(begin, candidates.end(), last); + if (begin == end || term_docids.empty()) return Status::OK(); + + chunk->docids.reserve(static_cast(end - begin)); + chunk->prx_doc_ordinals.reserve(static_cast(end - begin)); + size_t doc_index = 0; + for (auto it = begin; it != end; ++it) { + while (doc_index < term_docids.size() && term_docids[doc_index] < *it) { + ++doc_index; + } + if (doc_index == term_docids.size()) break; + if (term_docids[doc_index] != *it) continue; + out->push_back(*it); + chunk->docids.push_back(*it); + SNII_RETURN_IF_ERROR(append_docid_ordinal(doc_index, &chunk->prx_doc_ordinals)); + ++doc_index; + } + return Status::OK(); +} + +Status select_covering_windows(const FrqPreludeReader& prelude, + const std::vector& candidates, + std::vector* windows) { + std::vector sel; + uint32_t last = UINT32_MAX; + for (uint32_t d : candidates) { + bool found = false; + uint32_t w = 0; + SNII_RETURN_IF_ERROR(prelude.locate_window(d, &found, &w)); + if (!found) continue; + if (w != last) { + sel.push_back(w); + last = w; + } + } + *windows = std::move(sel); + return Status::OK(); +} + +bool should_scan_all_windows(const LogicalIndexReader& idx, const TermPlan& p, + size_t candidate_count) { + const size_t window_count = p.prelude.n_windows(); + if (candidate_count > window_count * 64) return true; + + const uint64_t doc_count = idx.stats().doc_count; + const bool near_full = doc_count != 0 && static_cast(p.df) * 10 >= doc_count * 9; + return near_full && candidate_count > window_count * 4; +} + +Status decode_flat_docids_only(const snii::io::BatchRangeFetcher& round1, const TermPlan& p, + std::vector* docids) { + Slice dd; + if (p.pod_ref) { + dd = round1.get(p.frq_handle); + } else { + SNII_RETURN_IF_ERROR(inline_dd_region(p.entry, &dd)); + } + return snii::format::decode_dd_region(dd, p.entry.dd_meta, /*win_base=*/0, docids); +} + +Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPlan& p, + const std::vector& windows, + const std::vector* candidates, + std::vector* out, DocidSource* source) { + struct FetchedWindow { + uint32_t ordinal = 0; + WindowMeta meta; + size_t handle = 0; + }; + + snii::io::BatchRangeFetcher fetcher(idx.reader(), snii::reader::kSameTermCoalesceGap); + std::vector fetched; + fetched.reserve(windows.size()); + out->reserve(candidates == nullptr ? p.entry.df : candidates->size()); + for (uint32_t w : windows) { + WindowMeta meta; + SNII_RETURN_IF_ERROR(p.prelude.window(w, &meta)); + bool dense_full = false; + SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full)); + if (dense_full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first)); + if (source != nullptr) { + DocidChunk chunk; + chunk.windowed = true; + chunk.window = w; + if (candidates == nullptr) { + SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, &chunk.docids)); + } else { + SNII_RETURN_IF_ERROR(append_candidate_range_with_ordinals( + *candidates, first, meta.last_docid, out, &chunk)); + } + source->chunks.push_back(std::move(chunk)); + } + if (candidates == nullptr) { + SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, out)); + } else if (source == nullptr) { + append_candidate_range(*candidates, first, meta.last_docid, out); + } + continue; + } + + snii::reader::WindowAbsRange range; + SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( + idx, p.entry, p.frq_base, p.prx_base, p.prelude, w, + /*want_positions=*/false, /*want_freq=*/false, &range)); + FetchedWindow f; + f.ordinal = w; + f.meta = meta; + f.handle = fetcher.add(range.dd_off, range.dd_len); + fetched.push_back(f); + } + if (fetcher.pending() > 0) SNII_RETURN_IF_ERROR(fetcher.fetch()); + + std::vector docs; + std::vector freqs; + std::vector> positions; + for (const FetchedWindow& f : fetched) { + docs.clear(); + freqs.clear(); + positions.clear(); + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + f.meta, fetcher.get(f.handle), Slice(), Slice(), + /*want_positions=*/false, /*want_freq=*/false, &docs, &freqs, &positions)); + if (source != nullptr) { + DocidChunk chunk; + chunk.windowed = true; + chunk.window = f.ordinal; + if (candidates == nullptr) { + chunk.docids = docs; + source->chunks.push_back(std::move(chunk)); + } else { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); + SNII_RETURN_IF_ERROR(intersect_window_candidates_with_ordinals( + *candidates, docs, first, f.meta.last_docid, out, &chunk)); + if (!chunk.docids.empty()) source->chunks.push_back(std::move(chunk)); + } + } + if (candidates == nullptr) { + out->insert(out->end(), docs.begin(), docs.end()); + continue; + } + if (source != nullptr) continue; + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); + intersect_window_candidates(*candidates, docs, first, f.meta.last_docid, out); + } + return Status::OK(); +} + +Status collect_docids_only(const LogicalIndexReader& idx, const snii::io::BatchRangeFetcher& round1, + const TermPlan& p, const std::vector* candidates, + std::vector* out, DocidSource* source) { + if (p.windowed) { + std::vector windows; + if (candidates == nullptr) { + windows = all_windows(p.prelude); + } else if (should_scan_all_windows(idx, p, candidates->size())) { + // Dense candidate sets cover most windows; for near-full terms this also + // avoids thousands-to-millions of locate_window probes with no byte win. + windows = all_windows(p.prelude); + } else { + SNII_RETURN_IF_ERROR(select_covering_windows(p.prelude, *candidates, &windows)); + } + return collect_windowed_docids_only(idx, p, windows, candidates, out, source); + } + + std::vector term_docids; + SNII_RETURN_IF_ERROR(decode_flat_docids_only(round1, p, &term_docids)); + if (source != nullptr) { + DocidChunk chunk; + if (candidates == nullptr) { + chunk.docids = term_docids; + } else if (!term_docids.empty()) { + SNII_RETURN_IF_ERROR(intersect_window_candidates_with_ordinals( + *candidates, term_docids, term_docids.front(), term_docids.back(), out, + &chunk)); + } + if (candidates == nullptr || !chunk.docids.empty()) + source->chunks.push_back(std::move(chunk)); + } + if (candidates == nullptr) { + *out = std::move(term_docids); + return Status::OK(); + } + if (source != nullptr) return Status::OK(); + *out = intersect_sorted(*candidates, term_docids); + return Status::OK(); +} + +Status build_docid_only_conjunction_impl(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates, + std::vector* sources) { + if (sources != nullptr) sources->assign(plans.size(), DocidSource {}); + const std::vector order = ascending_df_order(plans); + for (size_t k = 0; k < order.size(); ++k) { + const size_t ti = order[k]; + std::vector next; + DocidSource* source = sources == nullptr ? nullptr : &(*sources)[ti]; + SNII_RETURN_IF_ERROR(collect_docids_only(idx, round1, plans[ti], + k == 0 ? nullptr : candidates, &next, source)); + *candidates = std::move(next); + if (candidates->empty()) return Status::OK(); + } + return Status::OK(); +} + +} // namespace + +Status resolve_query_term(const LogicalIndexReader& idx, const std::string& term, + ResolvedQueryTerm* resolved, bool* found) { + *found = false; + SNII_RETURN_IF_ERROR( + idx.lookup(term, found, &resolved->entry, &resolved->frq_base, &resolved->prx_base)); + return Status::OK(); +} + +Status plan_terms(const LogicalIndexReader& idx, const std::vector& terms, + snii::io::BatchRangeFetcher* fetcher, std::vector* plans, + bool* all_present, bool need_positions) { + *all_present = true; + plans->resize(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + ResolvedQueryTerm resolved; + bool found = false; + SNII_RETURN_IF_ERROR(resolve_query_term(idx, terms[i], &resolved, &found)); + if (!found) { + *all_present = false; + return Status::OK(); + } + TermPlan& p = (*plans)[i]; + p.order = i; + p.entry = std::move(resolved.entry); + p.frq_base = resolved.frq_base; + p.prx_base = resolved.prx_base; + SNII_RETURN_IF_ERROR(configure_term_plan(idx, need_positions, fetcher, &p)); + } + return Status::OK(); +} + +Status plan_resolved_terms(const LogicalIndexReader& idx, + const std::vector& terms, + snii::io::BatchRangeFetcher* fetcher, std::vector* plans, + bool need_positions) { + plans->resize(terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + TermPlan& p = (*plans)[i]; + p.order = i; + p.entry = terms[i].entry; + p.frq_base = terms[i].frq_base; + p.prx_base = terms[i].prx_base; + SNII_RETURN_IF_ERROR(configure_term_plan(idx, need_positions, fetcher, &p)); + } + return Status::OK(); +} + +Status open_preludes(const snii::io::BatchRangeFetcher& fetcher, std::vector* plans, + bool need_positions) { + for (TermPlan& p : *plans) { + if (!p.windowed) continue; + SNII_RETURN_IF_ERROR(FrqPreludeReader::open(fetcher.get(p.prelude_handle), &p.prelude)); + if (need_positions && !p.prelude.has_prx()) { + return Status::Corruption("docid_conjunction: windowed prelude has no positions"); + } + } + return Status::OK(); +} + +Status inline_dd_region(const DictEntry& entry, Slice* out) { + if (entry.dd_meta.disk_len > entry.frq_bytes.size()) { + return Status::Corruption("docid_conjunction: inline dd region exceeds frq bytes"); + } + *out = Slice(entry.frq_bytes.data(), static_cast(entry.dd_meta.disk_len)); + return Status::OK(); +} + +Status build_docid_only_conjunction(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates) { + return build_docid_only_conjunction_impl(idx, round1, plans, candidates, nullptr); +} + +Status build_docid_only_conjunction(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, + std::vector* candidates, + std::vector* sources) { + return build_docid_only_conjunction_impl(idx, round1, plans, candidates, sources); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp new file mode 100644 index 00000000000000..18a487b31bac01 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp @@ -0,0 +1,222 @@ +#include "snii/query/internal/docid_posting_reader.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/reader/windowed_posting.h" + +namespace snii::query::internal { + +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeReader; +using snii::format::WindowMeta; +using snii::reader::LogicalIndexReader; + +namespace { + +Status decode_flat_docs(const DictEntry& entry, Slice dd_region, std::vector* docids) { + return snii::format::decode_dd_region(dd_region, entry.dd_meta, + /*win_base=*/0, docids); +} + +Status decode_inline_docs(const DictEntry& entry, std::vector* docids) { + if (entry.dd_meta.disk_len > entry.frq_bytes.size()) { + return Status::Corruption("docid_posting_reader: inline dd region exceeds frq bytes"); + } + return decode_flat_docs( + entry, Slice(entry.frq_bytes.data(), static_cast(entry.dd_meta.disk_len)), + docids); +} + +Status slim_docs_fetch_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) { + if (entry.frq_docs_len > win_len) { + return Status::Corruption("docid_posting_reader: slim frq_docs_len exceeds frq window"); + } + *out = entry.frq_docs_len > 0 ? entry.frq_docs_len : win_len; + return Status::OK(); +} + +Status add_u64(uint64_t lhs, uint64_t rhs, const char* message, uint64_t* out) { + if (rhs > std::numeric_limits::max() - lhs) { + return Status::Corruption(message); + } + *out = lhs + rhs; + return Status::OK(); +} + +Status prelude_abs(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + uint64_t* out) { + uint64_t with_base = 0; + SNII_RETURN_IF_ERROR(add_u64(idx.section_refs().posting_region.offset, frq_base, + "docid_posting_reader: prelude offset overflow", &with_base)); + return add_u64(with_base, entry.frq_off_delta, "docid_posting_reader: prelude offset overflow", + out); +} + +Status validate_windowed_docs_prefix(const DictEntry& entry) { + if (entry.prelude_len == 0) { + return Status::Corruption("docid_posting_reader: windowed entry has no prelude"); + } + if (entry.prelude_len > entry.frq_docs_len) { + return Status::Corruption("docid_posting_reader: prelude_len exceeds docs prefix"); + } + if (entry.frq_docs_len > entry.frq_len) { + return Status::Corruption("docid_posting_reader: docs prefix exceeds frq_len"); + } + return Status::OK(); +} + +struct FlatPlan { + size_t out_index = 0; + const DictEntry* entry = nullptr; + size_t handle = 0; +}; + +struct WindowPlan { + size_t out_index = 0; + const ResolvedDocidPosting* posting = nullptr; + size_t prefix_handle = 0; +}; + +Status plan_flat_docs(const LogicalIndexReader& idx, const ResolvedDocidPosting& posting, + snii::io::BatchRangeFetcher* fetcher, FlatPlan* plan) { + uint64_t win_abs = 0; + uint64_t win_len = 0; + SNII_RETURN_IF_ERROR( + idx.resolve_frq_window(posting.entry, posting.frq_base, &win_abs, &win_len)); + uint64_t docs_len = 0; + SNII_RETURN_IF_ERROR(slim_docs_fetch_len(posting.entry, win_len, &docs_len)); + plan->handle = fetcher->add(win_abs, docs_len); + return Status::OK(); +} + +Status plan_window_prefix(const LogicalIndexReader& idx, WindowPlan* plan, + snii::io::BatchRangeFetcher* fetcher) { + const ResolvedDocidPosting& posting = *plan->posting; + SNII_RETURN_IF_ERROR(validate_windowed_docs_prefix(posting.entry)); + uint64_t abs = 0; + SNII_RETURN_IF_ERROR(prelude_abs(idx, posting.entry, posting.frq_base, &abs)); + plan->prefix_handle = fetcher->add(abs, posting.entry.frq_docs_len); + return Status::OK(); +} + +Status window_dd_slice(Slice dd_block, const WindowMeta& meta, Slice* out) { + if (meta.dd_off > dd_block.size() || meta.dd_disk_len > dd_block.size() - meta.dd_off) { + return Status::Corruption("docid_posting_reader: window dd range out of prefix"); + } + *out = dd_block.subslice(static_cast(meta.dd_off), + static_cast(meta.dd_disk_len)); + return Status::OK(); +} + +Status decode_flat_plan(const snii::io::BatchRangeFetcher& fetcher, const FlatPlan& plan, + std::vector* out) { + return decode_flat_docs(*plan.entry, fetcher.get(plan.handle), out); +} + +Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, + std::vector* out) { + const DictEntry& entry = plan.posting->entry; + const Slice prefix = fetcher.get(plan.prefix_handle); + if (entry.prelude_len > prefix.size()) { + return Status::Corruption("docid_posting_reader: short docs prefix"); + } + const size_t prelude_len = static_cast(entry.prelude_len); + FrqPreludeReader prelude; + SNII_RETURN_IF_ERROR(FrqPreludeReader::open(prefix.subslice(0, prelude_len), &prelude)); + const uint64_t dd_block_len = prelude.dd_block_len(); + if (dd_block_len > static_cast(std::numeric_limits::max()) - prelude_len) { + return Status::Corruption("docid_posting_reader: docs prefix length overflow"); + } + const size_t expected_prefix_len = prelude_len + static_cast(dd_block_len); + if (prefix.size() != expected_prefix_len) { + return Status::Corruption("docid_posting_reader: docs prefix length mismatch"); + } + const Slice dd_block = prefix.subslice(prelude_len, prefix.size() - prelude_len); + for (uint32_t w = 0; w < prelude.n_windows(); ++w) { + WindowMeta meta; + Slice dd_region; + SNII_RETURN_IF_ERROR(prelude.window(w, &meta)); + SNII_RETURN_IF_ERROR(window_dd_slice(dd_block, meta, &dd_region)); + std::vector docs; + std::vector freqs; + std::vector> positions; + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + meta, dd_region, Slice(), Slice(), /*want_positions=*/false, + /*want_freq=*/false, &docs, &freqs, &positions)); + out->insert(out->end(), docs.begin(), docs.end()); + } + return Status::OK(); +} + +} // namespace + +Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, std::vector* docids) { + if (docids == nullptr) { + return Status::InvalidArgument("docid_posting_reader: null out"); + } + std::vector> batched; + SNII_RETURN_IF_ERROR(read_docid_postings_batched( + idx, {ResolvedDocidPosting {entry, frq_base, prx_base}}, &batched)); + *docids = std::move(batched.front()); + return Status::OK(); +} + +Status read_docid_postings_batched(const LogicalIndexReader& idx, + const std::vector& postings, + std::vector>* docids) { + if (docids == nullptr) { + return Status::InvalidArgument("docid_posting_reader: null batched out"); + } + docids->clear(); + docids->resize(postings.size()); + + std::vector flat_plans; + std::vector window_plans; + snii::io::BatchRangeFetcher docs_fetcher(idx.reader()); + + for (size_t i = 0; i < postings.size(); ++i) { + const ResolvedDocidPosting& posting = postings[i]; + if (posting.entry.kind == DictEntryKind::kInline) { + SNII_RETURN_IF_ERROR(decode_inline_docs(posting.entry, &(*docids)[i])); + continue; + } + if (posting.entry.enc == DictEntryEnc::kWindowed) { + WindowPlan plan; + plan.out_index = i; + plan.posting = &posting; + SNII_RETURN_IF_ERROR(plan_window_prefix(idx, &plan, &docs_fetcher)); + window_plans.push_back(std::move(plan)); + continue; + } + FlatPlan plan; + plan.out_index = i; + plan.entry = &posting.entry; + flat_plans.push_back(plan); + } + + for (FlatPlan& plan : flat_plans) { + const ResolvedDocidPosting& posting = postings[plan.out_index]; + SNII_RETURN_IF_ERROR(plan_flat_docs(idx, posting, &docs_fetcher, &plan)); + } + if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch()); + + for (const FlatPlan& plan : flat_plans) { + SNII_RETURN_IF_ERROR(decode_flat_plan(docs_fetcher, plan, &(*docids)[plan.out_index])); + } + for (const WindowPlan& plan : window_plans) { + SNII_RETURN_IF_ERROR( + decode_window_prefix_plan(docs_fetcher, plan, &(*docids)[plan.out_index])); + } + return Status::OK(); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp b/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp new file mode 100644 index 00000000000000..88b748e49e80b1 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_set_ops.cpp @@ -0,0 +1,105 @@ +#include "snii/query/internal/docid_set_ops.h" + +#include +#include +#include +#include + +namespace snii::query::internal { + +std::vector intersect_sorted(const std::vector& a, + const std::vector& b) { + std::vector out; + out.reserve(std::min(a.size(), b.size())); + std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), std::back_inserter(out)); + return out; +} + +void union_sorted_into(std::vector* acc, const std::vector& next) { + std::vector merged; + merged.reserve(acc->size() + next.size()); + std::set_union(acc->begin(), acc->end(), next.begin(), next.end(), std::back_inserter(merged)); + *acc = std::move(merged); +} + +std::vector union_sorted_many(const std::vector>& lists) { + constexpr size_t kLinearFanInMax = 8; + struct Cursor { + uint32_t docid = 0; + size_t list = 0; + size_t offset = 0; + }; + struct GreaterDocId { + bool operator()(const Cursor& a, const Cursor& b) const { return a.docid > b.docid; } + }; + + size_t non_empty = 0; + size_t largest = 0; + std::priority_queue, GreaterDocId> heap; + for (size_t i = 0; i < lists.size(); ++i) { + if (lists[i].empty()) continue; + ++non_empty; + largest = std::max(largest, lists[i].size()); + heap.push(Cursor {lists[i][0], i, 0}); + } + if (non_empty == 0) return {}; + if (non_empty == 1) { + for (const std::vector& docs : lists) { + if (!docs.empty()) return docs; + } + } + + if (non_empty <= kLinearFanInMax) { + std::vector offsets(lists.size(), 0); + std::vector out; + out.reserve(largest); + bool has_last = false; + uint32_t last = 0; + for (;;) { + bool found = false; + uint32_t next = 0; + for (size_t i = 0; i < lists.size(); ++i) { + if (offsets[i] >= lists[i].size()) continue; + const uint32_t docid = lists[i][offsets[i]]; + if (!found || docid < next) { + found = true; + next = docid; + } + } + if (!found) break; + if (!has_last || next != last) { + out.push_back(next); + last = next; + has_last = true; + } + for (size_t i = 0; i < lists.size(); ++i) { + while (offsets[i] < lists[i].size() && lists[i][offsets[i]] == next) { + ++offsets[i]; + } + } + } + return out; + } + + std::vector out; + out.reserve(largest); + bool has_last = false; + uint32_t last = 0; + while (!heap.empty()) { + const Cursor cur = heap.top(); + heap.pop(); + if (!has_last || cur.docid != last) { + out.push_back(cur.docid); + last = cur.docid; + has_last = true; + } + const size_t next_offset = cur.offset + 1; + const std::vector& docs = lists[cur.list]; + if (next_offset < docs.size()) { + heap.push(Cursor {docs[next_offset], cur.list, next_offset}); + } + } + return out; +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/docid_union.cpp b/be/src/storage/index/snii/core/src/query/docid_union.cpp new file mode 100644 index 00000000000000..da4665a63d1280 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/docid_union.cpp @@ -0,0 +1,31 @@ +#include "snii/query/internal/docid_union.h" + +#include + +#include "snii/query/internal/docid_set_ops.h" + +namespace snii::query::internal { + +Status build_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, + std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("docid_union: null out"); + out->clear(); + if (postings.empty()) return Status::OK(); + + std::vector> docs_by_posting; + SNII_RETURN_IF_ERROR(read_docid_postings_batched(idx, postings, &docs_by_posting)); + *out = union_sorted_many(docs_by_posting); + return Status::OK(); +} + +Status emit_docid_union(const snii::reader::LogicalIndexReader& idx, + const std::vector& postings, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("docid_union: null sink"); + std::vector acc; + SNII_RETURN_IF_ERROR(build_docid_union(idx, postings, &acc)); + if (acc.empty()) return Status::OK(); + return sink->append_sorted(acc); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp new file mode 100644 index 00000000000000..a86a620a014992 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -0,0 +1,644 @@ +#include "snii/query/phrase_query.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/dict_entry.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/format/prx_pod.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/query/internal/docid_conjunction.h" +#include "snii/query/internal/docid_set_ops.h" +#include "snii/query/internal/position_math.h" +#include "snii/query/prefix_query.h" +#include "snii/query/term_query.h" +#include "snii/reader/windowed_posting.h" + +// phrase_query implements MATCH_PHRASE with WINDOW (sub-block) SKIPPING for +// high-df windowed terms (design spec section 6.2): +// 1. Resolve every term; reject if any is absent. +// 2. Batch-read each windowed term's prelude + each slim/inline term's full +// docid posting in one round; open the two-level prelude readers. +// 3. Pick the DRIVER = smallest-df term; materialize it fully -> the initial +// candidate docid set. +// 4. For every other term in ascending-df order, narrow the candidate set: +// - slim/inline: intersect with its (already decoded) full posting. +// - windowed: locate_window() the CURRENT candidates -> the SET of +// windows covering them; batch-fetch ONLY those windows' +// .frq docid regions; keep candidates present in some +// covering window. A high-df term thus reads +// O(candidates) windows instead of its whole O(df) +// posting. +// 5. Fetch PRX only for retained chunks and run the positional phrase check +// (term[0]@p, term[1]@p+1, ...) on the survivors. +// The result is identical to a full-read intersection; only the bytes read for +// high-df windowed terms shrink. +namespace snii::query { + +using snii::query::internal::DocidChunk; +using snii::query::internal::DocidSource; +using snii::query::internal::ResolvedQueryTerm; +using snii::query::internal::TermPlan; +using snii::reader::LogicalIndexReader; + +namespace { + +struct ExpectedTailPositions { + uint32_t docid = 0; + std::vector positions; +}; + +// One decoded chunk of a term's posting: a windowed term's covering window, or +// a slim/inline term's single posting. `docids` is decoded in the conjunction +// phase (and reused by the streaming cursor -- the dd region is decoded exactly +// once); `prx` is the on-disk positions bytes, decoded lazily by the cursor +// (once per chunk) during phrase verification. +struct PosChunk { + std::vector docids; // ascending, absolute + // Empty means the chunk keeps every PRX doc in on-disk order. Non-empty means + // `docids[i]` corresponds to on-disk local document ordinal + // `prx_doc_ordinals[i]`, allowing PRX decode to skip positions for docs that + // were removed by the docid-only conjunction. + std::vector prx_doc_ordinals; + Slice prx; // .prx window bytes (reference fetcher/round1/entry) + bool windowed = false; + uint32_t window = 0; +}; + +// A term's retained posting as an ordered list of chunks (windowed: covering +// windows in docid order; slim/inline: one). The referenced prx bytes live in +// `round1` / the per-term fetchers kept alive in phrase_query::owners for the +// whole query, so the cursor can decode positions during verification. +struct PosSource { + std::vector chunks; +}; + +struct PhraseExecutionState { + std::vector srcs; + std::vector> owners; + std::vector candidates; +}; + +struct PhraseTermMapping { + std::vector unique_terms; + std::vector phrase_plan_index; +}; + +PhraseTermMapping BuildPhraseTermMapping(const std::vector& terms) { + PhraseTermMapping mapping; + mapping.phrase_plan_index.reserve(terms.size()); + for (const std::string& term : terms) { + auto it = std::find(mapping.unique_terms.begin(), mapping.unique_terms.end(), term); + if (it == mapping.unique_terms.end()) { + mapping.phrase_plan_index.push_back(mapping.unique_terms.size()); + mapping.unique_terms.push_back(term); + continue; + } + mapping.phrase_plan_index.push_back(static_cast(it - mapping.unique_terms.begin())); + } + return mapping; +} + +Status append_prx_doc_ordinal(size_t ordinal, std::vector* out) { + if (ordinal > std::numeric_limits::max()) { + return Status::Corruption("phrase_query: prx doc ordinal exceeds u32"); + } + out->push_back(static_cast(ordinal)); + return Status::OK(); +} + +Status SelectCandidateDocsForPrx(std::vector* docids, + std::vector* prx_doc_ordinals, + const std::vector& candidates, PosChunk* chunk) { + chunk->docids.clear(); + chunk->prx_doc_ordinals.clear(); + if (docids->empty() || candidates.empty()) return Status::OK(); + if (!prx_doc_ordinals->empty() && prx_doc_ordinals->size() != docids->size()) { + return Status::Corruption("phrase_query: prx ordinal/docid count mismatch"); + } + + std::vector selected_docids; + std::vector selected_ordinals; + selected_docids.reserve(std::min(docids->size(), candidates.size())); + selected_ordinals.reserve(selected_docids.capacity()); + + size_t candidate_index = 0; + for (size_t doc_index = 0; doc_index < docids->size() && candidate_index < candidates.size(); + ++doc_index) { + const uint32_t docid = (*docids)[doc_index]; + while (candidate_index < candidates.size() && candidates[candidate_index] < docid) { + ++candidate_index; + } + if (candidate_index == candidates.size()) break; + if (candidates[candidate_index] != docid) continue; + + selected_docids.push_back(docid); + if (prx_doc_ordinals->empty()) { + SNII_RETURN_IF_ERROR(append_prx_doc_ordinal(doc_index, &selected_ordinals)); + } else { + selected_ordinals.push_back((*prx_doc_ordinals)[doc_index]); + } + ++candidate_index; + } + + if (selected_docids.empty()) return Status::OK(); + if (selected_docids.size() == docids->size()) { + chunk->docids = std::move(*docids); + chunk->prx_doc_ordinals = std::move(*prx_doc_ordinals); + return Status::OK(); + } + chunk->docids = std::move(selected_docids); + chunk->prx_doc_ordinals = std::move(selected_ordinals); + return Status::OK(); +} + +Status BuildFlatPositionSource(const LogicalIndexReader& idx, + const snii::io::BatchRangeFetcher& round1, DocidSource* doc_source, + const TermPlan& p, const std::vector& candidates, + std::vector>* owners, + PosSource* src) { + PosChunk chunk; + std::vector docids; + std::vector prx_doc_ordinals; + if (!doc_source->chunks.empty()) { + docids = std::move(doc_source->chunks.front().docids); + prx_doc_ordinals = std::move(doc_source->chunks.front().prx_doc_ordinals); + } + if (p.pod_ref) { + uint64_t poff = 0; + uint64_t plen = 0; + SNII_RETURN_IF_ERROR(idx.resolve_prx_window(p.entry, p.prx_base, &poff, &plen)); + auto fetcher = std::make_unique(idx.reader()); + const size_t prx_handle = fetcher->add(poff, plen); + SNII_RETURN_IF_ERROR(fetcher->fetch()); + chunk.prx = fetcher->get(prx_handle); + owners->push_back(std::move(fetcher)); + } else { + chunk.prx = Slice(p.entry.prx_bytes); + } + if (docids.empty()) { + Slice dd; + if (p.pod_ref) { + dd = round1.get(p.frq_handle); + } else { + SNII_RETURN_IF_ERROR(internal::inline_dd_region(p.entry, &dd)); + } + SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd, p.entry.dd_meta, + /*win_base=*/0, &docids)); + } + SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx(&docids, &prx_doc_ordinals, candidates, &chunk)); + if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk)); + return Status::OK(); +} + +bool ChunkMayContainCandidate(const DocidChunk& chunk, const std::vector& candidates) { + if (chunk.docids.empty() || candidates.empty()) return false; + const auto it = std::lower_bound(candidates.begin(), candidates.end(), chunk.docids.front()); + return it != candidates.end() && *it <= chunk.docids.back(); +} + +Status DecodeWindowedPositionSource( + const LogicalIndexReader& idx, const TermPlan& p, DocidSource* doc_source, + const std::vector& candidates, + std::vector>* owners, PosSource* src) { + struct WindowFetch { + size_t chunk_index = 0; + size_t prx_handle = 0; + }; + + auto prx_fetcher = std::make_unique( + idx.reader(), snii::reader::kSameTermCoalesceGap); + std::vector fetched; + fetched.reserve(doc_source->chunks.size()); + for (size_t i = 0; i < doc_source->chunks.size(); ++i) { + DocidChunk& doc_chunk = doc_source->chunks[i]; + if (!ChunkMayContainCandidate(doc_chunk, candidates)) continue; + if (!doc_chunk.windowed) { + return Status::Corruption("phrase_query: expected windowed doc chunk"); + } + PosChunk chunk; + SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx( + &doc_chunk.docids, &doc_chunk.prx_doc_ordinals, candidates, &chunk)); + if (chunk.docids.empty()) continue; + + snii::reader::WindowAbsRange range; + SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( + idx, p.entry, p.frq_base, p.prx_base, p.prelude, doc_chunk.window, + /*want_positions=*/true, /*want_freq=*/false, &range)); + chunk.windowed = true; + chunk.window = doc_chunk.window; + WindowFetch f; + f.chunk_index = src->chunks.size(); + f.prx_handle = prx_fetcher->add(range.prx_off, range.prx_len); + fetched.push_back(f); + src->chunks.push_back(std::move(chunk)); + } + if (prx_fetcher->pending() > 0) SNII_RETURN_IF_ERROR(prx_fetcher->fetch()); + + for (const WindowFetch& f : fetched) { + src->chunks[f.chunk_index].prx = prx_fetcher->get(f.prx_handle); + } + if (!fetched.empty()) owners->push_back(std::move(prx_fetcher)); + return Status::OK(); +} + +Status BuildPositionSourcesForCandidates( + const LogicalIndexReader& idx, const snii::io::BatchRangeFetcher& round1, + const std::vector& plans, std::vector* doc_sources, + const std::vector& candidates, + std::vector>* owners, + std::vector* srcs) { + srcs->assign(plans.size(), PosSource {}); + for (size_t i = 0; i < plans.size(); ++i) { + const TermPlan& p = plans[i]; + if (p.windowed) { + SNII_RETURN_IF_ERROR(DecodeWindowedPositionSource(idx, p, &(*doc_sources)[i], + candidates, owners, &(*srcs)[i])); + continue; + } + SNII_RETURN_IF_ERROR(BuildFlatPositionSource(idx, round1, &(*doc_sources)[i], p, candidates, + owners, &(*srcs)[i])); + } + return Status::OK(); +} + +// Streaming position cursor over one term's retained chunks. It advances ONLY +// forward (callers seek ascending candidate docids), decodes each chunk's +// docids once (reused from the conjunction phase) and each chunk's positions at +// most once (lazily, into a flat CSR whose capacity is retained across chunks). +// No per-doc allocation, no per-candidate docid binary search: positions are +// addressed by the doc's local index within its chunk. This is the read-side +// dual of the windowed posting layout -- the S3-native batch fetch already +// pulled every needed chunk into memory; the cursor is pure in-memory column +// iteration. +class PostingCursor { +public: + void init(const PosSource* src) { + src_ = src; + ci_ = 0; + li_ = 0; + decoded_pos_chunk_ = kNoChunk; + } + + // Positions the cursor at `target` (guaranteed present: candidates are the + // intersection of exactly these chunks' docids). Monotonic forward advance. + Status seek(uint32_t target) { + while (ci_ < src_->chunks.size() && + (src_->chunks[ci_].docids.empty() || src_->chunks[ci_].docids.back() < target)) { + ++ci_; + li_ = 0; + } + if (ci_ >= src_->chunks.size()) { + return Status::Corruption("phrase_query: cursor exhausted before target docid"); + } + const std::vector& d = src_->chunks[ci_].docids; + while (li_ < d.size() && d[li_] < target) ++li_; + if (li_ >= d.size() || d[li_] != target) { + return Status::Corruption("phrase_query: candidate missing from posting chunk"); + } + return Status::OK(); + } + + // [begin,end) of the current doc's positions, decoding the current chunk's + // .prx exactly once (cached). Must follow a seek that landed on a real doc. + Status positions(std::pair* out) { + if (ci_ >= src_->chunks.size() || li_ >= src_->chunks[ci_].docids.size()) { + return Status::Corruption("phrase_query: cursor positions out of range"); + } + if (decoded_pos_chunk_ != ci_) { + ByteSource ps(src_->chunks[ci_].prx); + if (src_->chunks[ci_].prx_doc_ordinals.empty()) { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + } else { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr_selective( + &ps, src_->chunks[ci_].prx_doc_ordinals, &pflat_, &poff_)); + } + if (poff_.size() != src_->chunks[ci_].docids.size() + 1) { + return Status::Corruption("phrase_query: prx/dd doc-count mismatch"); + } + decoded_pos_chunk_ = ci_; + } + const uint32_t begin = poff_[li_]; + const uint32_t end = poff_[li_ + 1]; + if (begin == end) { + *out = {nullptr, nullptr}; + return Status::OK(); + } + if (end > pflat_.size()) { + return Status::Corruption("phrase_query: prx offset out of range"); + } + *out = {pflat_.data() + begin, pflat_.data() + end}; + return Status::OK(); + } + +private: + static constexpr size_t kNoChunk = static_cast(-1); + const PosSource* src_ = nullptr; + size_t ci_ = 0; // current chunk + size_t li_ = 0; // current local doc index within the chunk + size_t decoded_pos_chunk_ = kNoChunk; // which chunk pflat_/poff_ currently hold + std::vector pflat_; // current chunk's flat positions (reused) + std::vector poff_; // current chunk's per-doc offsets (reused) +}; + +size_t AnchorPhrasePosition(const std::vector& plans, + const std::vector& phrase_plan_index) { + size_t anchor = 0; + uint32_t best_df = std::numeric_limits::max(); + for (size_t phrase_pos = 0; phrase_pos < phrase_plan_index.size(); ++phrase_pos) { + const TermPlan& plan = plans[phrase_plan_index[phrase_pos]]; + if (plan.df < best_df) { + best_df = plan.df; + anchor = phrase_pos; + } + } + return anchor; +} + +// Single streaming pass over the candidates: for each (ascending) candidate, +// advance every term's cursor to it, gather each term's positions IN PHRASE +// ORDER, and test the consecutive-phrase predicate (term[0]@p, term[1]@p+1, +// ...) with term-level short-circuit. Cursors decode each chunk's +// docids/positions exactly once and address positions by local index -- no +// per-candidate docid binary search, no full-candidate position +// materialization. Candidates are ascending so the emitted docids are already +// sorted. +Status EmitPhraseStreaming(const std::vector& plans, + const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, const std::vector& candidates, + std::vector* docids) { + std::vector cur(plans.size()); + for (size_t i = 0; i < plans.size(); ++i) cur[i].init(&srcs[i]); + + const size_t phrase_len = phrase_plan_index.size(); + std::vector> span(phrase_len); + const size_t anchor = AnchorPhrasePosition(plans, phrase_plan_index); + const uint32_t anchor_offset = position_offsets[anchor]; + for (uint32_t d : candidates) { + for (size_t i = 0; i < cur.size(); ++i) SNII_RETURN_IF_ERROR(cur[i].seek(d)); + for (size_t pp = 0; pp < phrase_len; ++pp) { + SNII_RETURN_IF_ERROR(cur[phrase_plan_index[pp]].positions(&span[pp])); + } + bool match = false; + for (const uint32_t* p = span[anchor].first; p != span[anchor].second; ++p) { + if (*p < anchor_offset) continue; + const uint32_t start = *p - anchor_offset; + bool ok = true; + for (size_t t = 0; t < phrase_len; ++t) { + if (t == anchor) continue; + uint32_t want = 0; + if (!internal::add_position_offset(start, position_offsets[t], &want)) { + ok = false; + break; + } + if (!std::binary_search(span[t].first, span[t].second, want)) { + ok = false; + break; + } + } + if (ok) { + match = true; + break; + } + } + if (match) docids->push_back(d); + } + return Status::OK(); +} + +Status BuildPhraseExecutionState(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1, + std::vector* plans, PhraseExecutionState* state) { + if (round1->pending() > 0) SNII_RETURN_IF_ERROR(round1->fetch()); + SNII_RETURN_IF_ERROR(internal::open_preludes(*round1, plans, + /*need_positions=*/true)); + + state->owners.clear(); + state->candidates.clear(); + std::vector doc_sources; + SNII_RETURN_IF_ERROR(internal::build_docid_only_conjunction(idx, *round1, *plans, + &state->candidates, &doc_sources)); + if (state->candidates.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(BuildPositionSourcesForCandidates( + idx, *round1, *plans, &doc_sources, state->candidates, &state->owners, &state->srcs)); + return Status::OK(); +} + +Status ExecutePhrasePlans(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1, + std::vector* plans, + const std::vector& phrase_plan_index, + std::vector* docids) { + PhraseExecutionState state; + SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, round1, plans, &state)); + if (state.candidates.empty()) return Status::OK(); + + std::vector position_offsets; + if (!internal::build_position_offsets(phrase_plan_index.size(), &position_offsets)) { + return Status::InvalidArgument("phrase_query: phrase length exceeds doc position range"); + } + return EmitPhraseStreaming(*plans, phrase_plan_index, position_offsets, state.srcs, + state.candidates, docids); +} + +Status CollectExpectedTailPositions(const std::vector& plans, + const std::vector& position_offsets, + std::vector& srcs, + const std::vector& candidates, + std::vector* out) { + const size_t n = plans.size(); + std::vector cur(n); + for (size_t i = 0; i < n; ++i) cur[i].init(&srcs[i]); + + std::vector ordered(n); + for (size_t i = 0; i < n; ++i) ordered[plans[i].order] = &cur[i]; + + std::vector> span(n); + for (uint32_t d : candidates) { + for (size_t i = 0; i < n; ++i) SNII_RETURN_IF_ERROR(cur[i].seek(d)); + for (size_t pp = 0; pp < n; ++pp) { + SNII_RETURN_IF_ERROR(ordered[pp]->positions(&span[pp])); + } + + ExpectedTailPositions match; + match.docid = d; + for (const uint32_t* p = span[0].first; p != span[0].second; ++p) { + const uint32_t start = *p; + bool ok = true; + for (size_t t = 1; t < n; ++t) { + uint32_t want = 0; + if (!internal::add_position_offset(start, position_offsets[t], &want)) { + ok = false; + break; + } + if (!std::binary_search(span[t].first, span[t].second, want)) { + ok = false; + break; + } + } + uint32_t tail_pos = 0; + if (ok && internal::add_position_offset(start, position_offsets[n], &tail_pos)) { + match.positions.push_back(tail_pos); + } + } + if (!match.positions.empty()) out->push_back(std::move(match)); + } + return Status::OK(); +} + +Status CollectExpectedTailPositions(const LogicalIndexReader& idx, + const std::vector& exact_terms, + std::vector* out) { + out->clear(); + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, exact_terms, &round1, &plans, + /*need_positions=*/false)); + + PhraseExecutionState state; + SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state)); + if (state.candidates.empty()) return Status::OK(); + std::vector position_offsets; + if (!internal::build_position_offsets(plans.size() + 1, &position_offsets)) { + return Status::InvalidArgument( + "phrase_prefix_query: phrase length exceeds doc position range"); + } + return CollectExpectedTailPositions(plans, position_offsets, state.srcs, state.candidates, out); +} + +bool contains_any_position(const std::vector& wanted, + std::pair actual) { + for (uint32_t pos : wanted) { + if (std::binary_search(actual.first, actual.second, pos)) return true; + } + return false; +} + +Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx, + const ResolvedQueryTerm& tail, + const std::vector& expected, + std::vector* out) { + if (expected.empty()) return Status::OK(); + + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, {tail}, &round1, &plans, + /*need_positions=*/false)); + + PhraseExecutionState state; + SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state)); + if (state.candidates.empty()) return Status::OK(); + + PostingCursor cursor; + cursor.init(&state.srcs[0]); + size_t ei = 0; + size_t ti = 0; + while (ei < expected.size() && ti < state.candidates.size()) { + const uint32_t want_doc = expected[ei].docid; + const uint32_t tail_doc = state.candidates[ti]; + if (want_doc < tail_doc) { + ++ei; + continue; + } + if (tail_doc < want_doc) { + ++ti; + continue; + } + + SNII_RETURN_IF_ERROR(cursor.seek(want_doc)); + std::pair actual; + SNII_RETURN_IF_ERROR(cursor.positions(&actual)); + if (contains_any_position(expected[ei].positions, actual)) out->push_back(want_doc); + ++ei; + ++ti; + } + return Status::OK(); +} + +} // namespace + +Status phrase_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("phrase_query: null out"); + docids->clear(); + if (terms.empty()) return Status::OK(); + if (terms.size() == 1) return term_query(idx, terms.front(), docids); + if (!idx.has_positions()) { + return Status::Unsupported("phrase_query: index has no positions"); + } + + // Round 1: preludes (windowed) + docid postings (slim/inline) batched + // together. Positions are fetched after the docid-only conjunction has + // produced final candidates, so phrase verification does not read PRX for + // windows later removed by the docid intersection. + snii::io::BatchRangeFetcher round1(idx.reader()); + const PhraseTermMapping mapping = BuildPhraseTermMapping(terms); + std::vector plans; + bool all_present = false; + SNII_RETURN_IF_ERROR(internal::plan_terms(idx, mapping.unique_terms, &round1, &plans, + &all_present, + /*need_positions=*/false)); + if (!all_present) return Status::OK(); + return ExecutePhrasePlans(idx, &round1, &plans, mapping.phrase_plan_index, docids); +} + +Status phrase_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return phrase_query(idx, terms, docids); +} + +Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("phrase_prefix_query: null out"); + docids->clear(); + if (terms.empty()) return Status::OK(); + if (terms.size() == 1) return prefix_query(idx, terms.front(), docids); + if (!idx.has_positions()) { + return Status::Unsupported("phrase_prefix_query: index has no positions"); + } + + std::vector exact_terms; + exact_terms.reserve(terms.size() - 1); + for (size_t i = 0; i + 1 < terms.size(); ++i) { + ResolvedQueryTerm resolved; + bool found = false; + SNII_RETURN_IF_ERROR(internal::resolve_query_term(idx, terms[i], &resolved, &found)); + if (!found) return Status::OK(); + exact_terms.push_back(std::move(resolved)); + } + + std::vector tail_hits; + SNII_RETURN_IF_ERROR(idx.prefix_terms(terms.back(), &tail_hits)); + if (tail_hits.empty()) return Status::OK(); + + std::vector expected; + SNII_RETURN_IF_ERROR(CollectExpectedTailPositions(idx, exact_terms, &expected)); + if (expected.empty()) return Status::OK(); + + std::vector acc; + for (LogicalIndexReader::PrefixHit& hit : tail_hits) { + ResolvedQueryTerm tail {std::move(hit.entry), hit.frq_base, hit.prx_base}; + std::vector tail_docs; + SNII_RETURN_IF_ERROR( + CollectTailMatchesAtExpectedPositions(idx, tail, expected, &tail_docs)); + internal::union_sorted_into(&acc, tail_docs); + } + *docids = std::move(acc); + return Status::OK(); +} + +Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector& terms, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return phrase_prefix_query(idx, terms, docids); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/prefix_query.cpp b/be/src/storage/index/snii/core/src/query/prefix_query.cpp new file mode 100644 index 00000000000000..50d37cbbf38383 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/prefix_query.cpp @@ -0,0 +1,41 @@ +#include "snii/query/prefix_query.h" + +#include +#include + +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/query/internal/docid_union.h" + +namespace snii::query { + +using snii::reader::LogicalIndexReader; + +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("prefix_query: null out"); + docids->clear(); + VectorDocIdSink sink(*docids); + return prefix_query(idx, prefix, &sink); +} + +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return prefix_query(idx, prefix, docids); +} + +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("prefix_query: null sink"); + + std::vector hits; + SNII_RETURN_IF_ERROR(idx.prefix_terms(prefix, &hits)); + + std::vector postings; + postings.reserve(hits.size()); + for (LogicalIndexReader::PrefixHit& hit : hits) { + postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base}); + } + return internal::emit_docid_union(idx, postings, sink); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/query_profile.cpp b/be/src/storage/index/snii/core/src/query/query_profile.cpp new file mode 100644 index 00000000000000..9ecd333cb231ed --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/query_profile.cpp @@ -0,0 +1,46 @@ +#include "snii/query/query_profile.h" + +#include +#include + +#include "snii/io/file_reader.h" + +namespace snii::query { + +QueryProfileScope::QueryProfileScope(snii::io::FileReader* reader, QueryProfile* profile) + : reader_(reader), profile_(profile), start_(std::chrono::steady_clock::now()) { + if (profile_ == nullptr) return; + + *profile_ = QueryProfile {}; + if (reader_ == nullptr) return; + + const snii::io::IoMetrics* metrics = reader_->io_metrics(); + if (metrics == nullptr) return; + + profile_->has_io_metrics = true; + profile_->io_before = *metrics; +} + +QueryProfileScope::~QueryProfileScope() { + finish(); +} + +void QueryProfileScope::finish() { + if (profile_ == nullptr || finished_) return; + finished_ = true; + + const auto end = std::chrono::steady_clock::now(); + const auto elapsed = std::chrono::duration_cast(end - start_).count(); + profile_->elapsed_ns = std::max(1, static_cast(elapsed)); + + if (!profile_->has_io_metrics || reader_ == nullptr) return; + const snii::io::IoMetrics* metrics = reader_->io_metrics(); + if (metrics == nullptr) { + profile_->has_io_metrics = false; + return; + } + profile_->io_after = *metrics; + profile_->io_delta = snii::io::delta(profile_->io_after, profile_->io_before); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/regexp_query.cpp b/be/src/storage/index/snii/core/src/query/regexp_query.cpp new file mode 100644 index 00000000000000..2078654e85fbf7 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/regexp_query.cpp @@ -0,0 +1,82 @@ +#include "snii/query/regexp_query.h" + +#include +#include +#include +#include + +#include "snii/query/internal/term_expansion.h" + +namespace snii::query { + +namespace { + +bool is_regex_metachar(char c) { + switch (c) { + case '.': + case '^': + case '$': + case '|': + case '(': + case ')': + case '[': + case ']': + case '*': + case '+': + case '?': + case '{': + case '}': + case '\\': + return true; + default: + return false; + } +} + +std::string literal_prefix_for_regex(std::string_view pattern) { + std::string out; + size_t i = 0; + if (!pattern.empty() && pattern.front() == '^') i = 1; + for (; i < pattern.size(); ++i) { + const char c = pattern[i]; + if (is_regex_metachar(c)) break; + out.push_back(c); + } + return out; +} + +} // namespace + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("regexp_query: null out"); + docids->clear(); + VectorDocIdSink sink(*docids); + return regexp_query(idx, pattern, &sink); +} + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return regexp_query(idx, pattern, docids); +} + +Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("regexp_query: null sink"); + + std::regex re; + try { + re = std::regex(std::string(pattern)); + } catch (const std::regex_error& e) { + return Status::InvalidArgument(std::string("regexp_query: invalid regex: ") + e.what()); + } + + const std::string enum_prefix = literal_prefix_for_regex(pattern); + return internal::emit_expanded_docid_union( + idx, enum_prefix, + [&re](std::string_view term) { return std::regex_match(term.begin(), term.end(), re); }, + sink); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/scoring_query.cpp b/be/src/storage/index/snii/core/src/query/scoring_query.cpp new file mode 100644 index 00000000000000..4813b3560ca7d7 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/scoring_query.cpp @@ -0,0 +1,684 @@ +#include "snii/query/scoring_query.h" + +#include +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/io/batch_range_fetcher.h" +#include "snii/reader/windowed_posting.h" + +namespace snii::query { + +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeReader; +using snii::format::WindowMeta; +using snii::reader::LogicalIndexReader; + +namespace { + +// One scored posting for one term in one doc. +struct TermPosting { + uint32_t docid = 0; + double score = 0.0; +}; + +// One window's block-max upper bound and the docid range it covers. block_max is +// true when max_score came from the frq_prelude columns (vs the exact-score +// fallback); both are valid upper bounds, so it is informational only. +struct WindowBound { + uint32_t first_docid = 0; // inclusive + uint32_t last_docid = 0; // inclusive + double max_score = 0.0; // block-max upper bound for any doc in this window + bool block_max = false; +}; + +// All scored postings of one query term plus its block-max metadata. +struct TermCursor { + std::vector postings; // ascending docid, exact per-doc scores + std::vector windows; // ascending, covering all postings + size_t pos = 0; // DAAT cursor into postings +}; + +uint32_t CurrentDoc(const TermCursor& c) { + return c.pos < c.postings.size() ? c.postings[c.pos].docid + : std::numeric_limits::max(); +} + +// Reads one slim .frq window's bytes for a slim pod_ref/inline entry (prelude +// stripped). Windowed entries are handled separately via the prelude decode. +Status FetchSlimWindowBytes(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, std::vector* window_owned, Slice* window) { + if (entry.kind == DictEntryKind::kInline) { + *window = Slice(entry.frq_bytes); + return Status::OK(); + } + uint64_t win_abs = 0; + uint64_t win_len = 0; + SNII_RETURN_IF_ERROR(idx.resolve_frq_window(entry, frq_base, &win_abs, &win_len)); + snii::io::BatchRangeFetcher fetcher(idx.reader()); + const size_t h = fetcher.add(win_abs, win_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + Slice got = fetcher.get(h); + window_owned->assign(got.data(), got.data() + got.size()); + *window = Slice(*window_owned); + return Status::OK(); +} + +// Reads a windowed entry's frq_prelude (block-max columns live here). +Status FetchPrelude(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + FrqPreludeReader* out) { + const auto& region = idx.section_refs().posting_region; + const uint64_t prelude_abs = region.offset + frq_base + entry.frq_off_delta; + snii::io::BatchRangeFetcher fetcher(idx.reader()); + const size_t h = fetcher.add(prelude_abs, entry.prelude_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + return FrqPreludeReader::open(fetcher.get(h), out); +} + +// Builds per-window block-max bounds from a windowed entry's prelude. Each +// WindowMeta carries the window's max_freq / max_norm and its covered docid +// range (win_base+1 .. last_docid), so bounds come straight from the directory. +Status BuildWindowBounds(const FrqPreludeReader& prelude, const ScorerContext& ctx, double avgdl, + const Bm25Params& params, std::vector* windows) { + const uint32_t n = prelude.n_windows(); + for (uint32_t w = 0; w < n; ++w) { + WindowMeta m; + SNII_RETURN_IF_ERROR(prelude.window(w, &m)); + if (m.doc_count == 0) continue; + WindowBound wb; + wb.first_docid = static_cast(m.win_base) + (w == 0 ? 0u : 1u); + wb.last_docid = m.last_docid; + wb.max_score = ctx.max_score(m.max_freq, m.max_norm, avgdl, params); + wb.block_max = true; + windows->push_back(wb); + } + return Status::OK(); +} + +// Fallback single window covering all postings, bounded by the exact max score +// (always a valid upper bound, so pruning stays correct). +void SingleWindowFallback(const std::vector& postings, + std::vector* windows) { + if (postings.empty()) return; + WindowBound wb; + wb.first_docid = postings.front().docid; + wb.last_docid = postings.back().docid; + wb.block_max = false; + for (const auto& p : postings) wb.max_score = std::max(wb.max_score, p.score); + windows->push_back(wb); +} + +// Computes exact per-doc BM25 scores from decoded (docid, freq) vectors. +Status ScoreDecoded(const snii::stats::SniiStatsProvider& stats, const ScorerContext& ctx, + const Bm25Params& params, const std::vector& docids, + const std::vector& freqs, std::vector* out) { + const double avgdl = stats.avgdl(); + out->reserve(docids.size()); + for (size_t i = 0; i < docids.size(); ++i) { + uint8_t norm = 0; + SNII_RETURN_IF_ERROR(stats.encoded_norm(docids[i], &norm)); + const uint32_t tf = i < freqs.size() ? freqs[i] : 1; + out->push_back({docids[i], ctx.score(tf, norm, avgdl, params)}); + } + return Status::OK(); +} + +// Decodes a slim/inline term's single .frq window ([dd_region][freq_region]) into +// docids/freqs using the entry's region metadata. +Status DecodeSlim(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + std::vector* docids, std::vector* freqs) { + std::vector owned; + Slice window; + SNII_RETURN_IF_ERROR(FetchSlimWindowBytes(idx, entry, frq_base, &owned, &window)); + const uint64_t dd_len = entry.dd_meta.disk_len; + if (dd_len > window.size()) { + return Status::Corruption("scoring_query: slim dd region exceeds window"); + } + Slice dd_region = window.subslice(0, static_cast(dd_len)); + SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd_region, entry.dd_meta, + /*win_base=*/0, docids)); + Slice freq_region = window.subslice(static_cast(dd_len), + window.size() - static_cast(dd_len)); + return snii::format::decode_freq_region(freq_region, entry.freq_meta, docids->size(), freqs); +} + +// Builds the cursor for a windowed term: tiles all windows for exact scores and +// reads the prelude once for true per-window block-max bounds. +Status BuildWindowedCursor(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, const ScorerContext& ctx, + const DictEntry& entry, uint64_t frq_base, uint64_t prx_base, + const Bm25Params& params, TermCursor* cursor) { + snii::reader::DecodedPosting posting; + // Scoring needs freqs for BM25: fetch the FULL windows (want_freq=true). + SNII_RETURN_IF_ERROR(snii::reader::read_windowed_posting(idx, entry, frq_base, prx_base, + /*want_positions=*/false, + /*want_freq=*/true, &posting)); + SNII_RETURN_IF_ERROR( + ScoreDecoded(stats, ctx, params, posting.docids, posting.freqs, &cursor->postings)); + FrqPreludeReader prelude; + if (FetchPrelude(idx, entry, frq_base, &prelude).ok()) { + SNII_RETURN_IF_ERROR( + BuildWindowBounds(prelude, ctx, stats.avgdl(), params, &cursor->windows)); + } + return Status::OK(); +} + +// Builds the cursor for one term: postings with exact scores + window bounds. +Status BuildCursor(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats, + const std::string& term, const Bm25Params& params, bool* found, + TermCursor* cursor) { + DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, found, &entry, &frq_base, &prx_base)); + if (!*found) return Status::OK(); + + const ScorerContext ctx = ScorerContext::make(stats.indexed_doc_count(), entry.df); + + const bool windowed = + entry.kind == DictEntryKind::kPodRef && entry.enc == DictEntryEnc::kWindowed; + if (windowed) { + SNII_RETURN_IF_ERROR( + BuildWindowedCursor(idx, stats, ctx, entry, frq_base, prx_base, params, cursor)); + } else { + std::vector docids; + std::vector freqs; + SNII_RETURN_IF_ERROR(DecodeSlim(idx, entry, frq_base, &docids, &freqs)); + SNII_RETURN_IF_ERROR(ScoreDecoded(stats, ctx, params, docids, freqs, &cursor->postings)); + } + if (cursor->windows.empty()) { + SingleWindowFallback(cursor->postings, &cursor->windows); + } + return Status::OK(); +} + +// Block-max upper bound for a term at a given docid: the max_score of the window +// covering docid (windows are ascending and contiguous). Beyond the last window +// the bound is 0 (the term cannot contribute). +double TermBoundAt(const TermCursor& c, uint32_t docid) { + // Windows are ascending and contiguous; the first window whose last_docid is + // >= docid covers it. Its block-max is a valid upper bound for any contained + // doc, so it also bounds gaps between windows. + for (const auto& w : c.windows) { + if (docid <= w.last_docid) return w.max_score; + } + return 0.0; +} + +// Min-heap keyed on score (smallest at top) maintaining the top-K. +struct TopK { + explicit TopK(uint32_t k) : k_(k) {} + void offer(uint32_t docid, double score) { + if (heap_.size() < k_) { + heap_.push({score, docid}); + return; + } + if (heap_.empty()) return; + const Entry& worst = heap_.top(); // lowest score; ties: largest docid + const bool better = score > worst.first || (score == worst.first && docid < worst.second); + if (better) { + heap_.pop(); + heap_.push({score, docid}); + } + } + double threshold() const { return heap_.size() < k_ ? -1.0 : heap_.top().first; } + + using Entry = std::pair; + struct Cmp { + bool operator()(const Entry& a, const Entry& b) const { + if (a.first != b.first) return a.first > b.first; // min-score at top + return a.second < b.second; // for ties, largest docid at top (evictable) + } + }; + uint32_t k_; + std::priority_queue, Cmp> heap_; +}; + +void DrainSorted(TopK* topk, std::vector* out) { + std::vector all; + while (!topk->heap_.empty()) { + all.push_back({topk->heap_.top().second, topk->heap_.top().first}); + topk->heap_.pop(); + } + std::sort(all.begin(), all.end(), [](const ScoredDoc& a, const ScoredDoc& b) { + if (a.score != b.score) return a.score > b.score; + return a.docid < b.docid; + }); + *out = std::move(all); +} + +Status BuildCursors(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, const Bm25Params& params, + std::vector* cursors) { + for (const auto& term : terms) { + bool found = false; + TermCursor c; + SNII_RETURN_IF_ERROR(BuildCursor(idx, stats, term, params, &found, &c)); + if (found && !c.postings.empty()) cursors->push_back(std::move(c)); + } + return Status::OK(); +} + +} // namespace + +Status scoring_query_exhaustive(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("scoring_query: null out"); + out->clear(); + if (k == 0) return Status::OK(); + + std::vector cursors; + SNII_RETURN_IF_ERROR(BuildCursors(idx, stats, terms, params, &cursors)); + + std::unordered_map scores; + for (const auto& c : cursors) + for (const auto& p : c.postings) scores[p.docid] += p.score; + + std::vector all; + all.reserve(scores.size()); + for (const auto& [docid, score] : scores) all.push_back({docid, score}); + std::sort(all.begin(), all.end(), [](const ScoredDoc& a, const ScoredDoc& b) { + if (a.score != b.score) return a.score > b.score; + return a.docid < b.docid; + }); + if (all.size() > k) all.resize(k); + *out = std::move(all); + return Status::OK(); +} + +namespace { + +// --- Phase C: selective-fetch (lazy window) WAND ----------------------------- +// +// A LazyTermCursor knows its per-window block-max bounds + docid ranges from the +// frq_prelude WITHOUT fetching any .frq window. Each window's exact (docid,score) +// postings are decoded on first access and cached, so a window is fetched at most +// once and ONLY when the WAND control flow touches a posting in it. Combined with +// window-level SkipTo (advance past whole windows whose last_docid < target via +// the prelude, never fetching them), the offer sequence is byte-identical to the +// eager scoring_query_wand path -- only the bytes read differ. +// +// Soundness: a window is fetched only when LazyCurrentDoc/LazySkipTo land the +// cursor inside it, i.e. it covers a candidate the WAND pivot already proved can +// reach the running theta (bound >= theta). LazySkipTo jumps the cursor to the +// SAME posting (first docid >= target) the eager per-doc walk would, so pivots, +// alignments and offers are identical to the eager path; only windows the eager +// path read-through-but-never-offered-from are skipped. Windows whose block-max +// bound never reaches theta are never the pivot, so never fetched. + +// One query term's lazily-fetched scoring state. +struct LazyTermCursor { + const LogicalIndexReader* idx = nullptr; + const snii::stats::SniiStatsProvider* stats = nullptr; + ScorerContext ctx = ScorerContext::make(1, 1); + Bm25Params params; + DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + FrqPreludeReader prelude; + bool windowed = false; // false => slim/inline single block already materialized + + std::vector windows; // ascending; from prelude (or slim fallback) + std::vector postings; // sparse: only fetched windows are filled + std::vector win_start; // prefix offsets, size = windows.size()+1 + std::vector fetched; // size = windows.size() + size_t pos = 0; // virtual cursor over all windows' postings +}; + +// Total posting count across all windows (the virtual stream length). +uint32_t TotalPostings(const LazyTermCursor& c) { + return c.win_start.empty() ? 0 : c.win_start.back(); +} + +// Index of the window whose virtual range contains posting index p (p < total). +uint32_t WindowOf(const LazyTermCursor& c, uint32_t p) { + const auto it = std::upper_bound(c.win_start.begin(), c.win_start.end(), p); + return static_cast((it - c.win_start.begin()) - 1); +} + +// Fetches + decodes window w into the cursor's posting cache (idempotent). Only +// reached when the WAND proves window w can still contribute to the top-K. +Status MaterializeWindow(LazyTermCursor* c, uint32_t w) { + if (c->fetched[w]) return Status::OK(); + WindowMeta meta; + SNII_RETURN_IF_ERROR(c->prelude.window(w, &meta)); + snii::reader::WindowAbsRange r; + SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( + *c->idx, c->entry, c->frq_base, c->prx_base, c->prelude, w, + /*want_positions=*/false, /*want_freq=*/true, &r)); + // Scoring needs docids + freqs: fetch the window's dd sub-range AND freq sub-range. + snii::io::BatchRangeFetcher fetcher(c->idx->reader(), snii::reader::kSameTermCoalesceGap); + const size_t dh = fetcher.add(r.dd_off, r.dd_len); + const size_t fh = fetcher.add(r.freq_off, r.freq_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + std::vector docids; + std::vector freqs; + std::vector> pos; + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + meta, fetcher.get(dh), fetcher.get(fh), Slice(), /*want_positions=*/false, + /*want_freq=*/true, &docids, &freqs, &pos)); + if (docids.size() != c->win_start[w + 1] - c->win_start[w]) { + return Status::Corruption("scoring_query: selective window doc-count drift"); + } + std::vector scored; + SNII_RETURN_IF_ERROR(ScoreDecoded(*c->stats, c->ctx, c->params, docids, freqs, &scored)); + std::copy(scored.begin(), scored.end(), c->postings.begin() + c->win_start[w]); + c->fetched[w] = 1; + return Status::OK(); +} + +// Current docid at the cursor, fetching the covering window if needed. Exhausted +// cursor -> UINT32_MAX. +Status LazyCurrentDoc(LazyTermCursor* c, uint32_t* docid) { + if (c->pos >= TotalPostings(*c)) { + *docid = std::numeric_limits::max(); + return Status::OK(); + } + const uint32_t w = WindowOf(*c, static_cast(c->pos)); + SNII_RETURN_IF_ERROR(MaterializeWindow(c, w)); + *docid = c->postings[c->pos].docid; + return Status::OK(); +} + +// Advances pos to the first posting with docid >= target, skipping ENTIRE windows +// whose last_docid < target WITHOUT fetching them (prelude-only), then fetching +// just the landing window. Lands on the same posting the eager per-doc walk would. +Status LazySkipTo(LazyTermCursor* c, uint32_t target) { + const uint32_t total = TotalPostings(*c); + while (c->pos < total) { + const uint32_t w = WindowOf(*c, static_cast(c->pos)); + if (c->windows[w].last_docid >= target) break; + c->pos = c->win_start[w + 1]; // skip this window entirely (no fetch) + } + if (c->pos >= total) return Status::OK(); + const uint32_t w = WindowOf(*c, static_cast(c->pos)); + SNII_RETURN_IF_ERROR(MaterializeWindow(c, w)); + while (c->pos < total && c->postings[c->pos].docid < target) ++c->pos; + return Status::OK(); +} + +// Initializes a lazy windowed cursor from the prelude alone: per-window block-max +// bounds + ranges + cache slots, with NO .frq window fetched. +Status BuildLazyWindowed(LazyTermCursor* c) { + SNII_RETURN_IF_ERROR( + snii::reader::fetch_windowed_prelude(*c->idx, c->entry, c->frq_base, &c->prelude)); + SNII_RETURN_IF_ERROR( + BuildWindowBounds(c->prelude, c->ctx, c->stats->avgdl(), c->params, &c->windows)); + // BuildWindowBounds keeps only non-empty windows, in window order. Build the + // matching prefix-sum of doc_counts over those same non-empty windows so the + // bound list, win_start and fetched stay 1:1. + const uint32_t nb = static_cast(c->windows.size()); + c->win_start.assign(nb + 1, 0); + c->fetched.assign(nb, 0); + uint32_t bi = 0; + uint32_t acc = 0; + for (uint32_t w = 0; w < c->prelude.n_windows() && bi < nb; ++w) { + WindowMeta meta; + SNII_RETURN_IF_ERROR(c->prelude.window(w, &meta)); + if (meta.doc_count == 0) continue; + acc += meta.doc_count; + c->win_start[++bi] = acc; + } + c->postings.assign(acc, TermPosting {}); + return Status::OK(); +} + +// Initializes a slim/inline cursor: its single window is small, so fetch + score +// it eagerly (exactly as the existing path). One bound covers all its postings. +Status BuildLazySlim(LazyTermCursor* c) { + std::vector docids; + std::vector freqs; + SNII_RETURN_IF_ERROR(DecodeSlim(*c->idx, c->entry, c->frq_base, &docids, &freqs)); + SNII_RETURN_IF_ERROR(ScoreDecoded(*c->stats, c->ctx, c->params, docids, freqs, &c->postings)); + SingleWindowFallback(c->postings, &c->windows); + c->win_start = {0, static_cast(c->postings.size())}; + c->fetched.assign(1, 1); // already materialized + return Status::OK(); +} + +// Builds a LazyTermCursor for one term: prelude-only for windowed terms (no .frq +// fetched), fully-materialized single window for slim/inline (small). +Status BuildLazyCursor(const LogicalIndexReader& idx, const snii::stats::SniiStatsProvider& stats, + const std::string& term, const Bm25Params& params, bool* found, + LazyTermCursor* c) { + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, found, &c->entry, &c->frq_base, &prx_base)); + if (!*found) return Status::OK(); + c->idx = &idx; + c->stats = &stats; + c->params = params; + c->prx_base = prx_base; + c->ctx = ScorerContext::make(stats.indexed_doc_count(), c->entry.df); + c->windowed = + c->entry.kind == DictEntryKind::kPodRef && c->entry.enc == DictEntryEnc::kWindowed; + return c->windowed ? BuildLazyWindowed(c) : BuildLazySlim(c); +} + +Status SelectiveBuildCursors(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, const Bm25Params& params, + std::vector* cursors) { + for (const auto& term : terms) { + bool found = false; + LazyTermCursor c; + SNII_RETURN_IF_ERROR(BuildLazyCursor(idx, stats, term, params, &found, &c)); + if (found && TotalPostings(c) > 0) cursors->push_back(std::move(c)); + } + return Status::OK(); +} + +// Block-max upper bound for a lazy cursor at docid: block_max of the window +// covering docid (ascending, contiguous). Beyond the last window -> 0. Same +// semantics as TermBoundAt over the eager cursor's window list. +double LazyTermBoundAt(const LazyTermCursor& c, uint32_t docid) { + for (const auto& w : c.windows) { + if (docid <= w.last_docid) return w.max_score; + } + return 0.0; +} + +// Sorts cursors ascending by current docid (materializing each cursor's current +// covering window), returning the smallest current docid via *front. +Status SelectiveSortByDoc(std::vector* cursors, uint32_t* front) { + std::vector cur(cursors->size()); + for (size_t i = 0; i < cursors->size(); ++i) { + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&(*cursors)[i], &cur[i])); + } + std::vector order(cursors->size()); + for (size_t i = 0; i < order.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { return cur[a] < cur[b]; }); + std::vector sorted; + sorted.reserve(cursors->size()); + for (size_t i : order) sorted.push_back(std::move((*cursors)[i])); + *cursors = std::move(sorted); + *front = order.empty() ? std::numeric_limits::max() : cur[order.front()]; + return Status::OK(); +} + +// Finds the pivot term: the first cursor (current-docid order) at which the +// accumulated block-max bound reaches theta. >= keeps boundary ties (matching the +// exhaustive total order). *found=false when no remaining doc can beat theta. +Status SelectivePivot(std::vector* cursors, double theta, size_t* pivot, + uint32_t* pivot_doc, bool* found) { + double bound = 0.0; + *found = false; + for (size_t i = 0; i < cursors->size(); ++i) { + uint32_t d = 0; + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&(*cursors)[i], &d)); + if (d == std::numeric_limits::max()) break; + bound += LazyTermBoundAt((*cursors)[i], d); + if (bound >= theta) { + *pivot = i; + *pivot_doc = d; + *found = true; + return Status::OK(); + } + } + return Status::OK(); +} + +// Scores the aligned pivot doc exactly (summing all cursors AT pivot_doc) and +// advances those cursors by one posting. +Status SelectiveScorePivot(std::vector* cursors, uint32_t pivot_doc, TopK* topk) { + double doc_score = 0.0; + for (auto& c : *cursors) { + uint32_t d = 0; + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&c, &d)); + if (d == pivot_doc) { + doc_score += c.postings[c.pos].score; // window already materialized + ++c.pos; + } + } + topk->offer(pivot_doc, doc_score); + return Status::OK(); +} + +// Advances the first lagging cursor (current doc < pivot_doc) up to pivot_doc. +Status SelectiveAdvanceLagging(std::vector* cursors, uint32_t pivot_doc) { + for (auto& c : *cursors) { + uint32_t d = 0; + SNII_RETURN_IF_ERROR(LazyCurrentDoc(&c, &d)); + if (d < pivot_doc) { + SNII_RETURN_IF_ERROR(LazySkipTo(&c, pivot_doc)); + return Status::OK(); + } + } + return Status::OK(); +} + +// One WAND iteration body: sort, pick pivot, then either score (aligned) or skip +// a lagging cursor forward. *done=true ends the loop. +Status SelectiveStep(std::vector* cursors, TopK* topk, bool* done) { + uint32_t front = 0; + SNII_RETURN_IF_ERROR(SelectiveSortByDoc(cursors, &front)); + if (cursors->empty() || front == std::numeric_limits::max()) { + *done = true; + return Status::OK(); + } + size_t pivot = 0; + uint32_t pivot_doc = 0; + bool found_pivot = false; + SNII_RETURN_IF_ERROR( + SelectivePivot(cursors, topk->threshold(), &pivot, &pivot_doc, &found_pivot)); + if (!found_pivot) { + *done = true; + return Status::OK(); + } + if (front == pivot_doc) { + return SelectiveScorePivot(cursors, pivot_doc, topk); + } + return SelectiveAdvanceLagging(cursors, pivot_doc); +} + +Status SelectiveWandLoop(std::vector* cursors, TopK* topk) { + bool done = false; + while (!done) { + SNII_RETURN_IF_ERROR(SelectiveStep(cursors, topk, &done)); + } + return Status::OK(); +} + +} // namespace + +Status scoring_query_wand_selective(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("scoring_query: null out"); + out->clear(); + if (k == 0) return Status::OK(); + + std::vector cursors; + SNII_RETURN_IF_ERROR(SelectiveBuildCursors(idx, stats, terms, params, &cursors)); + + TopK topk(k); + SNII_RETURN_IF_ERROR(SelectiveWandLoop(&cursors, &topk)); + DrainSorted(&topk, out); + return Status::OK(); +} + +Status scoring_query_wand(const LogicalIndexReader& idx, + const snii::stats::SniiStatsProvider& stats, + const std::vector& terms, uint32_t k, + const Bm25Params& params, std::vector* out) { + if (out == nullptr) return Status::InvalidArgument("scoring_query: null out"); + out->clear(); + if (k == 0) return Status::OK(); + + std::vector cursors; + SNII_RETURN_IF_ERROR(BuildCursors(idx, stats, terms, params, &cursors)); + + TopK topk(k); + // Document-at-a-time WAND with block-max bounds. + while (true) { + // Sort cursors by current docid (ascending; exhausted cursors sink). + std::sort(cursors.begin(), cursors.end(), [](const TermCursor& a, const TermCursor& b) { + return CurrentDoc(a) < CurrentDoc(b); + }); + if (cursors.empty() || + CurrentDoc(cursors.front()) == std::numeric_limits::max()) { + break; + } + + const double theta = topk.threshold(); + // Accumulate block-max upper bounds in docid order to find the pivot term. + double bound = 0.0; + size_t pivot = 0; + bool found_pivot = false; + for (size_t i = 0; i < cursors.size(); ++i) { + const uint32_t d = CurrentDoc(cursors[i]); + if (d == std::numeric_limits::max()) break; + bound += TermBoundAt(cursors[i], d); + // Use >= (not >) so a doc whose upper bound only TIES the K-th threshold is + // still explored and exact-scored: under the (score desc, docid asc) total + // order a tie can still evict the current K-th entry (smaller docid wins), + // exactly as the exhaustive path would. Strict > would wrongly prune ties. + if (bound >= theta) { + pivot = i; + found_pivot = true; + break; + } + } + if (!found_pivot) break; // no doc can beat the threshold anymore. + + const uint32_t pivot_doc = CurrentDoc(cursors[pivot]); + if (CurrentDoc(cursors.front()) == pivot_doc) { + // All cursors at the pivot doc are aligned: score it exactly. + double doc_score = 0.0; + for (auto& c : cursors) { + if (CurrentDoc(c) == pivot_doc) { + doc_score += c.postings[c.pos].score; + ++c.pos; + } + } + topk.offer(pivot_doc, doc_score); + } else { + // Advance a lagging cursor toward pivot_doc (skip docs it cannot win on). + for (auto& c : cursors) { + if (CurrentDoc(c) < pivot_doc) { + while (c.pos < c.postings.size() && c.postings[c.pos].docid < pivot_doc) { + ++c.pos; + } + break; + } + } + } + } + DrainSorted(&topk, out); + return Status::OK(); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/term_expansion.cpp b/be/src/storage/index/snii/core/src/query/term_expansion.cpp new file mode 100644 index 00000000000000..4af0209bda9411 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/term_expansion.cpp @@ -0,0 +1,28 @@ +#include "snii/query/internal/term_expansion.h" + +#include +#include + +#include "snii/query/internal/docid_posting_reader.h" +#include "snii/query/internal/docid_union.h" + +namespace snii::query::internal { + +Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx, + std::string_view enum_prefix, const TermMatcher& matches, + DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("term_expansion: null sink"); + + std::vector hits; + SNII_RETURN_IF_ERROR(idx.prefix_terms(enum_prefix, &hits)); + + std::vector postings; + postings.reserve(hits.size()); + for (snii::reader::LogicalIndexReader::PrefixHit& hit : hits) { + if (!matches(hit.term)) continue; + postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base}); + } + return emit_docid_union(idx, postings, sink); +} + +} // namespace snii::query::internal diff --git a/be/src/storage/index/snii/core/src/query/term_query.cpp b/be/src/storage/index/snii/core/src/query/term_query.cpp new file mode 100644 index 00000000000000..19b4e4138974d6 --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/term_query.cpp @@ -0,0 +1,33 @@ +#include "snii/query/term_query.h" + +#include + +#include "snii/format/dict_entry.h" +#include "snii/query/internal/docid_posting_reader.h" + +namespace snii::query { + +using snii::format::DictEntry; +using snii::reader::LogicalIndexReader; + +Status term_query(const LogicalIndexReader& idx, std::string_view term, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("term_query: null out"); + docids->clear(); + + bool found = false; + DictEntry entry; + uint64_t frq_base = 0; + uint64_t prx_base = 0; + SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base)); + if (!found) return Status::OK(); + return internal::read_docid_posting(idx, entry, frq_base, prx_base, docids); +} + +Status term_query(const LogicalIndexReader& idx, std::string_view term, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return term_query(idx, term, docids); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/wildcard_query.cpp b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp new file mode 100644 index 00000000000000..3398f4bcdedabd --- /dev/null +++ b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp @@ -0,0 +1,71 @@ +#include "snii/query/wildcard_query.h" + +#include +#include +#include +#include +#include + +#include "snii/query/internal/term_expansion.h" + +namespace snii::query { + +namespace { + +std::string literal_prefix_for_wildcard(std::string_view pattern) { + std::string out; + for (char c : pattern) { + if (c == '*' || c == '?') break; + out.push_back(c); + } + return out; +} + +bool wildcard_match(std::string_view pattern, std::string_view text) { + std::vector prev(text.size() + 1, 0); + std::vector curr(text.size() + 1, 0); + prev[0] = 1; + + for (char p : pattern) { + std::fill(curr.begin(), curr.end(), 0); + if (p == '*') { + curr[0] = prev[0]; + for (size_t i = 1; i <= text.size(); ++i) { + curr[i] = prev[i] || curr[i - 1]; + } + } else { + for (size_t i = 1; i <= text.size(); ++i) { + curr[i] = prev[i - 1] && (p == '?' || p == text[i - 1]); + } + } + prev.swap(curr); + } + return prev[text.size()] != 0; +} + +} // namespace + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids) { + if (docids == nullptr) return Status::InvalidArgument("wildcard_query: null out"); + docids->clear(); + VectorDocIdSink sink(*docids); + return wildcard_query(idx, pattern, &sink); +} + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + std::vector* docids, QueryProfile* profile) { + QueryProfileScope profile_scope(idx.reader(), profile); + return wildcard_query(idx, pattern, docids); +} + +Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, + DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("wildcard_query: null sink"); + const std::string enum_prefix = literal_prefix_for_wildcard(pattern); + return internal::emit_expanded_docid_union( + idx, enum_prefix, + [pattern](std::string_view term) { return wildcard_match(pattern, term); }, sink); +} + +} // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp new file mode 100644 index 00000000000000..bb3cb6b684388b --- /dev/null +++ b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp @@ -0,0 +1,341 @@ +#include "snii/reader/logical_index_reader.h" + +#include +#include +#include +#include + +#include "snii/encoding/crc32c.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" + +namespace snii::reader { + +using snii::format::BlockRef; +using snii::format::bsbf_hash; +using snii::format::bsbf_probe; +using snii::format::DictBlockDirectoryReader; +using snii::format::DictBlockReader; +using snii::format::DictEntry; +using snii::format::IndexTier; +using snii::format::kBsbfBytesPerBlock; +using snii::format::kBsbfHeaderSize; +using snii::format::PerIndexMetaReader; +using snii::format::RegionRef; +using snii::format::SampledTermIndexReader; + +namespace { +constexpr uint64_t kMaxDictBlockUncompBytes = 256ull * 1024 * 1024; +constexpr uint64_t kDefaultDictResidentMaxBytes = 256ull * 1024; + +// L0/L1 tiering threshold (bytes). Defaults to kBsbfResidentMaxBytes; the env +// SNII_BSBF_RESIDENT_MAX overrides it for tuning and for exercising the +// on-demand L1 path in tests without a 250K-term corpus. Read fresh each open. +uint64_t bsbf_resident_max_bytes() { + const char* s = std::getenv("SNII_BSBF_RESIDENT_MAX"); + if (s != nullptr) { + char* end = nullptr; + const unsigned long long v = std::strtoull(s, &end, 10); + if (end != s) return v; + } + return snii::format::kBsbfResidentMaxBytes; +} + +uint64_t dict_resident_max_bytes() { + const char* s = std::getenv("SNII_DICT_RESIDENT_MAX"); + if (s != nullptr) { + char* end = nullptr; + const unsigned long long v = std::strtoull(s, &end, 10); + if (end != s) return v; + } + return kDefaultDictResidentMaxBytes; +} + +Status checked_size(uint64_t value, const char* error, size_t* out) { + if (value > std::numeric_limits::max()) { + return Status::Corruption(error); + } + *out = static_cast(value); + return Status::OK(); +} + +Status dict_block_memory_bytes(const BlockRef& ref, uint64_t* out) { + if ((ref.flags & snii::format::block_ref_flags::kZstd) == 0) { + *out = ref.length; + return Status::OK(); + } + if (ref.uncomp_len == 0 || ref.uncomp_len > kMaxDictBlockUncompBytes) { + return Status::Corruption("dict block: zstd uncomp_len out of range"); + } + *out = ref.uncomp_len; + return Status::OK(); +} + +Status read_dict_block_bytes(snii::io::FileReader* reader, const BlockRef& ref, + std::vector* out) { + size_t read_len = 0; + SNII_RETURN_IF_ERROR( + checked_size(ref.length, "dict block: on-disk length out of range", &read_len)); + + std::vector block_bytes; + SNII_RETURN_IF_ERROR(reader->read_at(ref.offset, read_len, &block_bytes)); + if (block_bytes.size() != read_len) { + return Status::Corruption("dict block: short read"); + } + + if ((ref.flags & snii::format::block_ref_flags::kZstd) == 0) { + *out = std::move(block_bytes); + return Status::OK(); + } + + uint64_t memory_bytes = 0; + SNII_RETURN_IF_ERROR(dict_block_memory_bytes(ref, &memory_bytes)); + size_t uncomp_len = 0; + SNII_RETURN_IF_ERROR( + checked_size(memory_bytes, "dict block: zstd length out of range", &uncomp_len)); + return snii::zstd_decompress(Slice(block_bytes), uncomp_len, out); +} + +Status open_dict_block(snii::io::FileReader* reader, const BlockRef& ref, IndexTier tier, + bool has_positions, std::vector* bytes, DictBlockReader* out) { + SNII_RETURN_IF_ERROR(read_dict_block_bytes(reader, ref, bytes)); + return DictBlockReader::open(Slice(*bytes), tier, has_positions, out); +} +} // namespace + +Status LogicalIndexReader::load_resident_dict_blocks() { + resident_dict_blocks_.clear(); + + const uint64_t max_bytes = dict_resident_max_bytes(); + if (max_bytes == 0 || dbd_.n_blocks() == 0) return Status::OK(); + + uint64_t total_bytes = 0; + for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) { + BlockRef ref {}; + SNII_RETURN_IF_ERROR(dbd_.get(ord, &ref)); + uint64_t block_bytes = 0; + SNII_RETURN_IF_ERROR(dict_block_memory_bytes(ref, &block_bytes)); + if (block_bytes > max_bytes - total_bytes) { + return Status::OK(); + } + total_bytes += block_bytes; + } + + resident_dict_blocks_.reserve(dbd_.n_blocks()); + for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) { + BlockRef ref {}; + SNII_RETURN_IF_ERROR(dbd_.get(ord, &ref)); + ResidentDictBlock block; + SNII_RETURN_IF_ERROR( + open_dict_block(reader_, ref, tier_, has_positions_, &block.bytes, &block.reader)); + resident_dict_blocks_.push_back(std::move(block)); + } + return Status::OK(); +} + +Status LogicalIndexReader::dict_block_reader_for_ordinal(uint32_t ordinal, + OnDemandDictBlock* on_demand, + const DictBlockReader** out) const { + if (!resident_dict_blocks_.empty()) { + if (resident_dict_blocks_.size() != dbd_.n_blocks() || + ordinal >= resident_dict_blocks_.size()) { + return Status::Corruption("logical_index: incomplete resident dict"); + } + *out = &resident_dict_blocks_[ordinal].reader; + return Status::OK(); + } + + BlockRef ref {}; + SNII_RETURN_IF_ERROR(dbd_.get(ordinal, &ref)); + SNII_RETURN_IF_ERROR(open_dict_block(reader_, ref, tier_, has_positions_, &on_demand->bytes, + &on_demand->reader)); + *out = &on_demand->reader; + return Status::OK(); +} + +Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tier, + bool has_positions, Slice meta_block, LogicalIndexReader* out) { + if (file_reader == nullptr) { + return Status::InvalidArgument("logical_index: null file reader"); + } + if (out == nullptr) return Status::InvalidArgument("logical_index: null out"); + *out = LogicalIndexReader {}; + + out->reader_ = file_reader; + out->tier_ = tier; + out->has_positions_ = has_positions; + + SNII_RETURN_IF_ERROR(PerIndexMetaReader::open(meta_block, &out->meta_)); + SNII_RETURN_IF_ERROR( + SampledTermIndexReader::open(out->meta_.sampled_term_index_bytes(), &out->sti_)); + SNII_RETURN_IF_ERROR( + DictBlockDirectoryReader::open(out->meta_.dict_block_directory_bytes(), &out->dbd_)); + SNII_RETURN_IF_ERROR(out->load_resident_dict_blocks()); + + // Block-split bloom XFilter: derive the resident header from the section ref + // (offset+length) -- ZERO open-time I/O, the whole point of the on-demand + // design. The bitset starts at the constant offset section.offset + 28; one + // 32-byte block is read on demand per probe in lookup(). + const RegionRef& bsbf = out->meta_.section_refs().bsbf; + if (bsbf.length > 0) { + if (bsbf.length <= kBsbfHeaderSize) + return Status::Corruption("logical_index: bsbf section too small"); + const uint64_t num_bytes = bsbf.length - kBsbfHeaderSize; + const bool resident = bsbf.length <= bsbf_resident_max_bytes(); + // L0: read the WHOLE section (header + bitset) so probes are in-memory AND + // the bitset crc can be verified once. L1: read only the 28-byte header so + // open stays near-zero I/O; the on-demand single-block probe cannot verify + // a whole-bitset crc, so L1 relies on the storage layer's own integrity for + // the bitset body. Either way the header (magic/version/strategy/geometry + + // header crc) is parsed and verified -- BsbfHeader::parse rejects a corrupt + // header. + std::vector head; + SNII_RETURN_IF_ERROR( + file_reader->read_at(bsbf.offset, resident ? bsbf.length : kBsbfHeaderSize, &head)); + if (head.size() < kBsbfHeaderSize) + return Status::Corruption("logical_index: short bsbf header read"); + SNII_RETURN_IF_ERROR(snii::format::BsbfHeader::parse(Slice(head.data(), kBsbfHeaderSize), + bsbf.offset, &out->bsbf_header_)); + // Cross-check the header geometry against the section ref. + if (out->bsbf_header_.num_bytes != num_bytes) + return Status::Corruption("logical_index: bsbf header/section size mismatch"); + out->has_bsbf_ = true; + if (resident) { + if (head.size() < bsbf.length) + return Status::Corruption("logical_index: short bsbf resident read"); + const Slice bitset(head.data() + kBsbfHeaderSize, out->bsbf_header_.num_bytes); + if (snii::crc32c(bitset) != out->bsbf_header_.bitset_crc) + return Status::Corruption("logical_index: bsbf bitset crc mismatch"); + out->bsbf_resident_bitset_.assign(bitset.data(), bitset.data() + bitset.size()); + out->bsbf_resident_ = true; + } + } + return Status::OK(); +} + +Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* entry, + uint64_t* frq_base, uint64_t* prx_base) const { + *found = false; + if (reader_ == nullptr) return Status::InvalidArgument("logical_index: not opened"); + + // 1. XFilter fast rejection. DEFINITELY-ABSENT returns empty without the + // DICT read. L0 probes the resident bitset; L1 reads one 32-byte block. + if (has_bsbf_) { + const uint64_t h = bsbf_hash(term); + bool maybe = false; + if (bsbf_resident_) { + // L0: in-memory probe of the resident bitset (no round). + const uint32_t blk = snii::format::bsbf_block_index(h, bsbf_header_.num_blocks); + maybe = snii::format::bsbf_block_contains( + h, + bsbf_resident_bitset_.data() + static_cast(blk) * kBsbfBytesPerBlock); + } else { + // L1: on-demand single-block probe. + SNII_RETURN_IF_ERROR(bsbf_probe(reader_, bsbf_header_, h, &maybe)); + } + if (!maybe) return Status::OK(); + } + + // 2. SampledTermIndex -> candidate block ordinal. + bool maybe = false; + uint32_t ordinal = 0; + SNII_RETURN_IF_ERROR(sti_.locate(term, &maybe, &ordinal)); + if (!maybe) return Status::OK(); + + // 3. Use a resident small-DICT block when present; otherwise read the DICT + // block on demand and parse it with the same validation path used at open. + const DictBlockReader* br = nullptr; + OnDemandDictBlock on_demand; + SNII_RETURN_IF_ERROR(dict_block_reader_for_ordinal(ordinal, &on_demand, &br)); + + bool hit = false; + SNII_RETURN_IF_ERROR(br->find_term(term, &hit, entry)); + if (!hit) return Status::OK(); + + *found = true; + *frq_base = br->frq_base(); + *prx_base = br->prx_base(); + return Status::OK(); +} + +Status LogicalIndexReader::prefix_terms(std::string_view prefix, + std::vector* out) const { + if (out == nullptr) return Status::InvalidArgument("logical_index: null out"); + out->clear(); + if (reader_ == nullptr) return Status::InvalidArgument("logical_index: not opened"); + + // Seek the start block: the SampledTermIndex block whose first term <= prefix + // (terms with `prefix` are >= prefix, so they begin in that block or later). + // If the prefix sorts before every sample (or is empty), start at block 0. + uint32_t start = 0; + if (!prefix.empty()) { + bool maybe = false; + uint32_t ordinal = 0; + SNII_RETURN_IF_ERROR(sti_.locate(prefix, &maybe, &ordinal)); + if (maybe) start = ordinal; + } + + for (uint32_t ord = start; ord < dbd_.n_blocks(); ++ord) { + const DictBlockReader* br = nullptr; + OnDemandDictBlock on_demand; + SNII_RETURN_IF_ERROR(dict_block_reader_for_ordinal(ord, &on_demand, &br)); + std::vector entries; + SNII_RETURN_IF_ERROR(br->decode_all(&entries)); + + for (DictEntry& e : entries) { + const std::string_view t(e.term); + if (t < prefix) continue; // not yet at the prefix range + const bool has_prefix = + t.size() >= prefix.size() && t.compare(0, prefix.size(), prefix) == 0; + if (!has_prefix) return Status::OK(); // past the prefix range; sorted -> done + PrefixHit hit; + hit.term = e.term; + hit.entry = std::move(e); + hit.frq_base = br->frq_base(); + hit.prx_base = br->prx_base(); + out->push_back(std::move(hit)); + } + } + return Status::OK(); +} + +namespace { + +// Validates a pod_ref window locator against the posting region and returns the +// absolute window range (after the prelude). Rejects corrupt locators rather +// than letting size_t underflow / uint64 overflow reach read_at. +Status resolve_window(const snii::format::RegionRef& section, uint64_t base, uint64_t off_delta, + uint64_t total_len, uint64_t prelude_len, uint64_t* abs_off, uint64_t* len) { + if (prelude_len > total_len) { + return Status::Corruption("logical_index: prelude_len exceeds window len"); + } + const uint64_t in_region = base + off_delta; + if (in_region < base) return Status::Corruption("logical_index: locator overflow"); + if (in_region > section.length || total_len > section.length - in_region) { + return Status::Corruption("logical_index: window past posting region"); + } + *abs_off = section.offset + in_region + prelude_len; + *len = total_len - prelude_len; + return Status::OK(); +} + +} // namespace + +Status LogicalIndexReader::resolve_frq_window(const snii::format::DictEntry& entry, + uint64_t frq_base, uint64_t* abs_off, + uint64_t* len) const { + return resolve_window(section_refs().posting_region, frq_base, entry.frq_off_delta, + entry.frq_len, entry.prelude_len, abs_off, len); +} + +Status LogicalIndexReader::resolve_prx_window(const snii::format::DictEntry& entry, + uint64_t prx_base, uint64_t* abs_off, + uint64_t* len) const { + // .prx windows carry no prelude (prelude_len = 0); both spans live in the + // same posting region (prx span precedes frq span for the same term). + return resolve_window(section_refs().posting_region, prx_base, entry.prx_off_delta, + entry.prx_len, 0, abs_off, len); +} + +} // namespace snii::reader diff --git a/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp b/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp new file mode 100644 index 00000000000000..41e6ba06800152 --- /dev/null +++ b/be/src/storage/index/snii/core/src/reader/snii_segment_reader.cpp @@ -0,0 +1,97 @@ +#include "snii/reader/snii_segment_reader.h" + +#include + +#include "snii/encoding/crc32c.h" +#include "snii/format/bootstrap_header.h" +#include "snii/format/format_constants.h" +#include "snii/format/per_index_meta.h" +#include "snii/format/stats_block.h" +#include "snii/format/tail_pointer.h" + +namespace snii::reader { + +using snii::format::BootstrapHeader; +using snii::format::IndexTier; +using snii::format::PerIndexMetaReader; +using snii::format::StatsBlock; +using snii::format::TailMetaRegionReader; +using snii::format::TailPointer; + +namespace { + +// Reads the bootstrap header from the front of the file and validates it. +Status ReadBootstrap(snii::io::FileReader* reader, BootstrapHeader* bh) { + std::vector buf; + SNII_RETURN_IF_ERROR(reader->read_at(0, snii::format::kBootstrapHeaderSize, &buf)); + return snii::format::decode_bootstrap_header(Slice(buf), bh); +} + +// Reads the fixed tail pointer (last tail_pointer_size() bytes) of the file. +Status ReadTailPointer(snii::io::FileReader* reader, TailPointer* tp) { + const size_t tp_size = snii::format::tail_pointer_size(); + const uint64_t total = reader->size(); + if (total < tp_size) { + return Status::Corruption("segment: file smaller than tail pointer"); + } + std::vector buf; + SNII_RETURN_IF_ERROR(reader->read_at(total - tp_size, tp_size, &buf)); + return snii::format::decode_tail_pointer(Slice(buf), tp); +} + +} // namespace + +Status SniiSegmentReader::open(snii::io::FileReader* reader, SniiSegmentReader* out) { + if (reader == nullptr) return Status::InvalidArgument("segment: null reader"); + if (out == nullptr) return Status::InvalidArgument("segment: null out"); + + BootstrapHeader bh; + SNII_RETURN_IF_ERROR(ReadBootstrap(reader, &bh)); + + TailPointer tp; + SNII_RETURN_IF_ERROR(ReadTailPointer(reader, &tp)); + if (tp.meta_region_length == 0) { + return Status::Corruption("segment: empty tail meta region"); + } + + out->reader_ = reader; + SNII_RETURN_IF_ERROR( + reader->read_at(tp.meta_region_offset, tp.meta_region_length, &out->meta_region_)); + // Verify the whole meta region against the tail pointer's checksum BEFORE parsing + // it. (TailMetaRegionReader::open also checks the region's own internal checksum; + // this is the read-boundary check that makes tp.meta_region_checksum meaningful and + // catches corruption before any framed sub-section is touched.) + if (snii::crc32c(Slice(out->meta_region_)) != tp.meta_region_checksum) { + return Status::Corruption("segment: meta region checksum mismatch"); + } + return TailMetaRegionReader::open(Slice(out->meta_region_), &out->region_reader_); +} + +Status SniiSegmentReader::open_index(uint64_t index_id, std::string_view suffix, + LogicalIndexReader* out) const { + if (out == nullptr) return Status::InvalidArgument("segment: null index out"); + if (reader_ == nullptr) return Status::InvalidArgument("segment: not opened"); + + bool found = false; + Slice meta_bytes; + SNII_RETURN_IF_ERROR(region_reader_.find(index_id, suffix, &found, &meta_bytes)); + if (!found) return Status::NotFound("segment: logical index not found"); + + // Determine tier / positions capability from the per-index meta. Positions + // capability is read from the PERSISTED header flag (kHasPositions), NOT from + // any region length: after the frq/prx merge, posting_region.length is non-zero + // for ANY index with a pod_ref term -- docs-only included -- so a region-length + // heuristic would mis-classify a docs-only index as positional and make + // DictBlockReader::check_flags hard-fail. The "|| has_norms" is kept only as a + // defensive belt-and-suspenders (a scoring index always has positions). + PerIndexMetaReader meta; + SNII_RETURN_IF_ERROR(PerIndexMetaReader::open(meta_bytes, &meta)); + const bool has_norms = meta.section_refs().norms.length > 0; + const bool has_positions = meta.has_positions() || has_norms; + const IndexTier tier = + has_norms ? IndexTier::kT3 : (has_positions ? IndexTier::kT2 : IndexTier::kT1); + + return LogicalIndexReader::open(reader_, tier, has_positions, meta_bytes, out); +} + +} // namespace snii::reader diff --git a/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp b/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp new file mode 100644 index 00000000000000..1299660f0658a8 --- /dev/null +++ b/be/src/storage/index/snii/core/src/reader/windowed_posting.cpp @@ -0,0 +1,253 @@ +#include "snii/reader/windowed_posting.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_source.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/format/prx_pod.h" +#include "snii/io/batch_range_fetcher.h" + +namespace snii::reader { + +using snii::format::DictEntry; +using snii::format::FrqPreludeReader; +using snii::format::FrqRegionMeta; +using snii::format::WindowMeta; + +namespace { + +// Resolves the absolute file offset of the prelude bytes for a windowed entry. +// The frq span lives in the interleaved posting region (after the term's prx span). +uint64_t PreludeAbs(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base) { + const auto& region = idx.section_refs().posting_region; + return region.offset + frq_base + entry.frq_off_delta; +} + +// Validates that [off, off+len) fits within [0, total). +Status InBounds(uint64_t off, uint64_t len, uint64_t total) { + if (off > total || len > total - off) { + return Status::Corruption("windowed_posting: range out of section"); + } + return Status::OK(); +} + +// Block geometry of a windowed entry's grouped .frq payload (all offsets absolute). +struct BlockGeometry { + uint64_t dd_block_off = 0; // absolute start of the dd-block + uint64_t dd_block_len = 0; + uint64_t freq_block_off = 0; // absolute start of the freq-block + uint64_t freq_block_len = 0; + uint64_t frq_region_len = 0; // entry.frq_len - prelude_len (dd-block + freq-block) +}; + +// Derives the dd-block / freq-block absolute ranges from the entry + prelude, +// validating they tile the post-prelude .frq region exactly. +Status ResolveBlocks(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + const FrqPreludeReader& prelude, BlockGeometry* g) { + if (entry.prelude_len > entry.frq_len) { + return Status::Corruption("windowed_posting: prelude_len exceeds frq_len"); + } + const uint64_t frq_window_start = PreludeAbs(idx, entry, frq_base) + entry.prelude_len; + g->frq_region_len = entry.frq_len - entry.prelude_len; + g->dd_block_len = prelude.dd_block_len(); + g->freq_block_len = prelude.freq_block_len(); + // dd-block + freq-block must fit exactly within the post-prelude region. + if (g->dd_block_len > g->frq_region_len || + g->freq_block_len > g->frq_region_len - g->dd_block_len) { + return Status::Corruption("windowed_posting: blocks exceed frq region"); + } + g->dd_block_off = frq_window_start; + g->freq_block_off = frq_window_start + g->dd_block_len; + return Status::OK(); +} + +// Per-window decode state for the full-posting path. +struct WindowSlices { + WindowMeta meta; + Slice dd_region; + Slice freq_region; + Slice prx_window; +}; + +// Carves window w's dd (and freq when want_freq) sub-slices out of the fetched +// blocks, validating each locator against its block length. +Status CarveRegionSlices(const WindowMeta& m, Slice dd_block, Slice freq_block, bool want_freq, + WindowSlices* out) { + SNII_RETURN_IF_ERROR(InBounds(m.dd_off, m.dd_disk_len, dd_block.size())); + out->dd_region = + dd_block.subslice(static_cast(m.dd_off), static_cast(m.dd_disk_len)); + if (!want_freq) return Status::OK(); + SNII_RETURN_IF_ERROR(InBounds(m.freq_off, m.freq_disk_len, freq_block.size())); + out->freq_region = freq_block.subslice(static_cast(m.freq_off), + static_cast(m.freq_disk_len)); + return Status::OK(); +} + +// Decodes window w from the fetched blocks (+ optional prx slice) and appends to out. +Status AppendWindow(const WindowSlices& ws, bool want_positions, bool want_freq, + DecodedPosting* out) { + std::vector docids, freqs; + std::vector> pos; + SNII_RETURN_IF_ERROR(decode_window_slices(ws.meta, ws.dd_region, ws.freq_region, ws.prx_window, + want_positions, want_freq, &docids, &freqs, &pos)); + out->docids.insert(out->docids.end(), docids.begin(), docids.end()); + out->freqs.insert(out->freqs.end(), freqs.begin(), freqs.end()); + if (want_positions) { + for (auto& v : pos) out->positions.push_back(std::move(v)); + } + return Status::OK(); +} + +} // namespace + +Status fetch_windowed_prelude(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, FrqPreludeReader* prelude) { + if (entry.prelude_len == 0) { + return Status::Corruption("windowed_posting: windowed entry has no prelude"); + } + if (entry.prelude_len > entry.frq_len) { + return Status::Corruption("windowed_posting: prelude_len exceeds frq_len"); + } + const uint64_t prelude_abs = PreludeAbs(idx, entry, frq_base); + snii::io::BatchRangeFetcher fetcher(idx.reader()); + const size_t h = fetcher.add(prelude_abs, entry.prelude_len); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + return FrqPreludeReader::open(fetcher.get(h), prelude); +} + +Status windowed_window_range(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, const FrqPreludeReader& prelude, + uint32_t w, bool want_positions, bool want_freq, WindowAbsRange* out) { + if (out == nullptr) return Status::InvalidArgument("windowed_posting: null range"); + *out = WindowAbsRange {}; + BlockGeometry g; + SNII_RETURN_IF_ERROR(ResolveBlocks(idx, entry, frq_base, prelude, &g)); + WindowMeta meta; + SNII_RETURN_IF_ERROR(prelude.window(w, &meta)); + + // dd sub-range within the dd-block. + SNII_RETURN_IF_ERROR(InBounds(meta.dd_off, meta.dd_disk_len, g.dd_block_len)); + out->dd_off = g.dd_block_off + meta.dd_off; + out->dd_len = meta.dd_disk_len; + + if (want_freq) { + SNII_RETURN_IF_ERROR(InBounds(meta.freq_off, meta.freq_disk_len, g.freq_block_len)); + out->freq_off = g.freq_block_off + meta.freq_off; + out->freq_len = meta.freq_disk_len; + } + + if (!want_positions) return Status::OK(); + if (!prelude.has_prx()) { + return Status::Corruption("windowed_posting: positions requested but prelude has none"); + } + const uint64_t prx_region_start = + idx.section_refs().posting_region.offset + prx_base + entry.prx_off_delta; + SNII_RETURN_IF_ERROR(InBounds(meta.prx_off, meta.prx_len, entry.prx_len)); + out->prx_off = prx_region_start + meta.prx_off; + out->prx_len = meta.prx_len; + return Status::OK(); +} + +Status decode_window_slices(const WindowMeta& meta, Slice dd_region, Slice freq_region, + Slice prx_window, bool want_positions, bool want_freq, + std::vector* docids, std::vector* freqs, + std::vector>* positions) { + FrqRegionMeta dd_meta; + dd_meta.zstd = meta.dd_zstd; + dd_meta.uncomp_len = meta.dd_uncomp_len; + dd_meta.disk_len = meta.dd_disk_len; + dd_meta.crc = meta.crc_dd; + dd_meta.verify_crc = meta.verify_crc; + SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd_region, dd_meta, meta.win_base, docids)); + if (docids->size() != meta.doc_count) { + return Status::Corruption("windowed_posting: frq doc_count mismatch"); + } + if (want_freq) { + FrqRegionMeta freq_meta; + freq_meta.zstd = meta.freq_zstd; + freq_meta.uncomp_len = meta.freq_uncomp_len; + freq_meta.disk_len = meta.freq_disk_len; + freq_meta.crc = meta.crc_freq; + freq_meta.verify_crc = meta.verify_crc; + SNII_RETURN_IF_ERROR( + snii::format::decode_freq_region(freq_region, freq_meta, meta.doc_count, freqs)); + } else { + freqs->clear(); + } + if (!want_positions) return Status::OK(); + + ByteSource psrc(prx_window); + SNII_RETURN_IF_ERROR(snii::format::read_prx_window(&psrc, positions)); + if (positions->size() != docids->size()) { + return Status::Corruption("windowed_posting: prx/frq doc-count mismatch"); + } + return Status::OK(); +} + +namespace { + +// Fetches the dd-block (always), the freq-block (when want_freq) and the whole .prx +// region (when want_positions) of a windowed entry in ONE batch and returns the +// in-memory block slices. The dd-block is a single contiguous range -> the +// docid-only / phrase path reads it as one Range GET (the byte-saving core). +Status FetchBlocks(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t prx_base, + const BlockGeometry& g, bool want_positions, bool want_freq, + snii::io::BatchRangeFetcher* fetcher, size_t* dd_h, size_t* freq_h, + size_t* prx_h) { + *dd_h = fetcher->add(g.dd_block_off, g.dd_block_len); + if (want_freq) { + *freq_h = fetcher->add(g.freq_block_off, g.freq_block_len); + } + if (want_positions) { + const uint64_t prx_region_start = + idx.section_refs().posting_region.offset + prx_base + entry.prx_off_delta; + *prx_h = fetcher->add(prx_region_start, entry.prx_len); + } + return fetcher->fetch(); +} + +} // namespace + +Status read_windowed_posting(const LogicalIndexReader& idx, const DictEntry& entry, + uint64_t frq_base, uint64_t prx_base, bool want_positions, + bool want_freq, DecodedPosting* out) { + if (out == nullptr) { + return Status::InvalidArgument("windowed_posting: null out"); + } + *out = DecodedPosting {}; + + FrqPreludeReader prelude; + SNII_RETURN_IF_ERROR(fetch_windowed_prelude(idx, entry, frq_base, &prelude)); + if (want_positions && !prelude.has_prx()) { + return Status::Corruption("windowed_posting: positions requested but prelude has none"); + } + BlockGeometry g; + SNII_RETURN_IF_ERROR(ResolveBlocks(idx, entry, frq_base, prelude, &g)); + + snii::io::BatchRangeFetcher fetcher(idx.reader()); + size_t dd_h = 0, freq_h = 0, prx_h = 0; + SNII_RETURN_IF_ERROR(FetchBlocks(idx, entry, prx_base, g, want_positions, want_freq, &fetcher, + &dd_h, &freq_h, &prx_h)); + const Slice dd_block = fetcher.get(dd_h); + const Slice freq_block = want_freq ? fetcher.get(freq_h) : Slice(); + const Slice prx_region = want_positions ? fetcher.get(prx_h) : Slice(); + + const uint32_t n = prelude.n_windows(); + for (uint32_t w = 0; w < n; ++w) { + WindowSlices ws; + SNII_RETURN_IF_ERROR(prelude.window(w, &ws.meta)); + SNII_RETURN_IF_ERROR(CarveRegionSlices(ws.meta, dd_block, freq_block, want_freq, &ws)); + if (want_positions) { + SNII_RETURN_IF_ERROR(InBounds(ws.meta.prx_off, ws.meta.prx_len, prx_region.size())); + ws.prx_window = prx_region.subslice(static_cast(ws.meta.prx_off), + static_cast(ws.meta.prx_len)); + } + SNII_RETURN_IF_ERROR(AppendWindow(ws, want_positions, want_freq, out)); + } + return Status::OK(); +} + +} // namespace snii::reader diff --git a/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp b/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp new file mode 100644 index 00000000000000..f4457c96273f40 --- /dev/null +++ b/be/src/storage/index/snii/core/src/stats/snii_stats_provider.cpp @@ -0,0 +1,93 @@ +#include "snii/stats/snii_stats_provider.h" + +#include +#include + +#include "snii/common/slice.h" +#include "snii/format/dict_entry.h" +#include "snii/format/format_constants.h" +#include "snii/format/stats_block.h" +#include "snii/io/batch_range_fetcher.h" + +namespace snii::stats { + +using snii::format::DictEntry; +using snii::format::NormsPodReader; +using snii::format::RegionRef; + +namespace { + +// Resolves a term's DictEntry. *found=false for an absent term (OK status). +Status LookupEntry(const snii::reader::LogicalIndexReader& idx, std::string_view term, bool* found, + DictEntry* entry) { + uint64_t frq_base = 0; + uint64_t prx_base = 0; + return idx.lookup(term, found, entry, &frq_base, &prx_base); +} + +} // namespace + +Status SniiStatsProvider::open(const snii::reader::LogicalIndexReader* idx, + SniiStatsProvider* out) { + if (idx == nullptr || out == nullptr) { + return Status::InvalidArgument("stats_provider: null argument"); + } + out->idx_ = idx; + const auto& sb = idx->stats(); + out->doc_count_ = sb.doc_count; + out->indexed_doc_count_ = sb.indexed_doc_count; + out->sum_total_term_freq_ = sb.sum_total_term_freq; + + const RegionRef& norms = idx->section_refs().norms; + if (norms.length == 0) { + out->has_norms_ = false; + return Status::OK(); + } + + snii::io::BatchRangeFetcher fetcher(idx->reader()); + const size_t h = fetcher.add(norms.offset, norms.length); + SNII_RETURN_IF_ERROR(fetcher.fetch()); + Slice framed = fetcher.get(h); + out->norms_bytes_.assign(framed.data(), framed.data() + framed.size()); + SNII_RETURN_IF_ERROR(NormsPodReader::open(Slice(out->norms_bytes_), &out->norms_reader_)); + out->has_norms_ = true; + return Status::OK(); +} + +double SniiStatsProvider::avgdl() const { + const uint64_t denom = std::max(1, indexed_doc_count_); + return static_cast(sum_total_term_freq_) / static_cast(denom); +} + +Status SniiStatsProvider::doc_freq(std::string_view term, uint64_t* df) const { + if (df == nullptr) return Status::InvalidArgument("stats_provider: null df"); + *df = 0; + bool found = false; + DictEntry entry; + SNII_RETURN_IF_ERROR(LookupEntry(*idx_, term, &found, &entry)); + if (found) *df = entry.df; + return Status::OK(); +} + +Status SniiStatsProvider::total_term_freq(std::string_view term, uint64_t* ttf) const { + if (ttf == nullptr) return Status::InvalidArgument("stats_provider: null ttf"); + *ttf = 0; + bool found = false; + DictEntry entry; + SNII_RETURN_IF_ERROR(LookupEntry(*idx_, term, &found, &entry)); + if (!found) return Status::OK(); + // tier>=T2 entries carry the total term frequency directly in ttf_delta (the + // LogicalIndexWriter stores ttf there, not a delta from df). + *ttf = entry.ttf_delta; + return Status::OK(); +} + +Status SniiStatsProvider::encoded_norm(uint32_t docid, uint8_t* out) const { + if (out == nullptr) return Status::InvalidArgument("stats_provider: null out"); + if (!has_norms_) { + return Status::InvalidArgument("stats_provider: index has no norms"); + } + return norms_reader_.try_encoded_norm(docid, out); +} + +} // namespace snii::stats diff --git a/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp b/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp new file mode 100644 index 00000000000000..a6ce29aee03557 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/compact_posting_pool.cpp @@ -0,0 +1,155 @@ +#include "snii/writer/compact_posting_pool.h" + +#include +#include +#include + +namespace snii::writer { + +// Gentle (~1.5x) many-level payload-capacity schedule. Starting at 5 bytes with a +// slow ramp keeps the over-allocated FINAL slice small for the millions of low-df +// terms (the dominant arena-overhead source) while still reaching multi-KiB slices +// for high-df chains in a bounded number of hops (so the per-slice 4-byte forward +// pointer stays a small fraction of a large chain's bytes). +const uint32_t CompactPostingPool::kSliceSizes[kLevelCount] = { + 5, 8, 12, 18, 27, 40, 60, 90, 135, 202, 303, 455, 683, 1024, 1536, 2304}; +const uint8_t CompactPostingPool::kNextLevel[kLevelCount] = {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 15}; + +CompactPostingPool::CompactPostingPool() = default; + +uint32_t CompactPostingPool::kSliceSizes_level0() { + return kSliceSizes[0]; +} + +uint32_t CompactPostingPool::kSliceSize_at(int level) { + return kSliceSizes[level]; +} + +uint8_t CompactPostingPool::kNextLevel_at(int level) { + return kNextLevel[level]; +} + +void CompactPostingPool::reset() { + std::vector>().swap(blocks_); + next_offset_ = 0; + payload_bytes_ = 0; +} + +uint32_t CompactPostingPool::alloc_run(uint32_t bytes) { + const uint32_t in_block = next_offset_ & kBlockMask; + // A fresh block is needed when (a) there is no tail block yet, (b) the run does + // not fit in the current tail block's remaining space, or (c) next_offset_ sits + // exactly on a block boundary whose block has not been allocated (a previous run + // that exactly filled the tail leaves next_offset_ == blocks_.size()*kBlockSize, + // so in_block == 0 must NOT be mistaken for an empty fresh block). + const bool tail_exists = (next_offset_ >> kBlockShift) < blocks_.size(); + const bool need_block = !tail_exists || in_block + bytes > kBlockSize; + // Hard invariant (see arena_bytes()): the uint32 offset must never wrap. The spimi + // accumulator force-spills below 4 GiB, but enforce it here too -- in release as + // well as debug -- so any direct user of the pool fails loudly instead of silently + // aliasing block 0. We are a library: throw and let the caller decide how to + // handle it, rather than aborting the process. The run starts either in the + // current tail or at a new block's base; compute that start in 64 bits before the + // uint32 arithmetic can wrap. + const uint64_t run_start = + need_block ? static_cast(blocks_.size()) * kBlockSize : next_offset_; + if (run_start + bytes > UINT32_MAX) { + throw std::overflow_error( + "snii: CompactPostingPool arena exceeded the 4 GiB uint32 offset limit; " + "the caller must spill before this point"); + } + if (need_block) { + blocks_.emplace_back(kBlockSize, 0); + next_offset_ = static_cast((blocks_.size() - 1) * kBlockSize); + } + const uint32_t off = next_offset_; + next_offset_ += bytes; + return off; +} + +uint32_t CompactPostingPool::alloc_slice(int level, uint32_t* slice_end) { + const uint32_t cap = kSliceSizes[level]; + const uint32_t first = alloc_run(cap + kPtrBytes); + *slice_end = first + cap; + // Zero the forward pointer so a not-yet-extended tail slice reads next_head == 0. + std::memset(at(*slice_end), 0, kPtrBytes); + return first; +} + +uint32_t CompactPostingPool::read_ptr(uint32_t slice_end) const { + uint32_t v; + std::memcpy(&v, at(slice_end), sizeof(v)); + return v; +} + +void CompactPostingPool::write_ptr(uint32_t slice_end, uint32_t next_head) { + std::memcpy(at(slice_end), &next_head, sizeof(next_head)); +} + +uint32_t CompactPostingPool::start_chain(SliceWriter* w, uint8_t* level) { + *level = 0; + const uint32_t head = alloc_slice(0, &w->slice_end); + w->cur = head; + return head; +} + +void CompactPostingPool::append_byte(SliceWriter* w, uint8_t* level, uint8_t value) { + if (w->cur == w->slice_end) { + // Current slice payload region is full: grow the chain with a larger slice and + // record the link in the old slice's trailing pointer bytes. + const uint8_t next_level = kNextLevel[*level]; + uint32_t new_end = 0; + const uint32_t new_head = alloc_slice(next_level, &new_end); + write_ptr(w->slice_end, new_head); + *level = next_level; + w->cur = new_head; + w->slice_end = new_end; + } + *at(w->cur) = value; + ++w->cur; + ++payload_bytes_; +} + +CompactPostingPool::Cursor::Cursor(const CompactPostingPool* pool, uint32_t head, uint64_t budget) + : pool_(pool), cur_(head), level_(0), budget_(budget) { + // The first slice is level 0; its payload region ends kSliceSizes[0] bytes in. + slice_end_ = head + CompactPostingPool::kSliceSizes[0]; +} + +bool CompactPostingPool::Cursor::has_next() const { + if (budget_ == 0) return false; + // At a slice boundary, the chain continues only if the forward pointer is non-zero; + // a zero pointer is the tail marker (offset 0 is never a valid next-slice head). Peek + // it so has_next() never reports a phantom byte that next() would have to fabricate. + if (cur_ == slice_end_) return pool_->read_ptr(slice_end_) != 0; + return true; +} + +uint8_t CompactPostingPool::Cursor::next() { + // Budget guard: the caller's stated upper bound is spent -- yield nothing more. + if (budget_ == 0) return 0; + if (cur_ == slice_end_) { + // Reached this slice's payload boundary. Follow the forward pointer to the next + // slice -- UNLESS it is zero, which marks the CHAIN TAIL (offset 0 is always the + // pool's very first slice, never a valid *next*-slice head, so a zero pointer is + // unambiguously "no more slices"). Without this tail check, an over-reading caller + // would follow the zero pointer to offset 0 and alias block 0's bytes (or read an + // unallocated block) -- UB. Stopping here makes the cursor self-terminating and + // safe regardless of how large a budget the caller passed. + const uint32_t next_head = pool_->read_ptr(slice_end_); + if (next_head == 0) { + budget_ = 0; // chain exhausted: no further bytes exist + return 0; + } + level_ = CompactPostingPool::kNextLevel[level_]; + cur_ = next_head; + slice_end_ = next_head + CompactPostingPool::kSliceSizes[level_]; + } + const uint8_t v = *pool_->at(cur_); + ++cur_; + --budget_; + return v; +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp b/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp new file mode 100644 index 00000000000000..8cbf1de2eee0d3 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/logical_index_writer.cpp @@ -0,0 +1,686 @@ +#include "snii/writer/logical_index_writer.h" + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/encoding/crc32c.h" +#include "snii/encoding/zstd_codec.h" +#include "snii/format/bsbf.h" +#include "snii/format/dict_block.h" +#include "snii/format/dict_block_directory.h" +#include "snii/format/frq_pod.h" +#include "snii/format/frq_prelude.h" +#include "snii/format/norms_pod.h" +#include "snii/format/null_bitmap.h" +#include "snii/format/prx_pod.h" + +namespace snii::writer { + +using snii::format::BlockRef; +using snii::format::DictBlockBuilder; +using snii::format::DictBlockDirectoryBuilder; +using snii::format::DictEntry; +using snii::format::DictEntryEnc; +using snii::format::DictEntryKind; +using snii::format::FrqPreludeColumns; +using snii::format::PerIndexMetaBuilder; +using snii::format::SampledTermIndexBuilder; +using snii::format::SectionRefs; +using snii::format::WindowMeta; + +namespace { + +// Target false-positive probability for the block-split bloom XFilter. Sizes +// the filter via Parquet OptimalNumOfBytes; L0 keeps the probe in memory and L1 +// keeps the per-query cost at one 32-byte block. +constexpr double kBsbfFpp = 0.01; +// Zstd "auto" sentinel for window builders (raw for tiny payloads). +constexpr int kAutoZstd = -1; +// Force-raw level for .frq dd/freq regions. Their plaintext is PFOR-bit-packed +// doc-deltas/freqs -- already high-entropy, so zstd shrinks ~30 MB of input by +// <0.1 MiB while burning ~0.4s CPU (and an extra crc pass over the compressed +// bytes) at 5M. We force raw here and keep zstd only on .prx (which compresses +// ~77%). Output stays self-describing: the region meta records zstd=false. +constexpr int kRawFrqRegion = 0; +// Windows per super-block in the two-level prelude directory (design section +// 5). +constexpr uint32_t kPreludeGroupSize = 64; +// zstd level for whole-DICT-block compression. Level 3 (zstd default) +// compresses the 64KiB front-coded term-key + entry-meta + inline-posting +// blocks ~40% at ~120 MiB/s encode / ~600 MiB/s decode -- a large size win for +// a small build-CPU cost, and a per-lookup decode (~0.1ms/64KiB) that is +// dominated by the S3 round trip it shrinks. Higher levels gain <1% here for +// materially more CPU. +constexpr int kDictBlockZstdLevel = 3; + +using snii::format::FrqRegionMeta; + +// Encodes one window's dd region (and freq region when has_freq) into separate +// buffers, returning their codec metadata. The dd region is the docs-only data; +// the freq region is the skippable suffix. Used for both the grouped windowed +// layout (regions concatenated into posting-level blocks) and the single-window +// slim/inline layout ([dd_region][freq_region]). +Status EncodeRegions(std::span docids, std::span freqs, + uint64_t win_base, bool has_freq, std::vector* dd_out, + FrqRegionMeta* dd_meta, std::vector* freq_out, + FrqRegionMeta* freq_meta) { + ByteSink dd_sink; + SNII_RETURN_IF_ERROR( + snii::format::build_dd_region(docids, win_base, kRawFrqRegion, &dd_sink, dd_meta)); + *dd_out = dd_sink.take(); + if (!has_freq) { + *freq_out = std::vector(); + *freq_meta = FrqRegionMeta {}; + return Status::OK(); + } + ByteSink freq_sink; + SNII_RETURN_IF_ERROR( + snii::format::build_freq_region(freqs, kRawFrqRegion, &freq_sink, freq_meta)); + *freq_out = freq_sink.take(); + return Status::OK(); +} + +// Reusable per-window scratch for the windowed builder. Each ByteSink RETAINS +// its capacity across windows (clear(), not re-construct), so encoding a +// high-df term split into thousands of windows allocates the scratch ONCE +// instead of churning thousands of small buffers (which fragment the heap arena +// and raise peak RSS). +struct WindowScratch { + ByteSink dd_sink; + ByteSink freq_sink; + ByteSink prx_sink; +}; + +// Encodes one window's dd (and freq) region into the scratch sinks and appends +// the bytes directly to the grouped blocks via LayoutWindowRegions. Reuses the +// sinks. +Status EncodeRegionsInto(WindowScratch* sc, std::span docids, + std::span freqs, uint64_t win_base, bool has_freq, + FrqRegionMeta* dd_meta, FrqRegionMeta* freq_meta) { + sc->dd_sink.clear(); + SNII_RETURN_IF_ERROR( + snii::format::build_dd_region(docids, win_base, kRawFrqRegion, &sc->dd_sink, dd_meta)); + if (has_freq) { + sc->freq_sink.clear(); + SNII_RETURN_IF_ERROR( + snii::format::build_freq_region(freqs, kRawFrqRegion, &sc->freq_sink, freq_meta)); + } else { + *freq_meta = FrqRegionMeta {}; + } + return Status::OK(); +} + +// Builds a single .prx window directly from a FLAT positions slice + its +// parallel freqs slice (doc d owns the next freqs[d] entries). Byte-identical +// to building from per-doc vectors, but with NO vector-of-vectors +// materialization: the writer indexes straight into the term's flat positions +// buffer. +Status MakePrxWindow(std::span positions_flat, std::span freqs, + std::vector* out) { + ByteSink sink; + SNII_RETURN_IF_ERROR( + snii::format::build_prx_window_flat(positions_flat, freqs, kAutoZstd, &sink)); + *out = sink.take(); + return Status::OK(); +} + +uint32_t MaxOf(std::span v) { + uint32_t m = 0; + for (uint32_t x : v) { + if (x > m) m = x; + } + return m; +} + +uint64_t SumOf(const std::vector& v) { + uint64_t s = 0; + for (uint32_t x : v) s += x; + return s; +} + +// Computes a window's WAND max_norm: the encoded norm yielding the LARGEST BM25 +// length contribution (smallest length penalty), i.e. the SMALLEST encoded norm +// among the window's docs (smaller dl => higher score). When norms are +// unavailable (no scoring), returns 0 -- decode_norm(0)=1.0 is the smallest +// possible dl, giving a correct (loosest) upper bound. +uint8_t WindowMaxNorm(const std::vector& norms, std::span docs) { + if (norms.empty() || docs.empty()) return 0; + uint8_t best = 0xFF; // decode_norm uses the byte directly; min byte => max score + for (uint32_t docid : docs) { + if (docid >= norms.size()) continue; // defensive: out-of-range doc has no norm + if (norms[docid] < best) best = norms[docid]; + } + return best == 0xFF ? 0 : best; +} + +// Window doc count by df: high-df windowed terms combine kFrqBaseUnit units +// into larger (kAdaptiveWindowDocs) windows; both are whole multiples of the +// base unit so .prx alignment and win_base/last_docid semantics are preserved. +uint32_t AdaptiveWindowDocs(uint32_t df) { + return df >= snii::format::kAdaptiveWindowDfThreshold ? snii::format::kAdaptiveWindowDocs + : snii::format::kFrqBaseUnit; +} + +// Builds the two-level .frq prelude for a windowed term and returns its bytes. +Status BuildPrelude(const std::vector& windows, bool has_freq, bool has_prx, + std::vector* out) { + FrqPreludeColumns cols; + cols.has_freq = has_freq; + cols.has_prx = has_prx; + cols.group_size = kPreludeGroupSize; + cols.windows = windows; + ByteSink sink; + SNII_RETURN_IF_ERROR(snii::format::build_frq_prelude(cols, &sink)); + *out = sink.take(); + return Status::OK(); +} + +void AppendBytes(std::vector* dst, const std::vector& src) { + dst->insert(dst->end(), src.begin(), src.end()); +} + +// One windowed term's grouped .frq layout (design 1.6): all dd regions form the +// dd-block, all freq regions form the freq-block. The final frq span is +// [prelude][dd-block][freq-block]. The .prx windows are STREAMED straight to +// the posting sink (the container output) during pass 1 (not buffered here) -- +// so the widest term's ~tens-of-MiB prx bytes never co-exist with the dd/freq +// blocks at peak RSS; only prx_total_len (the entry's prx byte span) is +// tracked. Per-window metadata (region offsets/lens/modes/crcs, prx_off within +// the entry) is recorded for the prelude. +struct WindowedPosting { + std::vector dd_block; // dd_region_0 ++ dd_region_1 ++ ... + std::vector freq_block; // freq_region_0 ++ ... (empty if !has_freq) + uint64_t prx_total_len = 0; // total .prx bytes streamed for this entry + std::vector windows; +}; + +// Fills a window's region locator fields in m from its dd/freq region metas and +// the running dd-block / freq-block offsets, then appends the region bytes to +// the blocks. has_freq controls whether the freq region is laid out. +void LayoutWindowRegions(const FrqRegionMeta& dd_meta, const std::vector& dd_bytes, + const FrqRegionMeta& freq_meta, const std::vector& freq_bytes, + bool has_freq, WindowedPosting* out, WindowMeta* m) { + m->dd_zstd = dd_meta.zstd; + m->dd_off = static_cast(out->dd_block.size()); + m->dd_disk_len = dd_meta.disk_len; + m->dd_uncomp_len = dd_meta.uncomp_len; + m->crc_dd = dd_meta.crc; + AppendBytes(&out->dd_block, dd_bytes); + if (!has_freq) return; + m->freq_zstd = freq_meta.zstd; + m->freq_off = static_cast(out->freq_block.size()); + m->freq_disk_len = freq_meta.disk_len; + m->freq_uncomp_len = freq_meta.uncomp_len; + m->crc_freq = freq_meta.crc; + AppendBytes(&out->freq_block, freq_bytes); +} + +// Splits a windowed term's postings into base-unit-aligned windows (size chosen +// by df via AdaptiveWindowDocs). Each window's dd/freq regions are encoded +// separately and grouped: all dd regions into the dd-block, all freq regions +// into the freq-block. Records per-window region metadata + WAND max_norm. +// +// TWO-PASS, MEMORY-AWARE: the widest term (df in the millions) is the dominant +// merge-phase peak-RSS source -- its flat positions_flat alone is tens of MiB +// and would otherwise co-exist with the encoded output blocks at the peak +// moment. +// pass 1 (prx): builds every window's .prx bytes, then FREES positions_flat +// (the single largest source array) before any dd/freq block +// grows. +// pass 2 (dd/freq): encodes the dd/freq regions from docids/freqs only. +// `tp` is taken by mutable reference; positions_flat is freed after pass 1 and +// docids/freqs are freed by the caller after this returns. Output bytes are +// byte-identical to the single-pass build (regions/prelude/prx are +// independent). +Status BuildWindowedPosting(TermPostings& tp, bool has_freq, bool has_prx, + const std::vector& norms, snii::io::FileWriter* posting_out, + WindowedPosting* out) { + const uint32_t unit = AdaptiveWindowDocs(static_cast(tp.docids.size())); + const size_t n = tp.docids.size(); + const std::span all_docs(tp.docids); + const std::span all_freqs(tp.freqs); + + // Size the per-term transient blocks up front so a very-high-df term (split + // into thousands of windows, dd/freq blocks of MiB) does not grow them by + // geometric doubling -- which would briefly hold the old+new buffer + // co-resident at the build peak. windows count is exact; dd/freq use a + // conservative byte/doc upper estimate (PFOR-packed deltas are typically <= 2 + // B/doc). Slack is freed when the term ends. + out->windows.reserve((n + unit - 1) / unit); + out->dd_block.reserve(n * 2); + if (has_freq) out->freq_block.reserve(n); + + WindowScratch sc; // reused across all windows (no per-window allocation churn) + + // ---- pass 1: prx (STREAMED to the output) + window skeleton ---- + // Each window's .prx bytes are appended straight to the posting sink + // (container output) as they are built, so the entry's full prx payload (tens + // of MiB for the widest term) is never buffered in RAM alongside the dd/freq + // blocks that pass 2 grows. m.prx_off is the byte offset WITHIN this entry's + // prx span (running prx_total_len), matching the reader's prx_off_delta + + // meta.prx_off contract. + { + // Positions come either from the flat buffer or, for very-high-df terms, + // from a sequential pump (so the term's full positions are never + // materialized). Both yield the SAME positions in the SAME order, so the + // prx bytes are identical. + const bool streamed = static_cast(tp.pos_pump); + const std::span all_pos(tp.positions_flat); + std::vector win_pos_buf; // reused per window when streaming + uint64_t win_base = 0; + size_t pos_off = 0; + for (size_t start = 0; start < n; start += unit) { + const size_t len = std::min(unit, n - start); + const auto docs = all_docs.subspan(start, len); + const auto freqs = all_freqs.subspan(start, len); + WindowMeta m; + m.last_docid = docs.back(); + m.win_base = win_base; + m.doc_count = static_cast(docs.size()); + m.max_freq = MaxOf(freqs); + m.max_norm = WindowMaxNorm(norms, docs); + size_t win_pos = 0; + for (uint32_t f : freqs) win_pos += f; + if (has_prx) { + std::span pos_span; + if (streamed) { + win_pos_buf.resize(win_pos); + if (win_pos != 0) tp.pos_pump(win_pos_buf.data(), win_pos); + pos_span = std::span(win_pos_buf); + } else { + pos_span = all_pos.subspan(pos_off, win_pos); + } + sc.prx_sink.clear(); + SNII_RETURN_IF_ERROR(snii::format::build_prx_window_flat(pos_span, freqs, kAutoZstd, + &sc.prx_sink)); + m.prx_off = out->prx_total_len; + m.prx_len = static_cast(sc.prx_sink.size()); + SNII_RETURN_IF_ERROR(posting_out->append(sc.prx_sink.view())); + out->prx_total_len += m.prx_len; + } + pos_off += win_pos; + out->windows.push_back(m); + win_base = m.last_docid; + } + } + // Positions are fully consumed; free the largest source array before pass 2 + // grows the dd/freq blocks, so the source positions never co-exist with them. + std::vector().swap(tp.positions_flat); + + // ---- pass 2: dd (and freq) regions from docids/freqs only ---- + uint64_t win_base = 0; + size_t wi = 0; + for (size_t start = 0; start < n; start += unit, ++wi) { + const size_t len = std::min(unit, n - start); + const auto docs = all_docs.subspan(start, len); + const auto freqs = all_freqs.subspan(start, len); + FrqRegionMeta dd_meta, freq_meta; + SNII_RETURN_IF_ERROR( + EncodeRegionsInto(&sc, docs, freqs, win_base, has_freq, &dd_meta, &freq_meta)); + LayoutWindowRegions(dd_meta, sc.dd_sink.buffer(), freq_meta, sc.freq_sink.buffer(), + has_freq, out, &out->windows[wi]); + win_base = out->windows[wi].last_docid; + } + return Status::OK(); +} + +} // namespace + +LogicalIndexWriter::LogicalIndexWriter(const SniiIndexInput& in) + : index_id_(in.index_id), + index_suffix_(in.index_suffix), + tier_(snii::format::tier_of(in.config)), + has_prx_(snii::format::has_positions(in.config)), + has_freq_(snii::format::tier_of(in.config) >= snii::format::IndexTier::kT2), + has_norms_(snii::format::has_scoring(in.config)), + doc_count_(in.doc_count), + null_docids_(in.null_docids), + terms_(in.terms), + term_source_(in.term_source), + encoded_norms_(in.encoded_norms), + target_dict_block_bytes_(in.target_dict_block_bytes != 0 + ? in.target_dict_block_bytes + : snii::format::kDefaultTargetDictBlockBytes), + // No independent dict cap: the dict spills via the writer's UNIFIED + // gate-2 cap (in.mem_reporter->over_cap()); UINT64_MAX disables the local + // per-buffer cap. + dict_buf_(UINT64_MAX, "dict", in.mem_reporter) {} + +Status LogicalIndexWriter::validate_term(const TermPostings& tp) const { + if (tp.freqs.size() != tp.docids.size()) { + return Status::InvalidArgument("logical_index: freqs length must equal docids"); + } + if (has_prx_) { + uint64_t total_pos = 0; + for (uint32_t f : tp.freqs) total_pos += f; + // Streamed positions (pos_pump set): validate against the declared + // pos_total (positions_flat is intentionally empty). Otherwise validate the + // flat buffer. + const uint64_t have = tp.pos_pump ? tp.pos_total : tp.positions_flat.size(); + if (total_pos != have) { + return Status::InvalidArgument("logical_index: positions count must equal sum(freqs)"); + } + } + for (size_t i = 1; i < tp.docids.size(); ++i) { + if (tp.docids[i] <= tp.docids[i - 1]) { + return Status::InvalidArgument("logical_index: docids must be strictly ascending"); + } + } + return Status::OK(); +} + +// Emits a windowed term: splits into base-unit windows, encodes each window's +// dd/freq regions separately, groups them at posting level, builds a two-level +// prelude, and lays out [prx span][prelude][dd-block][freq-block] CONTIGUOUSLY +// in the single posting region (prx span first, then the frq span). Sets +// enc=windowed + has_sb. frq_docs_len = prelude_len + dd_block_len is the +// contiguous docs-only prefix, which stays INSIDE the frq span. +Status LogicalIndexWriter::build_windowed_entry(TermPostings& tp, uint64_t frq_base, + uint64_t prx_base, DictEntry* e) { + // The prx span starts here: pass 1 streams each .prx window straight into + // the posting sink, so prx_off_delta is measured against the live + // posting-sink size. + const uint64_t prx_off = posting_size(); + WindowedPosting wp; + SNII_RETURN_IF_ERROR( + BuildWindowedPosting(tp, has_freq_, has_prx_, encoded_norms_, posting_out_, &wp)); + // wp.prx_total_len bytes were just streamed straight to the posting sink (0 + // when !has_prx). docids/freqs are now fully encoded into wp; release the + // source arrays before the (potentially large) wp blocks are appended to + // disk. + std::vector().swap(tp.docids); + std::vector().swap(tp.freqs); + std::vector prelude; + SNII_RETURN_IF_ERROR(BuildPrelude(wp.windows, has_freq_, has_prx_, &prelude)); + + e->kind = DictEntryKind::kPodRef; + e->enc = DictEntryEnc::kWindowed; + e->has_sb = true; // prelude is always a two-level skip directory. + e->prelude_len = static_cast(prelude.size()); + e->frq_docs_len = + e->prelude_len + static_cast(wp.dd_block.size()); // [prelude][dd-block] + + // The frq span starts immediately AFTER the prx span, in the SAME sink. The + // writer-side property frq_off == prx_off + wp.prx_total_len holds because + // nothing is appended to the posting sink between the prx pass and here -- + // but the delta is measured from the live size, not assumed. + const uint64_t frq_off = posting_size(); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(prelude))); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(wp.dd_block))); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(wp.freq_block))); + e->frq_off_delta = frq_off - frq_base; + e->frq_len = posting_size() - frq_off; + if (has_prx_) { + e->prx_off_delta = prx_off - prx_base; + e->prx_len = wp.prx_total_len; // == frq_off - prx_off + } + return Status::OK(); +} + +// Emits a slim term as a single .frq window (win_base=0) laid out [dd][freq]: +// inline when the encoded bytes are tiny, otherwise a slim pod_ref (no +// prelude). The dd region is the docs-only prefix; the freq region (when +// has_freq) is the skippable suffix. Region codecs are recorded in the +// DictEntry. For a pod_ref, the term's [prx][frq] spans are appended to the +// single posting region with the prx span FIRST (consistent with the windowed +// path); the reader resolves each delta independently so the relative order is +// not load-bearing. +Status LogicalIndexWriter::build_slim_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + DictEntry* e) { + std::vector dd_bytes, freq_bytes; + FrqRegionMeta dd_meta, freq_meta; + SNII_RETURN_IF_ERROR(EncodeRegions(tp.docids, tp.freqs, /*win_base=*/0, has_freq_, &dd_bytes, + &dd_meta, &freq_bytes, &freq_meta)); + std::vector frq_win = dd_bytes; // [dd_region][freq_region] + AppendBytes(&frq_win, freq_bytes); + std::vector prx_win; + if (has_prx_) { + SNII_RETURN_IF_ERROR(MakePrxWindow(tp.positions_flat, tp.freqs, &prx_win)); + } + + e->enc = DictEntryEnc::kSlim; + e->dd_meta = dd_meta; + e->freq_meta = freq_meta; + + if (frq_win.size() <= snii::format::kDefaultInlineThreshold) { + e->kind = DictEntryKind::kInline; + e->inline_dd_disk_len = dd_meta.disk_len; + e->frq_bytes = std::move(frq_win); + if (has_prx_) e->prx_bytes = std::move(prx_win); + return Status::OK(); + } + + // POD_REF: write [prx][frq] into the single posting sink, prx span first. + e->kind = DictEntryKind::kPodRef; + e->frq_docs_len = dd_meta.disk_len; // docs-only prefix = the single dd region + if (has_prx_) { + const uint64_t prx_off = posting_size(); + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(prx_win))); + e->prx_off_delta = prx_off - prx_base; + e->prx_len = posting_size() - prx_off; + } + const uint64_t frq_off = posting_size(); // immediately after the prx span + SNII_RETURN_IF_ERROR(posting_out_->append(Slice(frq_win))); + e->frq_off_delta = frq_off - frq_base; + e->frq_len = posting_size() - frq_off; + return Status::OK(); +} + +// Builds the DictEntry for one term. Inline entries embed their .frq/.prx +// bytes; pod_ref entries append [prx][frq] bytes to the single posting region +// and record off_delta relative to frq_base/prx_base (the posting-region size +// captured when the block opened; both bases hold that same value). +Status LogicalIndexWriter::build_entry(TermPostings& tp, uint64_t frq_base, uint64_t prx_base, + DictEntry* e) { + e->term = tp.term; + e->df = static_cast(tp.docids.size()); + e->ttf_delta = SumOf(tp.freqs); // simple: ttf stored directly as ttf_delta + e->max_freq = MaxOf(tp.freqs); + + if (e->df >= snii::format::kSlimDfThreshold) { + return build_windowed_entry(tp, frq_base, prx_base, e); + } + return build_slim_entry(tp, frq_base, prx_base, e); +} + +// Serializes the current open block, zstd-compresses it (the dict region is the +// single largest section -- term keys + entry meta + inline postings -- and the +// 64KiB blocks compress ~40%), streams the compressed bytes into the dict +// scratch file, and records a directory entry. The block-level crc32c +// (rec.checksum) covers the UNCOMPRESSED bytes, so DictBlockReader::open +// verifies integrity after the reader decompresses. A compressed block also +// shrinks the bytes a term lookup fetches from S3 -- aligning with the +// read-byte thesis. If zstd does not shrink a (tiny) block, it is stored raw so +// a lookup never pays a pointless decompress. +Status LogicalIndexWriter::flush_block(DictBlockBuilder* block, std::string first_term) { + ByteSink bsink; + block->finish(&bsink); + const Slice plain = bsink.view(); + BlockRecord rec; + rec.rel_offset = dict_buf_.size(); + rec.n_entries = block->n_entries(); + rec.checksum = snii::crc32c(plain); // crc over UNCOMPRESSED block bytes + rec.first_term = std::move(first_term); + + std::vector comp; + Status zs = snii::zstd_compress(plain, kDictBlockZstdLevel, &comp); + if (zs.ok() && comp.size() < plain.size()) { + rec.flags = snii::format::block_ref_flags::kZstd; + rec.uncomp_len = static_cast(plain.size()); + rec.length = static_cast(comp.size()); + SNII_RETURN_IF_ERROR(dict_buf_.append_move(std::move(comp))); + } else { + rec.flags = 0; + rec.uncomp_len = 0; + rec.length = static_cast(plain.size()); + SNII_RETURN_IF_ERROR(dict_buf_.append_move(bsink.take())); + } + blocks_.push_back(std::move(rec)); + return Status::OK(); +} + +// Running state for the in-flight DICT block while terms stream past. +struct LogicalIndexWriter::BlockState { + std::unique_ptr block; + std::string block_first_term; + uint64_t frq_base = 0; + uint64_t prx_base = 0; +}; + +Status LogicalIndexWriter::process_term(TermPostings& tp, BlockState* st) { + SNII_RETURN_IF_ERROR(validate_term(tp)); + // Collect only the 8-byte filter key per term (no whole-vocabulary string + // copy). BSBF key = XXH64 seed 0 (Parquet-canonical). + term_hashes_.push_back(snii::format::bsbf_hash(tp.term)); + ++term_count_; + stats_.sum_total_term_freq += SumOf(tp.freqs); + + if (!st->block) { + // Both bases come from the SAME posting sink, snapshotted at block open. + const uint64_t base = posting_size(); + st->frq_base = base; + st->prx_base = base; + st->block = std::make_unique(tier_, has_prx_, st->frq_base, st->prx_base); + st->block_first_term = tp.term; + } + + DictEntry e; + SNII_RETURN_IF_ERROR(build_entry(tp, st->frq_base, st->prx_base, &e)); + st->block->add_entry(e); + + if (st->block->estimated_bytes() >= target_dict_block_bytes_) { + SNII_RETURN_IF_ERROR(flush_block(st->block.get(), st->block_first_term)); + st->block.reset(); + } + return Status::OK(); +} + +Status LogicalIndexWriter::build_blocks() { + BlockState st; + if (term_source_ != nullptr) { + Status streamed = Status::OK(); + // Drain the SPIMI buffer term-by-term; only one TermPostings is alive at a + // time, so the input+output never fully coexist. The returned Status covers + // both spill/merge I/O errors and add_token validation errors (the latter + // flow through merge_runs -> spill_status_), so a separate status() check + // is no longer needed. + SNII_RETURN_IF_ERROR(term_source_->for_each_term_sorted([&](TermPostings&& tp) { + if (streamed.ok()) streamed = process_term(tp, &st); + })); + SNII_RETURN_IF_ERROR(streamed); + } else { + // Materialized fallback (tests / callers holding a vector): process_term + // frees the term's arrays, so feed a per-term COPY to keep terms_ intact + // for the caller. This path is not the large out-of-core build, so the copy + // is cheap. + for (const auto& tp : terms_) { + TermPostings copy = tp; + SNII_RETURN_IF_ERROR(process_term(copy, &st)); + } + } + if (st.block) SNII_RETURN_IF_ERROR(flush_block(st.block.get(), st.block_first_term)); + return Status::OK(); +} + +Status LogicalIndexWriter::build(snii::io::FileWriter* posting_out) { + if (posting_out == nullptr) { + return Status::InvalidArgument("logical_index: null posting sink"); + } + if (has_norms_ && encoded_norms_.size() != doc_count_) { + return Status::InvalidArgument("logical_index: norms length must equal doc_count"); + } + // The interleaved posting region streams STRAIGHT into the container output + // (no temp round-trip): posting_size() is the region-relative byte count, + // derived from the output offset advanced since this index's region began. + // The DICT region is staged in dict_buf_ (tiered: RAM under the cap = + // spill-only; spills above it) since it must land contiguously after the + // concurrently-streamed posting region. + posting_out_ = posting_out; + posting_off0_ = posting_out->bytes_written(); + + SNII_RETURN_IF_ERROR(build_blocks()); + // Seal the dict buffer so a spilled temp is flushed before + // stream_dict_region_into reads it back. A no-op for a RAM-resident dict. + SNII_RETURN_IF_ERROR(dict_buf_.seal()); + + stats_.doc_count = doc_count_; + stats_.indexed_doc_count = doc_count_ - static_cast(null_docids_.size()); + stats_.term_count = term_count_; + stats_.null_count = static_cast(null_docids_.size()); + + if (has_norms_) { + snii::format::NormsPodWriter nw; + for (uint8_t n : encoded_norms_) nw.add(n); + ByteSink nsink; + nw.finish(&nsink); + norms_section_ = nsink.take(); + } + + if (!null_docids_.empty()) { + snii::format::NullBitmapWriter null_writer; + for (uint32_t docid : null_docids_) null_writer.add_null(docid); + ByteSink null_sink; + null_writer.finish(doc_count_, &null_sink); + null_bitmap_section_ = null_sink.take(); + } + + // Build the absent-term filter (block-split bloom, Parquet-canonical) from + // the per-term keys (no retained strings) as a [28B header][bitset] blob; the + // compound writer places it as a PHYSICAL section probed one 32-byte block on + // demand. + bsbf_bytes_.clear(); + if (!term_hashes_.empty()) { + snii::format::BsbfBuilder bf; + SNII_RETURN_IF_ERROR(snii::format::BsbfBuilder::create( + static_cast(term_hashes_.size()), kBsbfFpp, &bf)); + for (uint64_t k : term_hashes_) bf.insert(k); + ByteSink bsink; + SNII_RETURN_IF_ERROR(bf.serialize(&bsink)); + bsbf_bytes_ = bsink.take(); + } + std::vector().swap(term_hashes_); // release + return Status::OK(); +} + +Status LogicalIndexWriter::finish_meta(const SectionRefs& abs_refs, uint64_t dict_region_offset, + ByteSink* out) const { + if (out == nullptr) return Status::InvalidArgument("logical_index: null meta sink"); + + SampledTermIndexBuilder sti; + for (const auto& b : blocks_) sti.add_block_first_term(b.first_term); + ByteSink sti_sink; + sti.finish(&sti_sink); + + DictBlockDirectoryBuilder dir; + for (const auto& b : blocks_) { + BlockRef ref; + ref.offset = dict_region_offset + b.rel_offset; + ref.length = b.length; + ref.n_entries = b.n_entries; + ref.flags = b.flags; + ref.checksum = b.checksum; + ref.uncomp_len = b.uncomp_len; + dir.add(ref); + } + ByteSink dir_sink; + dir.finish(&dir_sink); + + uint32_t flags = bsbf_bytes_.empty() ? 0u : PerIndexMetaBuilder::kHasBsbf; + // Persist positions capability explicitly (the R1 fix): the reader must NOT + // infer it from posting_region.length, which is non-zero for any docs-only + // pod_ref index. + if (has_prx_) flags |= PerIndexMetaBuilder::kHasPositions; + PerIndexMetaBuilder builder(index_id_, index_suffix_, flags); + builder.set_stats(stats_); + builder.set_sampled_term_index(sti_sink.view()); + builder.set_dict_block_directory(dir_sink.view()); + // The BSBF is a physical section (abs_refs.bsbf), not embedded in the meta. + builder.set_section_refs(abs_refs); + return builder.finish(out); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp b/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp new file mode 100644 index 00000000000000..8e6f9b9adc61b3 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/snii_compound_writer.cpp @@ -0,0 +1,146 @@ +#include "snii/writer/snii_compound_writer.h" + +#include + +#include "snii/common/slice.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/crc32c.h" +#include "snii/format/bootstrap_header.h" +#include "snii/format/per_index_meta.h" // SectionRefs +#include "snii/format/tail_meta_region.h" +#include "snii/format/tail_pointer.h" + +namespace snii::writer { + +using snii::format::BootstrapHeader; +using snii::format::SectionRefs; +using snii::format::TailMetaRegionBuilder; +using snii::format::TailPointer; + +SniiCompoundWriter::SniiCompoundWriter(snii::io::FileWriter* out) : out_(out) {} + +Status SniiCompoundWriter::append(const std::vector& bytes) { + if (bytes.empty()) return Status::OK(); + return out_->append(Slice(bytes)); +} + +// The bootstrap header occupies offset 0 and must precede the first posting region, +// which streams straight into the output during build(). Written lazily exactly once +// (on the first add, or in finish() for an empty container). +Status SniiCompoundWriter::ensure_bootstrap() { + if (bootstrap_written_) return Status::OK(); + bootstrap_written_ = true; + return write_bootstrap(); +} + +Status SniiCompoundWriter::add_logical_index(const SniiIndexInput& in) { + if (out_ == nullptr) return Status::InvalidArgument("compound: null file writer"); + if (finished_) return Status::Internal("compound: add after finish"); + SNII_RETURN_IF_ERROR(ensure_bootstrap()); + auto liw = std::make_unique(in); + Placement p; + // The posting region streams DIRECTLY into the container during build() -- no temp + // round-trip for the bulk -- followed immediately by this index's compact DICT + // trailer (produced interleaved into a temp, but laid out right after its posting + // region, preserving the per-index [posting][dict] layout). Offsets are read off + // the output writer (the single source of truth -- no separate cursor). + p.post_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(liw->build(out_)); + p.post_len = out_->bytes_written() - p.post_off; + p.dict_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(liw->stream_dict_region_into(out_)); + p.dict_len = out_->bytes_written() - p.dict_off; + indexes_.push_back(std::move(liw)); + placements_.push_back(p); + return Status::OK(); +} + +Status SniiCompoundWriter::write_bootstrap() { + BootstrapHeader bh; + bh.tail_pointer_size = static_cast(snii::format::tail_pointer_size()); + ByteSink sink; + SNII_RETURN_IF_ERROR(snii::format::encode_bootstrap_header(bh, &sink)); + return append(sink.buffer()); +} + +// Writes each index's norms POD then bsbf section (in add order), after all the +// per-index [posting][dict] regions. +Status SniiCompoundWriter::write_norms() { + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + if (!w.has_norms() || w.norms_bytes().empty()) continue; + Placement& p = placements_[i]; + p.norms_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(w.norms_bytes())); + p.norms_len = out_->bytes_written() - p.norms_off; + } + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + if (!w.has_null_bitmap()) continue; + Placement& p = placements_[i]; + p.null_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(w.null_bitmap_bytes())); + p.null_len = out_->bytes_written() - p.null_off; + } + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + if (!w.has_bsbf()) continue; + Placement& p = placements_[i]; + p.bsbf_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(w.bsbf_bytes())); + p.bsbf_len = out_->bytes_written() - p.bsbf_off; + } + return Status::OK(); +} + +Status SniiCompoundWriter::write_tail() { + TailMetaRegionBuilder region; + for (size_t i = 0; i < indexes_.size(); ++i) { + const LogicalIndexWriter& w = *indexes_[i]; + const Placement& p = placements_[i]; + + SectionRefs refs; + refs.dict_region = {p.dict_off, p.dict_len}; + refs.posting_region = {p.post_off, p.post_len}; + refs.norms = {p.norms_off, p.norms_len}; + refs.null_bitmap = {p.null_off, p.null_len}; + refs.bsbf = {p.bsbf_off, p.bsbf_len}; + + ByteSink meta; + SNII_RETURN_IF_ERROR(w.finish_meta(refs, p.dict_off, &meta)); + region.add_index(w.index_id(), w.index_suffix(), meta.view()); + } + + ByteSink region_sink; + region.finish(®ion_sink); + const uint64_t region_off = out_->bytes_written(); + SNII_RETURN_IF_ERROR(append(region_sink.buffer())); + const uint64_t region_len = out_->bytes_written() - region_off; + + TailPointer tp; + tp.meta_region_offset = region_off; + tp.meta_region_length = region_len; + tp.hot_off = 0; + tp.meta_region_checksum = snii::crc32c(region_sink.view()); + // Reserved: the bootstrap header carries (and decode_bootstrap_header verifies) its + // OWN internal crc32c, so a tail-pointer copy is redundant. Left 0 until a cross- + // region check needs it; the tail pointer's own tail_checksum still covers this + // field's bytes. + tp.bootstrap_header_checksum = 0; + ByteSink tail_sink; + SNII_RETURN_IF_ERROR(snii::format::encode_tail_pointer(tp, &tail_sink)); + return append(tail_sink.buffer()); +} + +Status SniiCompoundWriter::finish() { + if (out_ == nullptr) return Status::InvalidArgument("compound: null file writer"); + if (finished_) return Status::Internal("compound: finish called twice"); + finished_ = true; + + SNII_RETURN_IF_ERROR(ensure_bootstrap()); // empty container still gets a header + SNII_RETURN_IF_ERROR(write_norms()); + SNII_RETURN_IF_ERROR(write_tail()); + return out_->finalize(); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp b/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp new file mode 100644 index 00000000000000..e68ba24b9a4164 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/spill_run_codec.cpp @@ -0,0 +1,597 @@ +#include "snii/writer/spill_run_codec.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "snii/encoding/varint.h" +#include "snii/format/format_constants.h" + +namespace snii::writer { + +namespace { + +// Flush staging once it grows past this. A LARGE write buffer (4 MiB) collapses +// the per-flush write() syscall count by ~64x: at 64 KiB the 5M build issued +// ~8800 write()s to ext4 (~9s of syscall overhead) for ~553 MiB of runs, versus +// a raw dd of the same bytes taking ~1.2s. Runs are PRIVATE temp files, so the +// on-disk index is unaffected; the only cost is a slightly larger transient +// RunWriter staging buffer (4 MiB, bounded, freed at close()). +constexpr size_t kWriteFlushBytes = 1u << 22; // 4 MiB +// RunReader reads this much per disk fill; the window slides so a single record +// never needs the whole run in RAM (only the current term's encoded span). KEEP +// this small (64 KiB): a large read chunk x many open runs would inflate the +// merge-phase peak RSS at low spill thresholds (each reader holds a window). +constexpr size_t kReadChunkBytes = 1u << 16; // 64 KiB + +void AppendVarint(std::vector* buf, uint64_t v) { + uint8_t tmp[10]; + const size_t n = encode_varint64(v, tmp); + buf->insert(buf->end(), tmp, tmp + n); +} + +// Appends a block of `count` uint32 values as RAW little-endian fixed-width bytes +// (memcpy from contiguous source). Runs are private temp files; the on-disk index +// is unaffected. Raw blocks make encode/decode ~10x cheaper than per-value varint +// for the freqs/positions streams (which compress poorly as varints anyway), at +// the cost of a modestly larger temp run. Empty source is a no-op. +void AppendRawU32(std::vector* buf, const uint32_t* src, size_t count) { + if (count == 0) return; + const auto* bytes = reinterpret_cast(src); + buf->insert(buf->end(), bytes, bytes + count * sizeof(uint32_t)); +} + +// Writes the full byte range [data, data+len) to fd, looping over short writes. +Status WriteAll(int fd, const uint8_t* data, size_t len) { + size_t off = 0; + while (off < len) { + const ssize_t n = ::write(fd, data + off, len - off); + if (n < 0) { + if (errno == EINTR) continue; + return Status::IoError(std::string("run write failed: ") + std::strerror(errno)); + } + off += static_cast(n); + } + return Status::OK(); +} + +} // namespace + +// --------------------------------------------------------------------------- +// RunWriter +// --------------------------------------------------------------------------- + +RunWriter::~RunWriter() { + if (fd_ >= 0) ::close(fd_); +} + +Status RunWriter::open(const std::string& path) { + fd_ = ::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd_ < 0) { + return Status::IoError("run open(" + path + "): " + std::strerror(errno)); + } + buf_.clear(); + return Status::OK(); +} + +Status RunWriter::flush() { + if (buf_.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(WriteAll(fd_, buf_.data(), buf_.size())); + buf_.clear(); + return Status::OK(); +} + +Status RunWriter::write_term(uint32_t term_id, const TermPostings& tp) { + AppendVarint(&buf_, term_id); + AppendVarint(&buf_, tp.docids.size()); + // Docids are a RAW fixed-width u32 block (bulk memcpy), NOT per-value VInt. + // Per-value varint over ~60M docids cost ~1.5s of encode CPU on the spill feed + // side; raw is a single memcpy and the decode side becomes a memcpy too. Runs + // are PRIVATE temp files written then read back from page cache, so the modestly + // larger run (no delta packing) costs ~0 extra real I/O. Absolute docids are + // stored (the merge concatenates per-term across runs and re-deltas at encode). + AppendRawU32(&buf_, tp.docids.data(), tp.docids.size()); + // Freqs + positions are RAW fixed-width u32 blocks (bulk memcpy). The decoder + // reads them back the same way; n_pos == positions_flat.size() is recoverable + // from sum(freqs), but is written explicitly so a reader can size the block. + AppendRawU32(&buf_, tp.freqs.data(), tp.freqs.size()); + const uint64_t n_pos = tp.positions_flat.size(); + AppendVarint(&buf_, n_pos); + AppendRawU32(&buf_, tp.positions_flat.data(), tp.positions_flat.size()); + if (buf_.size() >= kWriteFlushBytes) SNII_RETURN_IF_ERROR(flush()); + return Status::OK(); +} + +Status RunWriter::close() { + if (fd_ < 0) return Status::OK(); + SNII_RETURN_IF_ERROR(flush()); + const int fd = fd_; + fd_ = -1; + if (::close(fd) != 0) { + return Status::IoError(std::string("run close: ") + std::strerror(errno)); + } + return Status::OK(); +} + +// --------------------------------------------------------------------------- +// RunReader +// --------------------------------------------------------------------------- + +RunReader::~RunReader() { + if (fd_ >= 0) ::close(fd_); +} + +Status RunReader::open(const std::string& path, bool has_positions) { + fd_ = ::open(path.c_str(), O_RDONLY); + if (fd_ < 0) { + return Status::IoError("run reopen(" + path + "): " + std::strerror(errno)); + } + // Record the run's byte size so every length decoded from the stream can be + // bounded against it before allocating (no record holds more u32s than the whole + // file). Honors the header's "lengths validated against the file size" contract, + // turning a corrupt/truncated length into Status::Corruption rather than an + // uncaught std::bad_alloc from a giant resize(). + struct stat st {}; + if (::fstat(fd_, &st) != 0) { + return Status::IoError(std::string("run fstat: ") + std::strerror(errno)); + } + file_size_ = static_cast(st.st_size); + has_positions_ = has_positions; + exhausted_ = false; + eof_ = false; + pos_ = 0; + pos_count_ = 0; + pos_remaining_ = 0; + window_.clear(); + return advance(); +} + +// Slides consumed bytes out of the window, then appends one disk chunk. +Status RunReader::fill() { + if (pos_ > 0) { + window_.erase(window_.begin(), window_.begin() + pos_); + pos_ = 0; + } + if (eof_) return Status::OK(); + const size_t base = window_.size(); + window_.resize(base + kReadChunkBytes); + ssize_t n; + do { + n = ::read(fd_, window_.data() + base, kReadChunkBytes); + } while (n < 0 && errno == EINTR); + if (n < 0) return Status::IoError(std::string("run read: ") + std::strerror(errno)); + window_.resize(base + static_cast(n)); + if (n == 0) eof_ = true; + return Status::OK(); +} + +// Buffered bytes available to the decoder right now (from pos_ to window end). +// fill() may slide the window (erasing consumed bytes), so callers must compare +// THIS quantity -- not window_.size() -- to decide whether more data arrived. +size_t RunReader::available() const { + return window_.size() - pos_; +} + +Status RunReader::ensure(size_t n) { + while (available() < n) { + const size_t had = available(); + SNII_RETURN_IF_ERROR(fill()); + if (available() == had && eof_) { + return Status::Corruption("run truncated: needed more bytes than available"); + } + } + return Status::OK(); +} + +// Streamed varint: decode from the current window; if it straddles the buffered +// boundary, top up from disk and retry. A varint is at most 10 bytes, so this +// loops at most a couple of times. Bounds-safe: decode_varint64 never reads past +// `end`, and a partial varint at true eof is reported as corruption. +Status RunReader::read_varint(uint64_t* v) { + while (true) { + const uint8_t* p = window_.data() + pos_; + const uint8_t* end = window_.data() + window_.size(); + const uint8_t* next = nullptr; + Status s = decode_varint64(p, end, v, &next); + if (s.ok()) { + pos_ += static_cast(next - p); + return Status::OK(); + } + if (eof_) return Status::Corruption("run truncated: incomplete varint"); + const size_t had = available(); + SNII_RETURN_IF_ERROR(fill()); + if (available() == had && eof_) { + return Status::Corruption("run truncated: incomplete varint at eof"); + } + } +} + +// Streams `count` raw little-endian u32s from the window into `dst` (caller-owned +// storage of at least count*4 bytes), topping up the window from disk as needed. +// Copies whatever is buffered each pass (the window may hold only part of a large +// block), so a high-df term's freqs/positions stream through in 64 KiB chunks +// without ever needing the whole block resident at once. +Status RunReader::pull_raw_u32(uint8_t* dst, size_t count) { + if (count == 0) return Status::OK(); + size_t need = count * sizeof(uint32_t); + size_t written = 0; + while (need > 0) { + if (available() == 0) { + const size_t had = available(); + SNII_RETURN_IF_ERROR(fill()); + if (available() == had && eof_) { + return Status::Corruption("run truncated: needed more raw bytes than available"); + } + } + const size_t take = std::min(need, available()); + std::memcpy(dst + written, window_.data() + pos_, take); + pos_ += take; + written += take; + need -= take; + } + return Status::OK(); +} + +// Bulk-decodes `count` raw u32s into `out` (resized to count). +Status RunReader::read_raw_u32(size_t count, std::vector* out) { + // Bound `count` against the run's byte size BEFORE resize(): a record can never + // hold more u32s than the whole file. Rejects a corrupt/truncated length varint + // (which is otherwise an unbounded resize -> uncaught std::bad_alloc). + if (count > file_size_ / sizeof(uint32_t)) { + return Status::Corruption("run: raw u32 count exceeds file size"); + } + out->resize(count); + if (count == 0) return Status::OK(); + return pull_raw_u32(reinterpret_cast(out->data()), count); +} + +// Materializes the current term's deferred position block into positions_flat. +// A no-op once the positions are already drained (idempotent within a term). +Status RunReader::materialize_positions() { + if (pos_remaining_ == 0) { + current_.positions_flat.clear(); + return Status::OK(); + } + const size_t n = static_cast(pos_remaining_); + if (has_positions_) { + SNII_RETURN_IF_ERROR(read_raw_u32(n, ¤t_.positions_flat)); + } else { + // No-positions runs should carry n_pos == 0; tolerate (skip) a stray block. + std::vector skip; + SNII_RETURN_IF_ERROR(read_raw_u32(n, &skip)); + current_.positions_flat.clear(); + } + pos_remaining_ = 0; + return Status::OK(); +} + +// Streams the next `n` positions of the current term straight from the window. +Status RunReader::stream_positions(uint32_t* dst, size_t n) { + if (n == 0) return Status::OK(); + if (n > pos_remaining_) { + return Status::Corruption("run: stream_positions past block end"); + } + SNII_RETURN_IF_ERROR(pull_raw_u32(reinterpret_cast(dst), n)); + pos_remaining_ -= n; + return Status::OK(); +} + +// Discards any positions of the current term left unread, so the window cursor +// lands at the next record boundary before advance() reads the next term. +Status RunReader::skip_remaining_positions() { + if (pos_remaining_ == 0) return Status::OK(); + const size_t n = static_cast(pos_remaining_); + std::vector skip; + SNII_RETURN_IF_ERROR(read_raw_u32(n, &skip)); + pos_remaining_ = 0; + return Status::OK(); +} + +Status RunReader::advance() { + // Drain any positions the owner left unread for the previous term so the window + // cursor lands at the next record boundary. + SNII_RETURN_IF_ERROR(skip_remaining_positions()); + // End-of-run detection: at a record boundary, if no bytes remain we are done. + if (available() == 0) { + SNII_RETURN_IF_ERROR(fill()); + if (available() == 0 && eof_) { + exhausted_ = true; + return Status::OK(); + } + } + uint64_t term_id = 0; + SNII_RETURN_IF_ERROR(read_varint(&term_id)); + if (term_id > UINT32_MAX) return Status::Corruption("run term_id exceeds uint32"); + current_id_ = static_cast(term_id); + current_.term.clear(); // runs store only the id; owner resolves the string + + uint64_t n_docs = 0; + SNII_RETURN_IF_ERROR(read_varint(&n_docs)); + // Docids: RAW absolute u32 block (bulk read), matching the writer's AppendRawU32. + SNII_RETURN_IF_ERROR(read_raw_u32(static_cast(n_docs), ¤t_.docids)); + // Freqs: RAW u32 block (bulk read), matching the writer's AppendRawU32. + SNII_RETURN_IF_ERROR(read_raw_u32(static_cast(n_docs), ¤t_.freqs)); + uint64_t n_pos = 0; + SNII_RETURN_IF_ERROR(read_varint(&n_pos)); + // Positions are LAZY: record the block count and leave the window cursor parked + // at the block start. The owner picks materialize_positions() (default) or + // stream_positions() (wide-term merge pump). The widest term's tens-of-MiB + // position block is thus never resident unless the owner asks for it whole. + current_.positions_flat.clear(); + pos_count_ = n_pos; + pos_remaining_ = n_pos; + return Status::OK(); +} + +// --------------------------------------------------------------------------- +// K-way merge +// --------------------------------------------------------------------------- + +namespace { + +// Min-heap entry: orders by the run's current term-id's VOCAB STRING, tie-broken +// by run index so equal terms are gathered run-order (keeping concatenated +// docids ascending). The comparator resolves id -> string via the shared vocab, +// so the merged stream is lexicographic (the dictionary order the writer needs). +struct HeapItem { + uint32_t term_id; + size_t run; +}; +struct HeapGreater { + const std::vector* vocab; + bool operator()(const HeapItem& a, const HeapItem& b) const { + const std::string& sa = (*vocab)[a.term_id]; + const std::string& sb = (*vocab)[b.term_id]; + if (sa != sb) return sa > sb; + return a.run > b.run; + } +}; + +// Appends src's postings onto dst (run order). Later runs only cover docids +// >= dst's last, so docids stay ascending. COALESCE the boundary doc: if a spill +// fell BETWEEN two tokens of the same doc, that doc ends one run and begins the +// next with the SAME docid -- merge them (sum freqs, splice positions) so the +// merged term has exactly one entry per docid (matching the in-memory build). +// +// Positions are FLAT: doc order, partitioned by freqs. Because both dst and src +// already store doc-ordered flat positions, the common (no-boundary-overlap) case +// is a single bulk append. The boundary-overlap case must INSERT src's first +// doc's positions right after dst's last doc's positions so flat order stays +// consistent with the merged (coalesced) freqs. +void Concat(TermPostings* dst, const TermPostings& src, bool has_positions) { + if (src.docids.empty()) return; + size_t start = 0; + size_t src_pos_start = 0; // flat offset of src positions to append after splice + if (!dst->docids.empty() && dst->docids.back() == src.docids.front()) { + const uint32_t head_fc = src.freqs.front(); + if (has_positions && head_fc != 0) { + // Splice src's first-doc positions in right after dst's last-doc positions. + // dst's last doc owns dst->freqs.back() entries at the tail of positions_flat + // BEFORE we bump that freq, so insert at end() (last doc is the tail run). + auto& flat = dst->positions_flat; + flat.insert(flat.end(), src.positions_flat.begin(), + src.positions_flat.begin() + head_fc); + } + dst->freqs.back() += head_fc; + src_pos_start = head_fc; + start = 1; // boundary doc folded in; append the rest + } + dst->docids.insert(dst->docids.end(), src.docids.begin() + start, src.docids.end()); + dst->freqs.insert(dst->freqs.end(), src.freqs.begin() + start, src.freqs.end()); + if (has_positions) { + dst->positions_flat.insert(dst->positions_flat.end(), + src.positions_flat.begin() + src_pos_start, + src.positions_flat.end()); + } +} + +// Coalesces ONLY docids/freqs (no positions). Used by the WIDE-term path, whose +// positions are streamed via a pos_pump instead of materialized. The boundary-doc +// freq merge (dst->freqs.back() += head_fc) is identical to Concat's, so the +// merged df / freqs / ttf are bit-for-bit the same; positions are emitted in pure +// run-order concatenation by the pump (the same byte stream Concat would build). +void ConcatDocsFreqs(TermPostings* dst, const TermPostings& src) { + if (src.docids.empty()) return; + size_t start = 0; + if (!dst->docids.empty() && dst->docids.back() == src.docids.front()) { + dst->freqs.back() += src.freqs.front(); + start = 1; // boundary doc folded in; append the rest + } + dst->docids.insert(dst->docids.end(), src.docids.begin() + start, src.docids.end()); + dst->freqs.insert(dst->freqs.end(), src.freqs.begin() + start, src.freqs.end()); +} + +// A merged term is emitted with a STREAMED position pump (instead of a +// materialized positions_flat) when it is wide enough that its full flat +// positions would dominate the merge-phase peak RSS. The writer routes any term +// with df >= kSlimDfThreshold through the windowed path (build_windowed_entry), +// which is the only path that consumes pos_pump; a slim term reads positions_flat +// directly, so it must always be materialized. Gating on the same df threshold +// the writer uses keeps the two in lockstep and is conservative: only the few +// genuinely-wide terms (led by the single widest, the merge-phase peak driver) +// take the streamed path. total_pos is also required so a degenerate wide term +// with no positions still has something to stream. +bool ShouldStreamPositions(uint64_t total_docs, uint64_t total_pos, bool has_positions) { + return has_positions && total_pos != 0 && total_docs >= snii::format::kSlimDfThreshold; +} + +} // namespace + +Status MergeRuns(const std::vector& run_paths, const std::vector& vocab, + bool has_positions, const std::function& fn, + bool allow_stream_positions) { + std::vector> readers; + readers.reserve(run_paths.size()); + std::priority_queue, HeapGreater> heap(HeapGreater {&vocab}); + for (size_t i = 0; i < run_paths.size(); ++i) { + auto r = std::make_unique(); + SNII_RETURN_IF_ERROR(r->open(run_paths[i], has_positions)); + if (!r->exhausted()) { + if (r->current_id() >= vocab.size()) { + return Status::Corruption("run term_id out of vocab range"); + } + heap.push({r->current_id(), i}); + } + readers.push_back(std::move(r)); + } + + std::vector matching; // run indices contributing the current term + while (!heap.empty()) { + const uint32_t id = heap.top().term_id; + TermPostings merged; + merged.term = vocab[id]; // resolve the id -> dictionary string once + // Gather every run whose head id maps to the same string (the heap's run + // tie-break keeps them in run order, so concatenated docids stay ascending). + // Equal strings imply equal ids for a dense vocab; compare by string so a + // duplicate string still groups correctly. The matching runs' current slices + // are already loaded in their readers (they were read to seed the heap), so + // summing their sizes here costs nothing extra in RAM. + matching.clear(); + uint64_t total_docs = 0, total_pos = 0; + while (!heap.empty() && vocab[heap.top().term_id] == merged.term) { + const size_t ri = heap.top().run; + heap.pop(); + const RunReader* r = readers[ri].get(); + total_docs += r->current().docids.size(); + total_pos += r->current_pos_count(); // positions are LAZY: use the count + matching.push_back(ri); + } + // Reserve EXACTLY the summed sizes (an upper bound -- boundary-doc coalescing + // only shrinks the final size). This eliminates std::vector's geometric + // over-allocation, which left ~32 MiB of dead capacity on the widest term (df + // in the millions split across spills) -- a dominant merge-phase peak-RSS + // overhang at 5M. The reserved-but-unwritten pages are not faulted in, so the + // empty reservation itself does not raise RSS; only the actual data does. + merged.docids.reserve(static_cast(total_docs)); + merged.freqs.reserve(static_cast(total_docs)); + + bool stream = allow_stream_positions && + ShouldStreamPositions(total_docs, total_pos, has_positions); + if (!stream && has_positions) { + merged.positions_flat.reserve(static_cast(total_pos)); + } + // Coalesce docids/freqs from every matching run (always materialized -- a few + // u32 vectors). For the non-wide case, also coalesce positions here. For the + // wide case, leave positions for the streamed pump and keep the readers PARKED + // at their position blocks until fn() drains the pump. + for (size_t ri : matching) { + RunReader* r = readers[ri].get(); + if (stream) { + ConcatDocsFreqs(&merged, r->current()); + } else { + if (has_positions) SNII_RETURN_IF_ERROR(r->materialize_positions()); + Concat(&merged, r->current(), has_positions); + } + } + + // The stream gate keyed on PRE-coalesce total_docs, but the writer's slim vs + // windowed dispatch keys on the POST-coalesce df (merged.docids.size()). + // Boundary-doc coalescing across spill seams can drop df below kSlimDfThreshold + // while total_docs stayed above it; that term routes to build_slim_entry, which + // reads positions_flat directly and ignores pos_pump. Materialize positions now + // from the still-parked readers (mirrors drain_sorted()'s slim fallback). + if (stream && merged.docids.size() < snii::format::kSlimDfThreshold) { + merged.positions_flat.reserve(static_cast(total_pos)); + for (size_t ri : matching) { + RunReader* r = readers[ri].get(); + SNII_RETURN_IF_ERROR(r->materialize_positions()); + const std::vector& pf = r->current().positions_flat; + merged.positions_flat.insert(merged.positions_flat.end(), pf.begin(), pf.end()); + } + stream = false; + } + + if (stream) { + // WIDE term: STREAM positions via a pump that walks the matching readers in + // run order (pure flat concatenation == the coalesced positions_flat, + // byte-for-byte). positions_flat stays empty -- the widest term's tens-of-MiB + // position buffer is never resident; only one ~64 KiB window per pull is. The + // readers are still parked at this term's blocks, so the pump pulls from them + // synchronously while fn() runs (fn consumes synchronously -- the windowed + // writer does). After fn(), advance the readers past the (now-drained) blocks. + merged.pos_total = total_pos; + size_t cursor = 0; // index into `matching` for the run currently being drained + Status pump_status = Status::OK(); + std::vector>* rd = &readers; + const std::vector* match = &matching; + // Self-contained liveness guard. The pump captures references into THIS stack + // frame (&cursor, &pump_status) and the parked run readers (rd/match), valid + // ONLY while fn() runs synchronously -- after fn() the readers advance past the + // drained blocks. `pump_alive` is heap-owned and captured BY VALUE, so a + // stored/deferred pos_pump fails loudly (throws) instead of dereferencing + // dangling state. See the contract on TermPostings::pos_pump. + auto pump_alive = std::make_shared(true); + merged.pos_pump = [rd, match, &cursor, &pump_status, pump_alive](uint32_t* dst, + size_t n) { + if (!*pump_alive) { + throw std::logic_error( + "TermPostings::pos_pump invoked after its producing merge scope ended; " + "the streamed TermPostings must be consumed synchronously inside fn() " + "and never stored for later use"); + } + size_t off = 0; + while (off < n) { + // Advance to the next run that still has positions to yield. + while (cursor < match->size() && + (*rd)[(*match)[cursor]]->positions_remaining() == 0) { + ++cursor; + } + if (cursor >= match->size()) break; // defensive: pump over-pulled + RunReader* r = (*rd)[(*match)[cursor]].get(); + const size_t take = + std::min(n - off, static_cast(r->positions_remaining())); + Status s = r->stream_positions(dst + off, take); + if (!s.ok()) { + // Mid-stream I/O / corruption: zero-fill the UNFILLED tail before + // returning. fn() has the pump and will consume dst BEFORE pump_status + // is surfaced after fn(); never hand it uninitialized bytes (the + // failed stream_positions wrote nothing into dst[off..]). The error is + // still latched and surfaced after fn(), so the build aborts -- the + // zero fill only guarantees deterministic, defined bytes meanwhile. + std::memset(dst + off, 0, (n - off) * sizeof(uint32_t)); + if (pump_status.ok()) pump_status = std::move(s); + return; + } + off += take; + } + // Short-fill on over-pull (cursor ran past the matching runs without an + // error status): the readers held fewer positions than n. Zero-fill the + // unfilled tail so the writer never reads uninitialized storage. With + // valid runs n == pos_total == sum(positions_remaining), so off == n and + // this memset spans zero bytes -- the produced .idx is unchanged. + if (off < n) std::memset(dst + off, 0, (n - off) * sizeof(uint32_t)); + }; + fn(std::move(merged)); + *pump_alive = false; // any later pos_pump call now throws instead of UAF + SNII_RETURN_IF_ERROR(pump_status); // surface a streamed-read I/O error + } else { + fn(std::move(merged)); + } + + // Advance every matching reader to its next term and re-seed the heap. For the + // wide path this also skips any positions the pump did not pull (none, when fn + // drained the whole stream); for the non-wide path positions were already + // materialized so nothing remains. + for (size_t ri : matching) { + RunReader* r = readers[ri].get(); + SNII_RETURN_IF_ERROR(r->advance()); // frees this run's slice, loads next term + if (!r->exhausted()) { + if (r->current_id() >= vocab.size()) { + return Status::Corruption("run term_id out of vocab range"); + } + heap.push({r->current_id(), ri}); + } + } + } + return Status::OK(); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp b/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp new file mode 100644 index 00000000000000..7fc8cd58ec0bf6 --- /dev/null +++ b/be/src/storage/index/snii/core/src/writer/spimi_term_buffer.cpp @@ -0,0 +1,594 @@ +#include "snii/writer/spimi_term_buffer.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "snii/encoding/varint.h" +#include "snii/format/format_constants.h" +#include "snii/writer/spill_run_codec.h" +#include "snii/writer/temp_dir.h" + +#if defined(__GLIBC__) +#include +#endif + +namespace snii::writer { + +namespace { + +// Returns freed heap arenas to the OS (glibc only). The spill encode churns many +// small allocations whose freed chunks glibc retains in its arenas; trimming +// before the peak-RSS-defining merge phase recovers that retention. No-op (and +// harmless) on non-glibc libcs. +void TrimMalloc() { +#if defined(__GLIBC__) + ::malloc_trim(0); +#endif +} + +// Process-unique temp path for a spill run under `dir` (pid + monotonic counter so +// parallel builds / multiple buffers never collide). +std::string MakeRunPath(const std::string& dir) { + static std::atomic counter {0}; + const uint64_t n = counter.fetch_add(1); + return dir + "/snii_spill_" + std::to_string(::getpid()) + "_" + std::to_string(n) + ".run"; +} + +} // namespace + +SpimiTermBuffer::SpimiTermBuffer(const std::vector* vocab, bool has_positions, + size_t spill_threshold_bytes, MemoryReporter* reporter) + : vocab_(vocab), + has_positions_(has_positions), + spill_threshold_bytes_(spill_threshold_bytes), + mem_reporter_(reporter) { + // Borrowed-vocab mode: only the 4 B/id slot-index array is sized to the + // vocabulary; the Term pool (slots_) grows with the LIVE touched count, so an + // all-but-empty vocabulary costs ~4 B/id instead of ~80 B/id. + slot_of_.assign(vocab_->size(), 0); + // The vocab-sized slot index is resident immediately and survives spills; report + // its initial positive delta now. + report_arena_delta(); +} + +SpimiTermBuffer::SpimiTermBuffer(bool has_positions, size_t spill_threshold_bytes, + MemoryReporter* reporter) + : vocab_(&owned_vocab_), + has_positions_(has_positions), + spill_threshold_bytes_(spill_threshold_bytes), + mem_reporter_(reporter) { + // Owned-vocab mode: the vocabulary grows as strings are interned; terms_ / + // present_ grow alongside it in add_token(string_view, ...). +} + +SpimiTermBuffer::~SpimiTermBuffer() { + // Balance the writer-level / Doris tracker on the error path: if the buffer is + // destroyed while resident bytes were reported but not yet freed-and-reported + // (e.g. a build aborts before draining), return them here so nothing leaks. + if (mem_reporter_ != nullptr && reported_resident_ != 0) { + mem_reporter_->report(-reported_resident_); + reported_resident_ = 0; + } + cleanup_runs(); +} + +void SpimiTermBuffer::report_arena_delta() { + if (mem_reporter_ == nullptr) return; + // Diff the REAL resident bytes (arena + slot index) against the last reported + // total; emit the signed delta exactly once. + const int64_t now = static_cast(resident_bytes()); + mem_reporter_->report(now - reported_resident_); + reported_resident_ = now; +} + +size_t SpimiTermBuffer::unique_terms() const { + return live_term_count_; +} + +uint64_t SpimiTermBuffer::resident_bytes() const { + // REAL resident accumulator bytes: the posting arena plus the vocab-sized slot + // index (capacity, since the reserved-but-unused tail is still resident RSS and + // survives spills -- spill_to_run does NOT free slot_of_). This is the gate-2 + // spill trigger metric and the spill space-precheck figure -- NOT the old gated + // live_bytes_ estimate. + return pool_.arena_bytes() + static_cast(slot_of_.capacity()) * sizeof(uint32_t); +} + +// Returns the live Term for `term_id`, claiming a pool slot on first touch (1 == +// new). Reuses a freed slot from free_slots_ when available; otherwise appends a +// fresh Term to slots_. slot_of_[term_id] holds (slot index + 1); 0 means empty. +SpimiTermBuffer::Term& SpimiTermBuffer::term_slot(uint32_t term_id, bool* new_term) { + uint32_t enc = slot_of_[term_id]; + if (enc != 0) { + *new_term = false; + return slots_[enc - 1]; + } + *new_term = true; + uint32_t slot; + if (!free_slots_.empty()) { + slot = free_slots_.back(); + free_slots_.pop_back(); + } else { + slot = static_cast(slots_.size()); + slots_.emplace_back(); + } + slot_of_[term_id] = slot + 1; + return slots_[slot]; +} + +// Appends one byte to a term's chain, starting the chain lazily on first use. +void SpimiTermBuffer::put_byte(Term* t, uint8_t b) { + if (t->head == kNoChain) t->head = pool_.start_chain(&t->w, &t->level); + pool_.append_byte(&t->w, &t->level, b); +} + +void SpimiTermBuffer::put_varint(Term* t, uint64_t v) { + uint8_t tmp[10]; + const size_t n = encode_varint64(v, tmp); + for (size_t i = 0; i < n; ++i) put_byte(t, tmp[i]); +} + +void SpimiTermBuffer::accumulate(uint32_t term_id, uint32_t docid, uint32_t pos) { + bool new_term = false; + Term& t = term_slot(term_id, &new_term); + if (new_term) { + touched_ids_.push_back(term_id); + ++live_term_count_; + } + // A token starts a new doc unless it continues the most-recent doc for this term. + const bool new_doc = !t.started || t.cur_docid != docid; + // Tagged entry: varint((pos << 1) | new_doc). Positions are tagged 0 when + // disabled. The new_doc bit lets the decoder recover per-doc freqs by counting. + // Widen to 64-bit so a full 32-bit position survives the << 1 without truncation. + const uint64_t tagged = has_positions_ + ? ((static_cast(pos) << 1) | (new_doc ? 1u : 0u)) + : (new_doc ? 1u : 0u); + put_varint(&t, tagged); + if (new_doc) { + // Out-of-order docids are tolerated (zigzag delta is signed) and reordered at + // finalize; flag them so to_postings sorts. The delta base is the previous + // distinct doc (cur_docid), which is 0 for the very first doc (started==false). + const int64_t base = t.started ? static_cast(t.cur_docid) : 0; + if (t.started && docid < t.cur_docid) t.sorted = false; + const int64_t delta = static_cast(docid) - base; + put_varint(&t, zigzag_encode(delta)); + t.cur_docid = docid; + t.started = true; + } + ++t.ntok; + ++total_tokens_; + + // Gate-2 spill: trigger on REAL resident bytes (arena + slot index), NOT the old + // gated live_bytes_ estimate. arena_bytes() is monotonic per fill and reset to 0 + // by spill_to_run()'s pool_.reset(), so the trigger self-rearms after each spill. + // The OTHER trigger is the hard arena safety stop (active even in unlimited mode): + // when the arena nears the 4 GiB uint32-offset limit -- without it, a single + // >4 GiB in-memory segment wraps alloc_run and silently corrupts data. A forced + // spill + final k-way merge stays byte-identical regardless of when it fires. + constexpr uint64_t kArenaSpillCap = 0xE0000000ull; // 3.5 GiB, < UINT32_MAX margin + // Report this token's REAL resident growth FIRST so the writer's unified total + // (reporter_->current_bytes()) reflects it before the gate-2 check. Single-source + // diff: cheap (subtraction + relaxed atomic add; arena_bytes() is two field reads). + report_arena_delta(); + // Gate-2 spill (UNIFIED): when a reporter is attached, trigger on the writer's TOTAL + // build RAM (arena + slot index + dict) crossing the one configured cap -- the same + // total and cap every buffer of this writer shares, not a per-buffer threshold. Off + // Doris (no reporter) fall back to the local spill_threshold_bytes_. The hard arena + // safety stop (4 GiB uint32-offset limit) is always active. spill_to_run() resets the + // arena and reports its negative internally, so the unified total drops after a spill. + const bool over_cap = mem_reporter_ != nullptr ? mem_reporter_->over_cap() + : (spill_threshold_bytes_ != 0 && + resident_bytes() >= spill_threshold_bytes_); + const bool arena_near_limit = pool_.arena_bytes() >= kArenaSpillCap; + if ((over_cap || arena_near_limit) && spill_status_.ok()) { + spill_status_ = spill_to_run(); + } +} + +void SpimiTermBuffer::add_token(uint32_t term_id, uint32_t docid, uint32_t pos) { + // Hot path: a pooled slot lookup + a couple of pushes. No hashing, no string + // construction per token. Reject (and latch) an out-of-range id. + if (term_id >= slot_of_.size()) { + if (spill_status_.ok()) { + spill_status_ = Status::InvalidArgument("spimi: term_id out of vocab range"); + } + return; + } + accumulate(term_id, docid, pos); +} + +void SpimiTermBuffer::add_token(std::string_view term, uint32_t docid, uint32_t pos) { + // Compatibility path: intern the term into the owned vocabulary on first + // occurrence, then accumulate by its id. ONLY valid in OWNED-vocab mode. In + // BORROWED-vocab mode vocab_ points at the caller's vector, NOT &owned_vocab_: + // interning here would grow owned_vocab_ / intern_ / slot_of_ out of step with + // the active (borrowed) vocab, so the new id indexes the WRONG string and writes + // a slot_of_ entry the borrowed-vocab build never reconciles -- silent + // corruption. Reject (and latch) instead of forwarding by a bogus id. + if (vocab_ != &owned_vocab_) { + if (spill_status_.ok()) { + spill_status_ = Status::InvalidArgument( + "spimi: add_token(string_view) requires owned-vocab mode"); + } + return; + } + auto it = intern_.find(std::string(term)); + uint32_t term_id; + if (it == intern_.end()) { + term_id = static_cast(owned_vocab_.size()); + owned_vocab_.emplace_back(term); + intern_.emplace(owned_vocab_.back(), term_id); + slot_of_.push_back(0); // vocab grows: new id starts with no live slot + } else { + term_id = it->second; + } + accumulate(term_id, docid, pos); +} + +namespace { + +// Reorders a term's flat arrays into ascending-docid order, COALESCING any +// same-docid groups so the result has exactly one entry per docid -- matching the +// k-way-merge path's boundary-doc coalescing and the writer's strictly-ascending +// precondition. Only invoked for the rare term that received out-of-order docids +// (the common ascending path leaves t.sorted true and skips it). +// +// A docid may REVISIT (e.g. feed 5,1,5): the chain holds two separate doc-groups +// for doc 5. A STABLE sort keeps equal-docid groups in arrival order, then the +// coalesce pass sums their freqs and concatenates their positions in that same +// (document/arrival) order -- so the merged positions stay consistent with the +// merged freqs, exactly as the run-order merge would have produced. +void SortByDocid(std::vector* docids, std::vector* freqs, + std::vector* positions_flat, bool has_positions) { + const size_t n = docids->size(); + std::vector order(n); + std::iota(order.begin(), order.end(), 0); + // STABLE so equal docids keep arrival order: their positions then concatenate in + // document order, the same order the merge path's run concatenation yields. + std::stable_sort(order.begin(), order.end(), + [&](size_t a, size_t b) { return (*docids)[a] < (*docids)[b]; }); + + std::vector pos_off; + if (has_positions) { + pos_off.resize(n); + uint32_t running = 0; + for (size_t i = 0; i < n; ++i) { + pos_off[i] = running; + running += (*freqs)[i]; + } + } + std::vector nd, nf, np; + nd.reserve(n); + nf.reserve(n); + if (has_positions) np.reserve(positions_flat->size()); + for (size_t k : order) { + // Coalesce a revisited docid into the previous entry (it sorts adjacent now): + // sum freqs and append this group's positions right after the prior group's, + // so flat doc order stays partitioned by the merged freqs. + if (!nd.empty() && nd.back() == (*docids)[k]) { + nf.back() += (*freqs)[k]; + } else { + nd.push_back((*docids)[k]); + nf.push_back((*freqs)[k]); + } + if (has_positions) { + np.insert(np.end(), positions_flat->begin() + pos_off[k], + positions_flat->begin() + pos_off[k] + (*freqs)[k]); + } + } + *docids = std::move(nd); + *freqs = std::move(nf); + if (has_positions) *positions_flat = std::move(np); +} + +} // namespace + +namespace { + +// Decodes one varint from a pool chain cursor. The chain was written by +// encode_varint*, so the same LEB128 continuation-bit loop reconstructs it. +uint64_t DecodeChainVarint(CompactPostingPool::Cursor* c) { + uint64_t result = 0; + int shift = 0; + for (;;) { + const uint8_t b = c->next(); + result |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) break; + shift += 7; + } + return result; +} + +} // namespace + +// Decodes a term's compact tagged chain back into a flat TermPostings (the exact +// docids/freqs/positions_flat the writer consumes), so the produced index is +// byte-identical to the legacy raw-uint32 accumulator. The chain holds one entry +// per token: varint((pos << 1) | new_doc); each new_doc entry is followed by a +// zigzag(docid-delta). A doc's freq is the run length of consecutive same-doc +// tokens; positions stream out in document order (empty when positions disabled). +// Stream positions for a sorted term whose token count exceeds this: such a term's +// flat positions buffer (uint32 per token) would be the peak-RSS transient (tens of +// MiB for the widest term). Below it, the flat buffer is cheap and simpler. +static constexpr uint32_t kStreamPositionsTokenThreshold = 1u << 16; // 65536 + +TermPostings SpimiTermBuffer::to_postings(std::string term, Term&& t, + bool allow_stream_positions) const { + TermPostings tp; + tp.term = std::move(term); + if (t.ntok == 0 || t.head == kNoChain) return tp; + + // Reserve docids/freqs by ntok (an upper bound on the doc count: ntok >= ndocs). + // The doc count is not stored separately to keep Term compact; since the corpus + // is freq~1 per (term, doc), ntok ~= ndocs so the over-reserve is negligible. + tp.docids.reserve(t.ntok); + tp.freqs.reserve(t.ntok); + + // For a large SORTED term, stream positions on demand instead of materializing a + // multi-MiB flat buffer: the writer (prx builder) pulls them window by window via + // pos_pump, decoding straight from the still-resident arena chain. Out-of-order + // terms (rare, defensive) need a full sort, so they always use the flat path. + const bool stream_pos = allow_stream_positions && has_positions_ && t.sorted && + t.ntok >= kStreamPositionsTokenThreshold; + if (has_positions_ && !stream_pos) tp.positions_flat.reserve(t.ntok); + + CompactPostingPool::Cursor c = pool_.cursor(t.head, t.w.cur); + int64_t prev = 0; + for (uint32_t i = 0; i < t.ntok; ++i) { + const uint64_t tagged = DecodeChainVarint(&c); + const bool new_doc = (tagged & 1u) != 0; + if (new_doc) { + prev += zigzag_decode(DecodeChainVarint(&c)); + tp.docids.push_back(static_cast(prev)); + tp.freqs.push_back(0); + } + ++tp.freqs.back(); // count this token toward the current doc's freq + if (has_positions_ && !stream_pos) { + tp.positions_flat.push_back(static_cast(tagged >> 1)); + } + } + + // Decide the FINAL position handling now that df (= docids.size()) is known. + // pos_pump is honored ONLY by the windowed writer path (build_windowed_entry), + // taken when df >= kSlimDfThreshold. A SLIM term (df below it) goes through + // build_slim_entry, which reads positions_flat directly -- so streaming would + // leave it empty and crash. A high-ntok but low-df term (many repeats in few + // docs) therefore falls back to materializing its df-bounded positions here. + const bool windowed_path = tp.docids.size() >= snii::format::kSlimDfThreshold; + if (stream_pos && windowed_path) { + // Hand the writer a sequential position source backed by a SECOND pass over the + // same chain (the chain stays resident in pool_ for the whole drain). The pump + // yields positions in document order -- identical to positions_flat -- so the + // produced .prx is byte-for-byte the same. The cursor is shared/advanced across + // calls (the writer pulls in order, exactly pos_total positions total). + tp.pos_total = t.ntok; + auto cur = std::make_shared(pool_.cursor(t.head, t.w.cur)); + tp.pos_pump = [cur](uint32_t* dst, size_t count) { + // Re-walk the tagged token stream, yielding one position per token. A new-doc + // token is followed by a zigzag docid-delta varint that must be consumed and + // discarded so the cursor stays aligned with the encoding. + for (size_t k = 0; k < count; ++k) { + const uint64_t tagged = DecodeChainVarint(cur.get()); + if ((tagged & 1u) != 0) (void)DecodeChainVarint(cur.get()); // skip docid delta + dst[k] = static_cast(tagged >> 1); + } + }; + } else if (stream_pos && has_positions_) { + // Slim fallback: the decode loop skipped positions (stream candidate) but the + // term is slim, so materialize positions_flat in a second pass for build_slim. + tp.positions_flat.reserve(t.ntok); + CompactPostingPool::Cursor pc = pool_.cursor(t.head, t.w.cur); + for (uint32_t i = 0; i < t.ntok; ++i) { + const uint64_t tagged = DecodeChainVarint(&pc); + if ((tagged & 1u) != 0) (void)DecodeChainVarint(&pc); // skip docid delta + tp.positions_flat.push_back(static_cast(tagged >> 1)); + } + } else if (!t.sorted) { + // Defensive reorder for the rare out-of-order-docid feed (merge of pre-sorted + // runs). The common ascending path leaves t.sorted true and skips it. + SortByDocid(&tp.docids, &tp.freqs, &tp.positions_flat, has_positions_); + } + return tp; +} + +void SpimiTermBuffer::ensure_string_rank() const { + const std::vector& v = vocab(); + if (string_rank_.size() == v.size()) return; // already built (or empty vocab) + // One full lexicographic sort of the vocabulary, amortized over every spill. + std::vector order(v.size()); + std::iota(order.begin(), order.end(), 0u); + std::sort(order.begin(), order.end(), [&](uint32_t a, uint32_t b) { return v[a] < v[b]; }); + string_rank_.assign(v.size(), 0u); + for (uint32_t rank = 0; rank < order.size(); ++rank) { + string_rank_[order[rank]] = rank; + } +} + +std::vector SpimiTermBuffer::sorted_ids() const { + ensure_string_rank(); + std::vector ids = touched_ids_; + const std::vector& rank = string_rank_; + // Integer rank compare instead of full std::string compare: equal-string ids + // cannot occur for a dense vocab, so a strict rank order matches the original + // lexicographic order exactly. + std::sort(ids.begin(), ids.end(), [&](uint32_t a, uint32_t b) { return rank[a] < rank[b]; }); + return ids; +} + +void SpimiTermBuffer::release_term(uint32_t term_id) { + const uint32_t enc = slot_of_[term_id]; + if (enc == 0) return; // not live (defensive) + const uint32_t slot = enc - 1; + slots_[slot] = Term(); // free this term's arrays; the empty Term slot is reusable + free_slots_.push_back(slot); + slot_of_[term_id] = 0; + --live_term_count_; +} + +Status SpimiTermBuffer::drain_sorted(const std::function& fn, + bool allow_stream_positions) { + const std::vector& v = vocab(); + for (uint32_t id : sorted_ids()) { + Term term = std::move(slots_[slot_of_[id] - 1]); + release_term(id); // release this term's slot before building the next + // Allow streaming positions only when the caller consumes synchronously (the + // arena chain stays resident for the whole drain, so the pump can read from it). + TermPostings tp = to_postings(v[id], std::move(term), allow_stream_positions); + fn(std::move(tp)); + } + touched_ids_.clear(); + // Drop the arena + the slot pool (their bytes are fully decoded) and return the + // freed chunks to the OS so the process peak reflects only what survives the + // drain, not retained input-phase arena memory. + pool_.reset(); + std::vector().swap(slots_); + std::vector().swap(free_slots_); + std::vector().swap(slot_of_); + TrimMalloc(); + // Arena reset + slot_of_ freed: now real resident ~0, so this emits the final + // negative that returns every reported byte (no leak after the in-memory drain). + report_arena_delta(); + return Status::OK(); +} + +Status SpimiTermBuffer::drain_to_writer(RunWriter* w) { + Status st = Status::OK(); + const std::vector& v = vocab(); + // Spill writes by term-id (no string IO). Iterate touched ids in vocab-string + // order so each run is sorted; the k-way merge re-orders runs by the same key. + for (uint32_t id : sorted_ids()) { + Term term = std::move(slots_[slot_of_[id] - 1]); + release_term(id); + // Spill path: the run codec serializes positions_flat directly, so positions + // must be materialized (no streaming pump). + TermPostings tp = to_postings(v[id], std::move(term), /*allow_stream=*/false); + if (st.ok()) st = w->write_term(id, tp); + } + touched_ids_.clear(); + pool_.reset(); // all chains decoded into the run; free the arena for the refill + // The spill returns the arena to 0; slot_of_ keeps its capacity (survives + // the spill). Report the arena-drop negative now so the gate-2 spill is balanced + // immediately, not deferred to the next token. + report_arena_delta(); + return st; +} + +Status SpimiTermBuffer::spill_to_run() { + const std::string dir = resolve_temp_dir(); + // Best-effort space pre-check: fail with a clear, early error rather than a + // mid-write IoError that leaves a half-written run. Best-effort only (TOCTOU; on + // tmpfs this reports RAM). resident_bytes() (arena + slot index) is the REAL + // resident figure about to drain -- a conservative over-estimate of the run size. + const uint64_t resident = resident_bytes(); + const uint64_t avail = temp_dir_available_bytes(dir); + if (avail < resident) { + return Status::IoError("spimi: insufficient temp space in '" + dir + "' to spill ~" + + std::to_string(resident) + " B (~" + std::to_string(avail) + + " B free); set SNII_TEMP_DIR/TMPDIR to a larger disk"); + } + const std::string path = MakeRunPath(dir); + RunWriter w; + SNII_RETURN_IF_ERROR(w.open(path)); + run_paths_.push_back(path); // tracked for cleanup even if a later step fails + SNII_RETURN_IF_ERROR(drain_to_writer(&w)); + // drain emptied touched_ids_ and freed each term's arrays; terms_/present_ keep + // their (vocab-sized) capacity so the next fill reuses the dense slots with no + // re-allocation. present_ is already all-zero after release_term per id. + return w.close(); +} + +Status SpimiTermBuffer::merge_runs(const std::function& fn, + bool allow_stream_positions) { + // Flush whatever is still resident as one final sorted run so the k-way merge + // sees a uniform set of run files (and never holds two term sources at once). + if (!touched_ids_.empty()) { + Status s = spill_to_run(); + if (!s.ok() && spill_status_.ok()) spill_status_ = s; + } + if (!spill_status_.ok()) return spill_status_; // a spill or add_token error; emit nothing + // All terms are now spilled; the merge reads runs and never touches the + // accumulators. Free the pool + the vocab-sized slot index so the merge phase + // holds none of the input-side arrays resident -- keeps spill-mode peak RSS + // down. malloc_trim(0) returns the freed glibc arenas to the OS so the peak RSS + // measurement reflects the merge transient, not retained input-phase chunks. + std::vector().swap(slots_); + std::vector().swap(free_slots_); + std::vector().swap(slot_of_); + TrimMalloc(); + // pool_ was already reset by the final spill_to_run -> drain_to_writer (reported + // there); this swap frees slot_of_, so report the remaining negative now. After a + // full spilled drain reported_resident_ returns to 0 (no leak). + report_arena_delta(); + Status s = MergeRuns(run_paths_, vocab(), has_positions_, fn, allow_stream_positions); + // The merge churns one large coalesced TermPostings per term (the widest term's + // arrays are tens of MiB) plus per-run reader windows; on completion glibc + // retains those freed chunks in its arenas. Trim again so the post-merge resident + // set (and thus the process peak high-water if a later phase allocates) reflects + // only live state, not merge-transient retention. + TrimMalloc(); + return s; +} + +Status SpimiTermBuffer::for_each_term_sorted(const std::function& fn) { + // Single-drain contract: a second call would re-merge the (still-present) run + // files and re-emit every term, or emit nothing in the in-memory path. Return + // an error and emit NOTHING rather than produce a wrong second stream. + if (drained_) { + return Status::Internal("spimi: already drained (single-drain contract)"); + } + drained_ = true; + // The callback is invoked synchronously while the arena is resident, so large + // sorted terms may stream positions via pos_pump (peak-RSS win for the writer). + if (run_paths_.empty() && spill_status_.ok()) { + return drain_sorted(fn, /*allow_stream_positions=*/true); // pure in-memory path + } + // Spilled path (or add_token latched a validation error): the merge may STREAM + // a wide term's positions via pos_pump (fn consumes each term synchronously + // while the run readers stay parked). merge_runs returns the I/O status + // directly; add_token validation errors surface via spill_status_ inside it. + return merge_runs(fn, /*allow_stream_positions=*/true); +} + +std::vector SpimiTermBuffer::finalize_sorted() { + std::vector out; + // Single-drain contract (mirrors for_each_term_sorted): a second drain (including + // a finalize_sorted after a for_each_term_sorted, or vice versa) would re-emit or + // emit nothing. Latch an error and return EMPTY rather than a wrong result. + if (drained_) { + if (spill_status_.ok()) { + spill_status_ = Status::Internal("spimi: already drained (single-drain contract)"); + } + return out; + } + drained_ = true; + out.reserve(touched_ids_.size()); + // RETAINS each TermPostings past the drain, so positions must be MATERIALIZED + // (a streamed pos_pump would reference the arena, freed when the drain ends). + if (run_paths_.empty() && spill_status_.ok()) { + Status s = drain_sorted([&out](TermPostings&& tp) { out.push_back(std::move(tp)); }, + /*allow_stream_positions=*/false); + if (!s.ok() && spill_status_.ok()) spill_status_ = s; + } else { + // RETAINS each TermPostings past the merge, so positions MUST be materialized + // (a streamed pos_pump would reference run readers freed when the merge ends). + Status s = merge_runs([&out](TermPostings&& tp) { out.push_back(std::move(tp)); }, + /*allow_stream_positions=*/false); + if (!s.ok() && spill_status_.ok()) spill_status_ = s; + } + return out; +} + +void SpimiTermBuffer::cleanup_runs() { + for (const std::string& p : run_paths_) std::remove(p.c_str()); + run_paths_.clear(); +} + +} // namespace snii::writer diff --git a/be/src/storage/index/snii/snii_doris_adapter.cpp b/be/src/storage/index/snii/snii_doris_adapter.cpp new file mode 100644 index 00000000000000..00176daba08ac3 --- /dev/null +++ b/be/src/storage/index/snii/snii_doris_adapter.cpp @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_doris_adapter.h" + +#include + +namespace doris::segment_v2::snii_doris { + +Status to_doris_status(const ::snii::Status& status) { + if (status.ok()) { + return Status::OK(); + } + switch (status.code()) { + case ::snii::StatusCode::kNotFound: + return Status::Error("SNII: {}", + status.message()); + case ::snii::StatusCode::kUnsupported: + return Status::Error("SNII: {}", status.message()); + case ::snii::StatusCode::kInvalidArgument: + return Status::Error("SNII: {}", status.message()); + case ::snii::StatusCode::kCorruption: + return Status::Error("SNII: {}", + status.message()); + case ::snii::StatusCode::kIoError: + return Status::IOError("SNII: {}", status.message()); + case ::snii::StatusCode::kInternal: + return Status::InternalError("SNII: {}", status.message()); + case ::snii::StatusCode::kOk: + break; + } + return Status::InternalError("SNII: {}", status.message()); +} + +::snii::Status to_snii_status(const Status& status) { + if (status.ok()) { + return ::snii::Status::OK(); + } + return ::snii::Status::IoError(status.to_string_no_stack()); +} + +::snii::Status DorisSniiFileWriter::append(::snii::Slice data) { + if (_writer == nullptr) { + return ::snii::Status::InvalidArgument("doris writer is null"); + } + return to_snii_status( + _writer->append(Slice(reinterpret_cast(data.data()), data.size()))); +} + +::snii::Status DorisSniiFileWriter::finalize() { + if (_writer == nullptr) { + return ::snii::Status::InvalidArgument("doris writer is null"); + } + return ::snii::Status::OK(); +} + +uint64_t DorisSniiFileWriter::bytes_written() const { + return _writer == nullptr ? 0 : _writer->bytes_appended(); +} + +::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, + std::vector* out) { + if (_reader == nullptr) { + return ::snii::Status::InvalidArgument("doris reader is null"); + } + if (out == nullptr) { + return ::snii::Status::InvalidArgument("output buffer is null"); + } + out->resize(len); + size_t bytes_read = 0; + auto status = _reader->read_at(offset, Slice(out->data(), len), &bytes_read, _io_ctx); + if (!status.ok()) { + return to_snii_status(status); + } + if (bytes_read != len) { + return ::snii::Status::IoError( + fmt::format("short read at offset {}, expect {}, got {}", offset, len, bytes_read)); + } + return ::snii::Status::OK(); +} + +uint64_t DorisSniiFileReader::size() const { + return _reader == nullptr ? 0 : _reader->size(); +} + +} // namespace doris::segment_v2::snii_doris diff --git a/be/src/storage/index/snii/snii_doris_adapter.h b/be/src/storage/index/snii/snii_doris_adapter.h new file mode 100644 index 00000000000000..bcd50bca99de28 --- /dev/null +++ b/be/src/storage/index/snii/snii_doris_adapter.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "io/fs/file_reader.h" +#include "io/fs/file_writer.h" +#include "snii/common/status.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" +#include "util/slice.h" + +namespace doris::segment_v2::snii_doris { + +Status to_doris_status(const ::snii::Status& status); +::snii::Status to_snii_status(const Status& status); + +class DorisSniiFileWriter final : public ::snii::io::FileWriter { +public: + explicit DorisSniiFileWriter(io::FileWriter* writer) : _writer(writer) {} + + ::snii::Status append(::snii::Slice data) override; + ::snii::Status finalize() override; + uint64_t bytes_written() const override; + +private: + io::FileWriter* _writer = nullptr; +}; + +class DorisSniiFileReader final : public ::snii::io::FileReader { +public: + explicit DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx = nullptr) + : _reader(std::move(reader)), _io_ctx(io_ctx) {} + + ::snii::Status read_at(uint64_t offset, size_t len, std::vector* out) override; + uint64_t size() const override; + +private: + io::FileReaderSPtr _reader; + const io::IOContext* _io_ctx = nullptr; +}; + +} // namespace doris::segment_v2::snii_doris diff --git a/be/src/storage/index/snii/snii_index_reader.cpp b/be/src/storage/index/snii/snii_index_reader.cpp new file mode 100644 index 00000000000000..7cb6dcf05137ee --- /dev/null +++ b/be/src/storage/index/snii/snii_index_reader.cpp @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_index_reader.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "runtime/runtime_profile.h" +#include "runtime/runtime_state.h" +#include "snii/format/null_bitmap.h" +#include "snii/query/boolean_query.h" +#include "snii/query/phrase_query.h" +#include "snii/query/prefix_query.h" +#include "snii/query/regexp_query.h" +#include "snii/query/term_query.h" +#include "snii/query/wildcard_query.h" +#include "storage/index/index_file_reader.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/inverted_index_cache.h" +#include "storage/index/inverted/inverted_index_iterator.h" +#include "storage/index/snii/snii_doris_adapter.h" + +namespace doris::segment_v2 { + +namespace { + +std::vector to_terms(const InvertedIndexQueryInfo& query_info) { + std::vector terms; + terms.reserve(query_info.term_infos.size()); + for (const auto& term_info : query_info.term_infos) { + DCHECK(term_info.is_single_term()); + terms.push_back(term_info.get_single_term()); + } + return terms; +} + +void parse_phrase_slop(std::string* query, InvertedIndexQueryInfo* query_info) { + DCHECK(query != nullptr); + DCHECK(query_info != nullptr); + const auto is_digits = [](std::string_view str) { + return std::all_of(str.begin(), str.end(), [](unsigned char c) { return std::isdigit(c); }); + }; + + const size_t last_space_pos = query->find_last_of(' '); + if (last_space_pos == std::string::npos) { + return; + } + const size_t tilde_pos = last_space_pos + 1; + if (tilde_pos >= query->size() - 1 || (*query)[tilde_pos] != '~') { + return; + } + + const size_t slop_pos = tilde_pos + 1; + std::string_view slop_str(query->data() + slop_pos, query->size() - slop_pos); + if (slop_str.empty()) { + return; + } + + bool ordered = false; + if (slop_str.size() == 1) { + if (!std::isdigit(static_cast(slop_str[0]))) { + return; + } + } else if (slop_str.back() == '+') { + ordered = true; + slop_str.remove_suffix(1); + } + + if (!is_digits(slop_str)) { + return; + } + auto result = std::from_chars(slop_str.begin(), slop_str.end(), query_info->slop); + if (result.ec != std::errc()) { + return; + } + query_info->ordered = ordered; + *query = query->substr(0, last_space_pos); +} + +} // namespace + +Status SniiIndexReader::new_iterator(std::unique_ptr* iterator) { + if (*iterator == nullptr) { + *iterator = InvertedIndexIterator::create_unique(); + } + dynamic_cast(iterator->get()) + ->add_reader(_reader_type, + dynamic_pointer_cast(shared_from_this())); + return Status::OK(); +} + +Status SniiIndexReader::_parse_query_terms(const IndexQueryContextPtr& context, + std::string search_str, + InvertedIndexQueryType query_type, + const InvertedIndexAnalyzerCtx* analyzer_ctx, + InvertedIndexQueryInfo* query_info) { + DCHECK(query_info != nullptr); + if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY || + query_type == InvertedIndexQueryType::WILDCARD_QUERY) { + query_info->term_infos.emplace_back(search_str, 0); + return Status::OK(); + } + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { + parse_phrase_slop(&search_str, query_info); + SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, _index_meta.properties()); + return Status::OK(); + } + + SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); + if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) { + query_info->term_infos.emplace_back(search_str); + } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) { + auto reader = + inverted_index::InvertedIndexAnalyzer::create_reader(analyzer_ctx->char_filter_map); + reader->init(search_str.data(), static_cast(search_str.size()), true); + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader, analyzer_ctx->analyzer.get()); + } else { + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, _index_meta.properties()); + } + return Status::OK(); +} + +void SniiIndexReader::_docids_to_bitmap(const std::vector& docids, + std::shared_ptr* bit_map) { + auto result = std::make_shared(); + if (!docids.empty()) { + result->addMany(docids.size(), docids.data()); + } + result->runOptimize(); + *bit_map = std::move(result); +} + +Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::string& column_name, + const Field& query_value, InvertedIndexQueryType query_type, + std::shared_ptr& bit_map, + const InvertedIndexAnalyzerCtx* analyzer_ctx) { + SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer); + std::string search_str = query_value.get(); + + if (int ignore_above = + std::stoi(get_parser_ignore_above_value_from_properties(_index_meta.properties())); + _reader_type == InvertedIndexReaderType::STRING_TYPE && search_str.size() > ignore_above) { + return Status::Error( + "query value is too long, evaluate skipped."); + } + + InvertedIndexQueryInfo query_info; + RETURN_IF_ERROR(_parse_query_terms(context, search_str, query_type, analyzer_ctx, &query_info)); + if (query_info.term_infos.empty()) { + auto msg = fmt::format("token parser result is empty for SNII query '{}'", search_str); + if (is_match_query(query_type)) { + LOG(WARNING) << msg; + bit_map = std::make_shared(); + return Status::OK(); + } + return Status::Error(msg); + } + + auto terms = to_terms(query_info); + std::string cache_value = query_info.generate_tokens_key(); + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + cache_value += " " + std::to_string(query_info.slop); + cache_value += " " + std::to_string(query_info.ordered); + } + auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta); + InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, + std::move(cache_value)}; + auto* cache = InvertedIndexQueryCache::instance(); + InvertedIndexQueryCacheHandle cache_handler; + if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) { + return Status::OK(); + } + + RETURN_IF_ERROR( + _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); + auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); + + std::vector docids; + snii::Status status; + switch (query_type) { + case InvertedIndexQueryType::EQUAL_QUERY: + case InvertedIndexQueryType::MATCH_ANY_QUERY: + status = terms.size() == 1 + ? snii::query::term_query(*logical_reader, terms.front(), &docids) + : snii::query::boolean_or(*logical_reader, terms, &docids); + break; + case InvertedIndexQueryType::MATCH_ALL_QUERY: + status = snii::query::boolean_and(*logical_reader, terms, &docids); + break; + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + if (query_info.slop != 0) { + return Status::Error( + "SNII does not support sloppy phrase query yet"); + } + status = terms.size() == 1 + ? snii::query::term_query(*logical_reader, terms.front(), &docids) + : snii::query::phrase_query(*logical_reader, terms, &docids); + break; + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + status = snii::query::phrase_prefix_query(*logical_reader, terms, &docids); + break; + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: + status = snii::query::regexp_query(*logical_reader, search_str, &docids); + break; + case InvertedIndexQueryType::WILDCARD_QUERY: + status = snii::query::wildcard_query(*logical_reader, search_str, &docids); + break; + case InvertedIndexQueryType::LESS_THAN_QUERY: + case InvertedIndexQueryType::LESS_EQUAL_QUERY: + case InvertedIndexQueryType::GREATER_THAN_QUERY: + case InvertedIndexQueryType::GREATER_EQUAL_QUERY: + case InvertedIndexQueryType::RANGE_QUERY: + return Status::Error( + "SNII inverted index storage format does not support BKD/range query"); + default: + return Status::Error( + "SNII unsupported inverted index query type {}", query_type_to_string(query_type)); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(status)); + _docids_to_bitmap(docids, &bit_map); + cache->insert(cache_key, bit_map, &cache_handler); + return Status::OK(); +} + +Status SniiIndexReader::try_query(const IndexQueryContextPtr& /*context*/, + const std::string& /*column_name*/, const Field& /*query_value*/, + InvertedIndexQueryType /*query_type*/, size_t* /*count*/) { + return Status::Error("SNII does not support try_query"); +} + +Status SniiIndexReader::read_null_bitmap(const IndexQueryContextPtr& context, + InvertedIndexQueryCacheHandle* cache_handle, + lucene::store::Directory* /*dir*/) { + SCOPED_RAW_TIMER(&context->stats->inverted_index_query_null_bitmap_timer); + auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta); + InvertedIndexQueryCache::CacheKey cache_key { + index_file_key, "", InvertedIndexQueryType::UNKNOWN_QUERY, "null_bitmap"}; + auto* cache = InvertedIndexQueryCache::instance(); + if (cache->lookup(cache_key, cache_handle)) { + return Status::OK(); + } + + RETURN_IF_ERROR( + _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); + auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); + auto null_bitmap = std::make_shared(); + const auto& ref = logical_reader->section_refs().null_bitmap; + if (ref.length > 0) { + std::vector bytes; + RETURN_IF_ERROR(snii_doris::to_doris_status( + logical_reader->reader()->read_at(ref.offset, ref.length, &bytes))); + snii::format::NullBitmapReader reader; + RETURN_IF_ERROR(snii_doris::to_doris_status( + snii::format::NullBitmapReader::open(snii::Slice(bytes), &reader))); + for (uint32_t docid = 0; docid < reader.doc_count(); ++docid) { + if (reader.is_null(docid)) { + null_bitmap->add(docid); + } + } + null_bitmap->runOptimize(); + } + cache->insert(cache_key, null_bitmap, cache_handle); + return Status::OK(); +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/snii/snii_index_reader.h b/be/src/storage/index/snii/snii_index_reader.h new file mode 100644 index 00000000000000..e7c4b00bf68c0b --- /dev/null +++ b/be/src/storage/index/snii/snii_index_reader.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "storage/index/inverted/inverted_index_query_type.h" +#include "storage/index/inverted/inverted_index_reader.h" + +namespace doris::segment_v2 { + +class SniiIndexReader final : public InvertedIndexReader { + ENABLE_FACTORY_CREATOR(SniiIndexReader); + +public: + SniiIndexReader(const TabletIndex* index_meta, + const std::shared_ptr& index_file_reader, + InvertedIndexReaderType reader_type) + : InvertedIndexReader(index_meta, index_file_reader), _reader_type(reader_type) {} + + Status new_iterator(std::unique_ptr* iterator) override; + Status query(const IndexQueryContextPtr& context, const std::string& column_name, + const Field& query_value, InvertedIndexQueryType query_type, + std::shared_ptr& bit_map, + const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) override; + Status try_query(const IndexQueryContextPtr& context, const std::string& column_name, + const Field& query_value, InvertedIndexQueryType query_type, + size_t* count) override; + Status read_null_bitmap(const IndexQueryContextPtr& context, + InvertedIndexQueryCacheHandle* cache_handle, + lucene::store::Directory* dir = nullptr) override; + InvertedIndexReaderType type() override { return _reader_type; } + +private: + Status _parse_query_terms(const IndexQueryContextPtr& context, std::string search_str, + InvertedIndexQueryType query_type, + const InvertedIndexAnalyzerCtx* analyzer_ctx, + InvertedIndexQueryInfo* query_info); + static void _docids_to_bitmap(const std::vector& docids, + std::shared_ptr* bit_map); + + InvertedIndexReaderType _reader_type; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/snii/snii_index_writer.cpp b/be/src/storage/index/snii/snii_index_writer.cpp new file mode 100644 index 00000000000000..4cc84eb3226f98 --- /dev/null +++ b/be/src/storage/index/snii/snii_index_writer.cpp @@ -0,0 +1,197 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_index_writer.h" + +#include + +#include + +#include "common/cast_set.h" +#include "common/config.h" +#include "storage/index/index_file_writer.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/query/query_info.h" +#include "storage/tablet/tablet_schema.h" + +namespace doris::segment_v2 { + +SniiIndexColumnWriter::SniiIndexColumnWriter(IndexFileWriter* index_file_writer, + const TabletIndex* index_meta, bool /*single_field*/) + : _index_file_writer(index_file_writer), _index_meta(index_meta) {} + +Status SniiIndexColumnWriter::init() { + _should_analyzer = + inverted_index::InvertedIndexAnalyzer::should_analyzer(_index_meta->properties()); + _has_positions = get_parser_phrase_support_string_from_properties(_index_meta->properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES; + _config = _has_positions ? snii::format::IndexConfig::kDocsPositions + : snii::format::IndexConfig::kDocsOnly; + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + _ignore_above = cast_set(std::stoul(ignore_above_value)); + const auto spill_threshold = + static_cast(config::inverted_index_ram_buffer_size * 1024 * 1024); + _term_buffer = std::make_unique(_has_positions, spill_threshold); + _analyzer_config.analyzer_name = get_analyzer_name_from_properties(_index_meta->properties()); + _analyzer_config.parser_type = get_inverted_index_parser_type_from_string( + get_parser_string_from_properties(_index_meta->properties())); + _analyzer_config.parser_mode = + get_parser_mode_string_from_properties(_index_meta->properties()); + _analyzer_config.char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta->properties()); + _analyzer_config.lower_case = + get_parser_lowercase_from_properties(_index_meta->properties()); + _analyzer_config.stop_words = get_parser_stopwords_from_properties(_index_meta->properties()); + try { + _char_string_reader = inverted_index::InvertedIndexAnalyzer::create_reader( + _analyzer_config.char_filter_map); + if (_should_analyzer) { + _analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer(&_analyzer_config); + } + } catch (const CLuceneError& e) { + return Status::Error( + "SNII create analyzer failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII create analyzer failed: {}", e.what()); + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::_analyze(const Slice& value, std::vector* terms) { + terms->clear(); + if (!_should_analyzer) { + TermInfo term; + term.term = std::string(value.data, value.size); + term.position = 0; + terms->emplace_back(std::move(term)); + return Status::OK(); + } + try { + _char_string_reader->init(value.data, cast_set(value.size), false); + *terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result(_char_string_reader, + _analyzer.get()); + } catch (const CLuceneError& e) { + return Status::Error( + "SNII analyze value failed: {}", e.what()); + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::_add_value_tokens(const Slice& value, uint32_t docid, + uint32_t position_base, uint32_t* max_position) { + DCHECK(max_position != nullptr); + *max_position = position_base; + if ((!_should_analyzer && value.size > _ignore_above) || (_should_analyzer && value.empty())) { + return Status::OK(); + } + + std::vector terms; + RETURN_IF_ERROR(_analyze(value, &terms)); + for (const auto& term_info : terms) { + DCHECK(term_info.is_single_term()); + const auto& term = term_info.get_single_term(); + const uint32_t position = + _has_positions ? position_base + cast_set(term_info.position) : 0; + _term_buffer->add_token(term, docid, position); + *max_position = std::max(*max_position, position); + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_values(const std::string /*name*/, const void* values, + size_t count) { + const auto* v = reinterpret_cast(values); + for (size_t i = 0; i < count; ++i) { + uint32_t max_position = 0; + RETURN_IF_ERROR(_add_value_tokens(*v, _rid, 0, &max_position)); + ++v; + ++_rid; + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, + const uint8_t* offsets_ptr, size_t count) { + if (count == 0) { + return Status::OK(); + } + const auto* offsets = reinterpret_cast(offsets_ptr); + size_t start_off = 0; + for (size_t i = 0; i < count; ++i) { + auto array_elem_size = offsets[i + 1] - offsets[i]; + uint32_t position_base = 0; + for (auto j = start_off; j < start_off + array_elem_size; ++j) { + if (nested_null_map != nullptr && nested_null_map[j] == 1) { + continue; + } + const auto* value = reinterpret_cast( + reinterpret_cast(value_ptr) + j * field_size); + uint32_t max_position = position_base; + RETURN_IF_ERROR(_add_value_tokens(*value, _rid, position_base, &max_position)); + position_base = max_position + 1; + } + start_off += array_elem_size; + ++_rid; + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_nulls(uint32_t count) { + _null_docids.reserve(_null_docids.size() + count); + for (uint32_t i = 0; i < count; ++i) { + _null_docids.push_back(_rid + i); + } + _rid += count; + return Status::OK(); +} + +Status SniiIndexColumnWriter::add_array_nulls(const uint8_t* null_map, size_t num_rows) { + DCHECK(_rid >= num_rows); + if (num_rows == 0 || null_map == nullptr) { + return Status::OK(); + } + const auto first_row = _rid - num_rows; + for (size_t i = 0; i < num_rows; ++i) { + if (null_map[i] == 1) { + _null_docids.push_back(cast_set(first_row + i)); + } + } + return Status::OK(); +} + +Status SniiIndexColumnWriter::finish() { + DCHECK(_term_buffer != nullptr); + auto status = _term_buffer->status(); + if (!status.ok()) { + return Status::InternalError("SNII term buffer error: {}", status.to_string()); + } + RETURN_IF_ERROR(_index_file_writer->add_snii_index(_index_meta, cast_set(_rid), + std::move(_null_docids), _term_buffer.get(), + _config)); + _term_buffer.reset(); + return Status::OK(); +} + +void SniiIndexColumnWriter::close_on_error() { + _term_buffer.reset(); + _null_docids.clear(); +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/snii/snii_index_writer.h b/be/src/storage/index/snii/snii_index_writer.h new file mode 100644 index 00000000000000..bbdcd3389df630 --- /dev/null +++ b/be/src/storage/index/snii/snii_index_writer.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "snii/format/format_constants.h" +#include "snii/writer/spimi_term_buffer.h" +#include "storage/index/index_writer.h" +#include "storage/index/inverted/inverted_index_parser.h" +#include "storage/index/inverted/query/query_info.h" +#include "storage/index/inverted/util/reader.h" +#include "util/slice.h" + +namespace lucene::analysis { +class Analyzer; +} + +namespace doris::segment_v2 { + +class SniiIndexColumnWriter final : public IndexColumnWriter { +public: + SniiIndexColumnWriter(IndexFileWriter* index_file_writer, const TabletIndex* index_meta, + bool single_field); + ~SniiIndexColumnWriter() override = default; + + Status init() override; + Status add_values(const std::string name, const void* values, size_t count) override; + Status add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, const uint8_t* offsets_ptr, + size_t count) override; + Status add_nulls(uint32_t count) override; + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override; + Status finish() override; + int64_t size() const override { return 0; } + void close_on_error() override; + +private: + Status _add_value_tokens(const Slice& value, uint32_t docid, uint32_t position_base, + uint32_t* max_position); + Status _analyze(const Slice& value, std::vector* terms); + + IndexFileWriter* _index_file_writer = nullptr; + const TabletIndex* _index_meta = nullptr; + bool _should_analyzer = false; + bool _has_positions = false; + uint32_t _ignore_above = 0; + uint32_t _rid = 0; + snii::format::IndexConfig _config = snii::format::IndexConfig::kDocsOnly; + InvertedIndexAnalyzerConfig _analyzer_config; + inverted_index::ReaderPtr _char_string_reader; + std::shared_ptr _analyzer; + std::unique_ptr _term_buffer; + std::vector _null_docids; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/storage/rowset/beta_rowset.cpp b/be/src/storage/rowset/beta_rowset.cpp index 70950dfe065634..4f6e038661958e 100644 --- a/be/src/storage/rowset/beta_rowset.cpp +++ b/be/src/storage/rowset/beta_rowset.cpp @@ -827,6 +827,9 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, case InvertedIndexStorageFormatPB::V3: format_str = "V3"; break; + case InvertedIndexStorageFormatPB::SNII: + format_str = "SNII"; + break; default: return Status::InternalError("inverted index storage format error"); break; @@ -836,6 +839,19 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str.c_str(), allocator), allocator); rapidjson::Value segments(rapidjson::kArrayType); + auto add_file_info_to_json = [&](const std::string& path, + rapidjson::Value& json_value) -> Status { + json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), allocator); + int64_t idx_file_size = 0; + auto st = fs->file_size(path, &idx_file_size); + if (st != Status::OK()) { + LOG(WARNING) << "show nested index file get file size error, file: " << path + << ", error: " << st.msg(); + return st; + } + json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), allocator); + return Status::OK(); + }; for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { rapidjson::Value segment(rapidjson::kObjectType); segment.AddMember("segment_id", rapidjson::Value(seg_id).Move(), allocator); @@ -846,24 +862,20 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, fs, std::string(index_file_path_prefix), storage_format, InvertedIndexFileInfo(), _rowset_meta->tablet_id()); RETURN_IF_ERROR(index_file_reader->init()); + if (storage_format == InvertedIndexStorageFormatPB::SNII) { + rapidjson::Value index_file(rapidjson::kObjectType); + auto index_file_path = + InvertedIndexDescriptor::get_index_file_path_v2(index_file_path_prefix); + RETURN_IF_ERROR(add_file_info_to_json(index_file_path, index_file)); + segment.AddMember("index_files", rapidjson::Value(rapidjson::kArrayType).Move(), + allocator); + auto& index_files = segment["index_files"]; + index_files.PushBack(index_file, allocator); + segments.PushBack(segment, allocator); + continue; + } auto dirs = index_file_reader->get_all_directories(); - auto add_file_info_to_json = [&](const std::string& path, - rapidjson::Value& json_value) -> Status { - json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), - allocator); - int64_t idx_file_size = 0; - auto st = fs->file_size(path, &idx_file_size); - if (st != Status::OK()) { - LOG(WARNING) << "show nested index file get file size error, file: " << path - << ", error: " << st.msg(); - return st; - } - json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), - allocator); - return Status::OK(); - }; - auto process_files = [&allocator, &index_file_reader](auto& index_meta, rapidjson::Value& indices, rapidjson::Value& index) -> Status { diff --git a/be/src/storage/segment/column_reader.cpp b/be/src/storage/segment/column_reader.cpp index ebb1887c8ee920..262c1dd048be16 100644 --- a/be/src/storage/segment/column_reader.cpp +++ b/be/src/storage/segment/column_reader.cpp @@ -55,6 +55,7 @@ #include "storage/index/index_reader.h" #include "storage/index/inverted/analyzer/analyzer.h" #include "storage/index/inverted/inverted_index_reader.h" +#include "storage/index/snii/snii_index_reader.h" #include "storage/index/zone_map/zone_map_index.h" #include "storage/iterators.h" #include "storage/olap_common.h" @@ -647,6 +648,17 @@ Status ColumnReader::_load_index(const std::shared_ptr& index_f } IndexReaderPtr index_reader; + if (index_file_reader->get_storage_format() == InvertedIndexStorageFormatPB::SNII) { + if (!is_string_type(type)) { + return Status::Error( + "SNII inverted index storage format does not support BKD index type {}", type); + } + auto reader_type = should_analyzer ? InvertedIndexReaderType::FULLTEXT + : InvertedIndexReaderType::STRING_TYPE; + index_reader = SniiIndexReader::create_shared(index_meta, index_file_reader, reader_type); + _index_readers[index_meta->index_id()] = index_reader; + return Status::OK(); + } if (is_string_type(type)) { if (should_analyzer) { diff --git a/be/src/storage/tablet/tablet_meta.cpp b/be/src/storage/tablet/tablet_meta.cpp index b289cda58e7d3b..1e0660339fb4ec 100644 --- a/be/src/storage/tablet/tablet_meta.cpp +++ b/be/src/storage/tablet/tablet_meta.cpp @@ -101,6 +101,9 @@ TabletMetaSharedPtr TabletMeta::create( case TInvertedIndexStorageFormat::V2: inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2; break; + case TInvertedIndexStorageFormat::SNII: + inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::SNII; + break; default: break; } @@ -495,6 +498,9 @@ void TabletMeta::init_schema_from_thrift(const TTabletSchema& tablet_schema, case TInvertedIndexFileStorageFormat::V3: tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3); break; + case TInvertedIndexFileStorageFormat::SNII: + tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::SNII); + break; default: tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3); break; diff --git a/be/src/storage/task/index_builder.cpp b/be/src/storage/task/index_builder.cpp index ef49626e143ab5..7f6f4632c184a2 100644 --- a/be/src/storage/task/index_builder.cpp +++ b/be/src/storage/task/index_builder.cpp @@ -421,6 +421,11 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta _olap_data_convertor->reserve(_alter_inverted_indexes.size()); std::unique_ptr index_file_writer = nullptr; + if (output_rowset_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::SNII) { + return Status::Error( + "BUILD INDEX is not supported for SNII inverted index storage format yet"); + } if (output_rowset_schema->get_inverted_index_storage_format() >= InvertedIndexStorageFormatPB::V2) { auto idx_file_reader_iter = _index_file_readers.find( diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 54129adf81bed0..3e8def3c9710a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -105,6 +105,12 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c checkInvertedIndexProperties(properties, colType, invertedIndexFileStorageFormat); } + if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII + && !colType.isStringType() && !colType.isArrayType()) { + throw new AnalysisException("SNII inverted index storage format only supports string columns, column: " + + indexColName + " type: " + colType); + } + // default is "none" if not set if (parser == null) { parser = INVERTED_INDEX_PARSER_NONE; diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java index b208e712c273c4..24337cd4929316 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java @@ -379,11 +379,15 @@ public OlapFile.TabletMetaCloudPB.Builder createTabletMetaBuilder(long tableId, schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2); } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V3) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3); + } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.SNII); } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.DEFAULT) { if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V1); } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2); + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) { + schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.SNII); } else { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index b27db96bbe176b..392131a8cd4ea1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -1219,6 +1219,8 @@ public static TInvertedIndexFileStorageFormat analyzeInvertedIndexFileStorageFor return TInvertedIndexFileStorageFormat.V1; } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) { return TInvertedIndexFileStorageFormat.V2; + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) { + return TInvertedIndexFileStorageFormat.SNII; } else { return TInvertedIndexFileStorageFormat.V3; } @@ -1230,11 +1232,15 @@ public static TInvertedIndexFileStorageFormat analyzeInvertedIndexFileStorageFor return TInvertedIndexFileStorageFormat.V2; } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v3")) { return TInvertedIndexFileStorageFormat.V3; + } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("snii")) { + return TInvertedIndexFileStorageFormat.SNII; } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("default")) { if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) { return TInvertedIndexFileStorageFormat.V1; } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V2")) { return TInvertedIndexFileStorageFormat.V2; + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("SNII")) { + return TInvertedIndexFileStorageFormat.SNII; } else { return TInvertedIndexFileStorageFormat.V3; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java index 8630d80b7dc0ab..414052ef4f096c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java @@ -177,6 +177,17 @@ public void checkColumn(ColumnDefinition column, KeysType keysType, throw new AnalysisException(colType + " is not supported in " + indexType.toString() + " index. " + "invalid index: " + name); } + if (indexType == IndexType.INVERTED + && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + boolean isStringIndex = colType.isStringLikeType() + || (colType.isArrayType() + && ((ArrayType) colType).getItemType().isStringLikeType()); + if (!isStringIndex) { + throw new AnalysisException( + "SNII inverted index storage format does not support BKD index on column: " + + indexColName); + } + } // In inverted index format v1, each subcolumn of a variant has its own index file, leading to high IOPS. // when the subcolumn type changes, it may result in missing files, causing link file failure. @@ -280,6 +291,17 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe throw new AnalysisException(colType + " is not supported in " + indexType.toString() + " index. " + "invalid index: " + name); } + if (indexType == IndexType.INVERTED + && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + boolean isStringIndex = colType.isStringType() + || (colType.isArrayType() + && ((org.apache.doris.catalog.ArrayType) columnType).getItemType().isStringType()); + if (!isStringIndex) { + throw new AnalysisException( + "SNII inverted index storage format does not support BKD index on column: " + + indexColName); + } + } if (indexType == IndexType.ANN && !colType.isArrayType()) { throw new AnalysisException("ANN index column must be array type"); diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 210a5ba0a1cf89..8577957927e64d 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -446,6 +446,7 @@ enum InvertedIndexStorageFormatPB { V1 = 0; V2 = 1; V3 = 2; + SNII = 3; } // Tablet-level storage format. Values match TStorageFormat (Thrift) integer values so diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift index 4b4780d933c6e2..8917efd68cd31a 100644 --- a/gensrc/thrift/AgentService.thrift +++ b/gensrc/thrift/AgentService.thrift @@ -196,7 +196,8 @@ enum TCompressionType { enum TInvertedIndexStorageFormat { DEFAULT = 0, // Default format, unspecified storage method. V1 = 1, // Index per idx: Each index is stored separately based on its identifier. - V2 = 2 // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. + V2 = 2, // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. + SNII = 4 // SNII native inverted index storage format } enum TBinlogFormat { diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift index c6b9c705307380..d088a936b9e05f 100644 --- a/gensrc/thrift/Types.thrift +++ b/gensrc/thrift/Types.thrift @@ -130,7 +130,8 @@ enum TInvertedIndexFileStorageFormat { DEFAULT = 0, // Default format, unspecified storage method. V1 = 1, // Index per idx: Each index is stored separately based on its identifier. V2 = 2, // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. - V3 = 3 // Position and dictionary compression + V3 = 3, // Position and dictionary compression + SNII = 4 // SNII native inverted index storage format } struct TScalarType { From 9f036b8fa041cc017e4ae7235fdd7b021d7d3488 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sat, 27 Jun 2026 10:23:22 +0800 Subject: [PATCH 02/12] [test](regression) Add SNII storage format regression ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Add a focused regression case for the SNII inverted index storage format. The test creates a string inverted index with inverted_index_storage_format=SNII, verifies MATCH_ANY, MATCH_ALL, MATCH_PHRASE, and NULL bitmap behavior, and validates that numeric/BKD inverted index creation is rejected for SNII. ### Release note None ### Check List (For Author) - Test: Regression test - bash /mnt/disk1/jiangkai/cloud_sim/bootstrap.sh run -- --run -d inverted_index_p0/storage_format -s test_storage_format_snii -genOut - bash /mnt/disk1/jiangkai/cloud_sim/bootstrap.sh run -- --run -d inverted_index_p0/storage_format -s test_storage_format_snii - Behavior changed: No - Does this need documentation: No --- .../test_storage_format_snii.out | 13 +++ .../test_storage_format_snii.groovy | 89 +++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out create mode 100644 regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy diff --git a/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out new file mode 100644 index 00000000000000..68526469767db5 --- /dev/null +++ b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !match_any -- +1 +2 + +-- !match_all -- +1 + +-- !match_phrase -- +5 + +-- !null_bitmap -- +4 diff --git a/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy new file mode 100644 index 00000000000000..8101e9b1b32c98 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_storage_format_snii", "p0, nonConcurrent") { + sql "DROP TABLE IF EXISTS test_storage_format_snii" + sql "DROP TABLE IF EXISTS test_storage_format_snii_bkd" + + sql """ + CREATE TABLE test_storage_format_snii ( + id INT NULL, + body TEXT NULL, + INDEX idx_body (`body`) USING INVERTED PROPERTIES( + "parser" = "english", + "support_phrase" = "true", + "lower_case" = "true" + ) COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true", + "inverted_index_storage_format" = "SNII" + ); + """ + + sql """ + INSERT INTO test_storage_format_snii VALUES + (1, 'alpha beta gamma'), + (2, 'alpha delta'), + (3, 'beta epsilon'), + (4, NULL), + (5, 'quick brown fox'), + (6, 'quick fox'); + """ + sql "sync" + + order_qt_match_any """ + SELECT id FROM test_storage_format_snii + WHERE body MATCH_ANY 'alpha' + ORDER BY id + """ + order_qt_match_all """ + SELECT id FROM test_storage_format_snii + WHERE body MATCH_ALL 'alpha beta' + ORDER BY id + """ + order_qt_match_phrase """ + SELECT id FROM test_storage_format_snii + WHERE body MATCH_PHRASE 'quick brown' + ORDER BY id + """ + order_qt_null_bitmap """ + SELECT id FROM test_storage_format_snii + WHERE body IS NULL + ORDER BY id + """ + + test { + sql """ + CREATE TABLE test_storage_format_snii_bkd ( + id INT NULL, + score INT NULL, + INDEX idx_score (`score`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + exception "SNII inverted index storage format" + } +} From de80cb3b1868c03c3fbfb04ef7863a25744b2326 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sat, 27 Jun 2026 12:56:56 +0800 Subject: [PATCH 03/12] [fix](be) Harden SNII inverted index integration ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: This change completes the SNII inverted index storage-format split by routing SNII reads and writes through the SNII implementation, preserving Doris IO context during SNII file reads, bounding expanding queries before materializing all prefix terms, and rejecting unsupported SNII operations such as BKD-backed indexes, ANN indexes, and BUILD INDEX. It also avoids applying old CLucene index-compaction/drop-index paths to SNII files and adds focused FE and regression coverage for unsupported paths. ### Release note SNII inverted index storage format rejects unsupported BKD, ANN, and BUILD INDEX operations. ### Check List (For Author) - Test: - Build: ./build.sh --be --fe -j 192 - Build: ./build.sh --be -j 192 - Unit Test: ./run-fe-ut.sh --run org.apache.doris.nereids.trees.plans.commands.IndexDefinitionTest,org.apache.doris.alter.IndexChangeJobTest - Regression test: ./bootstrap.sh run -- --run -d inverted_index_p0/storage_format -s test_storage_format_snii -forceGenOut; ./bootstrap.sh run -- --run -d inverted_index_p0/storage_format -s test_storage_format_snii - Format: ./build-support/clang-format.sh; ./build-support/check-format.sh; git diff --check - Static Analysis: ./build-support/run-clang-tidy.sh and ./build-support/run-clang-tidy.sh --build-dir be/build_Release attempted; failed because clang-tidy could not resolve system stddef.h and also reported existing large-function/NOLINT diagnostics outside the safe scope of this SNII integration. - Behavior changed: Yes. SNII explicitly rejects unsupported BKD/ANN/BUILD INDEX paths instead of falling through to non-SNII index handling. - Does this need documentation: No --- be/src/snii/format/null_bitmap.h | 3 + be/src/snii/query/internal/term_expansion.h | 3 +- be/src/snii/query/phrase_query.h | 8 +- be/src/snii/query/prefix_query.h | 7 +- be/src/snii/query/regexp_query.h | 7 +- be/src/snii/query/wildcard_query.h | 7 +- be/src/snii/reader/logical_index_reader.h | 10 +- be/src/storage/compaction/compaction.cpp | 6 + be/src/storage/index/index_file_reader.cpp | 10 +- be/src/storage/index/index_file_writer.cpp | 12 +- be/src/storage/index/index_file_writer.h | 8 +- .../snii/core/src/format/null_bitmap.cpp | 12 +- .../snii/core/src/query/phrase_query.cpp | 49 ++++--- .../snii/core/src/query/prefix_query.cpp | 22 ++-- .../snii/core/src/query/regexp_query.cpp | 29 +++-- .../snii/core/src/query/term_expansion.cpp | 25 ++-- .../snii/core/src/query/wildcard_query.cpp | 26 ++-- .../core/src/reader/logical_index_reader.cpp | 99 ++++++++++---- .../storage/index/snii/snii_doris_adapter.cpp | 108 ++++++++++++++- .../storage/index/snii/snii_doris_adapter.h | 22 +++- .../storage/index/snii/snii_index_reader.cpp | 82 ++++++++---- .../storage/index/snii/snii_index_writer.cpp | 11 +- be/src/storage/index/snii/snii_index_writer.h | 2 + be/src/storage/task/index_builder.cpp | 7 + .../plans/commands/info/BuildIndexOp.java | 5 + .../plans/commands/info/CreateTableInfo.java | 6 +- .../plans/commands/info/IndexDefinition.java | 15 ++- .../doris/alter/IndexChangeJobTest.java | 42 ++++++ .../plans/commands/IndexDefinitionTest.java | 64 +++++++++ .../test_storage_format_snii.out | 3 + .../test_storage_format_snii.groovy | 123 ++++++++++++++++++ 31 files changed, 698 insertions(+), 135 deletions(-) diff --git a/be/src/snii/format/null_bitmap.h b/be/src/snii/format/null_bitmap.h index efe5880a101f55..21c6f92be59709 100644 --- a/be/src/snii/format/null_bitmap.h +++ b/be/src/snii/format/null_bitmap.h @@ -76,6 +76,9 @@ class NullBitmapReader { // Number of distinct null docids in the bitmap. uint32_t null_count() const; + // Copies the decoded bitmap into the caller-owned Roaring object. + void copy_to(roaring::Roaring* out) const; + // Total doc count of the logical index, as recorded by the writer. uint32_t doc_count() const { return doc_count_; } diff --git a/be/src/snii/query/internal/term_expansion.h b/be/src/snii/query/internal/term_expansion.h index 3b9753b4df267e..3393c31dc8457a 100644 --- a/be/src/snii/query/internal/term_expansion.h +++ b/be/src/snii/query/internal/term_expansion.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -16,6 +17,6 @@ using TermMatcher = std::function; // DictEntry and block bases, so callers avoid a second lookup per expanded term. Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx, std::string_view enum_prefix, const TermMatcher& matches, - DocIdSink* sink); + DocIdSink* const sink, int32_t max_expansions = 0); } // namespace snii::query::internal diff --git a/be/src/snii/query/phrase_query.h b/be/src/snii/query/phrase_query.h index bcafc9dcb67516..0de44c1fdbd921 100644 --- a/be/src/snii/query/phrase_query.h +++ b/be/src/snii/query/phrase_query.h @@ -29,9 +29,11 @@ Status phrase_query(const snii::reader::LogicalIndexReader& idx, // term prefix and preceding items are exact terms. For example {"quick", "bro"} // matches "quick brown" and "quick bronze". Empty terms -> empty result. Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx, - const std::vector& terms, std::vector* docids); + const std::vector& terms, + std::vector* const docids, int32_t max_expansions = 0); Status phrase_prefix_query(const snii::reader::LogicalIndexReader& idx, - const std::vector& terms, std::vector* docids, - QueryProfile* profile); + const std::vector& terms, + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); } // namespace snii::query diff --git a/be/src/snii/query/prefix_query.h b/be/src/snii/query/prefix_query.h index e7937733396797..cd8dc5559f3232 100644 --- a/be/src/snii/query/prefix_query.h +++ b/be/src/snii/query/prefix_query.h @@ -15,10 +15,11 @@ namespace snii::query { Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, - std::vector* docids); + std::vector* const docids, int32_t max_expansions = 0); Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, - std::vector* docids, QueryProfile* profile); + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); Status prefix_query(const snii::reader::LogicalIndexReader& idx, std::string_view prefix, - DocIdSink* sink); + DocIdSink* const sink, int32_t max_expansions = 0); } // namespace snii::query diff --git a/be/src/snii/query/regexp_query.h b/be/src/snii/query/regexp_query.h index 801dec8f2c677d..a088ed42dcc1f8 100644 --- a/be/src/snii/query/regexp_query.h +++ b/be/src/snii/query/regexp_query.h @@ -15,10 +15,11 @@ namespace snii::query { Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids); + std::vector* const docids, int32_t max_expansions = 0); Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids, QueryProfile* profile); + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - DocIdSink* sink); + DocIdSink* const sink, int32_t max_expansions = 0); } // namespace snii::query diff --git a/be/src/snii/query/wildcard_query.h b/be/src/snii/query/wildcard_query.h index de66450e3fda69..1cb0d5551dcf09 100644 --- a/be/src/snii/query/wildcard_query.h +++ b/be/src/snii/query/wildcard_query.h @@ -15,10 +15,11 @@ namespace snii::query { Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids); + std::vector* const docids, int32_t max_expansions = 0); Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids, QueryProfile* profile); + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions = 0); Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - DocIdSink* sink); + DocIdSink* const sink, int32_t max_expansions = 0); } // namespace snii::query diff --git a/be/src/snii/reader/logical_index_reader.h b/be/src/snii/reader/logical_index_reader.h index 64c87203daabd2..b10a5d7c7791f5 100644 --- a/be/src/snii/reader/logical_index_reader.h +++ b/be/src/snii/reader/logical_index_reader.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -61,13 +62,18 @@ class LogicalIndexReader { uint64_t prx_base = 0; }; + using PrefixHitVisitor = std::function; + // Ordered term enumeration: every term with `prefix`, in lexicographic order, // by seeking the start DICT block via the SampledTermIndex and scanning // forward across contiguous blocks until the terms pass the prefix range. // Empty prefix enumerates all terms. This is the contiguous-DICT-block design // the term-anchor layout was built for (MATCH_PHRASE_PREFIX / prefix / range - // queries). - Status prefix_terms(std::string_view prefix, std::vector* out) const; + // queries). The visitor form avoids materializing all hits when callers only + // need a bounded expansion. + Status visit_prefix_terms(std::string_view prefix, const PrefixHitVisitor& visitor) const; + Status prefix_terms(std::string_view prefix, std::vector* const out, + int32_t max_terms = 0) const; // Resolves a pod_ref entry's absolute .frq / .prx window byte range, // validating the locator against the posting_region length (defends against diff --git a/be/src/storage/compaction/compaction.cpp b/be/src/storage/compaction/compaction.cpp index df2fee8b1146d8..5f040fae3ac00f 100644 --- a/be/src/storage/compaction/compaction.cpp +++ b/be/src/storage/compaction/compaction.cpp @@ -1221,6 +1221,12 @@ static bool check_rowset_has_inverted_index(const RowsetSharedPtr& src_rs, int32 } void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { + if (_cur_tablet_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::SNII) { + LOG(INFO) << "tablet[" << _tablet->tablet_id() + << "] uses SNII inverted index storage format, skip CLucene index compaction"; + return; + } for (const auto& index : _cur_tablet_schema->inverted_indexes()) { auto col_unique_ids = index->col_unique_ids(); // check if column unique ids is empty to avoid crash diff --git a/be/src/storage/index/index_file_reader.cpp b/be/src/storage/index/index_file_reader.cpp index bb43015fa966d4..433987e1d5f80b 100644 --- a/be/src/storage/index/index_file_reader.cpp +++ b/be/src/storage/index/index_file_reader.cpp @@ -139,7 +139,7 @@ Status IndexFileReader::_init_from(int32_t read_buffer_size, const io::IOContext return Status::OK(); } -Status IndexFileReader::_init_snii(const io::IOContext* /*io_ctx*/) { +Status IndexFileReader::_init_snii(const io::IOContext* io_ctx) { auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); int64_t file_size = -1; if (_idx_file_info.has_index_size()) { @@ -154,6 +154,7 @@ Status IndexFileReader::_init_snii(const io::IOContext* /*io_ctx*/) { RETURN_IF_ERROR(_fs->open_file(index_file_full_path, &reader, &opts)); _snii_file_reader = std::make_shared(std::move(reader)); _snii_segment_reader = std::make_unique(); + snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(io_ctx); RETURN_IF_ERROR(snii_doris::to_doris_status(snii::reader::SniiSegmentReader::open( _snii_file_reader.get(), _snii_segment_reader.get()))); return Status::OK(); @@ -309,7 +310,12 @@ Status IndexFileReader::index_file_exist(const TabletIndex* index_meta, bool* re } else if (_storage_format == InvertedIndexStorageFormatPB::SNII) { auto index_file_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); RETURN_IF_ERROR(_fs->exists(index_file_path, res)); - if (!*res || _snii_segment_reader == nullptr) { + if (!*res) { + return Status::OK(); + } + std::shared_lock lock(_mutex); + if (_snii_segment_reader == nullptr) { + *res = false; return Status::OK(); } auto logical_reader = std::make_unique(); diff --git a/be/src/storage/index/index_file_writer.cpp b/be/src/storage/index/index_file_writer.cpp index 96541efc436404..665cb185d4aae7 100644 --- a/be/src/storage/index/index_file_writer.cpp +++ b/be/src/storage/index/index_file_writer.cpp @@ -105,8 +105,9 @@ Result> IndexFileWriter::open(const TabletInde Status IndexFileWriter::add_snii_index(const TabletIndex* index_meta, uint32_t doc_count, std::vector null_docids, - snii::writer::SpimiTermBuffer* term_buffer, - snii::format::IndexConfig index_config) { + snii::writer::SpimiTermBuffer* const term_buffer, + snii::format::IndexConfig index_config, + snii::writer::MemoryReporter* const mem_reporter) { DCHECK(_storage_format == InvertedIndexStorageFormatPB::SNII); DCHECK(index_meta != nullptr); DCHECK(term_buffer != nullptr); @@ -127,11 +128,18 @@ Status IndexFileWriter::add_snii_index(const TabletIndex* index_meta, uint32_t d input.doc_count = doc_count; input.null_docids = std::move(null_docids); input.term_source = term_buffer; + input.mem_reporter = mem_reporter; RETURN_IF_ERROR(snii_doris::to_doris_status(_snii_compound_writer->add_logical_index(input))); ++_snii_index_count; return Status::OK(); } +void IndexFileWriter::retain_snii_memory_reporter( + std::unique_ptr mem_reporter) { + DCHECK(mem_reporter != nullptr); + _snii_memory_reporters.push_back(std::move(mem_reporter)); +} + Status IndexFileWriter::delete_index(const TabletIndex* index_meta) { DBUG_EXECUTE_IF("IndexFileWriter::delete_index_index_meta_nullptr", { index_meta = nullptr; }); if (!index_meta) { diff --git a/be/src/storage/index/index_file_writer.h b/be/src/storage/index/index_file_writer.h index 7cf02c686400ed..7f16d19cb90e74 100644 --- a/be/src/storage/index/index_file_writer.h +++ b/be/src/storage/index/index_file_writer.h @@ -39,6 +39,7 @@ #include "storage/index/snii/snii_doris_adapter.h" namespace snii::writer { +class MemoryReporter; class SpimiTermBuffer; class SniiCompoundWriter; } // namespace snii::writer @@ -69,8 +70,10 @@ class IndexFileWriter { MOCK_FUNCTION Result> open(const TabletIndex* index_meta); Status add_snii_index(const TabletIndex* index_meta, uint32_t doc_count, std::vector null_docids, - snii::writer::SpimiTermBuffer* term_buffer, - snii::format::IndexConfig config); + snii::writer::SpimiTermBuffer* const term_buffer, + snii::format::IndexConfig config, + snii::writer::MemoryReporter* const mem_reporter); + void retain_snii_memory_reporter(std::unique_ptr mem_reporter); Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); Status add_into_searcher_cache(); @@ -130,6 +133,7 @@ class IndexFileWriter { IndexStorageFormatPtr _index_storage_format; int64_t _tablet_id = -1; std::unique_ptr _snii_file_writer; + std::vector> _snii_memory_reporters; std::unique_ptr _snii_compound_writer; size_t _snii_index_count = 0; diff --git a/be/src/storage/index/snii/core/src/format/null_bitmap.cpp b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp index 2ca7be630fe06d..d805cd2e945563 100644 --- a/be/src/storage/index/snii/core/src/format/null_bitmap.cpp +++ b/be/src/storage/index/snii/core/src/format/null_bitmap.cpp @@ -11,7 +11,9 @@ namespace snii::format { -NullBitmapWriter::NullBitmapWriter() : bitmap_(std::make_unique()) {} +NullBitmapWriter:: + NullBitmapWriter() // NOLINT(modernize-use-equals-default): roaring type is incomplete in the header. + : bitmap_(std::make_unique()) {} NullBitmapWriter::~NullBitmapWriter() = default; @@ -39,7 +41,9 @@ void NullBitmapWriter::finish(uint32_t doc_count, ByteSink* sink) const { SectionFramer::write(*sink, kNullBitmapSectionType, payload.view()); } -NullBitmapReader::NullBitmapReader() : bitmap_(std::make_unique()) {} +NullBitmapReader:: + NullBitmapReader() // NOLINT(modernize-use-equals-default): roaring type is incomplete in the header. + : bitmap_(std::make_unique()) {} NullBitmapReader::~NullBitmapReader() = default; @@ -96,4 +100,8 @@ uint32_t NullBitmapReader::null_count() const { return static_cast(bitmap_->cardinality()); } +void NullBitmapReader::copy_to(roaring::Roaring* out) const { + *out = *bitmap_; +} + } // namespace snii::format diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index a86a620a014992..7389d7a5ec96b6 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -565,11 +565,17 @@ Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx, } // namespace Status phrase_query(const LogicalIndexReader& idx, const std::vector& terms, - std::vector* docids) { - if (docids == nullptr) return Status::InvalidArgument("phrase_query: null out"); + std::vector* const docids) { + if (docids == nullptr) { + return Status::InvalidArgument("phrase_query: null out"); + } docids->clear(); - if (terms.empty()) return Status::OK(); - if (terms.size() == 1) return term_query(idx, terms.front(), docids); + if (terms.empty()) { + return Status::OK(); + } + if (terms.size() == 1) { + return term_query(idx, terms.front(), docids); + } if (!idx.has_positions()) { return Status::Unsupported("phrase_query: index has no positions"); } @@ -590,17 +596,23 @@ Status phrase_query(const LogicalIndexReader& idx, const std::vector& terms, - std::vector* docids, QueryProfile* profile) { + std::vector* const docids, QueryProfile* profile) { QueryProfileScope profile_scope(idx.reader(), profile); return phrase_query(idx, terms, docids); } Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector& terms, - std::vector* docids) { - if (docids == nullptr) return Status::InvalidArgument("phrase_prefix_query: null out"); + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("phrase_prefix_query: null out"); + } docids->clear(); - if (terms.empty()) return Status::OK(); - if (terms.size() == 1) return prefix_query(idx, terms.front(), docids); + if (terms.empty()) { + return Status::OK(); + } + if (terms.size() == 1) { + return prefix_query(idx, terms.front(), docids, max_expansions); + } if (!idx.has_positions()) { return Status::Unsupported("phrase_prefix_query: index has no positions"); } @@ -611,17 +623,23 @@ Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector tail_hits; - SNII_RETURN_IF_ERROR(idx.prefix_terms(terms.back(), &tail_hits)); - if (tail_hits.empty()) return Status::OK(); + SNII_RETURN_IF_ERROR(idx.prefix_terms(terms.back(), &tail_hits, max_expansions)); + if (tail_hits.empty()) { + return Status::OK(); + } std::vector expected; SNII_RETURN_IF_ERROR(CollectExpectedTailPositions(idx, exact_terms, &expected)); - if (expected.empty()) return Status::OK(); + if (expected.empty()) { + return Status::OK(); + } std::vector acc; for (LogicalIndexReader::PrefixHit& hit : tail_hits) { @@ -636,9 +654,10 @@ Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector& terms, - std::vector* docids, QueryProfile* profile) { + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { QueryProfileScope profile_scope(idx.reader(), profile); - return phrase_prefix_query(idx, terms, docids); + return phrase_prefix_query(idx, terms, docids, max_expansions); } } // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/prefix_query.cpp b/be/src/storage/index/snii/core/src/query/prefix_query.cpp index 50d37cbbf38383..4ad9b6629bdf77 100644 --- a/be/src/storage/index/snii/core/src/query/prefix_query.cpp +++ b/be/src/storage/index/snii/core/src/query/prefix_query.cpp @@ -11,24 +11,30 @@ namespace snii::query { using snii::reader::LogicalIndexReader; Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, - std::vector* docids) { - if (docids == nullptr) return Status::InvalidArgument("prefix_query: null out"); + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("prefix_query: null out"); + } docids->clear(); VectorDocIdSink sink(*docids); - return prefix_query(idx, prefix, &sink); + return prefix_query(idx, prefix, &sink, max_expansions); } Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, - std::vector* docids, QueryProfile* profile) { + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { QueryProfileScope profile_scope(idx.reader(), profile); - return prefix_query(idx, prefix, docids); + return prefix_query(idx, prefix, docids, max_expansions); } -Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, DocIdSink* sink) { - if (sink == nullptr) return Status::InvalidArgument("prefix_query: null sink"); +Status prefix_query(const LogicalIndexReader& idx, std::string_view prefix, DocIdSink* const sink, + int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("prefix_query: null sink"); + } std::vector hits; - SNII_RETURN_IF_ERROR(idx.prefix_terms(prefix, &hits)); + SNII_RETURN_IF_ERROR(idx.prefix_terms(prefix, &hits, max_expansions)); std::vector postings; postings.reserve(hits.size()); diff --git a/be/src/storage/index/snii/core/src/query/regexp_query.cpp b/be/src/storage/index/snii/core/src/query/regexp_query.cpp index 2078654e85fbf7..13377732b17201 100644 --- a/be/src/storage/index/snii/core/src/query/regexp_query.cpp +++ b/be/src/storage/index/snii/core/src/query/regexp_query.cpp @@ -36,10 +36,14 @@ bool is_regex_metachar(char c) { std::string literal_prefix_for_regex(std::string_view pattern) { std::string out; size_t i = 0; - if (!pattern.empty() && pattern.front() == '^') i = 1; + if (!pattern.empty() && pattern.front() == '^') { + i = 1; + } for (; i < pattern.size(); ++i) { const char c = pattern[i]; - if (is_regex_metachar(c)) break; + if (is_regex_metachar(c)) { + break; + } out.push_back(c); } return out; @@ -48,22 +52,27 @@ std::string literal_prefix_for_regex(std::string_view pattern) { } // namespace Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids) { - if (docids == nullptr) return Status::InvalidArgument("regexp_query: null out"); + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("regexp_query: null out"); + } docids->clear(); VectorDocIdSink sink(*docids); - return regexp_query(idx, pattern, &sink); + return regexp_query(idx, pattern, &sink, max_expansions); } Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids, QueryProfile* profile) { + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { QueryProfileScope profile_scope(idx.reader(), profile); - return regexp_query(idx, pattern, docids); + return regexp_query(idx, pattern, docids, max_expansions); } Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - DocIdSink* sink) { - if (sink == nullptr) return Status::InvalidArgument("regexp_query: null sink"); + DocIdSink* const sink, int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("regexp_query: null sink"); + } std::regex re; try { @@ -76,7 +85,7 @@ Status regexp_query(const snii::reader::LogicalIndexReader& idx, std::string_vie return internal::emit_expanded_docid_union( idx, enum_prefix, [&re](std::string_view term) { return std::regex_match(term.begin(), term.end(), re); }, - sink); + sink, max_expansions); } } // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/query/term_expansion.cpp b/be/src/storage/index/snii/core/src/query/term_expansion.cpp index 4af0209bda9411..ce1cffb0f141f1 100644 --- a/be/src/storage/index/snii/core/src/query/term_expansion.cpp +++ b/be/src/storage/index/snii/core/src/query/term_expansion.cpp @@ -10,18 +10,23 @@ namespace snii::query::internal { Status emit_expanded_docid_union(const snii::reader::LogicalIndexReader& idx, std::string_view enum_prefix, const TermMatcher& matches, - DocIdSink* sink) { - if (sink == nullptr) return Status::InvalidArgument("term_expansion: null sink"); - - std::vector hits; - SNII_RETURN_IF_ERROR(idx.prefix_terms(enum_prefix, &hits)); + DocIdSink* const sink, int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("term_expansion: null sink"); + } std::vector postings; - postings.reserve(hits.size()); - for (snii::reader::LogicalIndexReader::PrefixHit& hit : hits) { - if (!matches(hit.term)) continue; - postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base}); - } + int32_t count = 0; + SNII_RETURN_IF_ERROR(idx.visit_prefix_terms( + enum_prefix, [&](snii::reader::LogicalIndexReader::PrefixHit&& hit, bool* stop) { + if (!matches(hit.term)) { + return Status::OK(); + } + postings.push_back({std::move(hit.entry), hit.frq_base, hit.prx_base}); + ++count; + *stop = max_expansions > 0 && count >= max_expansions; + return Status::OK(); + })); return emit_docid_union(idx, postings, sink); } diff --git a/be/src/storage/index/snii/core/src/query/wildcard_query.cpp b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp index 3398f4bcdedabd..a3d5fd72bfbb71 100644 --- a/be/src/storage/index/snii/core/src/query/wildcard_query.cpp +++ b/be/src/storage/index/snii/core/src/query/wildcard_query.cpp @@ -15,7 +15,9 @@ namespace { std::string literal_prefix_for_wildcard(std::string_view pattern) { std::string out; for (char c : pattern) { - if (c == '*' || c == '?') break; + if (c == '*' || c == '?') { + break; + } out.push_back(c); } return out; @@ -46,26 +48,32 @@ bool wildcard_match(std::string_view pattern, std::string_view text) { } // namespace Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids) { - if (docids == nullptr) return Status::InvalidArgument("wildcard_query: null out"); + std::vector* const docids, int32_t max_expansions) { + if (docids == nullptr) { + return Status::InvalidArgument("wildcard_query: null out"); + } docids->clear(); VectorDocIdSink sink(*docids); - return wildcard_query(idx, pattern, &sink); + return wildcard_query(idx, pattern, &sink, max_expansions); } Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - std::vector* docids, QueryProfile* profile) { + std::vector* const docids, QueryProfile* profile, + int32_t max_expansions) { QueryProfileScope profile_scope(idx.reader(), profile); - return wildcard_query(idx, pattern, docids); + return wildcard_query(idx, pattern, docids, max_expansions); } Status wildcard_query(const snii::reader::LogicalIndexReader& idx, std::string_view pattern, - DocIdSink* sink) { - if (sink == nullptr) return Status::InvalidArgument("wildcard_query: null sink"); + DocIdSink* const sink, int32_t max_expansions) { + if (sink == nullptr) { + return Status::InvalidArgument("wildcard_query: null sink"); + } const std::string enum_prefix = literal_prefix_for_wildcard(pattern); return internal::emit_expanded_docid_union( idx, enum_prefix, - [pattern](std::string_view term) { return wildcard_match(pattern, term); }, sink); + [pattern](std::string_view term) { return wildcard_match(pattern, term); }, sink, + max_expansions); } } // namespace snii::query diff --git a/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp index bb3cb6b684388b..be6c01b2cb97d6 100644 --- a/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp +++ b/be/src/storage/index/snii/core/src/reader/logical_index_reader.cpp @@ -26,8 +26,8 @@ using snii::format::RegionRef; using snii::format::SampledTermIndexReader; namespace { -constexpr uint64_t kMaxDictBlockUncompBytes = 256ull * 1024 * 1024; -constexpr uint64_t kDefaultDictResidentMaxBytes = 256ull * 1024; +constexpr uint64_t kMaxDictBlockUncompBytes = 256ULL * 1024 * 1024; +constexpr uint64_t kDefaultDictResidentMaxBytes = 256ULL * 1024; // L0/L1 tiering threshold (bytes). Defaults to kBsbfResidentMaxBytes; the env // SNII_BSBF_RESIDENT_MAX overrides it for tuning and for exercising the @@ -37,7 +37,9 @@ uint64_t bsbf_resident_max_bytes() { if (s != nullptr) { char* end = nullptr; const unsigned long long v = std::strtoull(s, &end, 10); - if (end != s) return v; + if (end != s) { + return v; + } } return snii::format::kBsbfResidentMaxBytes; } @@ -47,7 +49,9 @@ uint64_t dict_resident_max_bytes() { if (s != nullptr) { char* end = nullptr; const unsigned long long v = std::strtoull(s, &end, 10); - if (end != s) return v; + if (end != s) { + return v; + } } return kDefaultDictResidentMaxBytes; } @@ -108,7 +112,9 @@ Status LogicalIndexReader::load_resident_dict_blocks() { resident_dict_blocks_.clear(); const uint64_t max_bytes = dict_resident_max_bytes(); - if (max_bytes == 0 || dbd_.n_blocks() == 0) return Status::OK(); + if (max_bytes == 0 || dbd_.n_blocks() == 0) { + return Status::OK(); + } uint64_t total_bytes = 0; for (uint32_t ord = 0; ord < dbd_.n_blocks(); ++ord) { @@ -159,7 +165,9 @@ Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tie if (file_reader == nullptr) { return Status::InvalidArgument("logical_index: null file reader"); } - if (out == nullptr) return Status::InvalidArgument("logical_index: null out"); + if (out == nullptr) { + return Status::InvalidArgument("logical_index: null out"); + } *out = LogicalIndexReader {}; out->reader_ = file_reader; @@ -179,8 +187,9 @@ Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tie // 32-byte block is read on demand per probe in lookup(). const RegionRef& bsbf = out->meta_.section_refs().bsbf; if (bsbf.length > 0) { - if (bsbf.length <= kBsbfHeaderSize) + if (bsbf.length <= kBsbfHeaderSize) { return Status::Corruption("logical_index: bsbf section too small"); + } const uint64_t num_bytes = bsbf.length - kBsbfHeaderSize; const bool resident = bsbf.length <= bsbf_resident_max_bytes(); // L0: read the WHOLE section (header + bitset) so probes are in-memory AND @@ -193,20 +202,24 @@ Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tie std::vector head; SNII_RETURN_IF_ERROR( file_reader->read_at(bsbf.offset, resident ? bsbf.length : kBsbfHeaderSize, &head)); - if (head.size() < kBsbfHeaderSize) + if (head.size() < kBsbfHeaderSize) { return Status::Corruption("logical_index: short bsbf header read"); + } SNII_RETURN_IF_ERROR(snii::format::BsbfHeader::parse(Slice(head.data(), kBsbfHeaderSize), bsbf.offset, &out->bsbf_header_)); // Cross-check the header geometry against the section ref. - if (out->bsbf_header_.num_bytes != num_bytes) + if (out->bsbf_header_.num_bytes != num_bytes) { return Status::Corruption("logical_index: bsbf header/section size mismatch"); + } out->has_bsbf_ = true; if (resident) { - if (head.size() < bsbf.length) + if (head.size() < bsbf.length) { return Status::Corruption("logical_index: short bsbf resident read"); + } const Slice bitset(head.data() + kBsbfHeaderSize, out->bsbf_header_.num_bytes); - if (snii::crc32c(bitset) != out->bsbf_header_.bitset_crc) + if (snii::crc32c(bitset) != out->bsbf_header_.bitset_crc) { return Status::Corruption("logical_index: bsbf bitset crc mismatch"); + } out->bsbf_resident_bitset_.assign(bitset.data(), bitset.data() + bitset.size()); out->bsbf_resident_ = true; } @@ -217,7 +230,9 @@ Status LogicalIndexReader::open(snii::io::FileReader* file_reader, IndexTier tie Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* entry, uint64_t* frq_base, uint64_t* prx_base) const { *found = false; - if (reader_ == nullptr) return Status::InvalidArgument("logical_index: not opened"); + if (reader_ == nullptr) { + return Status::InvalidArgument("logical_index: not opened"); + } // 1. XFilter fast rejection. DEFINITELY-ABSENT returns empty without the // DICT read. L0 probes the resident bitset; L1 reads one 32-byte block. @@ -234,14 +249,18 @@ Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* // L1: on-demand single-block probe. SNII_RETURN_IF_ERROR(bsbf_probe(reader_, bsbf_header_, h, &maybe)); } - if (!maybe) return Status::OK(); + if (!maybe) { + return Status::OK(); + } } // 2. SampledTermIndex -> candidate block ordinal. bool maybe = false; uint32_t ordinal = 0; SNII_RETURN_IF_ERROR(sti_.locate(term, &maybe, &ordinal)); - if (!maybe) return Status::OK(); + if (!maybe) { + return Status::OK(); + } // 3. Use a resident small-DICT block when present; otherwise read the DICT // block on demand and parse it with the same validation path used at open. @@ -251,7 +270,9 @@ Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* bool hit = false; SNII_RETURN_IF_ERROR(br->find_term(term, &hit, entry)); - if (!hit) return Status::OK(); + if (!hit) { + return Status::OK(); + } *found = true; *frq_base = br->frq_base(); @@ -259,11 +280,14 @@ Status LogicalIndexReader::lookup(std::string_view term, bool* found, DictEntry* return Status::OK(); } -Status LogicalIndexReader::prefix_terms(std::string_view prefix, - std::vector* out) const { - if (out == nullptr) return Status::InvalidArgument("logical_index: null out"); - out->clear(); - if (reader_ == nullptr) return Status::InvalidArgument("logical_index: not opened"); +Status LogicalIndexReader::visit_prefix_terms(std::string_view prefix, + const PrefixHitVisitor& visitor) const { + if (!visitor) { + return Status::InvalidArgument("logical_index: null prefix visitor"); + } + if (reader_ == nullptr) { + return Status::InvalidArgument("logical_index: not opened"); + } // Seek the start block: the SampledTermIndex block whose first term <= prefix // (terms with `prefix` are >= prefix, so they begin in that block or later). @@ -273,7 +297,9 @@ Status LogicalIndexReader::prefix_terms(std::string_view prefix, bool maybe = false; uint32_t ordinal = 0; SNII_RETURN_IF_ERROR(sti_.locate(prefix, &maybe, &ordinal)); - if (maybe) start = ordinal; + if (maybe) { + start = ordinal; + } } for (uint32_t ord = start; ord < dbd_.n_blocks(); ++ord) { @@ -285,21 +311,42 @@ Status LogicalIndexReader::prefix_terms(std::string_view prefix, for (DictEntry& e : entries) { const std::string_view t(e.term); - if (t < prefix) continue; // not yet at the prefix range + if (t < prefix) { + continue; // not yet at the prefix range + } const bool has_prefix = t.size() >= prefix.size() && t.compare(0, prefix.size(), prefix) == 0; - if (!has_prefix) return Status::OK(); // past the prefix range; sorted -> done + if (!has_prefix) { + return Status::OK(); // past the prefix range; sorted -> done + } PrefixHit hit; hit.term = e.term; hit.entry = std::move(e); hit.frq_base = br->frq_base(); hit.prx_base = br->prx_base(); - out->push_back(std::move(hit)); + bool stop = false; + SNII_RETURN_IF_ERROR(visitor(std::move(hit), &stop)); + if (stop) { + return Status::OK(); + } } } return Status::OK(); } +Status LogicalIndexReader::prefix_terms(std::string_view prefix, std::vector* const out, + int32_t max_terms) const { + if (out == nullptr) { + return Status::InvalidArgument("logical_index: null out"); + } + out->clear(); + return visit_prefix_terms(prefix, [&](PrefixHit&& hit, bool* stop) { + out->push_back(std::move(hit)); + *stop = max_terms > 0 && out->size() >= static_cast(max_terms); + return Status::OK(); + }); +} + namespace { // Validates a pod_ref window locator against the posting region and returns the @@ -311,7 +358,9 @@ Status resolve_window(const snii::format::RegionRef& section, uint64_t base, uin return Status::Corruption("logical_index: prelude_len exceeds window len"); } const uint64_t in_region = base + off_delta; - if (in_region < base) return Status::Corruption("logical_index: locator overflow"); + if (in_region < base) { + return Status::Corruption("logical_index: locator overflow"); + } if (in_region > section.length || total_len > section.length - in_region) { return Status::Corruption("logical_index: window past posting region"); } diff --git a/be/src/storage/index/snii/snii_doris_adapter.cpp b/be/src/storage/index/snii/snii_doris_adapter.cpp index 00176daba08ac3..40ac1767a0e76b 100644 --- a/be/src/storage/index/snii/snii_doris_adapter.cpp +++ b/be/src/storage/index/snii/snii_doris_adapter.cpp @@ -19,8 +19,16 @@ #include +#include +#include +#include + +#include "common/cast_set.h" + namespace doris::segment_v2::snii_doris { +thread_local const io::IOContext* DorisSniiFileReader::_scoped_io_ctx = nullptr; + Status to_doris_status(const ::snii::Status& status) { if (status.ok()) { return Status::OK(); @@ -72,6 +80,15 @@ uint64_t DorisSniiFileWriter::bytes_written() const { return _writer == nullptr ? 0 : _writer->bytes_appended(); } +DorisSniiFileReader::ScopedIOContext::ScopedIOContext(const io::IOContext* io_ctx) + : _previous(_scoped_io_ctx) { + _scoped_io_ctx = io_ctx; +} + +DorisSniiFileReader::ScopedIOContext::~ScopedIOContext() { + _scoped_io_ctx = _previous; +} + ::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, std::vector* out) { if (_reader == nullptr) { @@ -80,9 +97,14 @@ ::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, if (out == nullptr) { return ::snii::Status::InvalidArgument("output buffer is null"); } + SNII_RETURN_IF_ERROR(_check_read_range(offset, len)); + if (len == 0) { + out->clear(); + return ::snii::Status::OK(); + } out->resize(len); size_t bytes_read = 0; - auto status = _reader->read_at(offset, Slice(out->data(), len), &bytes_read, _io_ctx); + auto status = _reader->read_at(offset, Slice(out->data(), len), &bytes_read, _current_io_ctx()); if (!status.ok()) { return to_snii_status(status); } @@ -93,8 +115,92 @@ ::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, return ::snii::Status::OK(); } +::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Range>& ranges, + std::vector>* outs) { + if (outs == nullptr) { + return ::snii::Status::InvalidArgument("output buffers is null"); + } + outs->clear(); + outs->resize(ranges.size()); + if (ranges.empty()) { + return ::snii::Status::OK(); + } + + struct IndexedRange { + uint64_t offset = 0; + size_t len = 0; + size_t index = 0; + }; + std::vector sorted; + sorted.reserve(ranges.size()); + for (size_t i = 0; i < ranges.size(); ++i) { + SNII_RETURN_IF_ERROR(_check_read_range(ranges[i].offset, ranges[i].len)); + if (ranges[i].len == 0) { + continue; + } + sorted.push_back({ranges[i].offset, ranges[i].len, i}); + } + if (sorted.empty()) { + return ::snii::Status::OK(); + } + std::sort(sorted.begin(), sorted.end(), [](const IndexedRange& lhs, const IndexedRange& rhs) { + return lhs.offset < rhs.offset; + }); + + constexpr uint64_t max_coalesced_gap = 4096; + constexpr uint64_t max_coalesced_read = 1ULL << 20; + for (size_t begin = 0; begin < sorted.size();) { + uint64_t read_offset = sorted[begin].offset; + uint64_t read_end = sorted[begin].offset + sorted[begin].len; + size_t end = begin + 1; + while (end < sorted.size()) { + const uint64_t next_end = sorted[end].offset + sorted[end].len; + if ((sorted[end].offset > read_end && + sorted[end].offset - read_end > max_coalesced_gap) || + next_end - read_offset > max_coalesced_read) { + break; + } + read_end = std::max(read_end, next_end); + ++end; + } + + std::vector bytes; + SNII_RETURN_IF_ERROR( + read_at(read_offset, cast_set(read_end - read_offset), &bytes)); + for (size_t i = begin; i < end; ++i) { + const uint64_t pos = sorted[i].offset - read_offset; + auto& out = (*outs)[sorted[i].index]; + out.assign(bytes.begin() + cast_set(pos), + bytes.begin() + cast_set(pos + sorted[i].len)); + } + begin = end; + } + return ::snii::Status::OK(); +} + uint64_t DorisSniiFileReader::size() const { return _reader == nullptr ? 0 : _reader->size(); } +const io::IOContext* DorisSniiFileReader::_current_io_ctx() const { + return _scoped_io_ctx != nullptr ? _scoped_io_ctx : _default_io_ctx; +} + +::snii::Status DorisSniiFileReader::_check_read_range(uint64_t offset, size_t len) const { + if (_reader == nullptr) { + return ::snii::Status::InvalidArgument("doris reader is null"); + } + if (offset > std::numeric_limits::max() - len) { + return ::snii::Status::Corruption( + fmt::format("read range overflows: offset {}, len {}", offset, len)); + } + const uint64_t end = offset + len; + if (end > _reader->size()) { + return ::snii::Status::Corruption( + fmt::format("read range exceeds file size: offset {}, len {}, file size {}", offset, + len, _reader->size())); + } + return ::snii::Status::OK(); +} + } // namespace doris::segment_v2::snii_doris diff --git a/be/src/storage/index/snii/snii_doris_adapter.h b/be/src/storage/index/snii/snii_doris_adapter.h index bcd50bca99de28..38158998e7c65c 100644 --- a/be/src/storage/index/snii/snii_doris_adapter.h +++ b/be/src/storage/index/snii/snii_doris_adapter.h @@ -47,15 +47,33 @@ class DorisSniiFileWriter final : public ::snii::io::FileWriter { class DorisSniiFileReader final : public ::snii::io::FileReader { public: + class ScopedIOContext { + public: + explicit ScopedIOContext(const io::IOContext* io_ctx); + ~ScopedIOContext(); + + ScopedIOContext(const ScopedIOContext&) = delete; + ScopedIOContext& operator=(const ScopedIOContext&) = delete; + + private: + const io::IOContext* _previous = nullptr; + }; + explicit DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx = nullptr) - : _reader(std::move(reader)), _io_ctx(io_ctx) {} + : _reader(std::move(reader)), _default_io_ctx(io_ctx) {} ::snii::Status read_at(uint64_t offset, size_t len, std::vector* out) override; + ::snii::Status read_batch(const std::vector<::snii::io::Range>& ranges, + std::vector>* outs) override; uint64_t size() const override; private: + ::snii::Status _check_read_range(uint64_t offset, size_t len) const; + const io::IOContext* _current_io_ctx() const; + io::FileReaderSPtr _reader; - const io::IOContext* _io_ctx = nullptr; + const io::IOContext* _default_io_ctx = nullptr; + static thread_local const io::IOContext* _scoped_io_ctx; }; } // namespace doris::segment_v2::snii_doris diff --git a/be/src/storage/index/snii/snii_index_reader.cpp b/be/src/storage/index/snii/snii_index_reader.cpp index 7cb6dcf05137ee..995c4ba51c3980 100644 --- a/be/src/storage/index/snii/snii_index_reader.cpp +++ b/be/src/storage/index/snii/snii_index_reader.cpp @@ -17,6 +17,7 @@ #include "storage/index/snii/snii_index_reader.h" +#include #include #include @@ -101,6 +102,21 @@ void parse_phrase_slop(std::string* query, InvertedIndexQueryInfo* query_info) { *query = query->substr(0, last_space_pos); } +std::string build_snii_query_cache_value(const InvertedIndexQueryInfo& query_info) { + std::string cache_value; + for (const auto& term_info : query_info.term_infos) { + DCHECK(term_info.is_single_term()); + const auto& term = term_info.get_single_term(); + cache_value.append(std::to_string(term.size())); + cache_value.push_back(':'); + cache_value.append(term); + cache_value.push_back('@'); + cache_value.append(std::to_string(term_info.position)); + cache_value.push_back(';'); + } + return cache_value; +} + } // namespace Status SniiIndexReader::new_iterator(std::unique_ptr* iterator) { @@ -128,23 +144,39 @@ Status SniiIndexReader::_parse_query_terms(const IndexQueryContextPtr& context, query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { parse_phrase_slop(&search_str, query_info); SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); - query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( - search_str, _index_meta.properties()); + try { + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, _index_meta.properties()); + } catch (const CLuceneError& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } return Status::OK(); } SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); - if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) { - query_info->term_infos.emplace_back(search_str); - } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) { - auto reader = - inverted_index::InvertedIndexAnalyzer::create_reader(analyzer_ctx->char_filter_map); - reader->init(search_str.data(), static_cast(search_str.size()), true); - query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( - reader, analyzer_ctx->analyzer.get()); - } else { - query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( - search_str, _index_meta.properties()); + try { + if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) { + query_info->term_infos.emplace_back(search_str); + } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) { + auto reader = inverted_index::InvertedIndexAnalyzer::create_reader( + analyzer_ctx->char_filter_map); + reader->init(search_str.data(), static_cast(search_str.size()), true); + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + reader, analyzer_ctx->analyzer.get()); + } else { + query_info->term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, _index_meta.properties()); + } + } catch (const CLuceneError& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII analyze query failed: {}", e.what()); } return Status::OK(); } @@ -186,10 +218,18 @@ Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::st } auto terms = to_terms(query_info); - std::string cache_value = query_info.generate_tokens_key(); + const int32_t max_expansions = + context->runtime_state == nullptr + ? 50 + : context->runtime_state->query_options().inverted_index_max_expansions; + std::string cache_value = build_snii_query_cache_value(query_info); if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { cache_value += " " + std::to_string(query_info.slop); cache_value += " " + std::to_string(query_info.ordered); + } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY || + query_type == InvertedIndexQueryType::WILDCARD_QUERY) { + cache_value += " " + std::to_string(max_expansions); } auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta); InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, @@ -200,6 +240,7 @@ Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::st return Status::OK(); } + snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(context->io_ctx); RETURN_IF_ERROR( _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); @@ -226,13 +267,13 @@ Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::st : snii::query::phrase_query(*logical_reader, terms, &docids); break; case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: - status = snii::query::phrase_prefix_query(*logical_reader, terms, &docids); + status = snii::query::phrase_prefix_query(*logical_reader, terms, &docids, max_expansions); break; case InvertedIndexQueryType::MATCH_REGEXP_QUERY: - status = snii::query::regexp_query(*logical_reader, search_str, &docids); + status = snii::query::regexp_query(*logical_reader, search_str, &docids, max_expansions); break; case InvertedIndexQueryType::WILDCARD_QUERY: - status = snii::query::wildcard_query(*logical_reader, search_str, &docids); + status = snii::query::wildcard_query(*logical_reader, search_str, &docids, max_expansions); break; case InvertedIndexQueryType::LESS_THAN_QUERY: case InvertedIndexQueryType::LESS_EQUAL_QUERY: @@ -269,6 +310,7 @@ Status SniiIndexReader::read_null_bitmap(const IndexQueryContextPtr& context, return Status::OK(); } + snii_doris::DorisSniiFileReader::ScopedIOContext io_context_scope(context->io_ctx); RETURN_IF_ERROR( _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); @@ -281,11 +323,7 @@ Status SniiIndexReader::read_null_bitmap(const IndexQueryContextPtr& context, snii::format::NullBitmapReader reader; RETURN_IF_ERROR(snii_doris::to_doris_status( snii::format::NullBitmapReader::open(snii::Slice(bytes), &reader))); - for (uint32_t docid = 0; docid < reader.doc_count(); ++docid) { - if (reader.is_null(docid)) { - null_bitmap->add(docid); - } - } + reader.copy_to(null_bitmap.get()); null_bitmap->runOptimize(); } cache->insert(cache_key, null_bitmap, cache_handle); diff --git a/be/src/storage/index/snii/snii_index_writer.cpp b/be/src/storage/index/snii/snii_index_writer.cpp index 4cc84eb3226f98..37f2d41963fb9a 100644 --- a/be/src/storage/index/snii/snii_index_writer.cpp +++ b/be/src/storage/index/snii/snii_index_writer.cpp @@ -46,7 +46,9 @@ Status SniiIndexColumnWriter::init() { _ignore_above = cast_set(std::stoul(ignore_above_value)); const auto spill_threshold = static_cast(config::inverted_index_ram_buffer_size * 1024 * 1024); - _term_buffer = std::make_unique(_has_positions, spill_threshold); + _memory_reporter = std::make_unique(nullptr, spill_threshold); + _term_buffer = std::make_unique(_has_positions, spill_threshold, + _memory_reporter.get()); _analyzer_config.analyzer_name = get_analyzer_name_from_properties(_index_meta->properties()); _analyzer_config.parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta->properties())); @@ -89,6 +91,9 @@ Status SniiIndexColumnWriter::_analyze(const Slice& value, std::vector } catch (const CLuceneError& e) { return Status::Error( "SNII analyze value failed: {}", e.what()); + } catch (const Exception& e) { + return Status::Error( + "SNII analyze value failed: {}", e.what()); } return Status::OK(); } @@ -184,13 +189,15 @@ Status SniiIndexColumnWriter::finish() { } RETURN_IF_ERROR(_index_file_writer->add_snii_index(_index_meta, cast_set(_rid), std::move(_null_docids), _term_buffer.get(), - _config)); + _config, _memory_reporter.get())); + _index_file_writer->retain_snii_memory_reporter(std::move(_memory_reporter)); _term_buffer.reset(); return Status::OK(); } void SniiIndexColumnWriter::close_on_error() { _term_buffer.reset(); + _memory_reporter.reset(); _null_docids.clear(); } diff --git a/be/src/storage/index/snii/snii_index_writer.h b/be/src/storage/index/snii/snii_index_writer.h index bbdcd3389df630..f9c6686bbed4cf 100644 --- a/be/src/storage/index/snii/snii_index_writer.h +++ b/be/src/storage/index/snii/snii_index_writer.h @@ -22,6 +22,7 @@ #include #include "snii/format/format_constants.h" +#include "snii/writer/memory_reporter.h" #include "snii/writer/spimi_term_buffer.h" #include "storage/index/index_writer.h" #include "storage/index/inverted/inverted_index_parser.h" @@ -67,6 +68,7 @@ class SniiIndexColumnWriter final : public IndexColumnWriter { InvertedIndexAnalyzerConfig _analyzer_config; inverted_index::ReaderPtr _char_string_reader; std::shared_ptr _analyzer; + std::unique_ptr _memory_reporter; std::unique_ptr _term_buffer; std::vector _null_docids; }; diff --git a/be/src/storage/task/index_builder.cpp b/be/src/storage/task/index_builder.cpp index 7f6f4632c184a2..0e0ffeeb1d1036 100644 --- a/be/src/storage/task/index_builder.cpp +++ b/be/src/storage/task/index_builder.cpp @@ -338,6 +338,13 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta if (_is_drop_op) { const auto& output_rs_tablet_schema = output_rowset_meta->tablet_schema(); + if (output_rs_tablet_schema->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::SNII) { + LOG(INFO) << "skip physical SNII inverted index rewrite for drop index. tablet_id=" + << _tablet->tablet_id() + << " rowset_id=" << output_rowset_meta->rowset_id().to_string(); + return Status::OK(); + } if (output_rs_tablet_schema->get_inverted_index_storage_format() != InvertedIndexStorageFormatPB::V1) { const auto& fs = output_rowset_meta->fs(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java index 494e756538b112..bf5aac95225629 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java @@ -31,6 +31,7 @@ import org.apache.doris.common.Config; import org.apache.doris.common.UserException; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import com.google.common.collect.Maps; import org.apache.commons.lang3.StringUtils; @@ -134,6 +135,10 @@ public void validate(ConnectContext ctx) throws UserException { } IndexType indexType = existedIdx.getIndexType(); + OlapTable olapTable = (OlapTable) table; + if (olapTable.getInvertedIndexFileStorageFormat() == TInvertedIndexFileStorageFormat.SNII) { + throw new AnalysisException("BUILD INDEX is not supported for SNII inverted index storage format yet"); + } if ((Config.isNotCloudMode() && indexType == IndexType.NGRAM_BF) || indexType == IndexType.BLOOMFILTER || (Config.isCloudMode() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java index 14869e7925cf86..9303ebf95bcb7b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java @@ -848,8 +848,10 @@ public void validate(ConnectContext ctx) { } if (indexDef.getIndexType() == IndexType.ANN) { if (invertedIndexFileStorageFormat != null - && invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) { - throw new AnalysisException("ANN index is not supported in index format V1"); + && (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1 + || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII)) { + throw new AnalysisException("ANN index is not supported in index format " + + invertedIndexFileStorageFormat); } } for (String indexColName : indexDef.getColumnNames()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java index 414052ef4f096c..36f256994a7116 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java @@ -164,6 +164,11 @@ public void checkColumn(ColumnDefinition column, KeysType keysType, "ANN index can only be used in DUP_KEYS table or UNIQUE_KEYS table with" + " merge-on-write enabled"); } + if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1 + || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + throw new AnalysisException("ANN index is not supported in index format " + + invertedIndexFileStorageFormat); + } return; } @@ -275,8 +280,10 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe "ANN index can only be used in DUP_KEYS table or UNIQUE_KEYS table with" + " merge-on-write enabled"); } - if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) { - throw new AnalysisException("ANN index is not supported in index format V1"); + if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1 + || invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.SNII) { + throw new AnalysisException("ANN index is not supported in index format " + + invertedIndexFileStorageFormat); } return; } @@ -303,10 +310,6 @@ public void checkColumn(Column column, KeysType keysType, boolean enableUniqueKe } } - if (indexType == IndexType.ANN && !colType.isArrayType()) { - throw new AnalysisException("ANN index column must be array type"); - } - // In inverted index format v1, each subcolumn of a variant has its own index file, leading to high IOPS. // when the subcolumn type changes, it may result in missing files, causing link file failure. // There are two cases in which the inverted index format v1 is not supported: diff --git a/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java index fa6260d19f7a8d..8a836b6b5d6f2c 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/alter/IndexChangeJobTest.java @@ -46,6 +46,7 @@ import org.apache.doris.qe.ConnectContext; import org.apache.doris.task.AgentTask; import org.apache.doris.task.AgentTaskQueue; +import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import org.apache.doris.thrift.TStatusCode; import org.apache.doris.thrift.TTaskType; import org.apache.doris.transaction.FakeTransactionIDGenerator; @@ -195,6 +196,47 @@ public void testBuildIndexIndexChange() throws UserException { Assert.assertEquals(OlapTableState.NORMAL, olapTable.getState()); } + @Test + public void testBuildIndexRejectedForSniiStorageFormat() throws UserException { + if (fakeEnv != null) { + fakeEnv.close(); + } + fakeEnv = new FakeEnv(); + if (fakeEditLog != null) { + fakeEditLog.close(); + } + fakeEditLog = new FakeEditLog(); + FakeEnv.setEnv(masterEnv); + SchemaChangeHandler schemaChangeHandler = Env.getCurrentEnv().getSchemaChangeHandler(); + ArrayList alterOps = new ArrayList<>(); + Database db = masterEnv.getInternalCatalog().getDbOrDdlException(CatalogTestUtil.testDbId1); + OlapTable olapTable = (OlapTable) db.getTableOrDdlException(CatalogTestUtil.testTableId1); + String indexName = "index1"; + TableNameInfo tableNameInfo = new TableNameInfo(masterEnv.getInternalCatalog().getName(), db.getName(), + olapTable.getName()); + IndexDefinition indexDefinition = new IndexDefinition(indexName, false, + Lists.newArrayList(olapTable.getBaseSchema().get(1).getName()), + "INVERTED", + Maps.newHashMap(), "balabala"); + CreateIndexOp createIndexClause = new CreateIndexOp(tableNameInfo, indexDefinition, false); + ConnectContext connectContext = new ConnectContext(); + createIndexClause.validate(connectContext); + alterOps.add(createIndexClause); + schemaChangeHandler.process(alterOps, db, olapTable); + TInvertedIndexFileStorageFormat originalFormat = olapTable.getInvertedIndexFileStorageFormat(); + try { + olapTable.setInvertedIndexFileStorageFormat(TInvertedIndexFileStorageFormat.SNII); + BuildIndexOp buildIndexClause = new BuildIndexOp(tableNameInfo, indexName, null, false); + buildIndexClause.validate(connectContext); + Assert.fail("BUILD INDEX should be rejected for SNII inverted index storage format."); + } catch (AnalysisException e) { + Assert.assertTrue(e.getMessage().contains( + "BUILD INDEX is not supported for SNII inverted index storage format yet")); + } finally { + olapTable.setInvertedIndexFileStorageFormat(originalFormat); + } + } + @Test public void testDropIndexIndexChange() throws UserException { if (fakeEnv != null) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java index 7b41ddc95cf840..060e687b495242 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/IndexDefinitionTest.java @@ -18,7 +18,9 @@ package org.apache.doris.nereids.trees.plans.commands; import org.apache.doris.catalog.AggregateType; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.Type; import org.apache.doris.catalog.info.IndexType; import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.trees.plans.commands.info.ColumnDefinition; @@ -57,6 +59,68 @@ void testVariantIndexFormatV1() throws AnalysisException { } } + @Test + void testSniiInvertedIndexColumnTypes() throws AnalysisException { + IndexDefinition def = new IndexDefinition("snii_index", false, Lists.newArrayList("col1"), + "INVERTED", null, "comment"); + + def.checkColumn(new ColumnDefinition("col1", StringType.INSTANCE, false, AggregateType.NONE, true, + null, "comment"), KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII); + def.checkColumn(new ColumnDefinition("col1", ArrayType.of(StringType.INSTANCE), false, + AggregateType.NONE, true, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII); + + AnalysisException intException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new ColumnDefinition("col1", IntegerType.INSTANCE, false, AggregateType.NONE, + true, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(intException.getMessage().contains("does not support BKD index")); + + AnalysisException arrayIntException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new ColumnDefinition("col1", ArrayType.of(IntegerType.INSTANCE), false, + AggregateType.NONE, true, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(arrayIntException.getMessage().contains("does not support BKD index")); + } + + @Test + void testSniiInvertedIndexCatalogColumnTypes() throws AnalysisException { + IndexDefinition def = new IndexDefinition("snii_index", false, Lists.newArrayList("col1"), + "INVERTED", null, "comment"); + + def.checkColumn(new Column("col1", Type.STRING, true), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII); + def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.STRING), true), + KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII); + + AnalysisException intException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new Column("col1", Type.INT, true), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(intException.getMessage().contains("does not support BKD index")); + + AnalysisException arrayIntException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.INT), true), + KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(arrayIntException.getMessage().contains("does not support BKD index")); + } + + @Test + void testSniiRejectsAnnIndex() { + IndexDefinition def = new IndexDefinition("ann_index", false, Lists.newArrayList("col1"), + "ANN", null, "comment"); + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new ColumnDefinition("col1", ArrayType.of(FloatType.INSTANCE), false, + AggregateType.NONE, false, null, "comment"), KeysType.DUP_KEYS, false, + TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(exception.getMessage().contains("ANN index is not supported in index format SNII")); + + AnalysisException catalogException = Assertions.assertThrows(AnalysisException.class, () -> + def.checkColumn(new Column("col1", org.apache.doris.catalog.ArrayType.create(Type.FLOAT), false), + KeysType.DUP_KEYS, false, TInvertedIndexFileStorageFormat.SNII)); + Assertions.assertTrue(catalogException.getMessage().contains( + "ANN index is not supported in index format SNII")); + } + void testArrayTypeSupport() throws AnalysisException { IndexDefinition def = new IndexDefinition("array_index", false, Lists.newArrayList("col1"), "INVERTED", null, "array test"); diff --git a/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out index 68526469767db5..33e05cf4214d2f 100644 --- a/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out +++ b/regression-test/data/inverted_index_p0/storage_format/test_storage_format_snii.out @@ -11,3 +11,6 @@ -- !null_bitmap -- 4 + +-- !array_contains -- +1 diff --git a/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy index 8101e9b1b32c98..7800350fb6b753 100644 --- a/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy +++ b/regression-test/suites/inverted_index_p0/storage_format/test_storage_format_snii.groovy @@ -17,7 +17,11 @@ suite("test_storage_format_snii", "p0, nonConcurrent") { sql "DROP TABLE IF EXISTS test_storage_format_snii" + sql "DROP TABLE IF EXISTS test_storage_format_snii_array" + sql "DROP TABLE IF EXISTS test_storage_format_snii_add_index" sql "DROP TABLE IF EXISTS test_storage_format_snii_bkd" + sql "DROP TABLE IF EXISTS test_storage_format_snii_array_bkd" + sql "DROP TABLE IF EXISTS test_storage_format_snii_ann" sql """ CREATE TABLE test_storage_format_snii ( @@ -70,6 +74,87 @@ suite("test_storage_format_snii", "p0, nonConcurrent") { ORDER BY id """ + sql """ + CREATE TABLE test_storage_format_snii_array ( + id INT NULL, + tags ARRAY NULL, + INDEX idx_tags (`tags`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + + sql """ + INSERT INTO test_storage_format_snii_array VALUES + (1, '["alpha", "beta"]'), + (2, '["gamma"]'), + (3, NULL); + """ + sql "sync" + + order_qt_array_contains """ + SELECT id FROM test_storage_format_snii_array + WHERE array_contains(tags, 'alpha') + ORDER BY id + """ + + test { + if (isCloudMode()) { + sql "BUILD INDEX ON test_storage_format_snii" + } else { + sql "BUILD INDEX idx_body ON test_storage_format_snii" + } + exception "BUILD INDEX is not supported for SNII inverted index storage format yet" + } + + sql """ + CREATE TABLE test_storage_format_snii_add_index ( + id INT NULL, + body TEXT NULL, + score INT NULL, + scores ARRAY NULL, + embedding ARRAY NOT NULL, + INDEX idx_body_added_table (`body`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + + test { + sql """ + ALTER TABLE test_storage_format_snii_add_index + ADD INDEX idx_score_added (`score`) USING INVERTED COMMENT '' + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + ALTER TABLE test_storage_format_snii_add_index + ADD INDEX idx_scores_added (`scores`) USING INVERTED COMMENT '' + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + CREATE INDEX idx_ann_added ON test_storage_format_snii_add_index (`embedding`) USING ANN PROPERTIES( + "index_type" = "hnsw", + "metric_type" = "l2_distance", + "dim" = "1" + ) + """ + exception "ANN index is not supported in index format SNII" + } + test { sql """ CREATE TABLE test_storage_format_snii_bkd ( @@ -86,4 +171,42 @@ suite("test_storage_format_snii", "p0, nonConcurrent") { """ exception "SNII inverted index storage format" } + + test { + sql """ + CREATE TABLE test_storage_format_snii_array_bkd ( + id INT NULL, + scores ARRAY NULL, + INDEX idx_scores (`scores`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + exception "SNII inverted index storage format" + } + + test { + sql """ + CREATE TABLE test_storage_format_snii_ann ( + id INT NULL, + embedding ARRAY NOT NULL, + INDEX idx_ann (`embedding`) USING ANN PROPERTIES( + "index_type" = "hnsw", + "metric_type" = "l2_distance", + "dim" = "1" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "SNII" + ); + """ + exception "ANN index is not supported in index format SNII" + } } From a244d2f46356396daf459c884871baa3ec52bb67 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sat, 27 Jun 2026 21:54:15 +0800 Subject: [PATCH 04/12] [improvement](be) Add inverted index IO profile metrics ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: SNII performance validation in cloud mode needs comparable IO observability against the existing CLucene/V3 inverted index path. Before this change, SNII opened remote index files without the same file-cache options as V3 and only part of the IO context reached SNII/CLucene readers, so query profiles could not compare logical requested index bytes, physical index reads, and serial read rounds. This change routes SNII index file opens through Doris file-cache options, propagates copied inverted-index IO context through SNII reads, records request/read bytes and read round counters for both SNII and CLucene index readers, and exposes those counters in the file-cache profile reporter. ### Release note SNII and V3 inverted index scans now expose additional IO profile counters for request bytes, physical read bytes, range read count, and serial read rounds. ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --clean --run --filter='DorisSniiFileReaderTest.*:DorisFSDirectoryTest.FSIndexInputReadInternalRecordsIndexIOStatsAndContext:FileCacheProfileReporterTest.*' -j "192" - Unit Test: ./run-be-ut.sh --run --filter='DorisSniiFileReaderTest.*:DorisFSDirectoryTest.FSIndexInputReadInternalRecordsIndexIOStatsAndContext:FileCacheProfileReporterTest.*' -j "192" - Format: build-support/clang-format.sh; build-support/check-format.sh; git diff --check - Static Analysis: build-support/run-clang-tidy.sh --build-dir be/ut_build_ASAN attempted; failed because clang-tidy could not resolve system stddef.h and also reported existing large-function/C-header/NOLINT diagnostics outside this change. Clear new SNII adapter style warnings were fixed. - Behavior changed: Yes. SNII remote index file reads now use the same Doris file-cache reader options as V3 when file cache is enabled, and both SNII/V3 report additional profile counters. - Does this need documentation: No --- be/src/exec/scan/olap_scanner.cpp | 5 +- be/src/io/cache/block_file_cache_profile.cpp | 17 ++ be/src/io/cache/block_file_cache_profile.h | 5 +- be/src/io/io_common.h | 4 + be/src/storage/index/index_file_reader.cpp | 4 + .../inverted/inverted_index_fs_directory.cpp | 15 +- .../storage/index/snii/snii_doris_adapter.cpp | 57 +++++- .../storage/index/snii/snii_doris_adapter.h | 15 +- ...block_file_cache_profile_reporter_test.cpp | 16 ++ .../storage/index/snii_doris_adapter_test.cpp | 168 ++++++++++++++++++ .../inverted_index_fs_directory_test.cpp | 54 +++++- 11 files changed, 339 insertions(+), 21 deletions(-) create mode 100644 be/test/storage/index/snii_doris_adapter_test.cpp diff --git a/be/src/exec/scan/olap_scanner.cpp b/be/src/exec/scan/olap_scanner.cpp index 320976814679b9..efa536ea690779 100644 --- a/be/src/exec/scan/olap_scanner.cpp +++ b/be/src/exec/scan/olap_scanner.cpp @@ -152,7 +152,10 @@ static bool has_file_cache_statistics(const io::FileCacheStatistics& stats) { stats.inverted_index_bytes_read_from_remote != 0 || stats.inverted_index_bytes_read_from_peer != 0 || stats.inverted_index_local_io_timer != 0 || stats.inverted_index_remote_io_timer != 0 || - stats.inverted_index_peer_io_timer != 0 || stats.inverted_index_io_timer != 0; + stats.inverted_index_peer_io_timer != 0 || stats.inverted_index_io_timer != 0 || + stats.inverted_index_request_bytes != 0 || stats.inverted_index_read_bytes != 0 || + stats.inverted_index_range_read_count != 0 || + stats.inverted_index_serial_read_rounds != 0; } Status OlapScanner::_prepare_impl() { diff --git a/be/src/io/cache/block_file_cache_profile.cpp b/be/src/io/cache/block_file_cache_profile.cpp index 8f9c167c9989e6..10ea52670789a0 100644 --- a/be/src/io/cache/block_file_cache_profile.cpp +++ b/be/src/io/cache/block_file_cache_profile.cpp @@ -98,6 +98,10 @@ FileCacheStatistics diff_file_cache_statistics(const FileCacheStatistics& curren SUBTRACT_FIELD(inverted_index_remote_io_timer); SUBTRACT_FIELD(inverted_index_peer_io_timer); SUBTRACT_FIELD(inverted_index_io_timer); + SUBTRACT_FIELD(inverted_index_request_bytes); + SUBTRACT_FIELD(inverted_index_read_bytes); + SUBTRACT_FIELD(inverted_index_range_read_count); + SUBTRACT_FIELD(inverted_index_serial_read_rounds); #undef SUBTRACT_FIELD return diff; } @@ -156,6 +160,14 @@ FileCacheProfileReporter::FileCacheProfileReporter(RuntimeProfile* profile) { ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexPeerIOUseTimer", cache_profile, 1); inverted_index_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexIOTimer", cache_profile, 1); + inverted_index_request_bytes = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexRequestBytes", TUnit::BYTES, cache_profile, 1); + inverted_index_read_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "InvertedIndexReadBytes", + TUnit::BYTES, cache_profile, 1); + inverted_index_range_read_count = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexRangeReadCount", TUnit::UNIT, cache_profile, 1); + inverted_index_serial_read_rounds = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexSerialReadRounds", TUnit::UNIT, cache_profile, 1); } void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) const { @@ -193,6 +205,11 @@ void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) con COUNTER_UPDATE(inverted_index_remote_io_timer, statistics->inverted_index_remote_io_timer); COUNTER_UPDATE(inverted_index_peer_io_timer, statistics->inverted_index_peer_io_timer); COUNTER_UPDATE(inverted_index_io_timer, statistics->inverted_index_io_timer); + COUNTER_UPDATE(inverted_index_request_bytes, statistics->inverted_index_request_bytes); + COUNTER_UPDATE(inverted_index_read_bytes, statistics->inverted_index_read_bytes); + COUNTER_UPDATE(inverted_index_range_read_count, statistics->inverted_index_range_read_count); + COUNTER_UPDATE(inverted_index_serial_read_rounds, + statistics->inverted_index_serial_read_rounds); } } // namespace doris::io diff --git a/be/src/io/cache/block_file_cache_profile.h b/be/src/io/cache/block_file_cache_profile.h index 6c95e49791c054..41cc2e0c01b41a 100644 --- a/be/src/io/cache/block_file_cache_profile.h +++ b/be/src/io/cache/block_file_cache_profile.h @@ -58,7 +58,6 @@ class FileCacheMetrics { void register_entity(); void update_metrics_callback(); -private: std::mutex _mtx; // use shared_ptr for concurrent std::shared_ptr _statistics; @@ -97,6 +96,10 @@ struct FileCacheProfileReporter { RuntimeProfile::Counter* inverted_index_remote_io_timer = nullptr; RuntimeProfile::Counter* inverted_index_peer_io_timer = nullptr; RuntimeProfile::Counter* inverted_index_io_timer = nullptr; + RuntimeProfile::Counter* inverted_index_request_bytes = nullptr; + RuntimeProfile::Counter* inverted_index_read_bytes = nullptr; + RuntimeProfile::Counter* inverted_index_range_read_count = nullptr; + RuntimeProfile::Counter* inverted_index_serial_read_rounds = nullptr; FileCacheProfileReporter(RuntimeProfile* profile); void update(const FileCacheStatistics* statistics) const; diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 36b20517afb87c..391f3b15c34e8d 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -74,6 +74,10 @@ struct FileCacheStatistics { int64_t inverted_index_remote_io_timer = 0; int64_t inverted_index_peer_io_timer = 0; int64_t inverted_index_io_timer = 0; + int64_t inverted_index_request_bytes = 0; + int64_t inverted_index_read_bytes = 0; + int64_t inverted_index_range_read_count = 0; + int64_t inverted_index_serial_read_rounds = 0; }; struct IOContext { diff --git a/be/src/storage/index/index_file_reader.cpp b/be/src/storage/index/index_file_reader.cpp index 433987e1d5f80b..e90d642b56c57b 100644 --- a/be/src/storage/index/index_file_reader.cpp +++ b/be/src/storage/index/index_file_reader.cpp @@ -21,6 +21,7 @@ #include #include "common/cast_set.h" +#include "common/config.h" #include "storage/index/inverted/inverted_index_compound_reader.h" #include "storage/index/inverted/inverted_index_fs_directory.h" #include "storage/tablet/tablet_schema.h" @@ -148,6 +149,9 @@ Status IndexFileReader::_init_snii(const io::IOContext* io_ctx) { file_size = file_size == 0 ? -1 : file_size; io::FileReaderOptions opts; + opts.cache_type = config::enable_file_cache ? io::FileCachePolicy::FILE_BLOCK_CACHE + : io::FileCachePolicy::NO_CACHE; + opts.is_doris_table = true; opts.file_size = file_size; opts.tablet_id = _tablet_id; io::FileReaderSPtr reader; diff --git a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp index e65025b25a4fc7..d03cdf38a9abf1 100644 --- a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp +++ b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp @@ -179,16 +179,15 @@ void DorisFSDirectory::FSIndexInput::close() { } void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) { + const bool is_index_data = _io_ctx.is_index_data; if (io_ctx) { const auto& ctx = static_cast(io_ctx); - _io_ctx.reader_type = ctx->reader_type; - _io_ctx.query_id = ctx->query_id; - _io_ctx.file_cache_stats = ctx->file_cache_stats; + _io_ctx = *ctx; } else { - _io_ctx.reader_type = ReaderType::UNKNOWN; - _io_ctx.query_id = nullptr; - _io_ctx.file_cache_stats = nullptr; + _io_ctx = io::IOContext {}; } + _io_ctx.is_index_data = is_index_data; + _io_ctx.is_inverted_index = true; } const void* DorisFSDirectory::FSIndexInput::getIoContext() { @@ -247,6 +246,10 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) if (_io_ctx.file_cache_stats != nullptr) { _io_ctx.file_cache_stats->inverted_index_io_timer += inverted_index_io_timer; + _io_ctx.file_cache_stats->inverted_index_request_bytes += len; + _io_ctx.file_cache_stats->inverted_index_read_bytes += len; + ++_io_ctx.file_cache_stats->inverted_index_range_read_count; + ++_io_ctx.file_cache_stats->inverted_index_serial_read_rounds; } } diff --git a/be/src/storage/index/snii/snii_doris_adapter.cpp b/be/src/storage/index/snii/snii_doris_adapter.cpp index 40ac1767a0e76b..5756bdc8678540 100644 --- a/be/src/storage/index/snii/snii_doris_adapter.cpp +++ b/be/src/storage/index/snii/snii_doris_adapter.cpp @@ -80,9 +80,22 @@ uint64_t DorisSniiFileWriter::bytes_written() const { return _writer == nullptr ? 0 : _writer->bytes_appended(); } +DorisSniiFileReader::DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx) + : _reader(std::move(reader)), _default_io_ctx(_make_index_io_context(io_ctx)) {} + +io::IOContext DorisSniiFileReader::_make_index_io_context(const io::IOContext* io_ctx) { + io::IOContext index_io_ctx; + if (io_ctx != nullptr) { + index_io_ctx = *io_ctx; + } + index_io_ctx.is_inverted_index = true; + index_io_ctx.is_index_data = true; + return index_io_ctx; +} + DorisSniiFileReader::ScopedIOContext::ScopedIOContext(const io::IOContext* io_ctx) - : _previous(_scoped_io_ctx) { - _scoped_io_ctx = io_ctx; + : _previous(_scoped_io_ctx), _io_ctx(DorisSniiFileReader::_make_index_io_context(io_ctx)) { + _scoped_io_ctx = &_io_ctx; } DorisSniiFileReader::ScopedIOContext::~ScopedIOContext() { @@ -90,7 +103,16 @@ DorisSniiFileReader::ScopedIOContext::~ScopedIOContext() { } ::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, - std::vector* out) { + std::vector* const out) { + SNII_RETURN_IF_ERROR(_read_at(offset, len, out)); + if (len > 0) { + _record_read_stats(cast_set(len), cast_set(len), 1, 1); + } + return ::snii::Status::OK(); +} + +::snii::Status DorisSniiFileReader::_read_at(uint64_t offset, size_t len, + std::vector* const out) const { if (_reader == nullptr) { return ::snii::Status::InvalidArgument("doris reader is null"); } @@ -116,7 +138,7 @@ ::snii::Status DorisSniiFileReader::read_at(uint64_t offset, size_t len, } ::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Range>& ranges, - std::vector>* outs) { + std::vector>* const outs) { if (outs == nullptr) { return ::snii::Status::InvalidArgument("output buffers is null"); } @@ -131,10 +153,12 @@ ::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Ran size_t len = 0; size_t index = 0; }; + int64_t request_bytes = 0; std::vector sorted; sorted.reserve(ranges.size()); for (size_t i = 0; i < ranges.size(); ++i) { SNII_RETURN_IF_ERROR(_check_read_range(ranges[i].offset, ranges[i].len)); + request_bytes += cast_set(ranges[i].len); if (ranges[i].len == 0) { continue; } @@ -149,6 +173,8 @@ ::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Ran constexpr uint64_t max_coalesced_gap = 4096; constexpr uint64_t max_coalesced_read = 1ULL << 20; + int64_t read_bytes = 0; + int64_t range_read_count = 0; for (size_t begin = 0; begin < sorted.size();) { uint64_t read_offset = sorted[begin].offset; uint64_t read_end = sorted[begin].offset + sorted[begin].len; @@ -165,8 +191,10 @@ ::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Ran } std::vector bytes; - SNII_RETURN_IF_ERROR( - read_at(read_offset, cast_set(read_end - read_offset), &bytes)); + const size_t read_len = cast_set(read_end - read_offset); + SNII_RETURN_IF_ERROR(_read_at(read_offset, read_len, &bytes)); + read_bytes += cast_set(read_len); + ++range_read_count; for (size_t i = begin; i < end; ++i) { const uint64_t pos = sorted[i].offset - read_offset; auto& out = (*outs)[sorted[i].index]; @@ -175,6 +203,7 @@ ::snii::Status DorisSniiFileReader::read_batch(const std::vector<::snii::io::Ran } begin = end; } + _record_read_stats(request_bytes, read_bytes, range_read_count, range_read_count); return ::snii::Status::OK(); } @@ -183,7 +212,21 @@ uint64_t DorisSniiFileReader::size() const { } const io::IOContext* DorisSniiFileReader::_current_io_ctx() const { - return _scoped_io_ctx != nullptr ? _scoped_io_ctx : _default_io_ctx; + return _scoped_io_ctx != nullptr ? _scoped_io_ctx : &_default_io_ctx; +} + +void DorisSniiFileReader::_record_read_stats(int64_t request_bytes, int64_t read_bytes, + int64_t range_read_count, + int64_t serial_read_rounds) const { + const auto* io_ctx = _current_io_ctx(); + if (io_ctx->file_cache_stats == nullptr) { + return; + } + auto* stats = io_ctx->file_cache_stats; + stats->inverted_index_request_bytes += request_bytes; + stats->inverted_index_read_bytes += read_bytes; + stats->inverted_index_range_read_count += range_read_count; + stats->inverted_index_serial_read_rounds += serial_read_rounds; } ::snii::Status DorisSniiFileReader::_check_read_range(uint64_t offset, size_t len) const { diff --git a/be/src/storage/index/snii/snii_doris_adapter.h b/be/src/storage/index/snii/snii_doris_adapter.h index 38158998e7c65c..7f099466704d5b 100644 --- a/be/src/storage/index/snii/snii_doris_adapter.h +++ b/be/src/storage/index/snii/snii_doris_adapter.h @@ -23,6 +23,7 @@ #include "common/status.h" #include "io/fs/file_reader.h" #include "io/fs/file_writer.h" +#include "io/io_common.h" #include "snii/common/status.h" #include "snii/io/file_reader.h" #include "snii/io/file_writer.h" @@ -57,22 +58,26 @@ class DorisSniiFileReader final : public ::snii::io::FileReader { private: const io::IOContext* _previous = nullptr; + io::IOContext _io_ctx; }; - explicit DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx = nullptr) - : _reader(std::move(reader)), _default_io_ctx(io_ctx) {} + explicit DorisSniiFileReader(io::FileReaderSPtr reader, const io::IOContext* io_ctx = nullptr); - ::snii::Status read_at(uint64_t offset, size_t len, std::vector* out) override; + ::snii::Status read_at(uint64_t offset, size_t len, std::vector* const out) override; ::snii::Status read_batch(const std::vector<::snii::io::Range>& ranges, - std::vector>* outs) override; + std::vector>* const outs) override; uint64_t size() const override; private: + static io::IOContext _make_index_io_context(const io::IOContext* io_ctx); ::snii::Status _check_read_range(uint64_t offset, size_t len) const; + ::snii::Status _read_at(uint64_t offset, size_t len, std::vector* const out) const; const io::IOContext* _current_io_ctx() const; + void _record_read_stats(int64_t request_bytes, int64_t read_bytes, int64_t range_read_count, + int64_t serial_read_rounds) const; io::FileReaderSPtr _reader; - const io::IOContext* _default_io_ctx = nullptr; + io::IOContext _default_io_ctx; static thread_local const io::IOContext* _scoped_io_ctx; }; diff --git a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp index e74ad758ac1db3..4e7bb6bb1d05a4 100644 --- a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp +++ b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp @@ -52,6 +52,10 @@ io::FileCacheStatistics make_file_cache_stats(int64_t multiplier) { stats.inverted_index_remote_io_timer = multiplier * 26; stats.inverted_index_peer_io_timer = multiplier * 27; stats.inverted_index_io_timer = multiplier * 28; + stats.inverted_index_request_bytes = multiplier * 29; + stats.inverted_index_read_bytes = multiplier * 30; + stats.inverted_index_range_read_count = multiplier * 31; + stats.inverted_index_serial_read_rounds = multiplier * 32; return stats; } @@ -89,6 +93,10 @@ void expect_file_cache_stats_eq(const io::FileCacheStatistics& actual, EXPECT_EQ(actual.inverted_index_remote_io_timer, expected.inverted_index_remote_io_timer); EXPECT_EQ(actual.inverted_index_peer_io_timer, expected.inverted_index_peer_io_timer); EXPECT_EQ(actual.inverted_index_io_timer, expected.inverted_index_io_timer); + EXPECT_EQ(actual.inverted_index_request_bytes, expected.inverted_index_request_bytes); + EXPECT_EQ(actual.inverted_index_read_bytes, expected.inverted_index_read_bytes); + EXPECT_EQ(actual.inverted_index_range_read_count, expected.inverted_index_range_read_count); + EXPECT_EQ(actual.inverted_index_serial_read_rounds, expected.inverted_index_serial_read_rounds); } } // namespace @@ -134,6 +142,14 @@ TEST(FileCacheProfileReporterTest, ReporterAggregatesDeltaReportsToExactFinalTot EXPECT_EQ(profile->get_counter("CacheGetOrSetTimer")->value(), after_second_report.cache_get_or_set_timer); EXPECT_EQ(profile->get_counter("LockWaitTimer")->value(), after_second_report.lock_wait_timer); + EXPECT_EQ(profile->get_counter("InvertedIndexRequestBytes")->value(), + after_second_report.inverted_index_request_bytes); + EXPECT_EQ(profile->get_counter("InvertedIndexReadBytes")->value(), + after_second_report.inverted_index_read_bytes); + EXPECT_EQ(profile->get_counter("InvertedIndexRangeReadCount")->value(), + after_second_report.inverted_index_range_read_count); + EXPECT_EQ(profile->get_counter("InvertedIndexSerialReadRounds")->value(), + after_second_report.inverted_index_serial_read_rounds); } } // namespace doris diff --git a/be/test/storage/index/snii_doris_adapter_test.cpp b/be/test/storage/index/snii_doris_adapter_test.cpp new file mode 100644 index 00000000000000..f307fb731daff5 --- /dev/null +++ b/be/test/storage/index/snii_doris_adapter_test.cpp @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/snii/snii_doris_adapter.h" + +#include + +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "io/fs/file_reader.h" +#include "io/fs/path.h" +#include "io/io_common.h" +#include "snii/io/file_reader.h" +#include "util/slice.h" + +namespace doris::segment_v2::snii_doris { +namespace { + +struct CapturedIOContext { + bool has_ctx = false; + bool is_inverted_index = false; + bool is_index_data = false; + bool read_file_cache = true; + bool is_disposable = false; + io::FileCacheStatistics* file_cache_stats = nullptr; +}; + +struct CapturedRead { + size_t offset = 0; + size_t len = 0; + CapturedIOContext io_ctx; +}; + +class RecordingFileReader final : public io::FileReader { +public: + explicit RecordingFileReader(std::string data) : _data(std::move(data)) {} + + Status close() override { + _closed = true; + return Status::OK(); + } + + const io::Path& path() const override { return _path; } + size_t size() const override { return _data.size(); } + bool closed() const override { return _closed; } + int64_t mtime() const override { return 0; } + + const std::vector& reads() const { return _reads; } + +protected: + Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, + const io::IOContext* io_ctx) override { + CapturedRead read; + read.offset = offset; + read.len = result.size; + if (io_ctx != nullptr) { + read.io_ctx.has_ctx = true; + read.io_ctx.is_inverted_index = io_ctx->is_inverted_index; + read.io_ctx.is_index_data = io_ctx->is_index_data; + read.io_ctx.read_file_cache = io_ctx->read_file_cache; + read.io_ctx.is_disposable = io_ctx->is_disposable; + read.io_ctx.file_cache_stats = io_ctx->file_cache_stats; + } + _reads.push_back(read); + + if (result.size > 0) { + std::memcpy(result.data, _data.data() + offset, result.size); + } + *bytes_read = result.size; + return Status::OK(); + } + +private: + std::string _data; + io::Path _path = "/tmp/snii_doris_adapter_test.idx"; + bool _closed = false; + std::vector _reads; +}; + +} // namespace + +TEST(DorisSniiFileReaderTest, ReadAtPropagatesIndexIOContextAndRecordsStats) { + auto recording_reader = std::make_shared("0123456789abcdef"); + DorisSniiFileReader reader(recording_reader); + + io::FileCacheStatistics stats; + io::IOContext io_ctx; + io_ctx.is_disposable = true; + io_ctx.is_index_data = false; + io_ctx.read_file_cache = false; + io_ctx.file_cache_stats = &stats; + + std::vector out; + { + DorisSniiFileReader::ScopedIOContext scope(&io_ctx); + auto status = reader.read_at(2, 5, &out); + ASSERT_TRUE(status.ok()) << status.message(); + } + + ASSERT_EQ(out.size(), 5); + EXPECT_EQ(std::string(out.begin(), out.end()), "23456"); + ASSERT_EQ(recording_reader->reads().size(), 1); + const auto& captured = recording_reader->reads()[0].io_ctx; + EXPECT_TRUE(captured.has_ctx); + EXPECT_TRUE(captured.is_inverted_index); + EXPECT_TRUE(captured.is_index_data); + EXPECT_FALSE(captured.read_file_cache); + EXPECT_TRUE(captured.is_disposable); + EXPECT_EQ(captured.file_cache_stats, &stats); + + EXPECT_EQ(stats.inverted_index_request_bytes, 5); + EXPECT_EQ(stats.inverted_index_read_bytes, 5); + EXPECT_EQ(stats.inverted_index_range_read_count, 1); + EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1); +} + +TEST(DorisSniiFileReaderTest, ReadBatchRecordsLogicalAndCoalescedPhysicalIO) { + auto recording_reader = + std::make_shared("0123456789abcdefghijklmnopqrstuvwxyz"); + DorisSniiFileReader reader(recording_reader); + + io::FileCacheStatistics stats; + io::IOContext io_ctx; + io_ctx.file_cache_stats = &stats; + + std::vector> outs; + { + DorisSniiFileReader::ScopedIOContext scope(&io_ctx); + std::vector<::snii::io::Range> ranges {{0, 4}, {6, 3}, {20, 2}}; + auto status = reader.read_batch(ranges, &outs); + ASSERT_TRUE(status.ok()) << status.message(); + } + + ASSERT_EQ(outs.size(), 3); + EXPECT_EQ(std::string(outs[0].begin(), outs[0].end()), "0123"); + EXPECT_EQ(std::string(outs[1].begin(), outs[1].end()), "678"); + EXPECT_EQ(std::string(outs[2].begin(), outs[2].end()), "kl"); + + ASSERT_EQ(recording_reader->reads().size(), 1); + EXPECT_EQ(recording_reader->reads()[0].offset, 0); + EXPECT_EQ(recording_reader->reads()[0].len, 22); + + EXPECT_EQ(stats.inverted_index_request_bytes, 9); + EXPECT_EQ(stats.inverted_index_read_bytes, 22); + EXPECT_EQ(stats.inverted_index_range_read_count, 1); + EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1); +} + +} // namespace doris::segment_v2::snii_doris diff --git a/be/test/storage/segment/inverted_index_fs_directory_test.cpp b/be/test/storage/segment/inverted_index_fs_directory_test.cpp index d42559a0e39975..99cd9d8b613cc7 100644 --- a/be/test/storage/segment/inverted_index_fs_directory_test.cpp +++ b/be/test/storage/segment/inverted_index_fs_directory_test.cpp @@ -287,6 +287,58 @@ TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalWithBytesReadError) { _CLDELETE(input); } +TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalRecordsIndexIOStatsAndContext) { + std::filesystem::path test_file = _tmp_dir / "test_file_with_stats"; + std::ofstream ofs(test_file); + ofs << "test content for stats"; + ofs.close(); + + lucene::store::IndexInput* input = nullptr; + CLuceneError error; + + bool result = + DorisFSDirectory::FSIndexInput::open(_fs, test_file.string().c_str(), input, error); + EXPECT_TRUE(result); + + io::FileCacheStatistics stats; + io::IOContext io_ctx; + io_ctx.is_disposable = true; + io_ctx.is_index_data = false; + io_ctx.read_file_cache = false; + io_ctx.file_cache_stats = &stats; + + input->setIoContext(&io_ctx); + input->setIndexFile(true); + + uint8_t buffer[6]; + input->readBytes(buffer, 6, false); + EXPECT_EQ(std::string(reinterpret_cast(buffer), 6), "test c"); + + const auto* captured = static_cast(input->getIoContext()); + EXPECT_TRUE(captured->is_inverted_index); + EXPECT_TRUE(captured->is_index_data); + EXPECT_FALSE(captured->read_file_cache); + EXPECT_TRUE(captured->is_disposable); + EXPECT_EQ(captured->file_cache_stats, &stats); + + EXPECT_EQ(stats.inverted_index_request_bytes, 6); + EXPECT_EQ(stats.inverted_index_read_bytes, 6); + EXPECT_EQ(stats.inverted_index_range_read_count, 1); + EXPECT_EQ(stats.inverted_index_serial_read_rounds, 1); + + input->setIoContext(nullptr); + captured = static_cast(input->getIoContext()); + EXPECT_TRUE(captured->is_inverted_index); + EXPECT_TRUE(captured->is_index_data); + EXPECT_EQ(captured->file_cache_stats, nullptr); + + input->setIndexFile(false); + captured = static_cast(input->getIoContext()); + EXPECT_FALSE(captured->is_index_data); + + _CLDELETE(input); +} + // Test 19: FSIndexOutput init error TEST_F(DorisFSDirectoryTest, FSIndexOutputInitError) { DebugPoints::instance()->add( @@ -841,4 +893,4 @@ TEST_F(DorisFSDirectoryTest, PrivGetFN) { } } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 From ac3d730391fa5798b112de8b9d61f54e408ae2aa Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 00:11:06 +0800 Subject: [PATCH 05/12] [improvement](be) Optimize SNII phrase query candidate filtering ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: SNII phrase and phrase-prefix queries spent most CPU time re-scanning phrase candidate docids for every PRX chunk and allocating per-doc expected tail position vectors. On the 10B TextBench cloud benchmark, MATCH_PHRASE 'failed order' took 253.0s wall / 3942.9 CPU-s and MATCH_PHRASE_PREFIX 'failed ord' took 438.3s wall / 6939.5 CPU-s before this optimization. The fix starts PRX candidate filtering from the chunk's first docid, keeps an all-selected fast path, stores expected tail positions in flat CSR-style arrays, and uses a single exact-term fast path for phrase-prefix expected tail positions. The same benchmark now runs in about 3.8s / 59.3 CPU-s for phrase and 3.9s / 61-62 CPU-s for phrase-prefix. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter=SniiPhraseQueryTest.* - Manual test: release BE build and cloud_sim E2E TextBench phrase benchmark - Static check: git diff --check; build-support/run-clang-tidy.sh --build-dir be/build_Release passed changed lines in phrase_query.cpp, while the new BE UT source is blocked by a local libstdc++ _POSIX_SEM_VALUE_MAX toolchain header error. - Behavior changed: No - Does this need documentation: No --- .../snii/core/src/query/phrase_query.cpp | 177 +++++++++++++++--- be/test/storage/index/snii_query_test.cpp | 173 +++++++++++++++++ 2 files changed, 319 insertions(+), 31 deletions(-) create mode 100644 be/test/storage/index/snii_query_test.cpp diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index 7389d7a5ec96b6..89601887b9d106 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -53,7 +53,23 @@ namespace { struct ExpectedTailPositions { uint32_t docid = 0; + size_t positions_begin = 0; + size_t positions_end = 0; +}; + +struct ExpectedTailPositionSet { + std::vector docs; std::vector positions; + + void clear() { + docs.clear(); + positions.clear(); + } + + void reserve_docs(size_t count) { + docs.reserve(count); + positions.reserve(count); + } }; // One decoded chunk of a term's posting: a windowed term's covering window, or @@ -115,44 +131,102 @@ Status append_prx_doc_ordinal(size_t ordinal, std::vector* out) { return Status::OK(); } +Status append_selected_ordinal(size_t doc_index, const std::vector& prx_doc_ordinals, + std::vector* selected_ordinals) { + if (!prx_doc_ordinals.empty()) { + selected_ordinals->push_back(prx_doc_ordinals[doc_index]); + return Status::OK(); + } + return append_prx_doc_ordinal(doc_index, selected_ordinals); +} + +Status append_selected_doc(size_t doc_index, uint32_t docid, + const std::vector& prx_doc_ordinals, + std::vector* selected_docids, + std::vector* selected_ordinals) { + selected_docids->push_back(docid); + return append_selected_ordinal(doc_index, prx_doc_ordinals, selected_ordinals); +} + +Status materialize_selected_prefix(size_t count, size_t capacity, + const std::vector& docids, + const std::vector& prx_doc_ordinals, + std::vector* selected_docids, + std::vector* selected_ordinals) { + selected_docids->reserve(capacity); + selected_ordinals->reserve(capacity); + selected_docids->insert(selected_docids->end(), docids.begin(), docids.begin() + count); + for (size_t i = 0; i < count; ++i) { + SNII_RETURN_IF_ERROR(append_selected_ordinal(i, prx_doc_ordinals, selected_ordinals)); + } + return Status::OK(); +} + +Status materialize_selected_prefix_if_needed(bool* selected_all, size_t count, size_t capacity, + const std::vector& docids, + const std::vector& prx_doc_ordinals, + std::vector* selected_docids, + std::vector* selected_ordinals) { + if (!*selected_all) { + return Status::OK(); + } + *selected_all = false; + return materialize_selected_prefix(count, capacity, docids, prx_doc_ordinals, selected_docids, + selected_ordinals); +} + Status SelectCandidateDocsForPrx(std::vector* docids, std::vector* prx_doc_ordinals, const std::vector& candidates, PosChunk* chunk) { chunk->docids.clear(); chunk->prx_doc_ordinals.clear(); - if (docids->empty() || candidates.empty()) return Status::OK(); + if (docids->empty() || candidates.empty()) { + return Status::OK(); + } if (!prx_doc_ordinals->empty() && prx_doc_ordinals->size() != docids->size()) { return Status::Corruption("phrase_query: prx ordinal/docid count mismatch"); } std::vector selected_docids; std::vector selected_ordinals; - selected_docids.reserve(std::min(docids->size(), candidates.size())); - selected_ordinals.reserve(selected_docids.capacity()); + bool selected_all = true; + const size_t selected_capacity = std::min(docids->size(), candidates.size()); - size_t candidate_index = 0; - for (size_t doc_index = 0; doc_index < docids->size() && candidate_index < candidates.size(); - ++doc_index) { + auto candidate_it = std::ranges::lower_bound(candidates, docids->front()); + size_t candidate_index = static_cast(candidate_it - candidates.begin()); + for (size_t doc_index = 0; doc_index < docids->size(); ++doc_index) { const uint32_t docid = (*docids)[doc_index]; while (candidate_index < candidates.size() && candidates[candidate_index] < docid) { ++candidate_index; } - if (candidate_index == candidates.size()) break; - if (candidates[candidate_index] != docid) continue; + if (candidate_index == candidates.size()) { + SNII_RETURN_IF_ERROR(materialize_selected_prefix_if_needed( + &selected_all, doc_index, selected_capacity, *docids, *prx_doc_ordinals, + &selected_docids, &selected_ordinals)); + break; + } + if (candidates[candidate_index] != docid) { + SNII_RETURN_IF_ERROR(materialize_selected_prefix_if_needed( + &selected_all, doc_index, selected_capacity, *docids, *prx_doc_ordinals, + &selected_docids, &selected_ordinals)); + continue; + } - selected_docids.push_back(docid); - if (prx_doc_ordinals->empty()) { - SNII_RETURN_IF_ERROR(append_prx_doc_ordinal(doc_index, &selected_ordinals)); - } else { - selected_ordinals.push_back((*prx_doc_ordinals)[doc_index]); + if (!selected_all) { + SNII_RETURN_IF_ERROR(append_selected_doc(doc_index, docid, *prx_doc_ordinals, + &selected_docids, &selected_ordinals)); } ++candidate_index; } - if (selected_docids.empty()) return Status::OK(); - if (selected_docids.size() == docids->size()) { + if (selected_all) { chunk->docids = std::move(*docids); chunk->prx_doc_ordinals = std::move(*prx_doc_ordinals); + docids->clear(); + prx_doc_ordinals->clear(); + return Status::OK(); + } + if (selected_docids.empty()) { return Status::OK(); } chunk->docids = std::move(selected_docids); @@ -452,7 +526,7 @@ Status CollectExpectedTailPositions(const std::vector& plans, const std::vector& position_offsets, std::vector& srcs, const std::vector& candidates, - std::vector* out) { + ExpectedTailPositionSet* out) { const size_t n = plans.size(); std::vector cur(n); for (size_t i = 0; i < n; ++i) cur[i].init(&srcs[i]); @@ -467,8 +541,7 @@ Status CollectExpectedTailPositions(const std::vector& plans, SNII_RETURN_IF_ERROR(ordered[pp]->positions(&span[pp])); } - ExpectedTailPositions match; - match.docid = d; + const size_t expected_begin = out->positions.size(); for (const uint32_t* p = span[0].first; p != span[0].second; ++p) { const uint32_t start = *p; bool ok = true; @@ -485,17 +558,47 @@ Status CollectExpectedTailPositions(const std::vector& plans, } uint32_t tail_pos = 0; if (ok && internal::add_position_offset(start, position_offsets[n], &tail_pos)) { - match.positions.push_back(tail_pos); + out->positions.push_back(tail_pos); } } - if (!match.positions.empty()) out->push_back(std::move(match)); + const size_t expected_end = out->positions.size(); + if (expected_end != expected_begin) { + out->docs.push_back({d, expected_begin, expected_end}); + } + } + return Status::OK(); +} + +Status CollectSingleTermExpectedTailPositions(std::vector& srcs, + const std::vector& candidates, + uint32_t tail_offset, ExpectedTailPositionSet* out) { + PostingCursor cursor; + cursor.init(srcs.data()); + out->reserve_docs(out->docs.size() + candidates.size()); + + for (uint32_t d : candidates) { + SNII_RETURN_IF_ERROR(cursor.seek(d)); + std::pair span; + SNII_RETURN_IF_ERROR(cursor.positions(&span)); + + const size_t expected_begin = out->positions.size(); + for (const uint32_t* p = span.first; p != span.second; ++p) { + uint32_t tail_pos = 0; + if (internal::add_position_offset(*p, tail_offset, &tail_pos)) { + out->positions.push_back(tail_pos); + } + } + const size_t expected_end = out->positions.size(); + if (expected_end != expected_begin) { + out->docs.push_back({d, expected_begin, expected_end}); + } } return Status::OK(); } Status CollectExpectedTailPositions(const LogicalIndexReader& idx, const std::vector& exact_terms, - std::vector* out) { + ExpectedTailPositionSet* out) { out->clear(); snii::io::BatchRangeFetcher round1(idx.reader()); std::vector plans; @@ -505,27 +608,37 @@ Status CollectExpectedTailPositions(const LogicalIndexReader& idx, PhraseExecutionState state; SNII_RETURN_IF_ERROR(BuildPhraseExecutionState(idx, &round1, &plans, &state)); if (state.candidates.empty()) return Status::OK(); + out->reserve_docs(state.candidates.size()); std::vector position_offsets; if (!internal::build_position_offsets(plans.size() + 1, &position_offsets)) { return Status::InvalidArgument( "phrase_prefix_query: phrase length exceeds doc position range"); } + if (plans.size() == 1) { + return CollectSingleTermExpectedTailPositions(state.srcs, state.candidates, + position_offsets[1], out); + } return CollectExpectedTailPositions(plans, position_offsets, state.srcs, state.candidates, out); } -bool contains_any_position(const std::vector& wanted, +bool contains_any_position(const ExpectedTailPositionSet& expected, + const ExpectedTailPositions& wanted, std::pair actual) { - for (uint32_t pos : wanted) { - if (std::binary_search(actual.first, actual.second, pos)) return true; + for (size_t i = wanted.positions_begin; i < wanted.positions_end; ++i) { + if (std::binary_search(actual.first, actual.second, expected.positions[i])) { + return true; + } } return false; } Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx, const ResolvedQueryTerm& tail, - const std::vector& expected, + const ExpectedTailPositionSet& expected, std::vector* out) { - if (expected.empty()) return Status::OK(); + if (expected.docs.empty()) { + return Status::OK(); + } snii::io::BatchRangeFetcher round1(idx.reader()); std::vector plans; @@ -540,8 +653,8 @@ Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx, cursor.init(&state.srcs[0]); size_t ei = 0; size_t ti = 0; - while (ei < expected.size() && ti < state.candidates.size()) { - const uint32_t want_doc = expected[ei].docid; + while (ei < expected.docs.size() && ti < state.candidates.size()) { + const uint32_t want_doc = expected.docs[ei].docid; const uint32_t tail_doc = state.candidates[ti]; if (want_doc < tail_doc) { ++ei; @@ -555,7 +668,9 @@ Status CollectTailMatchesAtExpectedPositions(const LogicalIndexReader& idx, SNII_RETURN_IF_ERROR(cursor.seek(want_doc)); std::pair actual; SNII_RETURN_IF_ERROR(cursor.positions(&actual)); - if (contains_any_position(expected[ei].positions, actual)) out->push_back(want_doc); + if (contains_any_position(expected, expected.docs[ei], actual)) { + out->push_back(want_doc); + } ++ei; ++ti; } @@ -635,9 +750,9 @@ Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector expected; + ExpectedTailPositionSet expected; SNII_RETURN_IF_ERROR(CollectExpectedTailPositions(idx, exact_terms, &expected)); - if (expected.empty()) { + if (expected.docs.empty()) { return Status::OK(); } diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp new file mode 100644 index 00000000000000..6604b44f62bad4 --- /dev/null +++ b/be/test/storage/index/snii_query_test.cpp @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include + +#include "snii/common/slice.h" +#include "snii/format/format_constants.h" +#include "snii/io/file_reader.h" +#include "snii/io/file_writer.h" +#include "snii/query/phrase_query.h" +#include "snii/reader/logical_index_reader.h" +#include "snii/reader/snii_segment_reader.h" +#include "snii/writer/snii_compound_writer.h" +#include "snii/writer/spimi_term_buffer.h" + +namespace snii::query { +namespace { + +class MemoryFile final : public snii::io::FileReader, public snii::io::FileWriter { +public: + Status append(Slice data) override { + data_.insert(data_.end(), data.data(), data.data() + data.size()); + return Status::OK(); + } + + Status finalize() override { + finalized_ = true; + return Status::OK(); + } + + uint64_t bytes_written() const override { return data_.size(); } + + Status read_at(uint64_t offset, size_t len, std::vector* out) override { + if (offset > data_.size() || len > data_.size() - offset) { + return Status::Corruption("memory file read past eof"); + } + out->resize(len); + if (len != 0) { + std::memcpy(out->data(), data_.data() + offset, len); + } + return Status::OK(); + } + + uint64_t size() const override { return data_.size(); } + bool finalized() const { return finalized_; } + +private: + std::vector data_; + bool finalized_ = false; +}; + +struct PostingDoc { + uint32_t docid = 0; + std::vector positions; +}; + +writer::TermPostings make_term(std::string term, std::vector docs) { + std::ranges::sort(docs, [](const PostingDoc& lhs, const PostingDoc& rhs) { + return lhs.docid < rhs.docid; + }); + + writer::TermPostings posting; + posting.term = std::move(term); + posting.docids.reserve(docs.size()); + posting.freqs.reserve(docs.size()); + for (const PostingDoc& doc : docs) { + posting.docids.push_back(doc.docid); + posting.freqs.push_back(static_cast(doc.positions.size())); + posting.positions_flat.insert(posting.positions_flat.end(), doc.positions.begin(), + doc.positions.end()); + } + return posting; +} + +std::vector docs_with_one_position(uint32_t begin, uint32_t end, uint32_t position) { + std::vector docs; + docs.reserve(end - begin); + for (uint32_t docid = begin; docid < end; ++docid) { + docs.push_back({docid, {position}}); + } + return docs; +} + +void assert_ok(const Status& status) { + ASSERT_TRUE(status.ok()) << status.to_string(); +} + +Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, + reader::LogicalIndexReader* index_reader) { + constexpr uint32_t kDocCount = 9000; + auto failed_docs = docs_with_one_position(0, kDocCount, 0); + auto order_docs = docs_with_one_position(0, kDocCount, 2); + auto ordinal_docs = docs_with_one_position(0, kDocCount, 2); + failed_docs[8000].positions = {0, 4}; + for (PostingDoc& doc : order_docs) { + if (doc.docid == 5000 || doc.docid == 7000) { + doc.positions = {1}; + } else if (doc.docid == 8000) { + doc.positions = {5}; + } + } + for (PostingDoc& doc : ordinal_docs) { + if (doc.docid == 6000) { + doc.positions = {1}; + } + } + + writer::SniiIndexInput input; + input.index_id = 7; + input.index_suffix = "Body"; + input.config = format::IndexConfig::kDocsPositions; + input.doc_count = kDocCount; + input.terms = {make_term("failed", std::move(failed_docs)), + make_term("order", std::move(order_docs)), + make_term("ordinal", std::move(ordinal_docs))}; + + writer::SniiCompoundWriter writer(file); + SNII_RETURN_IF_ERROR(writer.add_logical_index(input)); + SNII_RETURN_IF_ERROR(writer.finish()); + EXPECT_TRUE(file->finalized()); + + SNII_RETURN_IF_ERROR(reader::SniiSegmentReader::open(file, segment_reader)); + return segment_reader->open_index(input.index_id, input.index_suffix, index_reader); +} + +TEST(SniiPhraseQueryTest, WindowedPhraseQueryKeepsCorrectCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"failed", "order"}, &docids)); + + const std::vector expected {5000, 7000, 8000}; + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, WindowedPhrasePrefixQueryKeepsCorrectCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_prefix_query(index_reader, {"failed", "ord"}, &docids, 10)); + + const std::vector expected {5000, 6000, 7000, 8000}; + EXPECT_EQ(docids, expected); +} + +} // namespace +} // namespace snii::query From 424591e6bfb36a8780c141df13d8305afb47ea1c Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 01:37:32 +0800 Subject: [PATCH 06/12] [improvement](be) Reduce SNII phrase query CPU ### What problem does this PR solve? Issue Number: N/A Related PR: N/A Problem Summary: SNII MATCH_PHRASE in cloud mode spent most CPU in selective PRX/PFOR decode, candidate ordinal materialization, and generic two-term position checks. This change removes per-doc ordinal Status overhead, uses selected PRX ranges to compact dense PFOR decodes, decodes sparse PFOR runs through a stack buffer, and adds a two-term phrase merge path. In cloud_sim PH5 on textbench_10b_perf.otel10b_phrase40_snii, BE CPU is now about 47.67s on average with SQL and inverted-index query cache disabled, down from roughly 55-59s observed before these optimizations. ### Release note None ### Check List (For Author) - Test: Unit Test and Manual test - Unit Test: ./run-be-ut.sh --run --filter=SniiPhraseQueryTest.*:SniiPrxPodTest.* - Build: ./build.sh --be -j 192 - Manual test: deployed BE to cloud_sim and ran PH5 benchmark under /mnt/disk15/jiangkai/textbench/runs/20260628_phrase_cpu_opt_final_refactor_nocache - Static check: git diff --check; build-support/run-clang-tidy.sh --build-dir be/build_Release attempted but failed because the local clang/GCC sysroot cannot resolve stddef.h - Behavior changed: No - Does this need documentation: No --- .../index/snii/core/src/format/prx_pod.cpp | 191 ++++++++++++++---- .../snii/core/src/query/docid_conjunction.cpp | 20 +- .../snii/core/src/query/phrase_query.cpp | 31 +++ be/test/storage/index/snii_query_test.cpp | 47 +++++ 4 files changed, 233 insertions(+), 56 deletions(-) diff --git a/be/src/storage/index/snii/core/src/format/prx_pod.cpp b/be/src/storage/index/snii/core/src/format/prx_pod.cpp index a4a21f9056abfc..9bd28451ae3127 100644 --- a/be/src/storage/index/snii/core/src/format/prx_pod.cpp +++ b/be/src/storage/index/snii/core/src/format/prx_pod.cpp @@ -1,6 +1,7 @@ #include "snii/format/prx_pod.h" #include +#include #include #include #include @@ -298,54 +299,105 @@ Status validate_doc_ordinals(std::span doc_ordinals, uint32_t do return Status::OK(); } -Status decode_pfor_payload_csr_selective(Slice plain, std::span doc_ordinals, - std::vector* pos_flat, - std::vector* pos_off) { - ByteSource src(plain); - uint32_t doc_count = 0, total_pos = 0; - SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); - SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); - if (total_pos > kMaxWindowPositions) { - return Status::Corruption("prx: position count exceeds sane cap"); +struct SelectedRange { + SelectedRange(uint32_t begin_, uint32_t end_, uint32_t out_begin_) + : begin(begin_), end(end_), out_begin(out_begin_) {} + + uint32_t begin; + uint32_t end; + uint32_t out_begin; +}; + +uint32_t count_covered_pfor_runs(std::span selected, uint32_t total_pos) { + if (selected.empty() || total_pos == 0) { + return 0; } - if (doc_count > kMaxWindowDocs) { - return Status::Corruption("prx: doc count exceeds sane cap"); + uint32_t runs = 0; + uint32_t next_run = 0; + for (const SelectedRange& range : selected) { + if (range.begin == range.end) { + continue; + } + const uint32_t first_run = range.begin / kFrqBaseUnit; + const uint32_t last_run = (range.end - 1) / kFrqBaseUnit; + const uint32_t counted_first = std::max(first_run, next_run); + if (counted_first <= last_run) { + runs += last_run - counted_first + 1; + next_run = last_run + 1; + } } - SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); + return runs; +} - std::vector pos_counts; - SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); - uint64_t sum = 0; - for (uint32_t d = 0; d < doc_count; ++d) sum += pos_counts[d]; - if (sum != total_pos) return Status::Corruption("prx: pos_count sum mismatch"); +bool should_decode_full_prx_positions(std::span selected, + uint32_t selected_pos_count, uint32_t total_pos) { + if (selected.empty() || total_pos == 0) { + return false; + } + if (selected_pos_count * 2 >= total_pos) { + return true; + } + const uint32_t total_runs = (total_pos + kFrqBaseUnit - 1) / kFrqBaseUnit; + const uint32_t covered_runs = count_covered_pfor_runs(selected, total_pos); + return covered_runs * 4 >= total_runs * 3; +} - pos_flat->clear(); - pos_off->clear(); - pos_off->reserve(doc_ordinals.size() + 1); - pos_off->push_back(0); +void compact_selected_pfor_positions(std::span selected, + std::vector& pos_flat, + std::vector& pos_off) { + size_t write_off = 0; + pos_off.clear(); + pos_off.reserve(selected.size() + 1); + pos_off.push_back(0); + for (const SelectedRange& range : selected) { + const uint32_t count = range.end - range.begin; + if (count == 1) { + pos_flat[write_off++] = pos_flat[range.begin]; + pos_off.push_back(static_cast(write_off)); + continue; + } + uint32_t prev = 0; + for (uint32_t i = 0; i < count; ++i) { + const uint32_t delta = pos_flat[range.begin + i]; + prev = (i == 0) ? delta : prev + delta; + pos_flat[write_off++] = prev; + } + pos_off.push_back(static_cast(write_off)); + } + pos_flat.resize(write_off); +} - struct SelectedRange { - uint32_t begin = 0; - uint32_t end = 0; - uint32_t out_begin = 0; - }; - std::vector selected; +uint32_t build_selected_pfor_ranges(std::span pos_counts, + std::span doc_ordinals, + std::vector& selected, + std::vector& pos_off) { + selected.clear(); selected.reserve(doc_ordinals.size()); + pos_off.clear(); + pos_off.reserve(doc_ordinals.size() + 1); + pos_off.push_back(0); + + uint32_t selected_pos_count = 0; uint32_t delta_begin = 0; size_t next_doc = 0; - for (uint32_t d = 0; d < doc_count; ++d) { + for (uint32_t d = 0; d < static_cast(pos_counts.size()); ++d) { const uint32_t count = pos_counts[d]; if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) { - const uint32_t out_begin = static_cast(pos_flat->size()); - selected.push_back(SelectedRange {delta_begin, delta_begin + count, out_begin}); - pos_flat->resize(pos_flat->size() + count); - pos_off->push_back(static_cast(pos_flat->size())); + selected.push_back( + SelectedRange {delta_begin, delta_begin + count, selected_pos_count}); + selected_pos_count += count; + pos_off.push_back(selected_pos_count); ++next_doc; } delta_begin += count; } + return selected_pos_count; +} - std::vector run_buf; +Status decode_sparse_selected_pfor_positions(ByteSource* src, uint32_t total_pos, + std::span selected, + std::span pos_flat) { + std::array run_buf {}; size_t range_idx = 0; for (uint32_t run_begin = 0; run_begin < total_pos; run_begin += kFrqBaseUnit) { const uint32_t run_len = std::min(kFrqBaseUnit, total_pos - run_begin); @@ -354,31 +406,84 @@ Status decode_pfor_payload_csr_selective(Slice plain, std::span ++range_idx; } if (range_idx == selected.size() || selected[range_idx].begin >= run_end) { - SNII_RETURN_IF_ERROR(pfor_skip(&src, run_len)); + SNII_RETURN_IF_ERROR(pfor_skip(src, run_len)); continue; } - SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, run_len, &run_buf)); + SNII_RETURN_IF_ERROR(pfor_decode(src, run_len, run_buf.data())); for (size_t ri = range_idx; ri < selected.size() && selected[ri].begin < run_end; ++ri) { const SelectedRange& range = selected[ri]; const uint32_t copy_begin = std::max(range.begin, run_begin); const uint32_t copy_end = std::min(range.end, run_end); const uint32_t dst_begin = range.out_begin + copy_begin - range.begin; - for (uint32_t off = copy_begin; off < copy_end; ++off) { - (*pos_flat)[dst_begin + off - copy_begin] = run_buf[off - run_begin]; - } + std::copy_n(run_buf.data() + copy_begin - run_begin, copy_end - copy_begin, + pos_flat.data() + dst_begin); } } + return Status::OK(); +} - for (size_t i = 0; i < doc_ordinals.size(); ++i) { +void restore_selected_position_deltas(const std::vector& pos_off, + std::span pos_flat) { + for (size_t i = 0; i + 1 < pos_off.size(); ++i) { uint32_t prev = 0; - for (uint32_t off = (*pos_off)[i]; off < (*pos_off)[i + 1]; ++off) { - uint32_t& value = (*pos_flat)[off]; - prev = (off == (*pos_off)[i]) ? value : prev + value; + for (uint32_t off = pos_off[i]; off < pos_off[i + 1]; ++off) { + uint32_t& value = pos_flat[off]; + prev = (off == pos_off[i]) ? value : prev + value; value = prev; } } - if (!src.eof()) return Status::Corruption("prx: trailing bytes after pfor payload"); +} + +Status decode_pfor_payload_csr_selective(Slice plain, std::span doc_ordinals, + std::vector* pos_flat, + std::vector* pos_off) { + ByteSource src(plain); + uint32_t doc_count = 0, total_pos = 0; + SNII_RETURN_IF_ERROR(src.get_varint32(&doc_count)); + SNII_RETURN_IF_ERROR(src.get_varint32(&total_pos)); + if (total_pos > kMaxWindowPositions) { + return Status::Corruption("prx: position count exceeds sane cap"); + } + if (doc_count > kMaxWindowDocs) { + return Status::Corruption("prx: doc count exceeds sane cap"); + } + SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); + + std::vector pos_counts; + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); + uint64_t sum = 0; + for (uint32_t d = 0; d < doc_count; ++d) { + sum += pos_counts[d]; + } + if (sum != total_pos) { + return Status::Corruption("prx: pos_count sum mismatch"); + } + + pos_flat->clear(); + + std::vector selected; + const uint32_t selected_pos_count = + build_selected_pfor_ranges(pos_counts, doc_ordinals, selected, *pos_off); + + if (should_decode_full_prx_positions(selected, selected_pos_count, total_pos)) { + SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat)); + compact_selected_pfor_positions(selected, *pos_flat, *pos_off); + if (!src.eof()) { + return Status::Corruption("prx: trailing bytes after pfor payload"); + } + return Status::OK(); + } + + pos_flat->resize(selected_pos_count); + SNII_RETURN_IF_ERROR(decode_sparse_selected_pfor_positions( + &src, total_pos, selected, std::span(pos_flat->data(), pos_flat->size()))); + + restore_selected_position_deltas(*pos_off, + std::span(pos_flat->data(), pos_flat->size())); + if (!src.eof()) { + return Status::Corruption("prx: trailing bytes after pfor payload"); + } return Status::OK(); } diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp index a2477eaf576682..373cfc5fd0fbdb 100644 --- a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -126,14 +126,6 @@ Status append_docid_range(uint32_t first, uint32_t last, std::vector* return Status::OK(); } -Status append_docid_ordinal(size_t ordinal, std::vector* out) { - if (ordinal > std::numeric_limits::max()) { - return Status::Corruption("docid_conjunction: doc ordinal exceeds u32"); - } - out->push_back(static_cast(ordinal)); - return Status::OK(); -} - void append_candidate_range(const std::vector& candidates, uint32_t first, uint32_t last, std::vector* out) { const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); @@ -151,8 +143,7 @@ Status append_candidate_range_with_ordinals(const std::vector& candida for (auto it = begin; it != end; ++it) { out->push_back(*it); chunk->docids.push_back(*it); - SNII_RETURN_IF_ERROR(append_docid_ordinal( - static_cast(*it) - static_cast(first), &chunk->prx_doc_ordinals)); + chunk->prx_doc_ordinals.push_back(*it - first); } return Status::OK(); } @@ -224,8 +215,11 @@ Status intersect_window_candidates_with_ordinals(const std::vector& ca const auto end = std::upper_bound(begin, candidates.end(), last); if (begin == end || term_docids.empty()) return Status::OK(); - chunk->docids.reserve(static_cast(end - begin)); - chunk->prx_doc_ordinals.reserve(static_cast(end - begin)); + const size_t candidate_count = static_cast(end - begin); + out->reserve(out->size() + candidate_count); + chunk->docids.reserve(candidate_count); + chunk->prx_doc_ordinals.reserve(candidate_count); + size_t doc_index = 0; for (auto it = begin; it != end; ++it) { while (doc_index < term_docids.size() && term_docids[doc_index] < *it) { @@ -235,7 +229,7 @@ Status intersect_window_candidates_with_ordinals(const std::vector& ca if (term_docids[doc_index] != *it) continue; out->push_back(*it); chunk->docids.push_back(*it); - SNII_RETURN_IF_ERROR(append_docid_ordinal(doc_index, &chunk->prx_doc_ordinals)); + chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); ++doc_index; } return Status::OK(); diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index 89601887b9d106..75e9fe0fcf36b3 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -437,6 +437,30 @@ size_t AnchorPhrasePosition(const std::vector& plans, return anchor; } +bool ContainsTwoTermPhrase(std::pair left_span, + std::pair right_span, + uint32_t right_delta) { + const uint32_t* left = left_span.first; + const uint32_t* right = right_span.first; + while (left != left_span.second && right != right_span.second) { + uint32_t want = 0; + if (!internal::add_position_offset(*left, right_delta, &want)) { + return false; + } + while (right != right_span.second && *right < want) { + ++right; + } + if (right == right_span.second) { + return false; + } + if (*right == want) { + return true; + } + ++left; + } + return false; +} + // Single streaming pass over the candidates: for each (ascending) candidate, // advance every term's cursor to it, gather each term's positions IN PHRASE // ORDER, and test the consecutive-phrase predicate (term[0]@p, term[1]@p+1, @@ -462,6 +486,13 @@ Status EmitPhraseStreaming(const std::vector& plans, for (size_t pp = 0; pp < phrase_len; ++pp) { SNII_RETURN_IF_ERROR(cur[phrase_plan_index[pp]].positions(&span[pp])); } + if (phrase_len == 2) { + if (ContainsTwoTermPhrase(span[0], span[1], + position_offsets[1] - position_offsets[0])) { + docids->push_back(d); + } + continue; + } bool match = false; for (const uint32_t* p = span[anchor].first; p != span[anchor].second; ++p) { if (*p < anchor_offset) continue; diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp index 6604b44f62bad4..2899eae10dd1d4 100644 --- a/be/test/storage/index/snii_query_test.cpp +++ b/be/test/storage/index/snii_query_test.cpp @@ -24,7 +24,10 @@ #include #include "snii/common/slice.h" +#include "snii/encoding/byte_sink.h" +#include "snii/encoding/byte_source.h" #include "snii/format/format_constants.h" +#include "snii/format/prx_pod.h" #include "snii/io/file_reader.h" #include "snii/io/file_writer.h" #include "snii/query/phrase_query.h" @@ -169,5 +172,49 @@ TEST(SniiPhraseQueryTest, WindowedPhrasePrefixQueryKeepsCorrectCandidateOrdinals EXPECT_EQ(docids, expected); } +TEST(SniiPrxPodTest, SelectivePforCsrMatchesFullCsrAcrossRuns) { + std::vector freqs; + std::vector positions; + freqs.reserve(320); + for (uint32_t doc = 0; doc < 320; ++doc) { + const uint32_t freq = (doc % 5 == 0) ? 2 : 1; + freqs.push_back(freq); + positions.push_back(doc * 3); + if (freq == 2) { + positions.push_back(doc * 3 + 2); + } + } + + ByteSink sink; + assert_ok(format::build_prx_window_flat(positions, freqs, -1, &sink)); + + std::vector full_positions; + std::vector full_offsets; + ByteSource full_source(sink.view()); + assert_ok(format::read_prx_window_csr(&full_source, &full_positions, &full_offsets)); + + auto assert_selected_matches_full = [&](const std::vector& selected_docs) { + std::vector selected_positions; + std::vector selected_offsets; + ByteSource selected_source(sink.view()); + assert_ok(format::read_prx_window_csr_selective(&selected_source, selected_docs, + &selected_positions, &selected_offsets)); + + ASSERT_EQ(selected_offsets.size(), selected_docs.size() + 1); + for (size_t i = 0; i < selected_docs.size(); ++i) { + const uint32_t doc = selected_docs[i]; + const std::vector expected(full_positions.begin() + full_offsets[doc], + full_positions.begin() + full_offsets[doc + 1]); + const std::vector actual( + selected_positions.begin() + selected_offsets[i], + selected_positions.begin() + selected_offsets[i + 1]); + EXPECT_EQ(actual, expected); + } + }; + + assert_selected_matches_full({0, 1, 2}); + assert_selected_matches_full({0, 1, 127, 128, 129, 255, 256, 319}); +} + } // namespace } // namespace snii::query From eae1bba79fc550908d2824623d932001e5b06aa1 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 02:06:20 +0800 Subject: [PATCH 07/12] [improvement](be) Optimize SNII phrase query hot paths ### What problem does this PR solve? Issue Number: N/A Related PR: #64909 Problem Summary: SNII phrase and phrase-prefix queries spent most CPU in docid intersection, selective PRX/PFOR decode, and position verification. This change avoids generating PRX ordinals for full on-disk windows so the reader can use the full CSR path, folds selective PRX count validation into selected-range construction, removes hot-loop overflow helper calls from two-term phrase matching, and routes single-tail phrase-prefix queries through the streaming phrase path to avoid materializing all expected tail positions. On the 10B cloud_sim PP5 cold query, SNII BE CPU improved from 72.20s to 63.29s and HWM dropped from 20.26GB to 7.08GB. ### Release note None ### Check List (For Author) - Test: Unit Test and Manual test - Unit Test: ./run-be-ut.sh --run --filter=SniiPhraseQueryTest.*:SniiPrxPodTest.* - Static check: build-support/check-format.sh; git diff --check - Manual test: ./build.sh --be -j 192 - Manual test: cloud_sim cold query benchmark for PH5/PP5 via /mnt/disk15/jiangkai/textbench/cold_query_bench.sh - Behavior changed: No - Does this need documentation: No --- .../index/snii/core/src/format/prx_pod.cpp | 18 ++++++------- .../snii/core/src/query/docid_conjunction.cpp | 24 ++++++++++++++++-- .../snii/core/src/query/phrase_query.cpp | 25 +++++++++++++++++-- be/test/storage/index/snii_query_test.cpp | 13 ++++++++++ 4 files changed, 67 insertions(+), 13 deletions(-) diff --git a/be/src/storage/index/snii/core/src/format/prx_pod.cpp b/be/src/storage/index/snii/core/src/format/prx_pod.cpp index 9bd28451ae3127..ef966569e20455 100644 --- a/be/src/storage/index/snii/core/src/format/prx_pod.cpp +++ b/be/src/storage/index/snii/core/src/format/prx_pod.cpp @@ -370,7 +370,7 @@ void compact_selected_pfor_positions(std::span selected, uint32_t build_selected_pfor_ranges(std::span pos_counts, std::span doc_ordinals, std::vector& selected, - std::vector& pos_off) { + std::vector& pos_off, uint64_t* total_pos_count) { selected.clear(); selected.reserve(doc_ordinals.size()); pos_off.clear(); @@ -380,8 +380,10 @@ uint32_t build_selected_pfor_ranges(std::span pos_counts, uint32_t selected_pos_count = 0; uint32_t delta_begin = 0; size_t next_doc = 0; + uint64_t sum = 0; for (uint32_t d = 0; d < static_cast(pos_counts.size()); ++d) { const uint32_t count = pos_counts[d]; + sum += count; if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) { selected.push_back( SelectedRange {delta_begin, delta_begin + count, selected_pos_count}); @@ -391,6 +393,7 @@ uint32_t build_selected_pfor_ranges(std::span pos_counts, } delta_begin += count; } + *total_pos_count = sum; return selected_pos_count; } @@ -452,19 +455,16 @@ Status decode_pfor_payload_csr_selective(Slice plain, std::span std::vector pos_counts; SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); - uint64_t sum = 0; - for (uint32_t d = 0; d < doc_count; ++d) { - sum += pos_counts[d]; - } - if (sum != total_pos) { - return Status::Corruption("prx: pos_count sum mismatch"); - } pos_flat->clear(); std::vector selected; + uint64_t sum = 0; const uint32_t selected_pos_count = - build_selected_pfor_ranges(pos_counts, doc_ordinals, selected, *pos_off); + build_selected_pfor_ranges(pos_counts, doc_ordinals, selected, *pos_off, &sum); + if (sum != total_pos) { + return Status::Corruption("prx: pos_count sum mismatch"); + } if (should_decode_full_prx_positions(selected, selected_pos_count, total_pos)) { SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, total_pos, pos_flat)); diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp index 373cfc5fd0fbdb..275d85ffe9b3ff 100644 --- a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -138,8 +138,17 @@ Status append_candidate_range_with_ordinals(const std::vector& candida DocidChunk* chunk) { const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); const auto end = std::upper_bound(begin, candidates.end(), last); - chunk->docids.reserve(static_cast(end - begin)); - chunk->prx_doc_ordinals.reserve(static_cast(end - begin)); + const size_t candidate_count = static_cast(end - begin); + chunk->docids.reserve(candidate_count); + const uint64_t width = static_cast(last) - first + 1; + const bool full_dense_range = + candidate_count == width && begin != end && *begin == first && *(end - 1) == last; + if (full_dense_range) { + out->insert(out->end(), begin, end); + chunk->docids.insert(chunk->docids.end(), begin, end); + return Status::OK(); + } + chunk->prx_doc_ordinals.reserve(candidate_count); for (auto it = begin; it != end; ++it) { out->push_back(*it); chunk->docids.push_back(*it); @@ -219,6 +228,12 @@ Status intersect_window_candidates_with_ordinals(const std::vector& ca out->reserve(out->size() + candidate_count); chunk->docids.reserve(candidate_count); chunk->prx_doc_ordinals.reserve(candidate_count); + if (candidate_count == term_docids.size() && *begin == term_docids.front() && + *(end - 1) == term_docids.back() && std::equal(begin, end, term_docids.begin())) { + out->insert(out->end(), begin, end); + chunk->docids.insert(chunk->docids.end(), begin, end); + return Status::OK(); + } size_t doc_index = 0; for (auto it = begin; it != end; ++it) { @@ -232,6 +247,11 @@ Status intersect_window_candidates_with_ordinals(const std::vector& ca chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); ++doc_index; } + if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && + chunk->docids.front() == term_docids.front() && + chunk->docids.back() == term_docids.back()) { + chunk->prx_doc_ordinals.clear(); + } return Status::OK(); } diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index 75e9fe0fcf36b3..0e419440e6af7d 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -442,11 +443,12 @@ bool ContainsTwoTermPhrase(std::pair left_span uint32_t right_delta) { const uint32_t* left = left_span.first; const uint32_t* right = right_span.first; + const uint32_t max_start = std::numeric_limits::max() - right_delta; while (left != left_span.second && right != right_span.second) { - uint32_t want = 0; - if (!internal::add_position_offset(*left, right_delta, &want)) { + if (*left > max_start) { return false; } + const uint32_t want = *left + right_delta; while (right != right_span.second && *right < want) { ++right; } @@ -553,6 +555,18 @@ Status ExecutePhrasePlans(const LogicalIndexReader& idx, snii::io::BatchRangeFet state.candidates, docids); } +Status ExecuteResolvedPhraseTerms(const LogicalIndexReader& idx, + const std::vector& terms, + std::vector* docids) { + snii::io::BatchRangeFetcher round1(idx.reader()); + std::vector plans; + SNII_RETURN_IF_ERROR(internal::plan_resolved_terms(idx, terms, &round1, &plans, + /*need_positions=*/false)); + std::vector phrase_plan_index(terms.size()); + std::iota(phrase_plan_index.begin(), phrase_plan_index.end(), 0); + return ExecutePhrasePlans(idx, &round1, &plans, phrase_plan_index, docids); +} + Status CollectExpectedTailPositions(const std::vector& plans, const std::vector& position_offsets, std::vector& srcs, @@ -780,6 +794,13 @@ Status phrase_prefix_query(const LogicalIndexReader& idx, const std::vector resolved_terms = exact_terms; + resolved_terms.push_back(ResolvedQueryTerm {std::move(tail_hits.front().entry), + tail_hits.front().frq_base, + tail_hits.front().prx_base}); + return ExecuteResolvedPhraseTerms(idx, resolved_terms, docids); + } ExpectedTailPositionSet expected; SNII_RETURN_IF_ERROR(CollectExpectedTailPositions(idx, exact_terms, &expected)); diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp index 2899eae10dd1d4..aad1cff4449079 100644 --- a/be/test/storage/index/snii_query_test.cpp +++ b/be/test/storage/index/snii_query_test.cpp @@ -172,6 +172,19 @@ TEST(SniiPhraseQueryTest, WindowedPhrasePrefixQueryKeepsCorrectCandidateOrdinals EXPECT_EQ(docids, expected); } +TEST(SniiPhraseQueryTest, SingleTailPhrasePrefixUsesStreamingPhrasePath) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_prefix_query(index_reader, {"failed", "orde"}, &docids, 10)); + + const std::vector expected {5000, 7000, 8000}; + EXPECT_EQ(docids, expected); +} + TEST(SniiPrxPodTest, SelectivePforCsrMatchesFullCsrAcrossRuns) { std::vector freqs; std::vector positions; From 1b26ee87d6325c41967a7f00ea647699bbdd1804 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 03:35:34 +0800 Subject: [PATCH 08/12] [improvement](be) Optimize SNII inverted index query hot paths ### What problem does this PR solve? Issue Number: N/A Related PR: N/A Problem Summary: SNII high-df term and phrase queries spent avoidable CPU in vector-to-Roaring materialization, dense docid expansion, selected PRX range construction, repeated final-candidate filtering, and sorted docid conjunction. The CPU profile showed phrase execution dominated by docid conjunction, PFOR PRX decode, and selective PRX CSR compaction instead of remote I/O. This change streams eligible term, prefix, regexp, and wildcard query results directly into Roaring, emits dense docid windows as ranges, carries PRX doc ordinal context through phrase execution, builds selected PRX count ranges during PFOR decode, skips redundant final-candidate filtering, and adds sparse galloping paths for docid conjunction. It also caps conjunction reserve sizes to the maximum possible match count and refactors the SNII reader query dispatch to keep the storage reader control flow smaller. ### Release note None ### Check List (For Author) - Test: Unit Test / Manual test - Unit Test: ./run-be-ut.sh --run --filter=SniiPhraseQueryTest.*:SniiTermQueryTest.*:SniiPrxPodTest.* - Manual test: ./build.sh --be -j 192 - Manual test: deployed BE to cloud_sim and ran support_phrase MATCH_PHRASE smoke query - Manual test: cloud_sim cold benchmark for OP_term_high_df, PH5_phrase_failed_order, and PP5_phrase_prefix_failed - Static check: git diff --check; build-support/check-format.sh - Static check: clang-tidy was attempted with build-support/run-clang-tidy.sh --build-dir be/build_Release, but this environment failed before useful changed-line validation with missing stddef.h/toolchain include errors and pre-existing full-file warnings. - Behavior changed: No - Does this need documentation: No --- be/src/snii/query/docid_sink.h | 20 ++ .../snii/query/internal/docid_conjunction.h | 2 + .../query/internal/docid_posting_reader.h | 5 + be/src/snii/query/term_query.h | 3 + .../index/snii/core/src/format/prx_pod.cpp | 52 +++--- .../snii/core/src/query/docid_conjunction.cpp | 70 ++++++- .../core/src/query/docid_posting_reader.cpp | 92 +++++++++- .../snii/core/src/query/phrase_query.cpp | 89 +++++++-- .../index/snii/core/src/query/term_query.cpp | 8 +- .../storage/index/snii/snii_index_reader.cpp | 171 ++++++++++++------ be/src/storage/index/snii/snii_index_reader.h | 2 - be/test/storage/index/snii_query_test.cpp | 37 ++++ 12 files changed, 445 insertions(+), 106 deletions(-) diff --git a/be/src/snii/query/docid_sink.h b/be/src/snii/query/docid_sink.h index de08d24b9ce164..9fc5dc2d9739d3 100644 --- a/be/src/snii/query/docid_sink.h +++ b/be/src/snii/query/docid_sink.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -14,6 +15,7 @@ class DocIdSink { public: virtual ~DocIdSink() = default; virtual Status append_sorted(std::span docids) = 0; + virtual Status append_range(uint32_t first, uint64_t last_exclusive) = 0; }; class VectorDocIdSink final : public DocIdSink { @@ -25,6 +27,24 @@ class VectorDocIdSink final : public DocIdSink { return Status::OK(); } + Status append_range(uint32_t first, uint64_t last_exclusive) override { + if (last_exclusive <= first) { + return Status::OK(); + } + if (last_exclusive > static_cast(std::numeric_limits::max()) + 1) { + return Status::InvalidArgument("docid_sink: range exceeds uint32 docid space"); + } + const uint64_t count = last_exclusive - first; + if (count > static_cast(docids_.max_size() - docids_.size())) { + return Status::InvalidArgument("docid_sink: range too large"); + } + docids_.reserve(docids_.size() + static_cast(count)); + for (uint64_t docid = first; docid < last_exclusive; ++docid) { + docids_.push_back(static_cast(docid)); + } + return Status::OK(); + } + private: std::vector& docids_; }; diff --git a/be/src/snii/query/internal/docid_conjunction.h b/be/src/snii/query/internal/docid_conjunction.h index f97ac781a2c364..3cb6cc42f5a294 100644 --- a/be/src/snii/query/internal/docid_conjunction.h +++ b/be/src/snii/query/internal/docid_conjunction.h @@ -36,12 +36,14 @@ struct TermPlan { struct DocidChunk { std::vector docids; std::vector prx_doc_ordinals; + uint32_t prx_doc_count = 0; bool windowed = false; uint32_t window = 0; }; struct DocidSource { std::vector chunks; + bool docids_are_final_candidates = false; }; Status resolve_query_term(const snii::reader::LogicalIndexReader& idx, const std::string& term, diff --git a/be/src/snii/query/internal/docid_posting_reader.h b/be/src/snii/query/internal/docid_posting_reader.h index cf297d4082ccd9..bf5927b5857335 100644 --- a/be/src/snii/query/internal/docid_posting_reader.h +++ b/be/src/snii/query/internal/docid_posting_reader.h @@ -5,6 +5,7 @@ #include "snii/common/status.h" #include "snii/format/dict_entry.h" +#include "snii/query/docid_sink.h" #include "snii/reader/logical_index_reader.h" namespace snii::query::internal { @@ -22,6 +23,10 @@ Status read_docid_posting(const snii::reader::LogicalIndexReader& idx, const snii::format::DictEntry& entry, uint64_t frq_base, uint64_t prx_base, std::vector* docids); +Status read_docid_posting(const snii::reader::LogicalIndexReader& idx, + const snii::format::DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, snii::query::DocIdSink* sink); + // Batch counterpart for multi-term docid-only operators. Windowed terms share one // prelude fetch round and one docid fetch round, so OR-style operators pay by // stage rather than by term. diff --git a/be/src/snii/query/term_query.h b/be/src/snii/query/term_query.h index 959e5a0ad20a7b..c804405a2ec104 100644 --- a/be/src/snii/query/term_query.h +++ b/be/src/snii/query/term_query.h @@ -5,6 +5,7 @@ #include #include "snii/common/status.h" +#include "snii/query/docid_sink.h" #include "snii/query/query_profile.h" #include "snii/reader/logical_index_reader.h" @@ -16,6 +17,8 @@ namespace snii::query { Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, std::vector* docids); +Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, + DocIdSink* sink); Status term_query(const snii::reader::LogicalIndexReader& idx, std::string_view term, std::vector* docids, QueryProfile* profile); diff --git a/be/src/storage/index/snii/core/src/format/prx_pod.cpp b/be/src/storage/index/snii/core/src/format/prx_pod.cpp index ef966569e20455..7d90cb3ead5df6 100644 --- a/be/src/storage/index/snii/core/src/format/prx_pod.cpp +++ b/be/src/storage/index/snii/core/src/format/prx_pod.cpp @@ -367,34 +367,42 @@ void compact_selected_pfor_positions(std::span selected, pos_flat.resize(write_off); } -uint32_t build_selected_pfor_ranges(std::span pos_counts, - std::span doc_ordinals, - std::vector& selected, - std::vector& pos_off, uint64_t* total_pos_count) { +Status decode_selected_pfor_count_ranges(ByteSource* src, uint32_t doc_count, + std::span doc_ordinals, + std::vector& selected, + std::vector& pos_off, uint64_t* total_pos_count, + uint32_t* selected_pos_count) { selected.clear(); selected.reserve(doc_ordinals.size()); pos_off.clear(); pos_off.reserve(doc_ordinals.size() + 1); pos_off.push_back(0); - uint32_t selected_pos_count = 0; + *selected_pos_count = 0; uint32_t delta_begin = 0; size_t next_doc = 0; - uint64_t sum = 0; - for (uint32_t d = 0; d < static_cast(pos_counts.size()); ++d) { - const uint32_t count = pos_counts[d]; - sum += count; - if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) { - selected.push_back( - SelectedRange {delta_begin, delta_begin + count, selected_pos_count}); - selected_pos_count += count; - pos_off.push_back(selected_pos_count); - ++next_doc; + *total_pos_count = 0; + std::array run_buf {}; + for (uint32_t run_begin = 0; run_begin < doc_count; run_begin += kFrqBaseUnit) { + const uint32_t run_len = std::min(kFrqBaseUnit, doc_count - run_begin); + SNII_RETURN_IF_ERROR(pfor_decode(src, run_len, run_buf.data())); + for (uint32_t i = 0; i < run_len; ++i) { + const uint32_t d = run_begin + i; + const uint32_t count = run_buf[i]; + *total_pos_count += count; + if (next_doc < doc_ordinals.size() && doc_ordinals[next_doc] == d) { + selected.emplace_back(delta_begin, delta_begin + count, *selected_pos_count); + *selected_pos_count += count; + pos_off.push_back(*selected_pos_count); + ++next_doc; + } + delta_begin += count; } - delta_begin += count; } - *total_pos_count = sum; - return selected_pos_count; + if (next_doc != doc_ordinals.size()) { + return Status::Corruption("prx: selected doc ordinal was not decoded"); + } + return Status::OK(); } Status decode_sparse_selected_pfor_positions(ByteSource* src, uint32_t total_pos, @@ -453,15 +461,13 @@ Status decode_pfor_payload_csr_selective(Slice plain, std::span } SNII_RETURN_IF_ERROR(validate_doc_ordinals(doc_ordinals, doc_count)); - std::vector pos_counts; - SNII_RETURN_IF_ERROR(decode_pfor_runs(&src, doc_count, &pos_counts)); - pos_flat->clear(); std::vector selected; uint64_t sum = 0; - const uint32_t selected_pos_count = - build_selected_pfor_ranges(pos_counts, doc_ordinals, selected, *pos_off, &sum); + uint32_t selected_pos_count = 0; + SNII_RETURN_IF_ERROR(decode_selected_pfor_count_ranges(&src, doc_count, doc_ordinals, selected, + *pos_off, &sum, &selected_pos_count)); if (sum != total_pos) { return Status::Corruption("prx: pos_count sum mismatch"); } diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp index 275d85ffe9b3ff..9964b903ec37d1 100644 --- a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -141,6 +141,10 @@ Status append_candidate_range_with_ordinals(const std::vector& candida const size_t candidate_count = static_cast(end - begin); chunk->docids.reserve(candidate_count); const uint64_t width = static_cast(last) - first + 1; + if (width > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: dense window exceeds doc count range"); + } + chunk->prx_doc_count = static_cast(width); const bool full_dense_range = candidate_count == width && begin != end && *begin == first && *(end - 1) == last; if (full_dense_range) { @@ -220,14 +224,18 @@ Status intersect_window_candidates_with_ordinals(const std::vector& ca const std::vector& term_docids, uint32_t first, uint32_t last, std::vector* out, DocidChunk* chunk) { + if (term_docids.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk->prx_doc_count = static_cast(term_docids.size()); const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); const auto end = std::upper_bound(begin, candidates.end(), last); if (begin == end || term_docids.empty()) return Status::OK(); const size_t candidate_count = static_cast(end - begin); - out->reserve(out->size() + candidate_count); - chunk->docids.reserve(candidate_count); - chunk->prx_doc_ordinals.reserve(candidate_count); + const size_t max_matches = std::min(candidate_count, term_docids.size()); + out->reserve(out->size() + max_matches); + chunk->docids.reserve(max_matches); if (candidate_count == term_docids.size() && *begin == term_docids.front() && *(end - 1) == term_docids.back() && std::equal(begin, end, term_docids.begin())) { out->insert(out->end(), begin, end); @@ -235,6 +243,50 @@ Status intersect_window_candidates_with_ordinals(const std::vector& ca return Status::OK(); } + chunk->prx_doc_ordinals.reserve(max_matches); + const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1; + if (candidate_count < term_docids.size() / probes_per_candidate) { + size_t doc_index = 0; + for (auto it = begin; it != end; ++it) { + const auto found = + std::lower_bound(term_docids.begin() + doc_index, term_docids.end(), *it); + if (found == term_docids.end()) break; + doc_index = static_cast(found - term_docids.begin()); + if (*found != *it) continue; + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); + ++doc_index; + } + if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && + chunk->docids.front() == term_docids.front() && + chunk->docids.back() == term_docids.back()) { + chunk->prx_doc_ordinals.clear(); + } + return Status::OK(); + } + + const size_t probes_per_term_doc = log2_ceil(candidate_count) + 1; + if (term_docids.size() < candidate_count / probes_per_term_doc) { + auto candidate_it = begin; + for (size_t doc_index = 0; doc_index < term_docids.size(); ++doc_index) { + const uint32_t docid = term_docids[doc_index]; + candidate_it = std::lower_bound(candidate_it, end, docid); + if (candidate_it == end) break; + if (*candidate_it != docid) continue; + out->push_back(docid); + chunk->docids.push_back(docid); + chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); + ++candidate_it; + } + if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && + chunk->docids.front() == term_docids.front() && + chunk->docids.back() == term_docids.back()) { + chunk->prx_doc_ordinals.clear(); + } + return Status::OK(); + } + size_t doc_index = 0; for (auto it = begin; it != end; ++it) { while (doc_index < term_docids.size() && term_docids[doc_index] < *it) { @@ -321,6 +373,7 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla DocidChunk chunk; chunk.windowed = true; chunk.window = w; + chunk.prx_doc_count = meta.doc_count; if (candidates == nullptr) { SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, &chunk.docids)); } else { @@ -365,6 +418,10 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla chunk.window = f.ordinal; if (candidates == nullptr) { chunk.docids = docs; + if (docs.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(docs.size()); source->chunks.push_back(std::move(chunk)); } else { uint32_t first = 0; @@ -407,6 +464,10 @@ Status collect_docids_only(const LogicalIndexReader& idx, const snii::io::BatchR SNII_RETURN_IF_ERROR(decode_flat_docids_only(round1, p, &term_docids)); if (source != nullptr) { DocidChunk chunk; + if (term_docids.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(term_docids.size()); if (candidates == nullptr) { chunk.docids = term_docids; } else if (!term_docids.empty()) { @@ -439,6 +500,9 @@ Status build_docid_only_conjunction_impl(const LogicalIndexReader& idx, DocidSource* source = sources == nullptr ? nullptr : &(*sources)[ti]; SNII_RETURN_IF_ERROR(collect_docids_only(idx, round1, plans[ti], k == 0 ? nullptr : candidates, &next, source)); + if (source != nullptr && k + 1 == order.size()) { + source->docids_are_final_candidates = true; + } *candidates = std::move(next); if (candidates->empty()) return Status::OK(); } diff --git a/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp index 18a487b31bac01..206221ffc5dbbc 100644 --- a/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_posting_reader.cpp @@ -116,13 +116,45 @@ Status window_dd_slice(Slice dd_block, const WindowMeta& meta, Slice* out) { return Status::OK(); } +Status first_docid_in_window(const WindowMeta& meta, uint32_t window_ordinal, uint32_t* first) { + if (window_ordinal == 0) { + *first = 0; + return Status::OK(); + } + if (meta.win_base >= std::numeric_limits::max()) { + return Status::Corruption("docid_posting_reader: window base exceeds docid range"); + } + *first = static_cast(meta.win_base + 1); + if (*first > meta.last_docid) { + return Status::Corruption("docid_posting_reader: invalid window docid range"); + } + return Status::OK(); +} + +Status is_dense_full_window(const WindowMeta& meta, uint32_t window_ordinal, bool* full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, window_ordinal, &first)); + const uint64_t width = static_cast(meta.last_docid) - first + 1; + *full = meta.doc_count == width; + return Status::OK(); +} + Status decode_flat_plan(const snii::io::BatchRangeFetcher& fetcher, const FlatPlan& plan, std::vector* out) { return decode_flat_docs(*plan.entry, fetcher.get(plan.handle), out); } +Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, + DocIdSink* sink); + Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, std::vector* out) { + VectorDocIdSink sink(*out); + return decode_window_prefix_plan(fetcher, plan, &sink); +} + +Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, const WindowPlan& plan, + DocIdSink* sink) { const DictEntry& entry = plan.posting->entry; const Slice prefix = fetcher.get(plan.prefix_handle); if (entry.prelude_len > prefix.size()) { @@ -140,18 +172,30 @@ Status decode_window_prefix_plan(const snii::io::BatchRangeFetcher& fetcher, con return Status::Corruption("docid_posting_reader: docs prefix length mismatch"); } const Slice dd_block = prefix.subslice(prelude_len, prefix.size() - prelude_len); + std::vector docs; + std::vector freqs; + std::vector> positions; for (uint32_t w = 0; w < prelude.n_windows(); ++w) { WindowMeta meta; Slice dd_region; SNII_RETURN_IF_ERROR(prelude.window(w, &meta)); SNII_RETURN_IF_ERROR(window_dd_slice(dd_block, meta, &dd_region)); - std::vector docs; - std::vector freqs; - std::vector> positions; + bool dense_full = false; + SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full)); + if (dense_full) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first)); + SNII_RETURN_IF_ERROR( + sink->append_range(first, static_cast(meta.last_docid) + 1)); + continue; + } + docs.clear(); + freqs.clear(); + positions.clear(); SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( meta, dd_region, Slice(), Slice(), /*want_positions=*/false, /*want_freq=*/false, &docs, &freqs, &positions)); - out->insert(out->end(), docs.begin(), docs.end()); + SNII_RETURN_IF_ERROR(sink->append_sorted(docs)); } return Status::OK(); } @@ -163,11 +207,41 @@ Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, if (docids == nullptr) { return Status::InvalidArgument("docid_posting_reader: null out"); } - std::vector> batched; - SNII_RETURN_IF_ERROR(read_docid_postings_batched( - idx, {ResolvedDocidPosting {entry, frq_base, prx_base}}, &batched)); - *docids = std::move(batched.front()); - return Status::OK(); + docids->clear(); + VectorDocIdSink sink(*docids); + return read_docid_posting(idx, entry, frq_base, prx_base, &sink); +} + +Status read_docid_posting(const LogicalIndexReader& idx, const DictEntry& entry, uint64_t frq_base, + uint64_t prx_base, DocIdSink* sink) { + if (sink == nullptr) { + return Status::InvalidArgument("docid_posting_reader: null sink"); + } + ResolvedDocidPosting posting {entry, frq_base, prx_base}; + if (posting.entry.kind == DictEntryKind::kInline) { + std::vector docs; + SNII_RETURN_IF_ERROR(decode_inline_docs(posting.entry, &docs)); + return sink->append_sorted(docs); + } + + snii::io::BatchRangeFetcher docs_fetcher(idx.reader()); + if (posting.entry.enc == DictEntryEnc::kWindowed) { + WindowPlan plan; + plan.out_index = 0; + plan.posting = &posting; + SNII_RETURN_IF_ERROR(plan_window_prefix(idx, &plan, &docs_fetcher)); + if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch()); + return decode_window_prefix_plan(docs_fetcher, plan, sink); + } + + FlatPlan plan; + plan.out_index = 0; + plan.entry = &posting.entry; + SNII_RETURN_IF_ERROR(plan_flat_docs(idx, posting, &docs_fetcher, &plan)); + if (docs_fetcher.pending() > 0) SNII_RETURN_IF_ERROR(docs_fetcher.fetch()); + std::vector docs; + SNII_RETURN_IF_ERROR(decode_flat_plan(docs_fetcher, plan, &docs)); + return sink->append_sorted(docs); } Status read_docid_postings_batched(const LogicalIndexReader& idx, diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index 0e419440e6af7d..1fef749d78a5dd 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -85,6 +85,7 @@ struct PosChunk { // `prx_doc_ordinals[i]`, allowing PRX decode to skip positions for docs that // were removed by the docid-only conjunction. std::vector prx_doc_ordinals; + uint32_t prx_doc_count = 0; Slice prx; // .prx window bytes (reference fetcher/round1/entry) bool windowed = false; uint32_t window = 0; @@ -177,10 +178,15 @@ Status materialize_selected_prefix_if_needed(bool* selected_all, size_t count, s } Status SelectCandidateDocsForPrx(std::vector* docids, - std::vector* prx_doc_ordinals, + std::vector* prx_doc_ordinals, uint32_t prx_doc_count, const std::vector& candidates, PosChunk* chunk) { chunk->docids.clear(); chunk->prx_doc_ordinals.clear(); + if (prx_doc_count == 0 && docids->size() > std::numeric_limits::max()) { + return Status::Corruption("phrase_query: prx doc count exceeds u32"); + } + chunk->prx_doc_count = + prx_doc_count == 0 ? static_cast(docids->size()) : prx_doc_count; if (docids->empty() || candidates.empty()) { return Status::OK(); } @@ -243,9 +249,13 @@ Status BuildFlatPositionSource(const LogicalIndexReader& idx, PosChunk chunk; std::vector docids; std::vector prx_doc_ordinals; + const bool docids_are_final_candidates = + doc_source->docids_are_final_candidates && !doc_source->chunks.empty(); if (!doc_source->chunks.empty()) { - docids = std::move(doc_source->chunks.front().docids); - prx_doc_ordinals = std::move(doc_source->chunks.front().prx_doc_ordinals); + DocidChunk& doc_chunk = doc_source->chunks.front(); + docids = std::move(doc_chunk.docids); + prx_doc_ordinals = std::move(doc_chunk.prx_doc_ordinals); + chunk.prx_doc_count = doc_chunk.prx_doc_count; } if (p.pod_ref) { uint64_t poff = 0; @@ -268,8 +278,19 @@ Status BuildFlatPositionSource(const LogicalIndexReader& idx, } SNII_RETURN_IF_ERROR(snii::format::decode_dd_region(dd, p.entry.dd_meta, /*win_base=*/0, &docids)); + if (docids.size() > std::numeric_limits::max()) { + return Status::Corruption("phrase_query: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(docids.size()); } - SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx(&docids, &prx_doc_ordinals, candidates, &chunk)); + if (docids_are_final_candidates) { + chunk.docids = std::move(docids); + chunk.prx_doc_ordinals = std::move(prx_doc_ordinals); + if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk)); + return Status::OK(); + } + SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx(&docids, &prx_doc_ordinals, chunk.prx_doc_count, + candidates, &chunk)); if (!chunk.docids.empty()) src->chunks.push_back(std::move(chunk)); return Status::OK(); } @@ -295,13 +316,23 @@ Status DecodeWindowedPositionSource( fetched.reserve(doc_source->chunks.size()); for (size_t i = 0; i < doc_source->chunks.size(); ++i) { DocidChunk& doc_chunk = doc_source->chunks[i]; - if (!ChunkMayContainCandidate(doc_chunk, candidates)) continue; + if (!doc_source->docids_are_final_candidates && + !ChunkMayContainCandidate(doc_chunk, candidates)) { + continue; + } if (!doc_chunk.windowed) { return Status::Corruption("phrase_query: expected windowed doc chunk"); } PosChunk chunk; - SNII_RETURN_IF_ERROR(SelectCandidateDocsForPrx( - &doc_chunk.docids, &doc_chunk.prx_doc_ordinals, candidates, &chunk)); + if (doc_source->docids_are_final_candidates) { + chunk.docids = std::move(doc_chunk.docids); + chunk.prx_doc_ordinals = std::move(doc_chunk.prx_doc_ordinals); + chunk.prx_doc_count = doc_chunk.prx_doc_count; + } else { + SNII_RETURN_IF_ERROR( + SelectCandidateDocsForPrx(&doc_chunk.docids, &doc_chunk.prx_doc_ordinals, + doc_chunk.prx_doc_count, candidates, &chunk)); + } if (chunk.docids.empty()) continue; snii::reader::WindowAbsRange range; @@ -361,6 +392,7 @@ class PostingCursor { ci_ = 0; li_ = 0; decoded_pos_chunk_ = kNoChunk; + offsets_by_prx_ordinal_ = false; } // Positions the cursor at `target` (guaranteed present: candidates are the @@ -390,19 +422,32 @@ class PostingCursor { } if (decoded_pos_chunk_ != ci_) { ByteSource ps(src_->chunks[ci_].prx); - if (src_->chunks[ci_].prx_doc_ordinals.empty()) { + const PosChunk& chunk = src_->chunks[ci_]; + offsets_by_prx_ordinal_ = false; + if (chunk.prx_doc_ordinals.empty()) { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + } else if (should_decode_full_prx_window(chunk)) { SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + offsets_by_prx_ordinal_ = true; } else { SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr_selective( - &ps, src_->chunks[ci_].prx_doc_ordinals, &pflat_, &poff_)); + &ps, chunk.prx_doc_ordinals, &pflat_, &poff_)); } - if (poff_.size() != src_->chunks[ci_].docids.size() + 1) { - return Status::Corruption("phrase_query: prx/dd doc-count mismatch"); + if (offsets_by_prx_ordinal_) { + if (poff_.size() != static_cast(chunk.prx_doc_count) + 1) { + return Status::Corruption("phrase_query: full prx doc-count mismatch"); + } + } else if (poff_.size() != chunk.docids.size() + 1) { + return Status::Corruption("phrase_query: selected prx/doc-count mismatch"); } decoded_pos_chunk_ = ci_; } - const uint32_t begin = poff_[li_]; - const uint32_t end = poff_[li_ + 1]; + const size_t pos_index = position_offset_index(); + if (pos_index + 1 >= poff_.size()) { + return Status::Corruption("phrase_query: prx ordinal offset out of range"); + } + const uint32_t begin = poff_[pos_index]; + const uint32_t end = poff_[pos_index + 1]; if (begin == end) { *out = {nullptr, nullptr}; return Status::OK(); @@ -416,12 +461,26 @@ class PostingCursor { private: static constexpr size_t kNoChunk = static_cast(-1); + + static bool should_decode_full_prx_window(const PosChunk& chunk) { + return chunk.prx_doc_count != 0 && + static_cast(chunk.prx_doc_ordinals.size()) * 2 >= chunk.prx_doc_count; + } + + size_t position_offset_index() const { + if (!offsets_by_prx_ordinal_) { + return li_; + } + return src_->chunks[ci_].prx_doc_ordinals[li_]; + } + const PosSource* src_ = nullptr; size_t ci_ = 0; // current chunk size_t li_ = 0; // current local doc index within the chunk size_t decoded_pos_chunk_ = kNoChunk; // which chunk pflat_/poff_ currently hold - std::vector pflat_; // current chunk's flat positions (reused) - std::vector poff_; // current chunk's per-doc offsets (reused) + bool offsets_by_prx_ordinal_ = false; + std::vector pflat_; // current chunk's flat positions (reused) + std::vector poff_; // current chunk's per-doc offsets (reused) }; size_t AnchorPhrasePosition(const std::vector& plans, diff --git a/be/src/storage/index/snii/core/src/query/term_query.cpp b/be/src/storage/index/snii/core/src/query/term_query.cpp index 19b4e4138974d6..4cf6e97bc2471b 100644 --- a/be/src/storage/index/snii/core/src/query/term_query.cpp +++ b/be/src/storage/index/snii/core/src/query/term_query.cpp @@ -14,6 +14,12 @@ Status term_query(const LogicalIndexReader& idx, std::string_view term, std::vector* docids) { if (docids == nullptr) return Status::InvalidArgument("term_query: null out"); docids->clear(); + VectorDocIdSink sink(*docids); + return term_query(idx, term, &sink); +} + +Status term_query(const LogicalIndexReader& idx, std::string_view term, DocIdSink* sink) { + if (sink == nullptr) return Status::InvalidArgument("term_query: null sink"); bool found = false; DictEntry entry; @@ -21,7 +27,7 @@ Status term_query(const LogicalIndexReader& idx, std::string_view term, uint64_t prx_base = 0; SNII_RETURN_IF_ERROR(idx.lookup(term, &found, &entry, &frq_base, &prx_base)); if (!found) return Status::OK(); - return internal::read_docid_posting(idx, entry, frq_base, prx_base, docids); + return internal::read_docid_posting(idx, entry, frq_base, prx_base, sink); } Status term_query(const LogicalIndexReader& idx, std::string_view term, diff --git a/be/src/storage/index/snii/snii_index_reader.cpp b/be/src/storage/index/snii/snii_index_reader.cpp index 995c4ba51c3980..2b7129074d92a7 100644 --- a/be/src/storage/index/snii/snii_index_reader.cpp +++ b/be/src/storage/index/snii/snii_index_reader.cpp @@ -34,11 +34,13 @@ #include "runtime/runtime_state.h" #include "snii/format/null_bitmap.h" #include "snii/query/boolean_query.h" +#include "snii/query/docid_sink.h" #include "snii/query/phrase_query.h" #include "snii/query/prefix_query.h" #include "snii/query/regexp_query.h" #include "snii/query/term_query.h" #include "snii/query/wildcard_query.h" +#include "snii/reader/logical_index_reader.h" #include "storage/index/index_file_reader.h" #include "storage/index/inverted/analyzer/analyzer.h" #include "storage/index/inverted/inverted_index_cache.h" @@ -49,6 +51,34 @@ namespace doris::segment_v2 { namespace { +class RoaringDocIdSink final : public snii::query::DocIdSink { +public: + explicit RoaringDocIdSink(roaring::Roaring* bitmap) : _bitmap(bitmap) { + DCHECK(_bitmap != nullptr); + } + + snii::Status append_sorted(std::span docids) override { + if (!docids.empty()) { + _bitmap->addMany(docids.size(), docids.data()); + } + return snii::Status::OK(); + } + + snii::Status append_range(uint32_t first, uint64_t last_exclusive) override { + if (last_exclusive > first) { + _bitmap->addRange(first, last_exclusive); + } + return snii::Status::OK(); + } + +private: + roaring::Roaring* _bitmap; +}; + +struct SniiQueryExecutionResult { + std::shared_ptr bitmap; +}; + std::vector to_terms(const InvertedIndexQueryInfo& query_info) { std::vector terms; terms.reserve(query_info.term_infos.size()); @@ -117,6 +147,90 @@ std::string build_snii_query_cache_value(const InvertedIndexQueryInfo& query_inf return cache_value; } +std::shared_ptr docids_to_bitmap(const std::vector& docids) { + auto result = std::make_shared(); + if (!docids.empty()) { + result->addMany(docids.size(), docids.data()); + } + result->runOptimize(); + return result; +} + +Status execute_snii_query(const snii::reader::LogicalIndexReader& logical_reader, + InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, std::string_view search_str, + const std::vector& terms, int32_t max_expansions, + SniiQueryExecutionResult* result) { + result->bitmap = std::make_shared(); + RoaringDocIdSink sink(result->bitmap.get()); + std::vector docids; + bool emitted_to_sink = false; + snii::Status status; + switch (query_type) { + case InvertedIndexQueryType::EQUAL_QUERY: + case InvertedIndexQueryType::MATCH_ANY_QUERY: + status = terms.size() == 1 ? snii::query::term_query(logical_reader, terms.front(), &sink) + : snii::query::boolean_or(logical_reader, terms, &sink); + emitted_to_sink = true; + break; + case InvertedIndexQueryType::MATCH_ALL_QUERY: + if (terms.size() == 1) { + status = snii::query::term_query(logical_reader, terms.front(), &sink); + emitted_to_sink = true; + } else { + status = snii::query::boolean_and(logical_reader, terms, &docids); + } + break; + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + if (query_info.slop != 0) { + return Status::Error( + "SNII does not support sloppy phrase query yet"); + } + if (terms.size() == 1) { + status = snii::query::term_query(logical_reader, terms.front(), &sink); + emitted_to_sink = true; + } else { + status = snii::query::phrase_query(logical_reader, terms, &docids); + } + break; + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + if (terms.size() == 1) { + status = + snii::query::prefix_query(logical_reader, terms.front(), &sink, max_expansions); + emitted_to_sink = true; + } else { + status = snii::query::phrase_prefix_query(logical_reader, terms, &docids, + max_expansions); + } + break; + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: + status = snii::query::regexp_query(logical_reader, search_str, &sink, max_expansions); + emitted_to_sink = true; + break; + case InvertedIndexQueryType::WILDCARD_QUERY: + status = snii::query::wildcard_query(logical_reader, search_str, &sink, max_expansions); + emitted_to_sink = true; + break; + case InvertedIndexQueryType::LESS_THAN_QUERY: + case InvertedIndexQueryType::LESS_EQUAL_QUERY: + case InvertedIndexQueryType::GREATER_THAN_QUERY: + case InvertedIndexQueryType::GREATER_EQUAL_QUERY: + case InvertedIndexQueryType::RANGE_QUERY: + return Status::Error( + "SNII inverted index storage format does not support BKD/range query"); + default: + return Status::Error( + "SNII unsupported inverted index query type {}", query_type_to_string(query_type)); + } + RETURN_IF_ERROR(snii_doris::to_doris_status(status)); + if (emitted_to_sink) { + result->bitmap->runOptimize(); + } else { + result->bitmap = docids_to_bitmap(docids); + } + return Status::OK(); +} + } // namespace Status SniiIndexReader::new_iterator(std::unique_ptr* iterator) { @@ -181,16 +295,6 @@ Status SniiIndexReader::_parse_query_terms(const IndexQueryContextPtr& context, return Status::OK(); } -void SniiIndexReader::_docids_to_bitmap(const std::vector& docids, - std::shared_ptr* bit_map) { - auto result = std::make_shared(); - if (!docids.empty()) { - result->addMany(docids.size(), docids.data()); - } - result->runOptimize(); - *bit_map = std::move(result); -} - Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::string& column_name, const Field& query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map, @@ -245,49 +349,10 @@ Status SniiIndexReader::query(const IndexQueryContextPtr& context, const std::st _index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx)); auto logical_reader = DORIS_TRY(_index_file_reader->open_snii_index(&_index_meta)); - std::vector docids; - snii::Status status; - switch (query_type) { - case InvertedIndexQueryType::EQUAL_QUERY: - case InvertedIndexQueryType::MATCH_ANY_QUERY: - status = terms.size() == 1 - ? snii::query::term_query(*logical_reader, terms.front(), &docids) - : snii::query::boolean_or(*logical_reader, terms, &docids); - break; - case InvertedIndexQueryType::MATCH_ALL_QUERY: - status = snii::query::boolean_and(*logical_reader, terms, &docids); - break; - case InvertedIndexQueryType::MATCH_PHRASE_QUERY: - if (query_info.slop != 0) { - return Status::Error( - "SNII does not support sloppy phrase query yet"); - } - status = terms.size() == 1 - ? snii::query::term_query(*logical_reader, terms.front(), &docids) - : snii::query::phrase_query(*logical_reader, terms, &docids); - break; - case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: - status = snii::query::phrase_prefix_query(*logical_reader, terms, &docids, max_expansions); - break; - case InvertedIndexQueryType::MATCH_REGEXP_QUERY: - status = snii::query::regexp_query(*logical_reader, search_str, &docids, max_expansions); - break; - case InvertedIndexQueryType::WILDCARD_QUERY: - status = snii::query::wildcard_query(*logical_reader, search_str, &docids, max_expansions); - break; - case InvertedIndexQueryType::LESS_THAN_QUERY: - case InvertedIndexQueryType::LESS_EQUAL_QUERY: - case InvertedIndexQueryType::GREATER_THAN_QUERY: - case InvertedIndexQueryType::GREATER_EQUAL_QUERY: - case InvertedIndexQueryType::RANGE_QUERY: - return Status::Error( - "SNII inverted index storage format does not support BKD/range query"); - default: - return Status::Error( - "SNII unsupported inverted index query type {}", query_type_to_string(query_type)); - } - RETURN_IF_ERROR(snii_doris::to_doris_status(status)); - _docids_to_bitmap(docids, &bit_map); + SniiQueryExecutionResult query_result; + RETURN_IF_ERROR(execute_snii_query(*logical_reader, query_type, query_info, search_str, terms, + max_expansions, &query_result)); + bit_map = std::move(query_result.bitmap); cache->insert(cache_key, bit_map, &cache_handler); return Status::OK(); } diff --git a/be/src/storage/index/snii/snii_index_reader.h b/be/src/storage/index/snii/snii_index_reader.h index e7c4b00bf68c0b..5b504802a28f9f 100644 --- a/be/src/storage/index/snii/snii_index_reader.h +++ b/be/src/storage/index/snii/snii_index_reader.h @@ -53,8 +53,6 @@ class SniiIndexReader final : public InvertedIndexReader { InvertedIndexQueryType query_type, const InvertedIndexAnalyzerCtx* analyzer_ctx, InvertedIndexQueryInfo* query_info); - static void _docids_to_bitmap(const std::vector& docids, - std::shared_ptr* bit_map); InvertedIndexReaderType _reader_type; }; diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp index aad1cff4449079..2a67c74ae02520 100644 --- a/be/test/storage/index/snii_query_test.cpp +++ b/be/test/storage/index/snii_query_test.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,9 @@ #include "snii/format/prx_pod.h" #include "snii/io/file_reader.h" #include "snii/io/file_writer.h" +#include "snii/query/docid_sink.h" #include "snii/query/phrase_query.h" +#include "snii/query/term_query.h" #include "snii/reader/logical_index_reader.h" #include "snii/reader/snii_segment_reader.h" #include "snii/writer/snii_compound_writer.h" @@ -72,6 +75,25 @@ class MemoryFile final : public snii::io::FileReader, public snii::io::FileWrite bool finalized_ = false; }; +class RecordingDocIdSink final : public DocIdSink { +public: + Status append_sorted(std::span docids) override { + out.insert(out.end(), docids.begin(), docids.end()); + return Status::OK(); + } + + Status append_range(uint32_t first, uint64_t last_exclusive) override { + ++range_calls; + for (uint64_t docid = first; docid < last_exclusive; ++docid) { + out.push_back(static_cast(docid)); + } + return Status::OK(); + } + + std::vector out; + size_t range_calls = 0; +}; + struct PostingDoc { uint32_t docid = 0; std::vector positions; @@ -185,6 +207,21 @@ TEST(SniiPhraseQueryTest, SingleTailPhrasePrefixUsesStreamingPhrasePath) { EXPECT_EQ(docids, expected); } +TEST(SniiTermQueryTest, WindowedDenseTermEmitsRangesToSink) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + RecordingDocIdSink sink; + assert_ok(term_query(index_reader, "failed", &sink)); + + std::vector expected(9000); + std::iota(expected.begin(), expected.end(), 0); + EXPECT_EQ(sink.out, expected); + EXPECT_GT(sink.range_calls, 0); +} + TEST(SniiPrxPodTest, SelectivePforCsrMatchesFullCsrAcrossRuns) { std::vector freqs; std::vector positions; From 370b1e81b0b647932860ce62381ce9c75f09cc27 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 04:51:01 +0800 Subject: [PATCH 09/12] [improvement](be) Optimize SNII phrase query CPU path ### What problem does this PR solve? Issue Number: N/A Related PR: N/A Problem Summary: SNII phrase queries on the 10B cloud benchmark spent most CPU in PRX position decode, posting cursor iteration, and docid conjunction. This change adds a two-term phrase streaming path, uses an adjacent-pair prefilter for multi-term phrase verification, reuses candidate ranges during docid conjunction, and adds low bit-width PFOR unpack paths for common PRX count and delta widths. A near-dense ordinal shortcut was tested and removed because it regressed the PH5 and PP5 phrase cases. In the cloud_sim 10B cold benchmark, PH5 improved from 6181 ms wall time and 60.22 s BE CPU to 5794 ms wall time and 55.43 s BE CPU; PP5 improved from 6211 ms wall time and 59.82 s BE CPU to 6038 ms wall time and 56.26 s BE CPU. PH5 pprof shows pfor_decode self CPU reduced to about 11.4% after the low bit-width fast paths. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter='SniiPforTest.*:SniiPhraseQueryTest.*:SniiTermQueryTest.*:SniiPrxPodTest.*' - Manual test: ./build.sh --be -j 192 - Manual test: cloud_sim PH5/PP5 cold benchmark at /mnt/disk15/jiangkai/textbench/runs/20260628_phrase_cpu_final_pfor_next_cold - Manual test: cloud_sim PH5 pprof at /mnt/disk15/jiangkai/textbench/runs/20260628_phrase_final_pfor_next_pprof - Static Check: build-support/clang-format.sh, build-support/check-format.sh, and build-support/run-clang-tidy.sh --build-dir be/build_Release - Behavior changed: No - Does this need documentation: No --- .../index/snii/core/src/encoding/pfor.cpp | 61 ++++++ .../snii/core/src/query/docid_conjunction.cpp | 83 ++++--- .../snii/core/src/query/phrase_query.cpp | 204 +++++++++++++++--- be/test/storage/index/snii_query_test.cpp | 78 ++++++- 4 files changed, 361 insertions(+), 65 deletions(-) diff --git a/be/src/storage/index/snii/core/src/encoding/pfor.cpp b/be/src/storage/index/snii/core/src/encoding/pfor.cpp index 19f6442185a556..98862d7f5ee8a9 100644 --- a/be/src/storage/index/snii/core/src/encoding/pfor.cpp +++ b/be/src/storage/index/snii/core/src/encoding/pfor.cpp @@ -89,6 +89,67 @@ Status bitunpack(ByteSource* src, size_t n, uint8_t w, uint32_t* out) { Slice buf; SNII_RETURN_IF_ERROR(src->get_bytes(packed, &buf)); const uint8_t* base = buf.data(); + + if (w == 1) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 1U; + out[i + 1] = (v >> 1) & 1U; + out[i + 2] = (v >> 2) & 1U; + out[i + 3] = (v >> 3) & 1U; + out[i + 4] = (v >> 4) & 1U; + out[i + 5] = (v >> 5) & 1U; + out[i + 6] = (v >> 6) & 1U; + out[i + 7] = (v >> 7) & 1U; + } + if (i < n) { + const uint8_t v = base[byte]; + for (uint8_t bit = 0; i < n; ++i, ++bit) { + out[i] = (v >> bit) & 1U; + } + } + return Status::OK(); + } + if (w == 2) { + size_t i = 0; + size_t byte = 0; + for (; i + 4 <= n; i += 4, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 3U; + out[i + 1] = (v >> 2) & 3U; + out[i + 2] = (v >> 4) & 3U; + out[i + 3] = (v >> 6) & 3U; + } + if (i < n) { + const uint8_t v = base[byte]; + for (uint8_t shift = 0; i < n; ++i, shift += 2) { + out[i] = (v >> shift) & 3U; + } + } + return Status::OK(); + } + if (w == 4) { + size_t i = 0; + size_t byte = 0; + for (; i + 2 <= n; i += 2, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 15U; + out[i + 1] = (v >> 4) & 15U; + } + if (i < n) { + out[i] = base[byte] & 15U; + } + return Status::OK(); + } + if (w == 8) { + for (size_t i = 0; i < n; ++i) { + out[i] = base[i]; + } + return Status::OK(); + } + const uint64_t mask = low_mask(w); // Fast path: values whose 8-byte load window stays inside the buffer diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp index 9964b903ec37d1..a8f946e9c4f7bf 100644 --- a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -19,6 +19,13 @@ using snii::reader::LogicalIndexReader; namespace { +using CandidateIt = std::vector::const_iterator; + +struct CandidateRange { + size_t begin = 0; + size_t end = 0; +}; + Status slim_frq_docs_len(const DictEntry& entry, uint64_t win_len, uint64_t* out) { if (entry.frq_docs_len > win_len) { return Status::Corruption("docid_conjunction: slim frq_docs_len exceeds frq window"); @@ -126,18 +133,23 @@ Status append_docid_range(uint32_t first, uint32_t last, std::vector* return Status::OK(); } -void append_candidate_range(const std::vector& candidates, uint32_t first, uint32_t last, - std::vector* out) { - const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); +CandidateRange find_candidate_range(const std::vector& candidates, size_t* search_begin, + uint32_t first, uint32_t last) { + const auto from = candidates.begin() + *search_begin; + const auto begin = std::lower_bound(from, candidates.end(), first); const auto end = std::upper_bound(begin, candidates.end(), last); + *search_begin = static_cast(end - candidates.begin()); + return {.begin = static_cast(begin - candidates.begin()), + .end = static_cast(end - candidates.begin())}; +} + +void append_candidate_range(CandidateIt begin, CandidateIt end, std::vector* out) { out->insert(out->end(), begin, end); } -Status append_candidate_range_with_ordinals(const std::vector& candidates, uint32_t first, +Status append_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, uint32_t first, uint32_t last, std::vector* out, DocidChunk* chunk) { - const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); - const auto end = std::upper_bound(begin, candidates.end(), last); const size_t candidate_count = static_cast(end - begin); chunk->docids.reserve(candidate_count); const uint64_t width = static_cast(last) - first + 1; @@ -172,11 +184,9 @@ size_t log2_ceil(size_t n) { return bits; } -void intersect_window_candidates(const std::vector& candidates, - const std::vector& term_docids, uint32_t first, - uint32_t last, std::vector* out) { - const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); - const auto end = std::upper_bound(begin, candidates.end(), last); +void intersect_window_candidate_range(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, uint32_t first, + uint32_t last, std::vector* out) { const size_t candidate_count = static_cast(end - begin); if (candidate_count == 0 || term_docids.empty()) return; @@ -220,16 +230,14 @@ void intersect_window_candidates(const std::vector& candidates, std::back_inserter(*out)); } -Status intersect_window_candidates_with_ordinals(const std::vector& candidates, - const std::vector& term_docids, - uint32_t first, uint32_t last, - std::vector* out, DocidChunk* chunk) { +Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + std::vector* out, + DocidChunk* chunk) { if (term_docids.size() > std::numeric_limits::max()) { return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); } chunk->prx_doc_count = static_cast(term_docids.size()); - const auto begin = std::lower_bound(candidates.begin(), candidates.end(), first); - const auto end = std::upper_bound(begin, candidates.end(), last); if (begin == end || term_docids.empty()) return Status::OK(); const size_t candidate_count = static_cast(end - begin); @@ -354,6 +362,7 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla struct FetchedWindow { uint32_t ordinal = 0; WindowMeta meta; + CandidateRange candidates; size_t handle = 0; }; @@ -361,14 +370,23 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla std::vector fetched; fetched.reserve(windows.size()); out->reserve(candidates == nullptr ? p.entry.df : candidates->size()); + size_t candidate_search_begin = 0; for (uint32_t w : windows) { WindowMeta meta; SNII_RETURN_IF_ERROR(p.prelude.window(w, &meta)); + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first)); + CandidateRange candidate_range; + if (candidates != nullptr) { + candidate_range = find_candidate_range(*candidates, &candidate_search_begin, first, + meta.last_docid); + if (candidate_range.begin == candidate_range.end) { + continue; + } + } bool dense_full = false; SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full)); if (dense_full) { - uint32_t first = 0; - SNII_RETURN_IF_ERROR(first_docid_in_window(meta, w, &first)); if (source != nullptr) { DocidChunk chunk; chunk.windowed = true; @@ -377,15 +395,18 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla if (candidates == nullptr) { SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, &chunk.docids)); } else { + const auto begin = candidates->begin() + candidate_range.begin; + const auto end = candidates->begin() + candidate_range.end; SNII_RETURN_IF_ERROR(append_candidate_range_with_ordinals( - *candidates, first, meta.last_docid, out, &chunk)); + begin, end, first, meta.last_docid, out, &chunk)); } source->chunks.push_back(std::move(chunk)); } if (candidates == nullptr) { SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, out)); } else if (source == nullptr) { - append_candidate_range(*candidates, first, meta.last_docid, out); + append_candidate_range(candidates->begin() + candidate_range.begin, + candidates->begin() + candidate_range.end, out); } continue; } @@ -397,6 +418,7 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla FetchedWindow f; f.ordinal = w; f.meta = meta; + f.candidates = candidate_range; f.handle = fetcher.add(range.dd_off, range.dd_len); fetched.push_back(f); } @@ -424,10 +446,10 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla chunk.prx_doc_count = static_cast(docs.size()); source->chunks.push_back(std::move(chunk)); } else { - uint32_t first = 0; - SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); - SNII_RETURN_IF_ERROR(intersect_window_candidates_with_ordinals( - *candidates, docs, first, f.meta.last_docid, out, &chunk)); + const auto begin = candidates->begin() + f.candidates.begin; + const auto end = candidates->begin() + f.candidates.end; + SNII_RETURN_IF_ERROR(intersect_window_candidate_range_with_ordinals( + begin, end, docs, out, &chunk)); if (!chunk.docids.empty()) source->chunks.push_back(std::move(chunk)); } } @@ -438,7 +460,9 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla if (source != nullptr) continue; uint32_t first = 0; SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); - intersect_window_candidates(*candidates, docs, first, f.meta.last_docid, out); + intersect_window_candidate_range(candidates->begin() + f.candidates.begin, + candidates->begin() + f.candidates.end, docs, first, + f.meta.last_docid, out); } return Status::OK(); } @@ -471,9 +495,10 @@ Status collect_docids_only(const LogicalIndexReader& idx, const snii::io::BatchR if (candidates == nullptr) { chunk.docids = term_docids; } else if (!term_docids.empty()) { - SNII_RETURN_IF_ERROR(intersect_window_candidates_with_ordinals( - *candidates, term_docids, term_docids.front(), term_docids.back(), out, - &chunk)); + const auto begin = std::ranges::lower_bound(*candidates, term_docids.front()); + const auto end = std::upper_bound(begin, candidates->end(), term_docids.back()); + SNII_RETURN_IF_ERROR(intersect_window_candidate_range_with_ordinals( + begin, end, term_docids, out, &chunk)); } if (candidates == nullptr || !chunk.docids.empty()) source->chunks.push_back(std::move(chunk)); diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index 1fef749d78a5dd..e5377e1071d527 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -459,6 +459,21 @@ class PostingCursor { return Status::OK(); } + Status next(uint32_t* docid, std::pair* out) { + while (ci_ < src_->chunks.size() && + (src_->chunks[ci_].docids.empty() || li_ >= src_->chunks[ci_].docids.size())) { + ++ci_; + li_ = 0; + } + if (ci_ >= src_->chunks.size()) { + return Status::Corruption("phrase_query: cursor exhausted before next docid"); + } + *docid = src_->chunks[ci_].docids[li_]; + SNII_RETURN_IF_ERROR(positions(out)); + ++li_; + return Status::OK(); + } + private: static constexpr size_t kNoChunk = static_cast(-1); @@ -483,20 +498,6 @@ class PostingCursor { std::vector poff_; // current chunk's per-doc offsets (reused) }; -size_t AnchorPhrasePosition(const std::vector& plans, - const std::vector& phrase_plan_index) { - size_t anchor = 0; - uint32_t best_df = std::numeric_limits::max(); - for (size_t phrase_pos = 0; phrase_pos < phrase_plan_index.size(); ++phrase_pos) { - const TermPlan& plan = plans[phrase_plan_index[phrase_pos]]; - if (plan.df < best_df) { - best_df = plan.df; - anchor = phrase_pos; - } - } - return anchor; -} - bool ContainsTwoTermPhrase(std::pair left_span, std::pair right_span, uint32_t right_delta) { @@ -522,45 +523,178 @@ bool ContainsTwoTermPhrase(std::pair left_span return false; } +size_t SelectPhraseVerificationPair(const std::vector& plans, + const std::vector& phrase_plan_index) { + size_t best_left = 0; + uint64_t best_score = std::numeric_limits::max(); + for (size_t left = 0; left + 1 < phrase_plan_index.size(); ++left) { + const uint64_t score = static_cast(plans[phrase_plan_index[left]].df) + + plans[phrase_plan_index[left + 1]].df; + if (score < best_score) { + best_score = score; + best_left = left; + } + } + return best_left; +} + +void CollectTwoTermPhraseStarts(std::pair left_span, + std::pair right_span, + uint32_t right_delta, uint32_t left_offset, + std::vector* starts) { + starts->clear(); + const uint32_t* left = left_span.first; + const uint32_t* right = right_span.first; + const uint32_t max_left = std::numeric_limits::max() - right_delta; + while (left != left_span.second && right != right_span.second) { + if (*left > max_left) { + return; + } + const uint32_t want = *left + right_delta; + while (right != right_span.second && *right < want) { + ++right; + } + if (right == right_span.second) { + return; + } + if (*right == want && *left >= left_offset) { + starts->push_back(*left - left_offset); + } + ++left; + } +} + +Status EmitTwoTermPhraseStreaming(const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, + const std::vector& candidates, + std::vector* docids) { + const size_t left_plan = phrase_plan_index[0]; + const size_t right_plan = phrase_plan_index[1]; + const uint32_t right_delta = position_offsets[1] - position_offsets[0]; + + if (left_plan == right_plan) { + PostingCursor cursor; + cursor.init(&srcs[left_plan]); + for (uint32_t expected_docid : candidates) { + uint32_t docid = 0; + std::pair span; + SNII_RETURN_IF_ERROR(cursor.next(&docid, &span)); + if (docid != expected_docid) { + return Status::Corruption("phrase_query: repeated-term cursor/docid mismatch"); + } + if (ContainsTwoTermPhrase(span, span, right_delta)) { + docids->push_back(docid); + } + } + return Status::OK(); + } + + PostingCursor left_cursor; + PostingCursor right_cursor; + left_cursor.init(&srcs[left_plan]); + right_cursor.init(&srcs[right_plan]); + for (uint32_t expected_docid : candidates) { + uint32_t left_docid = 0; + uint32_t right_docid = 0; + std::pair left_span; + std::pair right_span; + SNII_RETURN_IF_ERROR(left_cursor.next(&left_docid, &left_span)); + SNII_RETURN_IF_ERROR(right_cursor.next(&right_docid, &right_span)); + if (left_docid != expected_docid || right_docid != expected_docid) { + return Status::Corruption("phrase_query: two-term cursor/docid mismatch"); + } + if (ContainsTwoTermPhrase(left_span, right_span, right_delta)) { + docids->push_back(expected_docid); + } + } + return Status::OK(); +} + // Single streaming pass over the candidates: for each (ascending) candidate, -// advance every term's cursor to it, gather each term's positions IN PHRASE -// ORDER, and test the consecutive-phrase predicate (term[0]@p, term[1]@p+1, -// ...) with term-level short-circuit. Cursors decode each chunk's -// docids/positions exactly once and address positions by local index -- no -// per-candidate docid binary search, no full-candidate position -// materialization. Candidates are ascending so the emitted docids are already -// sorted. +// gather positions lazily, and test the consecutive-phrase predicate +// (term[0]@p, term[1]@p+1, ...). Multi-term phrases first test the cheapest +// adjacent pair by df before decoding the remaining terms for that document. +// Cursors decode each retained chunk at most once and address positions by +// local index -- no per-candidate docid binary search, no full-candidate +// position materialization. Candidates are ascending so the emitted docids are +// already sorted. Status EmitPhraseStreaming(const std::vector& plans, const std::vector& phrase_plan_index, const std::vector& position_offsets, std::vector& srcs, const std::vector& candidates, std::vector* docids) { + const size_t phrase_len = phrase_plan_index.size(); + if (phrase_len == 2) { + return EmitTwoTermPhraseStreaming(phrase_plan_index, position_offsets, srcs, candidates, + docids); + } + std::vector cur(plans.size()); for (size_t i = 0; i < plans.size(); ++i) cur[i].init(&srcs[i]); - const size_t phrase_len = phrase_plan_index.size(); + std::vector> plan_span(plans.size()); + std::vector loaded_epoch(plans.size(), 0); + const size_t pair_left = + phrase_len > 2 ? SelectPhraseVerificationPair(plans, phrase_plan_index) : 0; + const size_t pair_right = pair_left + 1; + std::vector starts; std::vector> span(phrase_len); - const size_t anchor = AnchorPhrasePosition(plans, phrase_plan_index); - const uint32_t anchor_offset = position_offsets[anchor]; + uint32_t epoch = 1; for (uint32_t d : candidates) { - for (size_t i = 0; i < cur.size(); ++i) SNII_RETURN_IF_ERROR(cur[i].seek(d)); - for (size_t pp = 0; pp < phrase_len; ++pp) { - SNII_RETURN_IF_ERROR(cur[phrase_plan_index[pp]].positions(&span[pp])); - } - if (phrase_len == 2) { - if (ContainsTwoTermPhrase(span[0], span[1], - position_offsets[1] - position_offsets[0])) { + if (++epoch == 0) { + std::ranges::fill(loaded_epoch, 0); + epoch = 1; + } + auto positions_for_phrase_pos = + [&](size_t phrase_pos, std::pair* out) -> Status { + const size_t plan_index = phrase_plan_index[phrase_pos]; + if (loaded_epoch[plan_index] != epoch) { + SNII_RETURN_IF_ERROR(cur[plan_index].seek(d)); + SNII_RETURN_IF_ERROR(cur[plan_index].positions(&plan_span[plan_index])); + loaded_epoch[plan_index] = epoch; + } + *out = plan_span[plan_index]; + return Status::OK(); + }; + + if (phrase_len == 1) { + std::pair single_span; + SNII_RETURN_IF_ERROR(positions_for_phrase_pos(0, &single_span)); + if (single_span.first != single_span.second) { docids->push_back(d); } continue; } + + std::pair left_span; + std::pair right_span; + SNII_RETURN_IF_ERROR(positions_for_phrase_pos(pair_left, &left_span)); + SNII_RETURN_IF_ERROR(positions_for_phrase_pos(pair_right, &right_span)); + + CollectTwoTermPhraseStarts(left_span, right_span, + position_offsets[pair_right] - position_offsets[pair_left], + position_offsets[pair_left], &starts); + if (starts.empty()) { + continue; + } + + span[pair_left] = left_span; + span[pair_right] = right_span; + for (size_t pp = 0; pp < phrase_len; ++pp) { + if (pp == pair_left || pp == pair_right) { + continue; + } + SNII_RETURN_IF_ERROR(positions_for_phrase_pos(pp, &span[pp])); + } + bool match = false; - for (const uint32_t* p = span[anchor].first; p != span[anchor].second; ++p) { - if (*p < anchor_offset) continue; - const uint32_t start = *p - anchor_offset; + for (uint32_t start : starts) { bool ok = true; for (size_t t = 0; t < phrase_len; ++t) { - if (t == anchor) continue; + if (t == pair_left || t == pair_right) { + continue; + } uint32_t want = 0; if (!internal::add_position_offset(start, position_offsets[t], &want)) { ok = false; diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp index 2a67c74ae02520..7d75ff6b49ba1e 100644 --- a/be/test/storage/index/snii_query_test.cpp +++ b/be/test/storage/index/snii_query_test.cpp @@ -27,6 +27,7 @@ #include "snii/common/slice.h" #include "snii/encoding/byte_sink.h" #include "snii/encoding/byte_source.h" +#include "snii/encoding/pfor.h" #include "snii/format/format_constants.h" #include "snii/format/prx_pod.h" #include "snii/io/file_reader.h" @@ -136,6 +137,11 @@ Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, auto failed_docs = docs_with_one_position(0, kDocCount, 0); auto order_docs = docs_with_one_position(0, kDocCount, 2); auto ordinal_docs = docs_with_one_position(0, kDocCount, 2); + std::vector repeat_docs; + repeat_docs.reserve(kDocCount); + for (uint32_t docid = 0; docid < kDocCount; ++docid) { + repeat_docs.push_back({docid, {0, 1, 2}}); + } failed_docs[8000].positions = {0, 4}; for (PostingDoc& doc : order_docs) { if (doc.docid == 5000 || doc.docid == 7000) { @@ -157,7 +163,8 @@ Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, input.doc_count = kDocCount; input.terms = {make_term("failed", std::move(failed_docs)), make_term("order", std::move(order_docs)), - make_term("ordinal", std::move(ordinal_docs))}; + make_term("ordinal", std::move(ordinal_docs)), + make_term("repeat", std::move(repeat_docs))}; writer::SniiCompoundWriter writer(file); SNII_RETURN_IF_ERROR(writer.add_logical_index(input)); @@ -207,6 +214,33 @@ TEST(SniiPhraseQueryTest, SingleTailPhrasePrefixUsesStreamingPhrasePath) { EXPECT_EQ(docids, expected); } +TEST(SniiPhraseQueryTest, MultiTermPhraseUsesPairPrefilter) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"failed", "order", "ordinal"}, &docids)); + + const std::vector expected {5000, 7000}; + EXPECT_EQ(docids, expected); +} + +TEST(SniiPhraseQueryTest, RepeatedTermPhraseUsesCachedPostingSpan) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"repeat", "repeat", "repeat"}, &docids)); + + std::vector expected(9000); + std::iota(expected.begin(), expected.end(), 0); + EXPECT_EQ(docids, expected); +} + TEST(SniiTermQueryTest, WindowedDenseTermEmitsRangesToSink) { MemoryFile file; reader::SniiSegmentReader segment_reader; @@ -266,5 +300,47 @@ TEST(SniiPrxPodTest, SelectivePforCsrMatchesFullCsrAcrossRuns) { assert_selected_matches_full({0, 1, 127, 128, 129, 255, 256, 319}); } +TEST(SniiPforTest, LowBitWidthFastPathsRoundTrip) { + auto assert_round_trip = [](const std::vector& values, uint8_t expected_width) { + ByteSink sink; + snii::pfor_encode(values.data(), values.size(), &sink); + ASSERT_FALSE(sink.buffer().empty()); + EXPECT_EQ(sink.buffer().front(), expected_width); + + std::vector decoded(values.size(), 0xFFFFFFFF); + ByteSource source(sink.view()); + assert_ok(snii::pfor_decode(&source, values.size(), decoded.data())); + EXPECT_TRUE(source.eof()); + EXPECT_EQ(decoded, values); + }; + + std::vector one_bit(128); + for (size_t i = 0; i < one_bit.size(); ++i) { + one_bit[i] = static_cast(i & 1); + } + assert_round_trip(one_bit, 1); + + one_bit[17] = 1000; + assert_round_trip(one_bit, 1); + + std::vector two_bit(128); + for (size_t i = 0; i < two_bit.size(); ++i) { + two_bit[i] = static_cast(i & 3); + } + assert_round_trip(two_bit, 2); + + std::vector four_bit(128); + for (size_t i = 0; i < four_bit.size(); ++i) { + four_bit[i] = static_cast(i & 15); + } + assert_round_trip(four_bit, 4); + + std::vector eight_bit(256); + for (size_t i = 0; i < eight_bit.size(); ++i) { + eight_bit[i] = static_cast(i); + } + assert_round_trip(eight_bit, 8); +} + } // namespace } // namespace snii::query From e0e1c9f165fdef6bcd4cd1090cbd21a07dda11d2 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 06:38:31 +0800 Subject: [PATCH 10/12] [improvement](be) Optimize SNII dense phrase conjunction ### What problem does this PR solve? Issue Number: N/A Related PR: N/A Problem Summary: SNII phrase queries on the 10B cloud benchmark still spent the largest CPU share in docid/ordinal intersection before PRX verification. The hot path had to intersect dense or near-dense candidate windows with term docids and produce PRX doc ordinals. This change adds a candidate-span fast path that directly accepts all term docids when candidates continuously cover the term span, adds dense/near-dense ordinal mapping for term spans with few missing docs, and fixes mixed dense/non-dense window output ordering by batching window work and flushing in original order. In cloud_sim PH5/PP5 cold benchmark, PH5 improved from 5794 ms wall and 55.43 s BE CPU to 5702 ms wall and 53.55 s BE CPU; PP5 kept similar wall time and reduced BE CPU from 56.26 s to 55.21 s. The inverted index read bytes, remote bytes, cache bytes, serial rounds, and IO counts stayed unchanged, so the improvement is from CPU-side algorithm work rather than reduced remote reads. A bulk dense append variant was tested and reverted because it regressed PH5/PP5. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter='SniiPforTest.*:SniiPhraseQueryTest.*:SniiTermQueryTest.*:SniiPrxPodTest.*' - Manual test: ./build.sh --be -j 192 - Manual test: cloud_sim PH5/PP5 cold benchmark at /mnt/disk15/jiangkai/textbench/runs/20260628_phrase_final_cover_span_refactor_cold - Manual test: cloud_sim PH5 pprof at /mnt/disk15/jiangkai/textbench/runs/20260628_phrase_final_cover_span_refactor_pprof - Static Check: build-support/clang-format.sh, build-support/check-format.sh, git diff --check - Static Check: build-support/run-clang-tidy.sh --build-dir be/build_Release failed because the local ldb clang-tidy toolchain could not find stddef.h while parsing system headers - Behavior changed: No - Does this need documentation: No --- .../snii/core/src/query/docid_conjunction.cpp | 318 +++++++++++++----- be/test/storage/index/snii_query_test.cpp | 41 ++- 2 files changed, 272 insertions(+), 87 deletions(-) diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp index a8f946e9c4f7bf..1ebb6c86a7471a 100644 --- a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -147,6 +147,44 @@ void append_candidate_range(CandidateIt begin, CandidateIt end, std::vectorinsert(out->end(), begin, end); } +void clear_ordinals_if_all_term_docs_selected(const std::vector& term_docids, + DocidChunk* chunk) { + if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && + chunk->docids.front() == term_docids.front() && + chunk->docids.back() == term_docids.back()) { + chunk->prx_doc_ordinals.clear(); + } +} + +bool append_term_docs_if_candidates_cover_span(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + std::vector* out, DocidChunk* chunk) { + const uint32_t first = term_docids.front(); + const uint32_t last = term_docids.back(); + const uint64_t width = static_cast(last) - first + 1; + const size_t candidate_count = static_cast(end - begin); + if (width > candidate_count) { + return false; + } + + const auto span_begin = *begin == first ? begin : std::lower_bound(begin, end, first); + if (span_begin == end || *span_begin != first) { + return false; + } + if (static_cast(end - span_begin) < width) { + return false; + } + + const auto span_last = span_begin + static_cast(width) - 1; + if (*span_last != last) { + return false; + } + + out->insert(out->end(), term_docids.begin(), term_docids.end()); + chunk->docids.insert(chunk->docids.end(), term_docids.begin(), term_docids.end()); + return true; +} + Status append_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, uint32_t first, uint32_t last, std::vector* out, DocidChunk* chunk) { @@ -173,6 +211,81 @@ Status append_candidate_range_with_ordinals(CandidateIt begin, CandidateIt end, return Status::OK(); } +bool intersect_dense_term_span_with_ordinals(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + size_t candidate_count, std::vector* out, + DocidChunk* chunk) { + const uint32_t first = term_docids.front(); + const uint32_t last = term_docids.back(); + const uint64_t width = static_cast(last) - first + 1; + if (term_docids.size() > width) { + return false; + } + const uint64_t missing_count = width - term_docids.size(); + if (missing_count != 0 && + (missing_count * 8 > width || missing_count >= candidate_count || + missing_count > static_cast(std::numeric_limits::max()))) { + return false; + } + + if (missing_count == 0) { + for (auto it = begin; it != end; ++it) { + if (*it < first) { + continue; + } + if (*it > last) { + break; + } + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(*it - first); + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return true; + } + + std::vector missing; + missing.reserve(static_cast(missing_count)); + uint32_t expect = first; + for (uint32_t docid : term_docids) { + while (expect < docid) { + missing.push_back(expect); + ++expect; + } + if (docid < std::numeric_limits::max()) { + expect = docid + 1; + } + } + while (expect <= last) { + missing.push_back(expect); + if (expect == std::numeric_limits::max()) { + break; + } + ++expect; + } + + size_t miss = 0; + for (auto it = begin; it != end; ++it) { + if (*it < first) { + continue; + } + if (*it > last) { + break; + } + while (miss < missing.size() && missing[miss] < *it) { + ++miss; + } + if (miss < missing.size() && missing[miss] == *it) { + continue; + } + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back(static_cast(*it - first - miss)); + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return true; +} + size_t log2_ceil(size_t n) { if (n <= 1) return 1; --n; @@ -250,8 +363,16 @@ Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, Candida chunk->docids.insert(chunk->docids.end(), begin, end); return Status::OK(); } + if (append_term_docs_if_candidates_cover_span(begin, end, term_docids, out, chunk)) { + return Status::OK(); + } chunk->prx_doc_ordinals.reserve(max_matches); + if (intersect_dense_term_span_with_ordinals(begin, end, term_docids, candidate_count, out, + chunk)) { + return Status::OK(); + } + const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1; if (candidate_count < term_docids.size() / probes_per_candidate) { size_t doc_index = 0; @@ -266,11 +387,7 @@ Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, Candida chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); ++doc_index; } - if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && - chunk->docids.front() == term_docids.front() && - chunk->docids.back() == term_docids.back()) { - chunk->prx_doc_ordinals.clear(); - } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); return Status::OK(); } @@ -287,11 +404,7 @@ Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, Candida chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); ++candidate_it; } - if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && - chunk->docids.front() == term_docids.front() && - chunk->docids.back() == term_docids.back()) { - chunk->prx_doc_ordinals.clear(); - } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); return Status::OK(); } @@ -307,11 +420,7 @@ Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, Candida chunk->prx_doc_ordinals.push_back(static_cast(doc_index)); ++doc_index; } - if (chunk->docids.size() == term_docids.size() && !chunk->docids.empty() && - chunk->docids.front() == term_docids.front() && - chunk->docids.back() == term_docids.back()) { - chunk->prx_doc_ordinals.clear(); - } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); return Status::OK(); } @@ -355,20 +464,96 @@ Status decode_flat_docids_only(const snii::io::BatchRangeFetcher& round1, const return snii::format::decode_dd_region(dd, p.entry.dd_meta, /*win_base=*/0, docids); } +struct WindowWork { + uint32_t ordinal = 0; + WindowMeta meta; + CandidateRange candidates; + size_t handle = 0; + bool dense_full = false; +}; + +Status emit_dense_full_window_docids(const WindowWork& f, const std::vector* candidates, + std::vector& out, DocidSource* source) { + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); + if (source != nullptr) { + DocidChunk chunk; + chunk.windowed = true; + chunk.window = f.ordinal; + chunk.prx_doc_count = f.meta.doc_count; + if (candidates == nullptr) { + SNII_RETURN_IF_ERROR(append_docid_range(first, f.meta.last_docid, &chunk.docids)); + } else { + const auto begin = candidates->begin() + f.candidates.begin; + const auto end = candidates->begin() + f.candidates.end; + SNII_RETURN_IF_ERROR(append_candidate_range_with_ordinals( + begin, end, first, f.meta.last_docid, &out, &chunk)); + } + source->chunks.push_back(std::move(chunk)); + } + if (candidates == nullptr) { + SNII_RETURN_IF_ERROR(append_docid_range(first, f.meta.last_docid, &out)); + } else if (source == nullptr) { + append_candidate_range(candidates->begin() + f.candidates.begin, + candidates->begin() + f.candidates.end, &out); + } + return Status::OK(); +} + +Status emit_decoded_window_docids(const WindowWork& f, const snii::io::BatchRangeFetcher& fetcher, + const std::vector* candidates, + std::vector& out, DocidSource* source, + std::vector& docs, std::vector& freqs, + std::vector>& positions) { + docs.clear(); + freqs.clear(); + positions.clear(); + SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( + f.meta, fetcher.get(f.handle), Slice(), Slice(), + /*want_positions=*/false, /*want_freq=*/false, &docs, &freqs, &positions)); + if (source != nullptr) { + DocidChunk chunk; + chunk.windowed = true; + chunk.window = f.ordinal; + if (candidates == nullptr) { + chunk.docids = docs; + if (docs.size() > std::numeric_limits::max()) { + return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); + } + chunk.prx_doc_count = static_cast(docs.size()); + source->chunks.push_back(std::move(chunk)); + } else { + const auto begin = candidates->begin() + f.candidates.begin; + const auto end = candidates->begin() + f.candidates.end; + SNII_RETURN_IF_ERROR( + intersect_window_candidate_range_with_ordinals(begin, end, docs, &out, &chunk)); + if (!chunk.docids.empty()) { + source->chunks.push_back(std::move(chunk)); + } + } + } + if (candidates == nullptr) { + out.insert(out.end(), docs.begin(), docs.end()); + return Status::OK(); + } + if (source != nullptr) { + return Status::OK(); + } + uint32_t first = 0; + SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); + intersect_window_candidate_range(candidates->begin() + f.candidates.begin, + candidates->begin() + f.candidates.end, docs, first, + f.meta.last_docid, &out); + return Status::OK(); +} + Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPlan& p, const std::vector& windows, const std::vector* candidates, std::vector* out, DocidSource* source) { - struct FetchedWindow { - uint32_t ordinal = 0; - WindowMeta meta; - CandidateRange candidates; - size_t handle = 0; - }; - snii::io::BatchRangeFetcher fetcher(idx.reader(), snii::reader::kSameTermCoalesceGap); - std::vector fetched; - fetched.reserve(windows.size()); + std::vector work; + work.reserve(windows.size()); out->reserve(candidates == nullptr ? p.entry.df : candidates->size()); size_t candidate_search_begin = 0; for (uint32_t w : windows) { @@ -387,27 +572,8 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla bool dense_full = false; SNII_RETURN_IF_ERROR(is_dense_full_window(meta, w, &dense_full)); if (dense_full) { - if (source != nullptr) { - DocidChunk chunk; - chunk.windowed = true; - chunk.window = w; - chunk.prx_doc_count = meta.doc_count; - if (candidates == nullptr) { - SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, &chunk.docids)); - } else { - const auto begin = candidates->begin() + candidate_range.begin; - const auto end = candidates->begin() + candidate_range.end; - SNII_RETURN_IF_ERROR(append_candidate_range_with_ordinals( - begin, end, first, meta.last_docid, out, &chunk)); - } - source->chunks.push_back(std::move(chunk)); - } - if (candidates == nullptr) { - SNII_RETURN_IF_ERROR(append_docid_range(first, meta.last_docid, out)); - } else if (source == nullptr) { - append_candidate_range(candidates->begin() + candidate_range.begin, - candidates->begin() + candidate_range.end, out); - } + work.push_back(WindowWork { + .ordinal = w, .meta = meta, .candidates = candidate_range, .dense_full = true}); continue; } @@ -415,54 +581,27 @@ Status collect_windowed_docids_only(const LogicalIndexReader& idx, const TermPla SNII_RETURN_IF_ERROR(snii::reader::windowed_window_range( idx, p.entry, p.frq_base, p.prx_base, p.prelude, w, /*want_positions=*/false, /*want_freq=*/false, &range)); - FetchedWindow f; + WindowWork f; f.ordinal = w; f.meta = meta; f.candidates = candidate_range; f.handle = fetcher.add(range.dd_off, range.dd_len); - fetched.push_back(f); + work.push_back(f); + } + if (fetcher.pending() > 0) { + SNII_RETURN_IF_ERROR(fetcher.fetch()); } - if (fetcher.pending() > 0) SNII_RETURN_IF_ERROR(fetcher.fetch()); std::vector docs; std::vector freqs; std::vector> positions; - for (const FetchedWindow& f : fetched) { - docs.clear(); - freqs.clear(); - positions.clear(); - SNII_RETURN_IF_ERROR(snii::reader::decode_window_slices( - f.meta, fetcher.get(f.handle), Slice(), Slice(), - /*want_positions=*/false, /*want_freq=*/false, &docs, &freqs, &positions)); - if (source != nullptr) { - DocidChunk chunk; - chunk.windowed = true; - chunk.window = f.ordinal; - if (candidates == nullptr) { - chunk.docids = docs; - if (docs.size() > std::numeric_limits::max()) { - return Status::Corruption("docid_conjunction: prx doc count exceeds u32"); - } - chunk.prx_doc_count = static_cast(docs.size()); - source->chunks.push_back(std::move(chunk)); - } else { - const auto begin = candidates->begin() + f.candidates.begin; - const auto end = candidates->begin() + f.candidates.end; - SNII_RETURN_IF_ERROR(intersect_window_candidate_range_with_ordinals( - begin, end, docs, out, &chunk)); - if (!chunk.docids.empty()) source->chunks.push_back(std::move(chunk)); - } - } - if (candidates == nullptr) { - out->insert(out->end(), docs.begin(), docs.end()); + for (const WindowWork& f : work) { + if (f.dense_full) { + SNII_RETURN_IF_ERROR(emit_dense_full_window_docids(f, candidates, *out, source)); continue; } - if (source != nullptr) continue; - uint32_t first = 0; - SNII_RETURN_IF_ERROR(first_docid_in_window(f.meta, f.ordinal, &first)); - intersect_window_candidate_range(candidates->begin() + f.candidates.begin, - candidates->begin() + f.candidates.end, docs, first, - f.meta.last_docid, out); + SNII_RETURN_IF_ERROR(emit_decoded_window_docids(f, fetcher, candidates, *out, source, docs, + freqs, positions)); } return Status::OK(); } @@ -500,14 +639,17 @@ Status collect_docids_only(const LogicalIndexReader& idx, const snii::io::BatchR SNII_RETURN_IF_ERROR(intersect_window_candidate_range_with_ordinals( begin, end, term_docids, out, &chunk)); } - if (candidates == nullptr || !chunk.docids.empty()) + if (candidates == nullptr || !chunk.docids.empty()) { source->chunks.push_back(std::move(chunk)); + } } if (candidates == nullptr) { *out = std::move(term_docids); return Status::OK(); } - if (source != nullptr) return Status::OK(); + if (source != nullptr) { + return Status::OK(); + } *out = intersect_sorted(*candidates, term_docids); return Status::OK(); } @@ -517,7 +659,9 @@ Status build_docid_only_conjunction_impl(const LogicalIndexReader& idx, const std::vector& plans, std::vector* candidates, std::vector* sources) { - if (sources != nullptr) sources->assign(plans.size(), DocidSource {}); + if (sources != nullptr) { + sources->assign(plans.size(), DocidSource {}); + } const std::vector order = ascending_df_order(plans); for (size_t k = 0; k < order.size(); ++k) { const size_t ti = order[k]; @@ -529,7 +673,9 @@ Status build_docid_only_conjunction_impl(const LogicalIndexReader& idx, source->docids_are_final_candidates = true; } *candidates = std::move(next); - if (candidates->empty()) return Status::OK(); + if (candidates->empty()) { + return Status::OK(); + } } return Status::OK(); } diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp index 7d75ff6b49ba1e..da085cdec66914 100644 --- a/be/test/storage/index/snii_query_test.cpp +++ b/be/test/storage/index/snii_query_test.cpp @@ -57,6 +57,7 @@ class MemoryFile final : public snii::io::FileReader, public snii::io::FileWrite uint64_t bytes_written() const override { return data_.size(); } + // NOLINTBEGIN(readability-non-const-parameter): FileReader interface writes into out. Status read_at(uint64_t offset, size_t len, std::vector* out) override { if (offset > data_.size() || len > data_.size() - offset) { return Status::Corruption("memory file read past eof"); @@ -67,6 +68,7 @@ class MemoryFile final : public snii::io::FileReader, public snii::io::FileWrite } return Status::OK(); } + // NOLINTEND(readability-non-const-parameter) uint64_t size() const override { return data_.size(); } bool finalized() const { return finalized_; } @@ -137,11 +139,14 @@ Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, auto failed_docs = docs_with_one_position(0, kDocCount, 0); auto order_docs = docs_with_one_position(0, kDocCount, 2); auto ordinal_docs = docs_with_one_position(0, kDocCount, 2); + auto driver_docs = docs_with_one_position(0, 8000, 0); + auto almost_docs = docs_with_one_position(0, kDocCount, 1); std::vector repeat_docs; repeat_docs.reserve(kDocCount); for (uint32_t docid = 0; docid < kDocCount; ++docid) { repeat_docs.push_back({docid, {0, 1, 2}}); } + almost_docs.erase(almost_docs.begin() + 4000); failed_docs[8000].positions = {0, 4}; for (PostingDoc& doc : order_docs) { if (doc.docid == 5000 || doc.docid == 7000) { @@ -161,7 +166,9 @@ Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, input.index_suffix = "Body"; input.config = format::IndexConfig::kDocsPositions; input.doc_count = kDocCount; - input.terms = {make_term("failed", std::move(failed_docs)), + input.terms = {make_term("almost", std::move(almost_docs)), + make_term("driver", std::move(driver_docs)), + make_term("failed", std::move(failed_docs)), make_term("order", std::move(order_docs)), make_term("ordinal", std::move(ordinal_docs)), make_term("repeat", std::move(repeat_docs))}; @@ -241,6 +248,38 @@ TEST(SniiPhraseQueryTest, RepeatedTermPhraseUsesCachedPostingSpan) { EXPECT_EQ(docids, expected); } +TEST(SniiPhraseQueryTest, DenseTermWithMissingDocKeepsCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector driver_docids; + assert_ok(term_query(index_reader, "driver", &driver_docids)); + EXPECT_EQ(driver_docids.size(), 8000); + + std::vector almost_docids; + assert_ok(term_query(index_reader, "almost", &almost_docids)); + EXPECT_EQ(almost_docids.size(), 8999); + ASSERT_GT(almost_docids.size(), 6144); + EXPECT_EQ(almost_docids[3999], 3999); + EXPECT_EQ(almost_docids[4000], 4001); + EXPECT_EQ(almost_docids[6143], 6144); + EXPECT_EQ(almost_docids[6144], 6145); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"driver", "almost"}, &docids)); + + std::vector expected; + expected.reserve(7999); + for (uint32_t docid = 0; docid < 8000; ++docid) { + if (docid != 4000) { + expected.push_back(docid); + } + } + EXPECT_EQ(docids, expected); +} + TEST(SniiTermQueryTest, WindowedDenseTermEmitsRangesToSink) { MemoryFile file; reader::SniiSegmentReader segment_reader; From 508367b461fd4e1f1ae6c6399f9c4966b0f051b0 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 07:49:25 +0800 Subject: [PATCH 11/12] [improvement](be) Optimize SNII two-term phrase verification ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: SNII phrase verification still spent CPU in per-candidate cursor/status handling for exact two-term phrases. Profiles showed PostingCursor::next and positions handling in the hot path while IO metrics stayed unchanged. This change reuses a shared PRX chunk decoder and adds a two-term chunk merge path for non-repeated phrases so overlapping chunks decode once and docids are verified with a linear merge. Repeated-term phrases still use the existing streaming path, and multi-term streaming is split into smaller helpers. On 10B textbench cold cloud_sim runs, PH5 CPU dropped from 53.55s to 48.07s and PP5 CPU dropped from 55.21s to 49.95s with identical IO bytes. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter='SniiPhraseQueryTest.*:SniiTermQueryTest.*:SniiPrxPodTest.*:SniiPforTest.*' - Manual test: ./build.sh --be -j 192 - Manual test: cloud_sim BE redeploy and smoke MATCH_PHRASE / MATCH_PHRASE_PREFIX - Benchmark: textbench 10B cold PH5/PP5 comparison - Static check: build-support/clang-format.sh, build-support/check-format.sh, git diff --check; build-support/run-clang-tidy.sh --build-dir be/build_Release attempted but blocked by local clang-tidy system header resolution where stddef.h is not found. - Behavior changed: No - Does this need documentation: No --- .../snii/core/src/query/phrase_query.cpp | 419 ++++++++++++------ 1 file changed, 295 insertions(+), 124 deletions(-) diff --git a/be/src/storage/index/snii/core/src/query/phrase_query.cpp b/be/src/storage/index/snii/core/src/query/phrase_query.cpp index e5377e1071d527..72db2d628513e0 100644 --- a/be/src/storage/index/snii/core/src/query/phrase_query.cpp +++ b/be/src/storage/index/snii/core/src/query/phrase_query.cpp @@ -376,6 +376,85 @@ Status BuildPositionSourcesForCandidates( return Status::OK(); } +class PosChunkDecoder { +public: + void reset() { + chunk_ = nullptr; + offsets_by_prx_ordinal_ = false; + } + + Status decode(const PosChunk& chunk) { + chunk_ = &chunk; + ByteSource ps(chunk.prx); + offsets_by_prx_ordinal_ = false; + if (chunk.prx_doc_ordinals.empty()) { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + } else if (should_decode_full_prx_window(chunk)) { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); + offsets_by_prx_ordinal_ = true; + } else { + SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr_selective( + &ps, chunk.prx_doc_ordinals, &pflat_, &poff_)); + } + if (offsets_by_prx_ordinal_) { + if (poff_.size() != static_cast(chunk.prx_doc_count) + 1) { + return Status::Corruption("phrase_query: full prx doc-count mismatch"); + } + } else if (poff_.size() != chunk.docids.size() + 1) { + return Status::Corruption("phrase_query: selected prx/doc-count mismatch"); + } + if (poff_.back() > pflat_.size()) { + return Status::Corruption("phrase_query: prx final offset out of range"); + } + return Status::OK(); + } + + Status positions(size_t doc_index, std::pair* out) const { + if (chunk_ == nullptr || doc_index >= chunk_->docids.size()) { + return Status::Corruption("phrase_query: decoded chunk doc index out of range"); + } + const size_t pos_index = + offsets_by_prx_ordinal_ ? chunk_->prx_doc_ordinals[doc_index] : doc_index; + if (pos_index + 1 >= poff_.size()) { + return Status::Corruption("phrase_query: prx ordinal offset out of range"); + } + const uint32_t begin = poff_[pos_index]; + const uint32_t end = poff_[pos_index + 1]; + if (begin == end) { + *out = {nullptr, nullptr}; + return Status::OK(); + } + if (end > pflat_.size()) { + return Status::Corruption("phrase_query: prx offset out of range"); + } + *out = {pflat_.data() + begin, pflat_.data() + end}; + return Status::OK(); + } + + inline __attribute__((always_inline)) std::pair + positions_unchecked(size_t doc_index) const { + const size_t pos_index = + offsets_by_prx_ordinal_ ? chunk_->prx_doc_ordinals[doc_index] : doc_index; + const uint32_t begin = poff_[pos_index]; + const uint32_t end = poff_[pos_index + 1]; + if (begin == end) { + return {nullptr, nullptr}; + } + return {pflat_.data() + begin, pflat_.data() + end}; + } + +private: + static bool should_decode_full_prx_window(const PosChunk& chunk) { + return chunk.prx_doc_count != 0 && + static_cast(chunk.prx_doc_ordinals.size()) * 2 >= chunk.prx_doc_count; + } + + const PosChunk* chunk_ = nullptr; + bool offsets_by_prx_ordinal_ = false; + std::vector pflat_; + std::vector poff_; +}; + // Streaming position cursor over one term's retained chunks. It advances ONLY // forward (callers seek ascending candidate docids), decodes each chunk's // docids once (reused from the conjunction phase) and each chunk's positions at @@ -392,7 +471,7 @@ class PostingCursor { ci_ = 0; li_ = 0; decoded_pos_chunk_ = kNoChunk; - offsets_by_prx_ordinal_ = false; + decoder_.reset(); } // Positions the cursor at `target` (guaranteed present: candidates are the @@ -421,42 +500,10 @@ class PostingCursor { return Status::Corruption("phrase_query: cursor positions out of range"); } if (decoded_pos_chunk_ != ci_) { - ByteSource ps(src_->chunks[ci_].prx); - const PosChunk& chunk = src_->chunks[ci_]; - offsets_by_prx_ordinal_ = false; - if (chunk.prx_doc_ordinals.empty()) { - SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); - } else if (should_decode_full_prx_window(chunk)) { - SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr(&ps, &pflat_, &poff_)); - offsets_by_prx_ordinal_ = true; - } else { - SNII_RETURN_IF_ERROR(snii::format::read_prx_window_csr_selective( - &ps, chunk.prx_doc_ordinals, &pflat_, &poff_)); - } - if (offsets_by_prx_ordinal_) { - if (poff_.size() != static_cast(chunk.prx_doc_count) + 1) { - return Status::Corruption("phrase_query: full prx doc-count mismatch"); - } - } else if (poff_.size() != chunk.docids.size() + 1) { - return Status::Corruption("phrase_query: selected prx/doc-count mismatch"); - } + SNII_RETURN_IF_ERROR(decoder_.decode(src_->chunks[ci_])); decoded_pos_chunk_ = ci_; } - const size_t pos_index = position_offset_index(); - if (pos_index + 1 >= poff_.size()) { - return Status::Corruption("phrase_query: prx ordinal offset out of range"); - } - const uint32_t begin = poff_[pos_index]; - const uint32_t end = poff_[pos_index + 1]; - if (begin == end) { - *out = {nullptr, nullptr}; - return Status::OK(); - } - if (end > pflat_.size()) { - return Status::Corruption("phrase_query: prx offset out of range"); - } - *out = {pflat_.data() + begin, pflat_.data() + end}; - return Status::OK(); + return decoder_.positions(li_, out); } Status next(uint32_t* docid, std::pair* out) { @@ -477,25 +524,49 @@ class PostingCursor { private: static constexpr size_t kNoChunk = static_cast(-1); - static bool should_decode_full_prx_window(const PosChunk& chunk) { - return chunk.prx_doc_count != 0 && - static_cast(chunk.prx_doc_ordinals.size()) * 2 >= chunk.prx_doc_count; + const PosSource* src_ = nullptr; + size_t ci_ = 0; // current chunk + size_t li_ = 0; // current local doc index within the chunk + size_t decoded_pos_chunk_ = kNoChunk; // which chunk decoder_ currently holds + PosChunkDecoder decoder_; +}; + +class PhrasePositionLoader { +public: + PhrasePositionLoader(size_t plan_count, std::vector& srcs) + : cursors_(plan_count), plan_spans_(plan_count), loaded_epoch_(plan_count, 0) { + for (size_t i = 0; i < plan_count; ++i) { + cursors_[i].init(&srcs[i]); + } } - size_t position_offset_index() const { - if (!offsets_by_prx_ordinal_) { - return li_; + void begin_doc(uint32_t docid) { + docid_ = docid; + ++epoch_; + if (epoch_ == 0) { + std::ranges::fill(loaded_epoch_, 0); + epoch_ = 1; } - return src_->chunks[ci_].prx_doc_ordinals[li_]; } - const PosSource* src_ = nullptr; - size_t ci_ = 0; // current chunk - size_t li_ = 0; // current local doc index within the chunk - size_t decoded_pos_chunk_ = kNoChunk; // which chunk pflat_/poff_ currently hold - bool offsets_by_prx_ordinal_ = false; - std::vector pflat_; // current chunk's flat positions (reused) - std::vector poff_; // current chunk's per-doc offsets (reused) + Status positions_for_phrase_pos(const std::vector& phrase_plan_index, size_t phrase_pos, + std::pair* out) { + const size_t plan_index = phrase_plan_index[phrase_pos]; + if (loaded_epoch_[plan_index] != epoch_) { + SNII_RETURN_IF_ERROR(cursors_[plan_index].seek(docid_)); + SNII_RETURN_IF_ERROR(cursors_[plan_index].positions(&plan_spans_[plan_index])); + loaded_epoch_[plan_index] = epoch_; + } + *out = plan_spans_[plan_index]; + return Status::OK(); + } + +private: + std::vector cursors_; + std::vector> plan_spans_; + std::vector loaded_epoch_; + uint32_t docid_ = 0; + uint32_t epoch_ = 0; }; bool ContainsTwoTermPhrase(std::pair left_span, @@ -541,8 +612,8 @@ size_t SelectPhraseVerificationPair(const std::vector& plans, void CollectTwoTermPhraseStarts(std::pair left_span, std::pair right_span, uint32_t right_delta, uint32_t left_offset, - std::vector* starts) { - starts->clear(); + std::vector& starts) { + starts.clear(); const uint32_t* left = left_span.first; const uint32_t* right = right_span.first; const uint32_t max_left = std::numeric_limits::max() - right_delta; @@ -558,7 +629,7 @@ void CollectTwoTermPhraseStarts(std::pair left return; } if (*right == want && *left >= left_offset) { - starts->push_back(*left - left_offset); + starts.push_back(*left - left_offset); } ++left; } @@ -611,70 +682,158 @@ Status EmitTwoTermPhraseStreaming(const std::vector& phrase_plan_index, return Status::OK(); } -// Single streaming pass over the candidates: for each (ascending) candidate, -// gather positions lazily, and test the consecutive-phrase predicate -// (term[0]@p, term[1]@p+1, ...). Multi-term phrases first test the cheapest -// adjacent pair by df before decoding the remaining terms for that document. -// Cursors decode each retained chunk at most once and address positions by -// local index -- no per-candidate docid binary search, no full-candidate -// position materialization. Candidates are ascending so the emitted docids are -// already sorted. -Status EmitPhraseStreaming(const std::vector& plans, - const std::vector& phrase_plan_index, - const std::vector& position_offsets, - std::vector& srcs, const std::vector& candidates, - std::vector* docids) { - const size_t phrase_len = phrase_plan_index.size(); - if (phrase_len == 2) { - return EmitTwoTermPhraseStreaming(phrase_plan_index, position_offsets, srcs, candidates, - docids); +void EmitTwoTermPhraseChunkPair(const PosChunk& left, const PosChunk& right, + const PosChunkDecoder& left_decoder, + const PosChunkDecoder& right_decoder, uint32_t right_delta, + std::vector& docids) { + size_t li = static_cast( + std::lower_bound(left.docids.begin(), left.docids.end(), right.docids.front()) - + left.docids.begin()); + size_t ri = static_cast( + std::lower_bound(right.docids.begin(), right.docids.end(), left.docids.front()) - + right.docids.begin()); + while (li < left.docids.size() && ri < right.docids.size()) { + const uint32_t left_docid = left.docids[li]; + const uint32_t right_docid = right.docids[ri]; + if (left_docid < right_docid) { + ++li; + continue; + } + if (right_docid < left_docid) { + ++ri; + continue; + } + + const std::pair left_span = + left_decoder.positions_unchecked(li); + const std::pair right_span = + right_decoder.positions_unchecked(ri); + if (ContainsTwoTermPhrase(left_span, right_span, right_delta)) { + docids.push_back(left_docid); + } + ++li; + ++ri; } +} - std::vector cur(plans.size()); - for (size_t i = 0; i < plans.size(); ++i) cur[i].init(&srcs[i]); +Status EmitTwoTermPhraseChunkMerge(const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, + std::vector* const docids) { + const size_t left_plan = phrase_plan_index[0]; + const size_t right_plan = phrase_plan_index[1]; + const uint32_t right_delta = position_offsets[1] - position_offsets[0]; + const PosSource& left_src = srcs[left_plan]; + const PosSource& right_src = srcs[right_plan]; + + PosChunkDecoder left_decoder; + PosChunkDecoder right_decoder; + size_t decoded_left_chunk = static_cast(-1); + size_t decoded_right_chunk = static_cast(-1); + size_t left_chunk = 0; + size_t right_chunk = 0; + while (left_chunk < left_src.chunks.size() && right_chunk < right_src.chunks.size()) { + const PosChunk& left = left_src.chunks[left_chunk]; + const PosChunk& right = right_src.chunks[right_chunk]; + if (left.docids.empty()) { + ++left_chunk; + continue; + } + if (right.docids.empty()) { + ++right_chunk; + continue; + } + if (left.docids.back() < right.docids.front()) { + ++left_chunk; + continue; + } + if (right.docids.back() < left.docids.front()) { + ++right_chunk; + continue; + } - std::vector> plan_span(plans.size()); - std::vector loaded_epoch(plans.size(), 0); - const size_t pair_left = - phrase_len > 2 ? SelectPhraseVerificationPair(plans, phrase_plan_index) : 0; - const size_t pair_right = pair_left + 1; - std::vector starts; - std::vector> span(phrase_len); - uint32_t epoch = 1; - for (uint32_t d : candidates) { - if (++epoch == 0) { - std::ranges::fill(loaded_epoch, 0); - epoch = 1; - } - auto positions_for_phrase_pos = - [&](size_t phrase_pos, std::pair* out) -> Status { - const size_t plan_index = phrase_plan_index[phrase_pos]; - if (loaded_epoch[plan_index] != epoch) { - SNII_RETURN_IF_ERROR(cur[plan_index].seek(d)); - SNII_RETURN_IF_ERROR(cur[plan_index].positions(&plan_span[plan_index])); - loaded_epoch[plan_index] = epoch; - } - *out = plan_span[plan_index]; - return Status::OK(); - }; + if (decoded_left_chunk != left_chunk) { + SNII_RETURN_IF_ERROR(left_decoder.decode(left)); + decoded_left_chunk = left_chunk; + } + if (decoded_right_chunk != right_chunk) { + SNII_RETURN_IF_ERROR(right_decoder.decode(right)); + decoded_right_chunk = right_chunk; + } - if (phrase_len == 1) { - std::pair single_span; - SNII_RETURN_IF_ERROR(positions_for_phrase_pos(0, &single_span)); - if (single_span.first != single_span.second) { - docids->push_back(d); - } + EmitTwoTermPhraseChunkPair(left, right, left_decoder, right_decoder, right_delta, *docids); + + const uint32_t left_last = left.docids.back(); + const uint32_t right_last = right.docids.back(); + if (left_last <= right_last) { + ++left_chunk; + } + if (right_last <= left_last) { + ++right_chunk; + } + } + return Status::OK(); +} + +bool PhraseStartMatchesAllTerms( + uint32_t start, size_t phrase_len, size_t pair_left, size_t pair_right, + const std::vector& position_offsets, + const std::vector>& span) { + for (size_t t = 0; t < phrase_len; ++t) { + if (t == pair_left || t == pair_right) { continue; } + uint32_t want = 0; + if (!internal::add_position_offset(start, position_offsets[t], &want)) { + return false; + } + if (!std::binary_search(span[t].first, span[t].second, want)) { + return false; + } + } + return true; +} + +Status EmitSingleTermPhraseStreaming(const std::vector& phrase_plan_index, + std::vector& srcs, + const std::vector& candidates, + std::vector* docids) { + PhrasePositionLoader loader(srcs.size(), srcs); + for (uint32_t d : candidates) { + loader.begin_doc(d); + std::pair single_span; + SNII_RETURN_IF_ERROR(loader.positions_for_phrase_pos(phrase_plan_index, 0, &single_span)); + if (single_span.first != single_span.second) { + docids->push_back(d); + } + } + return Status::OK(); +} +Status EmitMultiTermPhraseStreaming(const std::vector& plans, + const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, + const std::vector& candidates, + std::vector* docids) { + const size_t phrase_len = phrase_plan_index.size(); + PhrasePositionLoader loader(plans.size(), srcs); + std::vector> span(phrase_len); + std::vector starts; + const size_t pair_left = SelectPhraseVerificationPair(plans, phrase_plan_index); + const size_t pair_right = pair_left + 1; + for (uint32_t d : candidates) { + loader.begin_doc(d); std::pair left_span; std::pair right_span; - SNII_RETURN_IF_ERROR(positions_for_phrase_pos(pair_left, &left_span)); - SNII_RETURN_IF_ERROR(positions_for_phrase_pos(pair_right, &right_span)); + SNII_RETURN_IF_ERROR( + loader.positions_for_phrase_pos(phrase_plan_index, pair_left, &left_span)); + SNII_RETURN_IF_ERROR( + loader.positions_for_phrase_pos(phrase_plan_index, pair_right, &right_span)); CollectTwoTermPhraseStarts(left_span, right_span, position_offsets[pair_right] - position_offsets[pair_left], - position_offsets[pair_left], &starts); + position_offsets[pair_left], starts); if (starts.empty()) { continue; } @@ -685,36 +844,48 @@ Status EmitPhraseStreaming(const std::vector& plans, if (pp == pair_left || pp == pair_right) { continue; } - SNII_RETURN_IF_ERROR(positions_for_phrase_pos(pp, &span[pp])); + SNII_RETURN_IF_ERROR(loader.positions_for_phrase_pos(phrase_plan_index, pp, &span[pp])); } - bool match = false; for (uint32_t start : starts) { - bool ok = true; - for (size_t t = 0; t < phrase_len; ++t) { - if (t == pair_left || t == pair_right) { - continue; - } - uint32_t want = 0; - if (!internal::add_position_offset(start, position_offsets[t], &want)) { - ok = false; - break; - } - if (!std::binary_search(span[t].first, span[t].second, want)) { - ok = false; - break; - } - } - if (ok) { - match = true; + if (PhraseStartMatchesAllTerms(start, phrase_len, pair_left, pair_right, + position_offsets, span)) { + docids->push_back(d); break; } } - if (match) docids->push_back(d); } return Status::OK(); } +// Single streaming pass over the candidates: for each (ascending) candidate, +// gather positions lazily, and test the consecutive-phrase predicate +// (term[0]@p, term[1]@p+1, ...). Multi-term phrases first test the cheapest +// adjacent pair by df before decoding the remaining terms for that document. +// Cursors decode each retained chunk at most once and address positions by +// local index -- no per-candidate docid binary search, no full-candidate +// position materialization. Candidates are ascending so the emitted docids are +// already sorted. +Status EmitPhraseStreaming(const std::vector& plans, + const std::vector& phrase_plan_index, + const std::vector& position_offsets, + std::vector& srcs, const std::vector& candidates, + std::vector* docids) { + const size_t phrase_len = phrase_plan_index.size(); + if (phrase_len == 1) { + return EmitSingleTermPhraseStreaming(phrase_plan_index, srcs, candidates, docids); + } + if (phrase_len == 2) { + if (phrase_plan_index[0] != phrase_plan_index[1]) { + return EmitTwoTermPhraseChunkMerge(phrase_plan_index, position_offsets, srcs, docids); + } + return EmitTwoTermPhraseStreaming(phrase_plan_index, position_offsets, srcs, candidates, + docids); + } + return EmitMultiTermPhraseStreaming(plans, phrase_plan_index, position_offsets, srcs, + candidates, docids); +} + Status BuildPhraseExecutionState(const LogicalIndexReader& idx, snii::io::BatchRangeFetcher* round1, std::vector* plans, PhraseExecutionState* state) { if (round1->pending() > 0) SNII_RETURN_IF_ERROR(round1->fetch()); From d20ca92752b3b42193d1006d4473ed4932e93ba5 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 28 Jun 2026 09:06:32 +0800 Subject: [PATCH 12/12] [improvement](be) Optimize SNII phrase CPU hotspots ### What problem does this PR solve? Issue Number: N/A Related PR: #64909 Problem Summary: SNII phrase queries over high-df terms were CPU-bound in PFOR unpacking and docid conjunction ordinal mapping. PH5/PP5 profiling on the 10B cloud_sim dataset showed pfor_decode and intersect_window_candidate_range_with_ordinals as top self CPU consumers while remote bytes and serial read rounds stayed fixed. This change adds low-bit PFOR unpack fast paths for common widths 3/5/6/7 and a bounded-span bitset/rank intersection path that preserves PRX doc ordinals for 16K-doc windows. The optimized path keeps the on-disk format unchanged and reduces CPU in the cold cloud_sim phrase benchmark: PH5 BE CPU 48.07s -> 41.68s, PP5 BE CPU 49.95s -> 43.57s. ### Release note None ### Check List (For Author) - Test: Unit Test / Manual test - build-support/clang-format.sh be/src/storage/index/snii/core/src/encoding/pfor.cpp be/src/storage/index/snii/core/src/query/docid_conjunction.cpp be/test/storage/index/snii_query_test.cpp - build-support/check-format.sh be/src/storage/index/snii/core/src/encoding/pfor.cpp be/src/storage/index/snii/core/src/query/docid_conjunction.cpp be/test/storage/index/snii_query_test.cpp - git diff --check - build-support/run-clang-tidy.sh --build-dir be/build_Release - ./run-be-ut.sh --run --filter='SniiPhraseQueryTest.*:SniiTermQueryTest.*:SniiPrxPodTest.*:SniiPforTest.*' - ./build.sh --be -j 192 - cloud_sim deploy/start BE and PH5/PP5 cold phrase benchmark under /mnt/disk15/jiangkai/textbench/runs/20260628_phrase_cpu_opt_final_verified - Behavior changed: No - Does this need documentation: No --- .../index/snii/core/src/encoding/pfor.cpp | 305 ++++++++++++------ .../snii/core/src/query/docid_conjunction.cpp | 56 ++++ be/test/storage/index/snii_query_test.cpp | 56 +++- 3 files changed, 322 insertions(+), 95 deletions(-) diff --git a/be/src/storage/index/snii/core/src/encoding/pfor.cpp b/be/src/storage/index/snii/core/src/encoding/pfor.cpp index 98862d7f5ee8a9..5cdf8fdb57f9d6 100644 --- a/be/src/storage/index/snii/core/src/encoding/pfor.cpp +++ b/be/src/storage/index/snii/core/src/encoding/pfor.cpp @@ -35,28 +35,35 @@ uint8_t bits_for(uint32_t v) { // Exception cost estimated at ~6 bytes each. uint8_t choose_width(const uint32_t* v, size_t n) { uint8_t maxw = 0; - for (size_t i = 0; i < n; ++i) maxw = std::max(maxw, bits_for(v[i])); + for (size_t i = 0; i < n; ++i) { + maxw = std::max(maxw, bits_for(v[i])); + } uint8_t best = maxw; size_t best_cost = SIZE_MAX; - for (int w = 0; w <= maxw; ++w) { + for (uint8_t w = 0; w <= maxw; ++w) { size_t exc = 0; - for (size_t i = 0; i < n; ++i) - if (bits_for(v[i]) > w) ++exc; + for (size_t i = 0; i < n; ++i) { + if (bits_for(v[i]) > w) { + ++exc; + } + } size_t cost = (static_cast(w) * n + 7) / 8 + exc * 6; if (cost < best_cost) { best_cost = cost; - best = static_cast(w); + best = w; } } return best; } uint32_t low_mask(uint8_t w) { - return (w >= 32) ? 0xFFFFFFFFu : ((1u << w) - 1u); + return (w >= 32) ? 0xFFFFFFFFU : ((1U << w) - 1U); } void bitpack(const uint32_t* v, size_t n, uint8_t w, ByteSink* out) { - if (w == 0) return; + if (w == 0) { + return; + } uint64_t acc = 0; int filled = 0; for (size_t i = 0; i < n; ++i) { @@ -68,112 +75,218 @@ void bitpack(const uint32_t* v, size_t n, uint8_t w, ByteSink* out) { filled -= 8; } } - if (filled > 0) out->put_u8(static_cast(acc)); + if (filled > 0) { + out->put_u8(static_cast(acc)); + } } -Status bitunpack(ByteSource* src, size_t n, uint8_t w, uint32_t* out) { - if (w == 0) { - std::memset(out, 0, n * sizeof(uint32_t)); - return Status::OK(); +void bitunpack_tail(const uint8_t* base, size_t packed, size_t n, uint8_t w, size_t i, + uint64_t mask, uint32_t* out) { + for (; i < n; ++i) { + const size_t bit_off = static_cast(w) * i; + const size_t byte_off = bit_off >> 3; + uint64_t word = 0; + for (size_t b = byte_off; b < packed && b < byte_off + 8; ++b) { + word |= static_cast(base[b]) << ((b - byte_off) * 8); + } + out[i] = static_cast((word >> (bit_off & 7)) & mask); } - // Pull the whole packed run in ONE bounds-checked slice (#3: was one get_u8 - // per byte -- a Status-returning call + bounds check each), then unpack - // straight from the contiguous buffer. Each value's w<=32 bits start at bit - // offset i*w and span at most ceil((7+32)/8)=5 bytes, so a single unaligned - // 64-bit load at byte (i*w)/8 always covers it: one load + shift + mask per - // value, branchless, no per-byte accumulator loop (#2). Measured fewest - // instructions and fewest cycles of the alternatives -- the dependency-free - // per-value form lets the core overlap the loads (the unaligned word reads - // all hit L1, the packed run being only KiB). - const size_t packed = (static_cast(w) * n + 7) / 8; - Slice buf; - SNII_RETURN_IF_ERROR(src->get_bytes(packed, &buf)); - const uint8_t* base = buf.data(); +} - if (w == 1) { - size_t i = 0; - size_t byte = 0; - for (; i + 8 <= n; i += 8, ++byte) { - const uint8_t v = base[byte]; - out[i] = v & 1U; - out[i + 1] = (v >> 1) & 1U; - out[i + 2] = (v >> 2) & 1U; - out[i + 3] = (v >> 3) & 1U; - out[i + 4] = (v >> 4) & 1U; - out[i + 5] = (v >> 5) & 1U; - out[i + 6] = (v >> 6) & 1U; - out[i + 7] = (v >> 7) & 1U; - } - if (i < n) { - const uint8_t v = base[byte]; - for (uint8_t bit = 0; i < n; ++i, ++bit) { - out[i] = (v >> bit) & 1U; - } - } - return Status::OK(); +void bitunpack_w1(const uint8_t* base, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 1U; + out[i + 1] = (v >> 1) & 1U; + out[i + 2] = (v >> 2) & 1U; + out[i + 3] = (v >> 3) & 1U; + out[i + 4] = (v >> 4) & 1U; + out[i + 5] = (v >> 5) & 1U; + out[i + 6] = (v >> 6) & 1U; + out[i + 7] = (v >> 7) & 1U; } - if (w == 2) { - size_t i = 0; - size_t byte = 0; - for (; i + 4 <= n; i += 4, ++byte) { - const uint8_t v = base[byte]; - out[i] = v & 3U; - out[i + 1] = (v >> 2) & 3U; - out[i + 2] = (v >> 4) & 3U; - out[i + 3] = (v >> 6) & 3U; - } - if (i < n) { - const uint8_t v = base[byte]; - for (uint8_t shift = 0; i < n; ++i, shift += 2) { - out[i] = (v >> shift) & 3U; - } + if (i < n) { + const uint8_t v = base[byte]; + for (uint8_t bit = 0; i < n; ++i, ++bit) { + out[i] = (v >> bit) & 1U; } - return Status::OK(); } - if (w == 4) { - size_t i = 0; - size_t byte = 0; - for (; i + 2 <= n; i += 2, ++byte) { - const uint8_t v = base[byte]; - out[i] = v & 15U; - out[i + 1] = (v >> 4) & 15U; - } - if (i < n) { - out[i] = base[byte] & 15U; - } - return Status::OK(); +} + +void bitunpack_w2(const uint8_t* base, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 4 <= n; i += 4, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 3U; + out[i + 1] = (v >> 2) & 3U; + out[i + 2] = (v >> 4) & 3U; + out[i + 3] = (v >> 6) & 3U; } - if (w == 8) { - for (size_t i = 0; i < n; ++i) { - out[i] = base[i]; + if (i < n) { + const uint8_t v = base[byte]; + for (uint8_t shift = 0; i < n; ++i, shift += 2) { + out[i] = (v >> shift) & 3U; } - return Status::OK(); } +} - const uint64_t mask = low_mask(w); +void bitunpack_w3(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, byte += 3) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + out[i] = b0 & 7U; + out[i + 1] = (b0 >> 3) & 7U; + out[i + 2] = ((b0 >> 6) | (b1 << 2)) & 7U; + out[i + 3] = (b1 >> 1) & 7U; + out[i + 4] = (b1 >> 4) & 7U; + out[i + 5] = ((b1 >> 7) | (b2 << 1)) & 7U; + out[i + 6] = (b2 >> 2) & 7U; + out[i + 7] = (b2 >> 5) & 7U; + } + bitunpack_tail(base, packed, n, 3, i, 7U, out); +} - // Fast path: values whose 8-byte load window stays inside the buffer - // (byte_off + 8 - // <= packed). The final few are finished by the tail loop, which zero-pads - // past end. +void bitunpack_w4(const uint8_t* base, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 2 <= n; i += 2, ++byte) { + const uint8_t v = base[byte]; + out[i] = v & 15U; + out[i + 1] = (v >> 4) & 15U; + } + if (i < n) { + out[i] = base[byte] & 15U; + } +} + +void bitunpack_w5(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, byte += 5) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + const uint32_t b3 = base[byte + 3]; + const uint32_t b4 = base[byte + 4]; + out[i] = b0 & 31U; + out[i + 1] = ((b0 >> 5) | (b1 << 3)) & 31U; + out[i + 2] = (b1 >> 2) & 31U; + out[i + 3] = ((b1 >> 7) | (b2 << 1)) & 31U; + out[i + 4] = ((b2 >> 4) | (b3 << 4)) & 31U; + out[i + 5] = (b3 >> 1) & 31U; + out[i + 6] = ((b3 >> 6) | (b4 << 2)) & 31U; + out[i + 7] = (b4 >> 3) & 31U; + } + bitunpack_tail(base, packed, n, 5, i, 31U, out); +} + +void bitunpack_w6(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 4 <= n; i += 4, byte += 3) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + out[i] = b0 & 63U; + out[i + 1] = ((b0 >> 6) | (b1 << 2)) & 63U; + out[i + 2] = ((b1 >> 4) | (b2 << 4)) & 63U; + out[i + 3] = (b2 >> 2) & 63U; + } + bitunpack_tail(base, packed, n, 6, i, 63U, out); +} + +void bitunpack_w7(const uint8_t* base, size_t packed, size_t n, uint32_t* out) { + size_t i = 0; + size_t byte = 0; + for (; i + 8 <= n; i += 8, byte += 7) { + const uint32_t b0 = base[byte]; + const uint32_t b1 = base[byte + 1]; + const uint32_t b2 = base[byte + 2]; + const uint32_t b3 = base[byte + 3]; + const uint32_t b4 = base[byte + 4]; + const uint32_t b5 = base[byte + 5]; + const uint32_t b6 = base[byte + 6]; + out[i] = b0 & 127U; + out[i + 1] = ((b0 >> 7) | (b1 << 1)) & 127U; + out[i + 2] = ((b1 >> 6) | (b2 << 2)) & 127U; + out[i + 3] = ((b2 >> 5) | (b3 << 3)) & 127U; + out[i + 4] = ((b3 >> 4) | (b4 << 4)) & 127U; + out[i + 5] = ((b4 >> 3) | (b5 << 5)) & 127U; + out[i + 6] = ((b5 >> 2) | (b6 << 6)) & 127U; + out[i + 7] = (b6 >> 1) & 127U; + } + bitunpack_tail(base, packed, n, 7, i, 127U, out); +} + +void bitunpack_w8(const uint8_t* base, size_t n, uint32_t* out) { + for (size_t i = 0; i < n; ++i) { + out[i] = base[i]; + } +} + +void bitunpack_generic(const uint8_t* base, size_t packed, size_t n, uint8_t w, uint32_t* out) { + const uint64_t mask = low_mask(w); size_t i = 0; if (packed >= 8) { const size_t last_safe_byte = packed - 8; for (; i < n; ++i) { const size_t bit_off = static_cast(w) * i; const size_t byte_off = bit_off >> 3; - if (byte_off > last_safe_byte) break; + if (byte_off > last_safe_byte) { + break; + } out[i] = static_cast((load_u64_le(base + byte_off) >> (bit_off & 7)) & mask); } } - for (; i < n; ++i) { - const size_t bit_off = static_cast(w) * i; - const size_t byte_off = bit_off >> 3; - uint64_t word = 0; - for (size_t b = byte_off; b < packed && b < byte_off + 8; ++b) { - word |= static_cast(base[b]) << ((b - byte_off) * 8); - } - out[i] = static_cast((word >> (bit_off & 7)) & mask); + bitunpack_tail(base, packed, n, w, i, mask, out); +} + +Status bitunpack(ByteSource* src, size_t n, uint8_t w, uint32_t* out) { + if (w == 0) { + std::memset(out, 0, n * sizeof(uint32_t)); + return Status::OK(); + } + // Pull the packed run once and unpack from the contiguous slice; this keeps + // the hot decode path free of per-byte ByteSource calls. + const size_t packed = (static_cast(w) * n + 7) / 8; + Slice buf; + SNII_RETURN_IF_ERROR(src->get_bytes(packed, &buf)); + const uint8_t* base = buf.data(); + + switch (w) { + case 1: + bitunpack_w1(base, n, out); + break; + case 2: + bitunpack_w2(base, n, out); + break; + case 3: + bitunpack_w3(base, packed, n, out); + break; + case 4: + bitunpack_w4(base, n, out); + break; + case 5: + bitunpack_w5(base, packed, n, out); + break; + case 6: + bitunpack_w6(base, packed, n, out); + break; + case 7: + bitunpack_w7(base, packed, n, out); + break; + case 8: + bitunpack_w8(base, n, out); + break; + default: + bitunpack_generic(base, packed, n, w, out); + break; } return Status::OK(); } @@ -214,7 +327,9 @@ Status pfor_decode(ByteSource* src, size_t n, uint32_t* out) { SNII_RETURN_IF_ERROR(src->get_varint32(&d)); SNII_RETURN_IF_ERROR(src->get_varint32(&val)); idx += d; - if (idx >= n) return Status::Corruption("pfor exception index out of range"); + if (idx >= n) { + return Status::Corruption("pfor exception index out of range"); + } out[idx] = val; } return Status::OK(); @@ -235,7 +350,9 @@ Status pfor_skip(ByteSource* src, size_t n) { SNII_RETURN_IF_ERROR(src->get_varint32(&d)); SNII_RETURN_IF_ERROR(src->get_varint32(&val)); idx += d; - if (idx >= n) return Status::Corruption("pfor exception index out of range"); + if (idx >= n) { + return Status::Corruption("pfor exception index out of range"); + } } return Status::OK(); } diff --git a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp index 1ebb6c86a7471a..cfbafd3ca7c1bb 100644 --- a/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp +++ b/be/src/storage/index/snii/core/src/query/docid_conjunction.cpp @@ -1,6 +1,7 @@ #include "snii/query/internal/docid_conjunction.h" #include +#include #include #include @@ -21,6 +22,10 @@ namespace { using CandidateIt = std::vector::const_iterator; +constexpr uint32_t kBoundedSpanBitsetDocs = 16 * 1024; +constexpr size_t kBoundedSpanBitsetWords = kBoundedSpanBitsetDocs / 64; +constexpr size_t kBoundedSpanBitsetMinInput = 32; + struct CandidateRange { size_t begin = 0; size_t end = 0; @@ -286,6 +291,53 @@ bool intersect_dense_term_span_with_ordinals(CandidateIt begin, CandidateIt end, return true; } +bool intersect_bounded_span_with_ordinals(CandidateIt begin, CandidateIt end, + const std::vector& term_docids, + size_t candidate_count, std::vector* out, + DocidChunk* chunk) { + if (candidate_count < kBoundedSpanBitsetMinInput || + term_docids.size() < kBoundedSpanBitsetMinInput) { + return false; + } + + const uint32_t first = std::min(*begin, term_docids.front()); + const uint32_t last = std::max(*(end - 1), term_docids.back()); + const uint64_t width = static_cast(last) - first + 1; + if (width > kBoundedSpanBitsetDocs || term_docids.size() > width) { + return false; + } + + std::array bits {}; + for (uint32_t docid : term_docids) { + const uint32_t off = docid - first; + bits[off >> 6] |= 1ULL << (off & 63); + } + + const auto word_count = static_cast((width + 63) >> 6); + std::array ordinal_base {}; + uint32_t ordinal = 0; + for (size_t word = 0; word < word_count; ++word) { + ordinal_base[word] = ordinal; + ordinal += static_cast(__builtin_popcountll(bits[word])); + } + + for (auto it = begin; it != end; ++it) { + const uint32_t off = *it - first; + const size_t word = off >> 6; + const uint64_t mask = 1ULL << (off & 63); + if ((bits[word] & mask) == 0) { + continue; + } + out->push_back(*it); + chunk->docids.push_back(*it); + chunk->prx_doc_ordinals.push_back( + ordinal_base[word] + + static_cast(__builtin_popcountll(bits[word] & (mask - 1)))); + } + clear_ordinals_if_all_term_docs_selected(term_docids, chunk); + return true; +} + size_t log2_ceil(size_t n) { if (n <= 1) return 1; --n; @@ -372,6 +424,10 @@ Status intersect_window_candidate_range_with_ordinals(CandidateIt begin, Candida chunk)) { return Status::OK(); } + if (intersect_bounded_span_with_ordinals(begin, end, term_docids, candidate_count, out, + chunk)) { + return Status::OK(); + } const size_t probes_per_candidate = log2_ceil(term_docids.size()) + 1; if (candidate_count < term_docids.size() / probes_per_candidate) { diff --git a/be/test/storage/index/snii_query_test.cpp b/be/test/storage/index/snii_query_test.cpp index da085cdec66914..d735770d8402cc 100644 --- a/be/test/storage/index/snii_query_test.cpp +++ b/be/test/storage/index/snii_query_test.cpp @@ -141,9 +141,19 @@ Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, auto ordinal_docs = docs_with_one_position(0, kDocCount, 2); auto driver_docs = docs_with_one_position(0, 8000, 0); auto almost_docs = docs_with_one_position(0, kDocCount, 1); + std::vector sparse_left_docs; + std::vector sparse_right_docs; std::vector repeat_docs; + sparse_left_docs.reserve(kDocCount / 3 + 1); + sparse_right_docs.reserve(kDocCount); repeat_docs.reserve(kDocCount); for (uint32_t docid = 0; docid < kDocCount; ++docid) { + if (docid % 3 == 0) { + sparse_left_docs.push_back({docid, {0}}); + } + if (docid % 4 != 1) { + sparse_right_docs.push_back({docid, {1}}); + } repeat_docs.push_back({docid, {0, 1, 2}}); } almost_docs.erase(almost_docs.begin() + 4000); @@ -171,7 +181,9 @@ Status build_reader(MemoryFile* file, reader::SniiSegmentReader* segment_reader, make_term("failed", std::move(failed_docs)), make_term("order", std::move(order_docs)), make_term("ordinal", std::move(ordinal_docs)), - make_term("repeat", std::move(repeat_docs))}; + make_term("repeat", std::move(repeat_docs)), + make_term("sparse_left", std::move(sparse_left_docs)), + make_term("sparse_right", std::move(sparse_right_docs))}; writer::SniiCompoundWriter writer(file); SNII_RETURN_IF_ERROR(writer.add_logical_index(input)); @@ -280,6 +292,24 @@ TEST(SniiPhraseQueryTest, DenseTermWithMissingDocKeepsCandidateOrdinals) { EXPECT_EQ(docids, expected); } +TEST(SniiPhraseQueryTest, SparseWindowBitsetKeepsCandidateOrdinals) { + MemoryFile file; + reader::SniiSegmentReader segment_reader; + reader::LogicalIndexReader index_reader; + assert_ok(build_reader(&file, &segment_reader, &index_reader)); + + std::vector docids; + assert_ok(phrase_query(index_reader, {"sparse_left", "sparse_right"}, &docids)); + + std::vector expected; + for (uint32_t docid = 0; docid < 9000; ++docid) { + if (docid % 3 == 0 && docid % 4 != 1) { + expected.push_back(docid); + } + } + EXPECT_EQ(docids, expected); +} + TEST(SniiTermQueryTest, WindowedDenseTermEmitsRangesToSink) { MemoryFile file; reader::SniiSegmentReader segment_reader; @@ -368,12 +398,36 @@ TEST(SniiPforTest, LowBitWidthFastPathsRoundTrip) { } assert_round_trip(two_bit, 2); + std::vector three_bit(131); + for (size_t i = 0; i < three_bit.size(); ++i) { + three_bit[i] = static_cast(i & 7); + } + assert_round_trip(three_bit, 3); + std::vector four_bit(128); for (size_t i = 0; i < four_bit.size(); ++i) { four_bit[i] = static_cast(i & 15); } assert_round_trip(four_bit, 4); + std::vector five_bit(129); + for (size_t i = 0; i < five_bit.size(); ++i) { + five_bit[i] = static_cast(i & 31); + } + assert_round_trip(five_bit, 5); + + std::vector six_bit(130); + for (size_t i = 0; i < six_bit.size(); ++i) { + six_bit[i] = static_cast(i & 63); + } + assert_round_trip(six_bit, 6); + + std::vector seven_bit(131); + for (size_t i = 0; i < seven_bit.size(); ++i) { + seven_bit[i] = static_cast(i & 127); + } + assert_round_trip(seven_bit, 7); + std::vector eight_bit(256); for (size_t i = 0; i < eight_bit.size(); ++i) { eight_bit[i] = static_cast(i);