Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
907 changes: 284 additions & 623 deletions Cargo.lock

Large diffs are not rendered by default.

144 changes: 144 additions & 0 deletions crates/lance-graph/src/graph/fingerprint.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Fingerprint functions for SPO triple addressing.
//!
//! Labels (node names, relationship types) are hashed into fixed-width
//! fingerprints for compact storage and fast comparison in the SPO store.

/// Number of u64 words in a fingerprint vector.
pub const FINGERPRINT_WORDS: usize = 8;

/// A fingerprint is a fixed-width hash of a label string.
pub type Fingerprint = [u64; FINGERPRINT_WORDS];

/// Hash a label string into a fingerprint.
///
/// Uses FNV-1a inspired mixing to distribute bits across all words.
/// The result is deterministic: same label always produces the same fingerprint.
pub fn label_fp(label: &str) -> Fingerprint {
let mut fp = [0u64; FINGERPRINT_WORDS];
let bytes = label.as_bytes();

// Primary hash using FNV-1a constants
let mut h: u64 = 0xcbf29ce484222325;
for &b in bytes {
h ^= b as u64;
h = h.wrapping_mul(0x100000001b3);
}
fp[0] = h;

// Fill remaining words with cascading mixes
#[allow(clippy::needless_range_loop)]
for i in 1..FINGERPRINT_WORDS {
h = h.wrapping_mul(0x517cc1b727220a95);
h ^= h >> 17;
h = h.wrapping_mul(0x6c62272e07bb0142);
h ^= (i as u64).wrapping_mul(0x9e3779b97f4a7c15);
fp[i] = h;
}

// Guard: reject if density > 11% (prevents pack_axes overflow)
// Density = popcount / total_bits. At 8 words × 64 bits = 512 bits,
// 11% ≈ 56 set bits. If we exceed this, rotate to thin out.
let popcount: u32 = fp.iter().map(|w| w.count_ones()).sum();
let total_bits = (FINGERPRINT_WORDS * 64) as u32;
let max_density_bits = total_bits * 11 / 100; // 11% threshold

if popcount > max_density_bits {
// Thin out by XOR-folding with shifted self
for i in 0..FINGERPRINT_WORDS {
fp[i] ^= fp[i] >> 3;
fp[i] &= fp[(i + 1) % FINGERPRINT_WORDS].wrapping_shr(1) | fp[i];
}
// Re-check and force-mask if still too dense
let popcount2: u32 = fp.iter().map(|w| w.count_ones()).sum();
if popcount2 > max_density_bits {
for w in fp.iter_mut() {
// Keep only every other bit
*w &= 0x5555_5555_5555_5555;
}
}
}

fp
}

/// Hash a DN (distinguished name) path into a u64 address.
///
/// Used for keying records in the SPO store.
pub fn dn_hash(dn: &str) -> u64 {
let mut h: u64 = 0xcbf29ce484222325;
for &b in dn.as_bytes() {
h ^= b as u64;
h = h.wrapping_mul(0x100000001b3);
}
h
}

/// Compute Hamming distance between two fingerprints.
///
/// Returns the number of bit positions where the fingerprints differ.
pub fn hamming_distance(a: &Fingerprint, b: &Fingerprint) -> u32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x ^ y).count_ones())
.sum()
}

/// Zero fingerprint constant.
pub const ZERO_FP: Fingerprint = [0u64; FINGERPRINT_WORDS];

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_label_fp_deterministic() {
let fp1 = label_fp("Jan");
let fp2 = label_fp("Jan");
assert_eq!(fp1, fp2);
}

#[test]
fn test_label_fp_different_labels() {
let fp1 = label_fp("Jan");
let fp2 = label_fp("Ada");
assert_ne!(fp1, fp2);
}

#[test]
fn test_label_fp_density_bound() {
// Check that density stays under ~50% for reasonable labels
for label in &["Jan", "Ada", "KNOWS", "CREATES", "HELPS", "entity_42"] {
let fp = label_fp(label);
let popcount: u32 = fp.iter().map(|w| w.count_ones()).sum();
let total = (FINGERPRINT_WORDS * 64) as u32;
assert!(
popcount < total / 2,
"Label '{}' has density {}/{}",
label,
popcount,
total
);
}
}

#[test]
fn test_dn_hash_deterministic() {
assert_eq!(dn_hash("edge:jan-knows-ada"), dn_hash("edge:jan-knows-ada"));
}

#[test]
fn test_hamming_distance_self() {
let fp = label_fp("test");
assert_eq!(hamming_distance(&fp, &fp), 0);
}

#[test]
fn test_hamming_distance_different() {
let fp1 = label_fp("Jan");
let fp2 = label_fp("Ada");
assert!(hamming_distance(&fp1, &fp2) > 0);
}
}
32 changes: 32 additions & 0 deletions crates/lance-graph/src/graph/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Graph primitives: fingerprinting, sparse bitmaps, and SPO triple store.
//!
//! This module provides the low-level graph data structures that sit beneath
//! the Cypher query engine. While the Cypher layer operates on property graphs
//! via DataFusion, this layer provides direct fingerprint-based graph operations.

pub mod fingerprint;
pub mod sparse;
pub mod spo;

/// Container geometry identifiers for graph storage layouts.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum ContainerGeometry {
/// Flat record batch (default).
Flat = 0,
/// Adjacency list.
AdjList = 1,
/// CSR (Compressed Sparse Row).
Csr = 2,
/// CSC (Compressed Sparse Column).
Csc = 3,
/// COO (Coordinate list).
Coo = 4,
/// Hybrid (mixed format).
Hybrid = 5,
/// SPO (Subject-Predicate-Object triple store).
Spo = 6,
}
128 changes: 128 additions & 0 deletions crates/lance-graph/src/graph/sparse.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Sparse bitmap operations for SPO fingerprint packing.
//!
//! Uses `[u64; BITMAP_WORDS]` for fixed-width bitmaps that can be
//! packed into Lance vector columns for ANN search.

/// Number of u64 words in a bitmap.
///
/// Previously hardcoded as `[u64; 2]` which truncated fingerprints.
/// Now matches the fingerprint width for full coverage.
pub const BITMAP_WORDS: usize = 8;

/// A fixed-width bitmap for sparse set encoding.
pub type Bitmap = [u64; BITMAP_WORDS];

/// Create an empty bitmap (all zeros).
pub const fn bitmap_zero() -> Bitmap {
[0u64; BITMAP_WORDS]
}

/// OR two bitmaps together.
pub fn bitmap_or(a: &Bitmap, b: &Bitmap) -> Bitmap {
let mut result = [0u64; BITMAP_WORDS];
for i in 0..BITMAP_WORDS {
result[i] = a[i] | b[i];
}
result
}

/// AND two bitmaps together.
pub fn bitmap_and(a: &Bitmap, b: &Bitmap) -> Bitmap {
let mut result = [0u64; BITMAP_WORDS];
for i in 0..BITMAP_WORDS {
result[i] = a[i] & b[i];
}
result
}

/// XOR two bitmaps (used for Hamming distance).
pub fn bitmap_xor(a: &Bitmap, b: &Bitmap) -> Bitmap {
let mut result = [0u64; BITMAP_WORDS];
for i in 0..BITMAP_WORDS {
result[i] = a[i] ^ b[i];
}
result
}

/// Count set bits in a bitmap.
pub fn bitmap_popcount(bm: &Bitmap) -> u32 {
bm.iter().map(|w| w.count_ones()).sum()
}

/// Hamming distance between two bitmaps.
pub fn bitmap_hamming(a: &Bitmap, b: &Bitmap) -> u32 {
bitmap_popcount(&bitmap_xor(a, b))
}

/// Check if a bitmap is all zeros.
pub fn bitmap_is_zero(bm: &Bitmap) -> bool {
bm.iter().all(|&w| w == 0)
}

/// Set a specific bit position (0..BITMAP_WORDS*64).
pub fn bitmap_set_bit(bm: &mut Bitmap, pos: usize) {
let word = pos / 64;
let bit = pos % 64;
if word < BITMAP_WORDS {
bm[word] |= 1u64 << bit;
}
}

/// Pack three fingerprints into a combined bitmap for SPO encoding.
///
/// The packed result is the OR of all three, used as the search vector.
/// Individual components can be recovered via AND with the original fingerprints.
pub fn pack_axes(
s: &[u64; BITMAP_WORDS],
p: &[u64; BITMAP_WORDS],
o: &[u64; BITMAP_WORDS],
) -> Bitmap {
let sp = bitmap_or(s, p);
bitmap_or(&sp, o)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_bitmap_zero() {
let bm = bitmap_zero();
assert!(bitmap_is_zero(&bm));
assert_eq!(bitmap_popcount(&bm), 0);
}

#[test]
fn test_bitmap_or() {
let a = [1u64, 0, 0, 0, 0, 0, 0, 0];
let b = [0u64, 1, 0, 0, 0, 0, 0, 0];
let c = bitmap_or(&a, &b);
assert_eq!(c[0], 1);
assert_eq!(c[1], 1);
}

#[test]
fn test_bitmap_hamming() {
let a = [0xFFu64, 0, 0, 0, 0, 0, 0, 0];
let b = [0x00u64, 0, 0, 0, 0, 0, 0, 0];
assert_eq!(bitmap_hamming(&a, &b), 8);
}

#[test]
fn test_pack_axes() {
let s = [1u64, 0, 0, 0, 0, 0, 0, 0];
let p = [2u64, 0, 0, 0, 0, 0, 0, 0];
let o = [4u64, 0, 0, 0, 0, 0, 0, 0];
let packed = pack_axes(&s, &p, &o);
assert_eq!(packed[0], 7); // 1|2|4 = 7
}

#[test]
fn test_bitmap_words_matches_fingerprint() {
// BITMAP_WORDS must match FINGERPRINT_WORDS
assert_eq!(BITMAP_WORDS, super::super::fingerprint::FINGERPRINT_WORDS);
}
}
Loading