Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/tame-times-create.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@tanstack/db-ivm": patch
---

Replace JSON.stringify based hash function by structural hashing function.
2 changes: 0 additions & 2 deletions packages/db-ivm/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
"version": "0.1.2",
"dependencies": {
"fractional-indexing": "^3.2.0",
"murmurhash-js": "^1.0.0",
"sorted-btree": "^1.8.1"
},
"devDependencies": {
"@types/debug": "^4.1.12",
"@types/murmurhash-js": "^1.0.6",
"@vitest/coverage-istanbul": "^3.0.9"
},
"exports": {
Expand Down
11 changes: 6 additions & 5 deletions packages/db-ivm/src/hashIndex.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import { DefaultMap, hash } from "./utils.js"
import { DefaultMap } from "./utils.js"
import { hash } from "./hashing/index.js"
import type { Hash } from "./hashing/index.js"

/**
* A map from a difference collection trace's keys -> (value, multiplicities) that changed.
* Used in operations like join and reduce where the operation needs to
* exploit the key-value structure of the data to run efficiently.
*/
export class HashIndex<K, V> {
#inner: DefaultMap<K, DefaultMap<string, [V, number]>>
#inner: DefaultMap<K, DefaultMap<Hash, [V, number]>>

constructor() {
this.#inner = new DefaultMap<K, DefaultMap<string, [V, number]>>(
() =>
new DefaultMap<string, [V, number]>(() => [undefined as any as V, 0])
this.#inner = new DefaultMap<K, DefaultMap<Hash, [V, number]>>(
() => new DefaultMap<Hash, [V, number]>(() => [undefined as any as V, 0])
)
// #inner is as map of:
// {
Expand Down
157 changes: 157 additions & 0 deletions packages/db-ivm/src/hashing/hash.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import { MurmurHashStream, randomHash } from "./murmur.js"
import type { Hasher } from "./murmur.js"

/*
* Implementation of structural hashing based on the Composites polyfill implementation:
* https://github.com/tc39/proposal-composites
*/

const TRUE = randomHash()
const FALSE = randomHash()
const NULL = randomHash()
const UNDEFINED = randomHash()
const KEY = randomHash()
const FUNCTIONS = randomHash()
const DATE_MARKER = randomHash()
const OBJECT_MARKER = randomHash()
const ARRAY_MARKER = randomHash()
const MAP_MARKER = randomHash()
const SET_MARKER = randomHash()

const hashCache = new WeakMap<object, number>()

export function hash(input: any): number {
const hasher = new MurmurHashStream()
updateHasher(hasher, input)
return hasher.digest()
}

function hashObject(input: object): number {
const cachedHash = hashCache.get(input)
if (cachedHash !== undefined) {
return cachedHash
}

let valueHash: number | undefined
if (input instanceof Date) {
valueHash = hashDate(input)
} else {
let plainObjectInput = input
let marker = OBJECT_MARKER

if (input instanceof Array) {
marker = ARRAY_MARKER
}

if (input instanceof Map) {
marker = MAP_MARKER
plainObjectInput = [...input.entries()]
}

if (input instanceof Set) {
marker = SET_MARKER
plainObjectInput = [...input.entries()]
}

if (
input instanceof Buffer ||
input instanceof Uint8Array ||
input instanceof File
) {
// Deeply hashing these objects would be too costly
// but we also don't want to ignore them
// so we track them by reference and cache them in a weak map
return cachedReferenceHash(input)
}

valueHash = hashPlainObject(plainObjectInput, marker)
}

hashCache.set(input, valueHash)
return valueHash
}

function hashDate(input: Date): number {
const hasher = new MurmurHashStream()
hasher.update(DATE_MARKER)
hasher.update(input.getTime())
return hasher.digest()
}

function hashPlainObject(input: object, marker: number): number {
const hasher = new MurmurHashStream()

// Mark the type of the input
hasher.update(marker)
const keys = Object.keys(input)
keys.sort(keySort)
for (const key of keys) {
hasher.update(KEY)
hasher.update(key)
updateHasher(hasher, input[key as keyof typeof input])
}

return hasher.digest()
}

function updateHasher(hasher: Hasher, input: unknown): void {
if (input === null) {
hasher.update(NULL)
return
}
switch (typeof input) {
case `undefined`:
hasher.update(UNDEFINED)
return
case `boolean`:
hasher.update(input ? TRUE : FALSE)
return
case `number`:
// Normalize NaNs and -0
hasher.update(isNaN(input) ? NaN : input === 0 ? 0 : input)
return
case `bigint`:
case `string`:
case `symbol`:
hasher.update(input)
return
case `object`:
hasher.update(getCachedHash(input))
return
case `function`:
// Functions are assigned a globally unique ID
// and that ID is cached in the weak map
hasher.update(cachedReferenceHash(input))
return
default:
console.warn(
`Ignored input during hashing because it is of type ${typeof input} which is not supported`
)
}
}

function getCachedHash(input: object): number {
let valueHash = hashCache.get(input)
if (valueHash === undefined) {
valueHash = hashObject(input)
}
return valueHash
}

let nextRefId = 1
function cachedReferenceHash(fn: object): number {
let valueHash = hashCache.get(fn)
if (valueHash === undefined) {
valueHash = nextRefId ^ FUNCTIONS
nextRefId++
hashCache.set(fn, valueHash)
}
return valueHash
}

/**
* Strings sorted lexicographically.
*/
function keySort(a: string, b: string): number {
return a.localeCompare(b)
}
2 changes: 2 additions & 0 deletions packages/db-ivm/src/hashing/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export { hash } from "./hash.js"
export type { Hash, Hasher } from "./murmur.js"
128 changes: 128 additions & 0 deletions packages/db-ivm/src/hashing/murmur.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Implementation of murmur hash based on the Composites polyfill implementation:
* https://github.com/tc39/proposal-composites
*/

const RANDOM_SEED = randomHash()
const STRING_MARKER = randomHash()
const BIG_INT_MARKER = randomHash()
const NEG_BIG_INT_MARKER = randomHash()
const SYMBOL_MARKER = randomHash()

export type Hash = number

export function randomHash() {
return (Math.random() * (2 ** 31 - 1)) >>> 0
}

export interface Hasher {
update: (val: symbol | string | number | bigint) => void
digest: () => number
}

/**
* This implementation of Murmur hash uses a random initial seed and random markers.
* This means that hashes aren't deterministic across app restarts.
* This is intentional in the composites polyfill to be resistent to hash-flooding attacks
* where malicious users would precompute lots of different objects whose hashes collide with each other.
*
* Currently, for ts/db-ivm this is fine because we don't persist client state.
* However, when we will introduce persistence we will either need to store the seeds or remove the randomness
* to ensure deterministic hashes across app restarts.
*/
export class MurmurHashStream implements Hasher {
private hash: number = RANDOM_SEED
private length = 0
private carry = 0
private carryBytes = 0

private _mix(k1: number): void {
k1 = Math.imul(k1, 0xcc9e2d51)
k1 = (k1 << 15) | (k1 >>> 17)
k1 = Math.imul(k1, 0x1b873593)
this.hash ^= k1
this.hash = (this.hash << 13) | (this.hash >>> 19)
this.hash = Math.imul(this.hash, 5) + 0xe6546b64
}

private _writeByte(byte: number): void {
this.carry |= (byte & 0xff) << (8 * this.carryBytes)
this.carryBytes++
this.length++

if (this.carryBytes === 4) {
this._mix(this.carry >>> 0)
this.carry = 0
this.carryBytes = 0
}
}

update(chunk: symbol | string | number | bigint): void {
switch (typeof chunk) {
case `symbol`: {
this.update(SYMBOL_MARKER)
const description = chunk.description
if (!description) {
return
}

for (let i = 0; i < description.length; i++) {
const code = description.charCodeAt(i)
this._writeByte(code & 0xff)
this._writeByte((code >>> 8) & 0xff)
}
return
}
case `string`:
this.update(STRING_MARKER)
for (let i = 0; i < chunk.length; i++) {
const code = chunk.charCodeAt(i)
this._writeByte(code & 0xff)
this._writeByte((code >>> 8) & 0xff)
}
return
case `number`:
this._writeByte(chunk & 0xff)
this._writeByte((chunk >>> 8) & 0xff)
this._writeByte((chunk >>> 16) & 0xff)
this._writeByte((chunk >>> 24) & 0xff)
return
case `bigint`: {
let value = chunk
if (value < 0n) {
value = -value
this.update(NEG_BIG_INT_MARKER)
} else {
this.update(BIG_INT_MARKER)
}
while (value > 0n) {
this._writeByte(Number(value & 0xffn))
value >>= 8n
}
if (chunk === 0n) this._writeByte(0)
return
}
default:
throw new TypeError(`Unsupported input type: ${typeof chunk}`)
}
}

digest(): number {
if (this.carryBytes > 0) {
let k1 = this.carry >>> 0
k1 = Math.imul(k1, 0xcc9e2d51)
k1 = (k1 << 15) | (k1 >>> 17)
k1 = Math.imul(k1, 0x1b873593)
this.hash ^= k1
}

this.hash ^= this.length
this.hash ^= this.hash >>> 16
this.hash = Math.imul(this.hash, 0x85ebca6b)
this.hash ^= this.hash >>> 13
this.hash = Math.imul(this.hash, 0xc2b2ae35)
this.hash ^= this.hash >>> 16

return this.hash >>> 0
}
}
2 changes: 1 addition & 1 deletion packages/db-ivm/src/multiset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ import {
DefaultMap,
chunkedArrayPush,
globalObjectIdGenerator,
hash,
} from "./utils.js"
import { hash } from "./hashing/index.js"

export type MultiSetArray<T> = Array<[T, number]>
export type KeyedData<T> = [key: string, value: T]
Expand Down
8 changes: 4 additions & 4 deletions packages/db-ivm/src/operators/distinct.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import { DifferenceStreamWriter, UnaryOperator } from "../graph.js"
import { StreamBuilder } from "../d2.js"
import { hash } from "../utils.js"
import { hash } from "../hashing/index.js"
import { MultiSet } from "../multiset.js"
import type { Hash } from "../hashing/index.js"
import type { DifferenceStreamReader } from "../graph.js"
import type { IStreamBuilder } from "../types.js"

type HashedValue = string
type Multiplicity = number

/**
* Operator that removes duplicates
*/
export class DistinctOperator<T> extends UnaryOperator<T> {
#by: (value: T) => any
#values: Map<HashedValue, Multiplicity> // keeps track of the number of times each value has been seen
#values: Map<Hash, Multiplicity> // keeps track of the number of times each value has been seen

constructor(
id: number,
Expand All @@ -27,7 +27,7 @@ export class DistinctOperator<T> extends UnaryOperator<T> {
}

run(): void {
const updatedValues = new Map<HashedValue, [Multiplicity, T]>()
const updatedValues = new Map<Hash, [Multiplicity, T]>()

// Compute the new multiplicity for each value
for (const message of this.inputMessages()) {
Expand Down
Loading
Loading