Skip to content

Commit 79c95a3

Browse files
authored
keep residual where clause in join predicate pushdown (#442)
1 parent aecbcc3 commit 79c95a3

File tree

8 files changed

+566
-32
lines changed

8 files changed

+566
-32
lines changed

.changeset/slimy-lizards-kiss.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@tanstack/db": patch
3+
---
4+
5+
Fix query optimizer to preserve outer join semantics by keeping residual WHERE clauses when pushing predicates to subqueries.

packages/db/src/query/compiler/group-by.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { filter, groupBy, groupByOperators, map } from "@tanstack/db-ivm"
2-
import { Func, PropRef } from "../ir.js"
2+
import { Func, PropRef, getHavingExpression } from "../ir.js"
33
import {
44
AggregateFunctionNotInSelectError,
55
NonAggregateExpressionNotInGroupByError,
@@ -129,8 +129,9 @@ export function processGroupBy(
129129
// Apply HAVING clauses if present
130130
if (havingClauses && havingClauses.length > 0) {
131131
for (const havingClause of havingClauses) {
132+
const havingExpression = getHavingExpression(havingClause)
132133
const transformedHavingClause = transformHavingClause(
133-
havingClause,
134+
havingExpression,
134135
selectClause || {}
135136
)
136137
const compiledHaving = compileExpression(transformedHavingClause)
@@ -263,8 +264,9 @@ export function processGroupBy(
263264
// Apply HAVING clauses if present
264265
if (havingClauses && havingClauses.length > 0) {
265266
for (const havingClause of havingClauses) {
267+
const havingExpression = getHavingExpression(havingClause)
266268
const transformedHavingClause = transformHavingClause(
267-
havingClause,
269+
havingExpression,
268270
selectClause || {}
269271
)
270272
const compiledHaving = compileExpression(transformedHavingClause)

packages/db/src/query/compiler/index.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import {
77
LimitOffsetRequireOrderByError,
88
UnsupportedFromTypeError,
99
} from "../../errors.js"
10-
import { PropRef } from "../ir.js"
10+
import { PropRef, getWhereExpression } from "../ir.js"
1111
import { compileExpression } from "./evaluators.js"
1212
import { processJoins } from "./joins.js"
1313
import { processGroupBy } from "./group-by.js"
@@ -131,7 +131,8 @@ export function compileQuery(
131131
if (query.where && query.where.length > 0) {
132132
// Apply each WHERE condition as a filter (they are ANDed together)
133133
for (const where of query.where) {
134-
const compiledWhere = compileExpression(where)
134+
const whereExpression = getWhereExpression(where)
135+
const compiledWhere = compileExpression(whereExpression)
135136
pipeline = pipeline.pipe(
136137
filter(([_key, namespacedRow]) => {
137138
return compiledWhere(namespacedRow)

packages/db/src/query/ir.ts

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ export interface JoinClause {
3939
right: BasicExpression
4040
}
4141

42-
export type Where = BasicExpression<boolean>
42+
export type Where =
43+
| BasicExpression<boolean>
44+
| { expression: BasicExpression<boolean>; residual?: boolean }
4345

4446
export type GroupBy = Array<BasicExpression>
4547

@@ -128,3 +130,48 @@ export class Aggregate<T = any> extends BaseExpression<T> {
128130
super()
129131
}
130132
}
133+
134+
/**
135+
* Helper functions for working with Where clauses
136+
*/
137+
138+
/**
139+
* Extract the expression from a Where clause
140+
*/
141+
export function getWhereExpression(where: Where): BasicExpression<boolean> {
142+
return typeof where === `object` && `expression` in where
143+
? where.expression
144+
: where
145+
}
146+
147+
/**
148+
* Extract the expression from a HAVING clause
149+
* HAVING clauses can contain aggregates, unlike regular WHERE clauses
150+
*/
151+
export function getHavingExpression(
152+
having: Having
153+
): BasicExpression | Aggregate {
154+
return typeof having === `object` && `expression` in having
155+
? having.expression
156+
: having
157+
}
158+
159+
/**
160+
* Check if a Where clause is marked as residual
161+
*/
162+
export function isResidualWhere(where: Where): boolean {
163+
return (
164+
typeof where === `object` &&
165+
`expression` in where &&
166+
where.residual === true
167+
)
168+
}
169+
170+
/**
171+
* Create a residual Where clause from an expression
172+
*/
173+
export function createResidualWhere(
174+
expression: BasicExpression<boolean>
175+
): Where {
176+
return { expression, residual: true }
177+
}

packages/db/src/query/optimizer.ts

Lines changed: 66 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545
* - **Ordering + Limits**: ORDER BY combined with LIMIT/OFFSET (would change result set)
4646
* - **Functional Operations**: fnSelect, fnWhere, fnHaving (potential side effects)
4747
*
48+
* ### Residual WHERE Clauses
49+
* For outer joins (LEFT, RIGHT, FULL), WHERE clauses are copied to subqueries for optimization
50+
* but also kept as "residual" clauses in the main query to preserve semantics. This ensures
51+
* that NULL values from outer joins are properly filtered according to SQL standards.
52+
*
4853
* The optimizer tracks which clauses were actually optimized and only removes those from the
4954
* main query. Subquery reuse is handled safely through immutable query copies.
5055
*
@@ -121,9 +126,12 @@ import {
121126
CollectionRef as CollectionRefClass,
122127
Func,
123128
QueryRef as QueryRefClass,
129+
createResidualWhere,
130+
getWhereExpression,
131+
isResidualWhere,
124132
} from "./ir.js"
125133
import { isConvertibleToCollectionFilter } from "./compiler/expressions.js"
126-
import type { BasicExpression, From, QueryIR } from "./ir.js"
134+
import type { BasicExpression, From, QueryIR, Where } from "./ir.js"
127135

128136
/**
129137
* Represents a WHERE clause after source analysis
@@ -325,8 +333,13 @@ function applySingleLevelOptimization(query: QueryIR): QueryIR {
325333
return query
326334
}
327335

336+
// Filter out residual WHERE clauses to prevent them from being optimized again
337+
const nonResidualWhereClauses = query.where.filter(
338+
(where) => !isResidualWhere(where)
339+
)
340+
328341
// Step 1: Split all AND clauses at the root level for granular optimization
329-
const splitWhereClauses = splitAndClauses(query.where)
342+
const splitWhereClauses = splitAndClauses(nonResidualWhereClauses)
330343

331344
// Step 2: Analyze each WHERE clause to determine which sources it touches
332345
const analyzedClauses = splitWhereClauses.map((clause) =>
@@ -337,7 +350,20 @@ function applySingleLevelOptimization(query: QueryIR): QueryIR {
337350
const groupedClauses = groupWhereClauses(analyzedClauses)
338351

339352
// Step 4: Apply optimizations by lifting single-source clauses into subqueries
340-
return applyOptimizations(query, groupedClauses)
353+
const optimizedQuery = applyOptimizations(query, groupedClauses)
354+
355+
// Add back any residual WHERE clauses that were filtered out
356+
const residualWhereClauses = query.where.filter((where) =>
357+
isResidualWhere(where)
358+
)
359+
if (residualWhereClauses.length > 0) {
360+
optimizedQuery.where = [
361+
...(optimizedQuery.where || []),
362+
...residualWhereClauses,
363+
]
364+
}
365+
366+
return optimizedQuery
341367
}
342368

343369
/**
@@ -424,26 +450,35 @@ function isRedundantSubquery(query: QueryIR): boolean {
424450
* ```
425451
*/
426452
function splitAndClauses(
427-
whereClauses: Array<BasicExpression<boolean>>
453+
whereClauses: Array<Where>
428454
): Array<BasicExpression<boolean>> {
429455
const result: Array<BasicExpression<boolean>> = []
430456

431-
for (const clause of whereClauses) {
432-
if (clause.type === `func` && clause.name === `and`) {
433-
// Recursively split nested AND clauses to handle complex expressions
434-
const splitArgs = splitAndClauses(
435-
clause.args as Array<BasicExpression<boolean>>
436-
)
437-
result.push(...splitArgs)
438-
} else {
439-
// Preserve non-AND clauses as-is (including OR clauses)
440-
result.push(clause)
441-
}
457+
for (const whereClause of whereClauses) {
458+
const clause = getWhereExpression(whereClause)
459+
result.push(...splitAndClausesRecursive(clause))
442460
}
443461

444462
return result
445463
}
446464

465+
// Helper function for recursive splitting of BasicExpression arrays
466+
function splitAndClausesRecursive(
467+
clause: BasicExpression<boolean>
468+
): Array<BasicExpression<boolean>> {
469+
if (clause.type === `func` && clause.name === `and`) {
470+
// Recursively split nested AND clauses to handle complex expressions
471+
const result: Array<BasicExpression<boolean>> = []
472+
for (const arg of clause.args as Array<BasicExpression<boolean>>) {
473+
result.push(...splitAndClausesRecursive(arg))
474+
}
475+
return result
476+
} else {
477+
// Preserve non-AND clauses as-is (including OR clauses)
478+
return [clause]
479+
}
480+
}
481+
447482
/**
448483
* Step 2: Analyze which table sources a WHERE clause touches.
449484
*
@@ -588,19 +623,32 @@ function applyOptimizations(
588623
}))
589624
: undefined
590625

591-
// Build the remaining WHERE clauses: multi-source + any single-source that weren't optimized
592-
const remainingWhereClauses: Array<BasicExpression<boolean>> = []
626+
// Build the remaining WHERE clauses: multi-source + residual single-source clauses
627+
const remainingWhereClauses: Array<Where> = []
593628

594629
// Add multi-source clauses
595630
if (groupedClauses.multiSource) {
596631
remainingWhereClauses.push(groupedClauses.multiSource)
597632
}
598633

599-
// Add single-source clauses that weren't actually optimized
634+
// Determine if we need residual clauses (when query has outer JOINs)
635+
const hasOuterJoins =
636+
query.join &&
637+
query.join.some(
638+
(join) =>
639+
join.type === `left` || join.type === `right` || join.type === `full`
640+
)
641+
642+
// Add single-source clauses
600643
for (const [source, clause] of groupedClauses.singleSource) {
601644
if (!actuallyOptimized.has(source)) {
645+
// Wasn't optimized at all - keep as regular WHERE clause
602646
remainingWhereClauses.push(clause)
647+
} else if (hasOuterJoins) {
648+
// Was optimized AND query has outer JOINs - keep as residual WHERE clause
649+
remainingWhereClauses.push(createResidualWhere(clause))
603650
}
651+
// If optimized and no outer JOINs - don't keep (original behavior)
604652
}
605653

606654
// Create a completely new query object to ensure immutability

packages/db/tests/query/indexes.test.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ describe(`Query Index Optimization`, () => {
631631
write({
632632
type: `insert`,
633633
value: {
634-
id: `other1`,
634+
id: `1`, // Matches Alice from main collection
635635
name: `Other Active Item`,
636636
age: 40,
637637
status: `active`,
@@ -641,7 +641,7 @@ describe(`Query Index Optimization`, () => {
641641
write({
642642
type: `insert`,
643643
value: {
644-
id: `other2`,
644+
id: `2`, // Matches Bob from main collection
645645
name: `Other Inactive Item`,
646646
age: 35,
647647
status: `inactive`,
@@ -970,11 +970,11 @@ describe(`Query Index Optimization`, () => {
970970

971971
await liveQuery.stateWhenReady()
972972

973-
// Should include all results from the first collection
973+
// Should only include results where both sides match the WHERE condition
974+
// Charlie and Eve are filtered out because they have no matching 'other' records
975+
// and the WHERE clause requires other.status = 'active' (can't be NULL)
974976
expect(liveQuery.toArray).toEqual([
975977
{ id: `1`, name: `Alice`, otherName: `Other Active Item` },
976-
{ id: `3`, name: `Charlie` },
977-
{ id: `5`, name: `Eve` },
978978
])
979979

980980
// Combine stats from both collections
@@ -1100,11 +1100,11 @@ describe(`Query Index Optimization`, () => {
11001100

11011101
await liveQuery.stateWhenReady()
11021102

1103-
// Should have found results where both items are active
1103+
// Should only include results where both sides match the WHERE condition
1104+
// Charlie and Eve are filtered out because they have no matching 'other' records
1105+
// and the WHERE clause requires other.status = 'active' (can't be NULL)
11041106
expect(liveQuery.toArray).toEqual([
11051107
{ id: `1`, name: `Alice`, otherName: `Other Active Item` },
1106-
{ id: `3`, name: `Charlie` },
1107-
{ id: `5`, name: `Eve` },
11081108
])
11091109

11101110
// We should have done an index lookup on the left collection to find active items

0 commit comments

Comments
 (0)