diff --git a/misc/python/materialize/parallel_workload/settings.py b/misc/python/materialize/parallel_workload/settings.py index 39894cd0dc536..fe50ba74fc8fe 100644 --- a/misc/python/materialize/parallel_workload/settings.py +++ b/misc/python/materialize/parallel_workload/settings.py @@ -41,8 +41,6 @@ def _missing_(cls, value): ADDITIONAL_SYSTEM_PARAMETER_DEFAULTS = { # Uses a lot of memory, hard to predict how much "memory_limiter_interval": "0", - # TODO: Remove when https://github.com/MaterializeInc/database-issues/issues/9656 is fixed - "persist_stats_filter_enabled": "false", # See https://materializeinc.slack.com/archives/CTESPM7FU/p1758195280629909, should reenable when it performs better "enable_compute_logical_backpressure": "false", # Allows the `Scenario.RepeatRow` scenario to call `repeat_row`. Having diff --git a/src/expr/src/interpret.rs b/src/expr/src/interpret.rs index 737464a9407a6..8ce2f61411586 100644 --- a/src/expr/src/interpret.rs +++ b/src/expr/src/interpret.rs @@ -1763,6 +1763,357 @@ mod tests { assert!(range_out.may_contain(Datum::Null)); } + /// Regression test for database-issues#9656. + /// + /// Adding an `Interval` to a `Timestamp` is non-monotone in the interval + /// argument: the lex order of intervals (months, days, micros) does not + /// respect calendar-month arithmetic with day-clamping. The interpreter + /// must therefore not assume monotonicity, otherwise persist filter + /// pushdown can incorrectly conclude that a part has no matching rows. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] + fn test_add_timestamp_interval_non_monotone() { + use chrono::NaiveDateTime; + use mz_repr::adt::interval::Interval; + use mz_repr::adt::timestamp::CheckedTimestamp; + use mz_repr::{Datum, Row}; + + let arena = RowArena::new(); + + // The setup: a timestamp literal `t = 2024-01-31 00:00:00`, and an + // interval column whose stats-range spans + // `[{0 months, 31 days, 0 us}, {1 month, 0 days, 0 us}]`. In lex order, + // the 31-day interval is the lower bound and the 1-month interval is + // the upper bound. The function values at the endpoints are: + // t + {0,31,0} = 2024-03-02 + // t + {1, 0,0} = 2024-02-29 + // But an *interior* interval like {0, 60, 0} maps to 2024-03-31, which + // lies far outside `[Feb 29, Mar 2]`. Under the (incorrect) monotone + // assumption, the interpreter would conclude the output is in that + // narrow window, and rule out predicates like `>= 2024-03-15`. + let ts_lit = |s: &str| { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(), + )); + MirScalarExpr::Literal(Ok(row), ReprScalarType::Timestamp.nullable(false)) + }; + let interval = |months: i32, days: i32, micros: i64| { + Datum::Interval(Interval { + months, + days, + micros, + }) + }; + + // Expression: `(timestamp_lit + interval_col) >= 2024-03-15`. + let expr = ts_lit("2024-01-31T00:00:00") + .call_binary(MirScalarExpr::column(0), AddTimestampInterval) + .call_binary(ts_lit("2024-03-15T00:00:00"), Gte); + + let relation = ReprRelationType::new(vec![ReprScalarType::Interval.nullable(false)]); + let mut interpreter = ColumnSpecs::new(&relation, &arena); + interpreter.push_column( + 0, + ResultSpec::value_between(interval(0, 31, 0), interval(1, 0, 0)), + ); + + let range_out = interpreter.expr(&expr).range; + // The actual data may include e.g. `{0, 60, 0}` → 2024-03-31, which + // satisfies `>= 2024-03-15`. The interpreter must admit `True` so that + // filter pushdown does not skip the part. Under the buggy + // `(true, true)` annotation, the output range would be + // `[Feb 29, Mar 2]`, all of which is `< Mar 15`, and the interpreter + // would (wrongly) admit only `False`. + assert!( + range_out.may_contain(Datum::True), + "interpreter incorrectly ruled out matching rows; \ + add_timestamp_interval is not monotone in the interval argument", + ); + } + + /// Regression test for `date_bin_timestamp`, which is non-monotone in the + /// `stride` argument: a larger stride can bin a source timestamp to an + /// *earlier* result than a smaller stride, because the bin alignment to + /// the unix epoch depends on the stride magnitude rather than on lex order. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] + fn test_date_bin_timestamp_non_monotone() { + use chrono::NaiveDateTime; + use mz_repr::adt::interval::Interval; + use mz_repr::adt::timestamp::CheckedTimestamp; + use mz_repr::{Datum, Row}; + + let arena = RowArena::new(); + + let ts_lit = |s: &str| { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(), + )); + MirScalarExpr::Literal(Ok(row), ReprScalarType::Timestamp.nullable(false).into()) + }; + let interval = |months: i32, days: i32, micros: i64| { + Datum::Interval(Interval { + months, + days, + micros, + }) + }; + + // Expression: `date_bin(stride_col, 2024-01-01 12:00:00) >= 2024-01-01`. + // stride_col ranges over `[1 day, 2 days]`. + // - 1 day stride → bins to 2024-01-01 00:00:00 (satisfies `>=`). + // - 2 day stride → bins to 2023-12-31 00:00:00 (does not). + // Under the buggy `(true, true)` annotation, both endpoint evaluations + // give the same boolean answer (`>=` is monotone), so depending on which + // way the comparisons fall the interpreter could end up wrongly ruling + // out one outcome. The non-monotone fix forces a conservative + // `anything()` for the binned value. + let expr = MirScalarExpr::column(0) + .call_binary(ts_lit("2024-01-01T12:00:00"), DateBinTimestamp) + .call_binary(ts_lit("2024-01-01T00:00:00"), Gte); + + let relation = ReprRelationType::new(vec![ReprScalarType::Interval.nullable(false)]); + let mut interpreter = ColumnSpecs::new(&relation, &arena); + interpreter.push_column( + 0, + ResultSpec::value_between(interval(0, 1, 0), interval(0, 2, 0)), + ); + + let range_out = interpreter.expr(&expr).range; + // The actual data may include a 1-day stride for which the predicate + // holds, and a 2-day stride for which it doesn't, so both outcomes are + // reachable. + assert!( + range_out.may_contain(Datum::True), + "date_bin is not monotone in the stride argument; \ + interpreter must not rule out matching rows", + ); + assert!( + range_out.may_contain(Datum::False), + "interpreter must not rule out non-matching rows either", + ); + } + + /// Regression test for `age_timestamp`, which is non-monotone in the first + /// argument: the Postgres-style calendar-aware age computation does not + /// respect the lex order of `Interval` results when the input crosses a + /// month boundary. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] + fn test_age_timestamp_non_monotone_in_first_arg() { + use chrono::NaiveDateTime; + use mz_repr::adt::interval::Interval; + use mz_repr::adt::timestamp::CheckedTimestamp; + use mz_repr::{Datum, Row}; + + let arena = RowArena::new(); + + let ts_lit = |s: &str| { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(), + )); + MirScalarExpr::Literal(Ok(row), ReprScalarType::Timestamp.nullable(false).into()) + }; + let interval = |months: i32, days: i32, micros: i64| { + Datum::Interval(Interval { + months, + days, + micros, + }) + }; + + // Expression: `age(a_col, 2024-02-15) >= '1 month 16 days'`. + // a_col stats: [2024-03-31, 2024-05-01]. + // age(2024-03-31, 2024-02-15) = {1 month, 16 days} ← satisfies `>=` + // age(2024-05-01, 2024-02-15) = {2 months, 15 days} ← satisfies `>=` + // Interior a = 2024-04-01: + // age(2024-04-01, 2024-02-15) = {1 month, 15 days} ← does NOT satisfy. + // Under the buggy `(true, true)` annotation, the interpreter would + // bound the output of `age` by its endpoint values — lex-range + // `[(1m,16d), (2m,15d)]` — and conclude the `>=` predicate is always + // True, ruling out parts whose true rows would evaluate False. + let expr = MirScalarExpr::column(0) + .call_binary(ts_lit("2024-02-15T00:00:00"), AgeTimestamp) + .call_binary( + MirScalarExpr::Literal( + Ok({ + let mut row = Row::default(); + row.packer().push(interval(1, 16, 0)); + row + }), + ReprScalarType::Interval.nullable(false).into(), + ), + Gte, + ); + + let relation = ReprRelationType::new(vec![ReprScalarType::Timestamp.nullable(false)]); + let mut interpreter = ColumnSpecs::new(&relation, &arena); + let lo = { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-03-31T00:00:00", "%Y-%m-%dT%H:%M:%S") + .unwrap(), + ) + .unwrap(), + )); + row + }; + let hi = { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-05-01T00:00:00", "%Y-%m-%dT%H:%M:%S") + .unwrap(), + ) + .unwrap(), + )); + row + }; + interpreter.push_column( + 0, + ResultSpec::value_between(lo.unpack_first(), hi.unpack_first()), + ); + + let range_out = interpreter.expr(&expr).range; + assert!( + range_out.may_contain(Datum::False), + "age_timestamp is not monotone in the first argument; \ + interpreter must not rule out non-matching rows", + ); + } + + /// Regression test for `age_timestamp`, which is non-monotone in the + /// second argument: the result has a V-shape at `a == b` because the + /// algorithm flips sign when `a < b`, so the same interval can be produced + /// by `b` values on either side of `a`. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] + fn test_age_timestamp_non_monotone_in_second_arg() { + use chrono::NaiveDateTime; + use mz_repr::adt::interval::Interval; + use mz_repr::adt::timestamp::CheckedTimestamp; + use mz_repr::{Datum, Row}; + + let arena = RowArena::new(); + + let ts_lit = |s: &str| { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(), + )); + MirScalarExpr::Literal(Ok(row), ReprScalarType::Timestamp.nullable(false).into()) + }; + let interval = |months: i32, days: i32, micros: i64| { + Datum::Interval(Interval { + months, + days, + micros, + }) + }; + + // Expression: `age(2024-02-29, b_col) >= '-1 month, -1 day'`. + // b_col stats: [2024-03-30, 2024-04-01]. + // age(2024-02-29, 2024-03-30) = {-1 month, -1 day, 0} ← satisfies `>=` + // age(2024-02-29, 2024-04-01) = {-1 month, -1 day, 0} ← satisfies `>=` + // Interior b = 2024-03-31: + // age(2024-02-29, 2024-03-31) = {-1 month, -2 days, 0} ← does NOT satisfy. + // The carry logic in `age` borrows a *whole month worth of days* when + // the day field goes negative, which (after the sign-revert at the + // end) makes the result dip non-monotonically as `b` crosses a month + // boundary. Both endpoints lex-coincide above the interior dip, so + // under the buggy `(true, true)` annotation the interpreter would + // conclude the predicate is always True. + let expr = ts_lit("2024-02-29T00:00:00") + .call_binary(MirScalarExpr::column(0), AgeTimestamp) + .call_binary( + MirScalarExpr::Literal( + Ok({ + let mut row = Row::default(); + row.packer().push(interval(-1, -1, 0)); + row + }), + ReprScalarType::Interval.nullable(false).into(), + ), + Gte, + ); + + let relation = ReprRelationType::new(vec![ReprScalarType::Timestamp.nullable(false)]); + let mut interpreter = ColumnSpecs::new(&relation, &arena); + let lo = { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-03-30T00:00:00", "%Y-%m-%dT%H:%M:%S") + .unwrap(), + ) + .unwrap(), + )); + row + }; + let hi = { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-04-01T00:00:00", "%Y-%m-%dT%H:%M:%S") + .unwrap(), + ) + .unwrap(), + )); + row + }; + interpreter.push_column( + 0, + ResultSpec::value_between(lo.unpack_first(), hi.unpack_first()), + ); + + // Sanity-check the counterexample: endpoint and interior values. + let a = CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-02-29T00:00:00", "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(); + let b_lo = CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-03-30T00:00:00", "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(); + let b_mid = CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-03-31T00:00:00", "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(); + let b_hi = CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str("2024-04-01T00:00:00", "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(); + let age_lo = a.age(&b_lo).unwrap(); + let age_mid = a.age(&b_mid).unwrap(); + let age_hi = a.age(&b_hi).unwrap(); + assert_eq!((age_lo.months, age_lo.days, age_lo.micros), (-1, -1, 0)); + assert_eq!((age_mid.months, age_mid.days, age_mid.micros), (-1, -2, 0)); + assert_eq!((age_hi.months, age_hi.days, age_hi.micros), (-1, -1, 0)); + + let range_out = interpreter.expr(&expr).range; + assert!( + range_out.may_contain(Datum::False), + "age_timestamp is not monotone in the second argument; \ + interpreter must not rule out non-matching rows", + ); + } + #[mz_ore::test] fn test_trace() { use super::Trace; diff --git a/src/expr/src/scalar/func.rs b/src/expr/src/scalar/func.rs index 71c9c7bf6f5ec..2b00f497adb79 100644 --- a/src/expr/src/scalar/func.rs +++ b/src/expr/src/scalar/func.rs @@ -181,7 +181,13 @@ fn add_float64(a: f64, b: f64) -> Result { } } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "+")] +// `Interval` is lex-ordered (months, days, micros), but adding an interval to a +// timestamp adds *calendar* months (with day-clamping) which does not respect +// that ordering: e.g. `i1 = {0 months, 31 days}` is lex-less than +// `i2 = {1 month, 0 days}`, but `2024-01-31 + i1 = 2024-03-02` is greater than +// `2024-01-31 + i2 = 2024-02-29`. Day-clamping plus preserved sub-day time also +// breaks monotonicity in the first argument near month boundaries. +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "+")] fn add_timestamp_interval( a: CheckedTimestamp, b: Interval, @@ -189,7 +195,7 @@ fn add_timestamp_interval( add_timestamplike_interval(a, b) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "+")] +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "+")] fn add_timestamp_tz_interval( a: CheckedTimestamp>, b: Interval, @@ -212,7 +218,8 @@ where Ok(CheckedTimestamp::from_timestamplike(T::from_date_time(dt))?) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "-")] +// See `add_timestamp_interval` for why this is not monotone. +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "-")] fn sub_timestamp_interval( a: CheckedTimestamp, b: Interval, @@ -220,7 +227,7 @@ fn sub_timestamp_interval( sub_timestamplike_interval(a, b) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "-")] +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "-")] fn sub_timestamp_tz_interval( a: CheckedTimestamp>, b: Interval, @@ -249,7 +256,12 @@ fn add_date_time( Ok(CheckedTimestamp::from_timestamplike(dt)?) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "+")] +// Monotone in `date` (dates have no sub-day component, so day-clamping at month +// boundaries only causes results to collapse, never to reverse), but not in +// `interval`: e.g. `{0 months, 31 days}` is lex-less than `{1 month, 0 days}`, +// but adding the former to `2024-01-31` gives `2024-03-02` while the latter +// gives `2024-02-29`. +#[sqlfunc(is_monotone = "(true, false)", is_infix_op = true, sqlname = "+")] fn add_date_interval( date: Date, interval: Interval, @@ -784,8 +796,25 @@ fn sub_numeric( } } +// `age(a, b)` is non-monotone in *both* arguments: +// +// * Lex order on `Interval` is `(months, days, micros)`, but the Postgres +// `age` algorithm independently subtracts year/month/day/... fields and +// then *borrows* across boundaries when a lower field goes negative. With +// `b = 2024-02-15` fixed: +// a = 2024-03-31 → age = {1 month, 16 days} +// a = 2024-04-01 → age = {1 month, 15 days} +// a = 2024-05-01 → age = {2 months, 15 days} +// As `a` increases past a month boundary, `months` jumps by 1 and `days` +// drops, producing a lex-smaller interval than the previous step. +// +// * Holding `a` fixed and varying `b`, the result has a V-shape at `a == b` +// (sign is flipped when `a < b`): +// a = 2024-02-15, b = 2024-02-14 → age = {0 months, 1 day} +// a = 2024-02-15, b = 2024-02-15 → age = {0 months, 0 days} +// a = 2024-02-15, b = 2024-02-16 → age = {0 months, 1 day} #[sqlfunc( - is_monotone = "(true, true)", + is_monotone = "(false, false)", output_type = "Interval", sqlname = "age", propagates_nulls = true @@ -797,7 +826,12 @@ fn age_timestamp( Ok(a.age(&b)?) } -#[sqlfunc(is_monotone = "(true, true)", sqlname = "age", propagates_nulls = true)] +// See `age_timestamp` for why this is not monotone in either argument. +#[sqlfunc( + is_monotone = "(false, false)", + sqlname = "age", + propagates_nulls = true +)] fn age_timestamp_tz( a: CheckedTimestamp>, b: CheckedTimestamp>, @@ -852,8 +886,9 @@ fn sub_interval(a: Interval, b: Interval) -> Result { .ok_or_else(|| EvalError::IntervalOutOfRange(format!("{a} - {b}").into())) } +// See `add_date_interval` for why this is not monotone in `interval`. #[sqlfunc( - is_monotone = "(true, true)", + is_monotone = "(true, false)", is_infix_op = true, sqlname = "-", propagates_nulls = true @@ -2036,7 +2071,12 @@ where Ok(CheckedTimestamp::from_timestamplike(res)?) } -#[sqlfunc(is_monotone = "(true, true)", sqlname = "bin_unix_epoch_timestamp")] +// Non-monotone in `stride`: the result is `origin + floor((source - origin) / +// stride) * stride`. For a fixed source like `2024-01-01 12:00:00`, a 1-day +// stride bins to `2024-01-01 00:00:00`, but a 2-day stride bins to +// `2023-12-31 00:00:00` — i.e. the lex-larger interval produces an earlier +// timestamp. Monotone in `source`. +#[sqlfunc(is_monotone = "(false, true)", sqlname = "bin_unix_epoch_timestamp")] fn date_bin_timestamp( stride: Interval, source: CheckedTimestamp, @@ -2047,7 +2087,8 @@ fn date_bin_timestamp( date_bin(stride, source, origin) } -#[sqlfunc(is_monotone = "(true, true)", sqlname = "bin_unix_epoch_timestamptz")] +// See `date_bin_timestamp` for why this is not monotone in `stride`. +#[sqlfunc(is_monotone = "(false, true)", sqlname = "bin_unix_epoch_timestamptz")] fn date_bin_timestamp_tz( stride: Interval, source: CheckedTimestamp>, diff --git a/test/sqllogictest/filter-pushdown.slt b/test/sqllogictest/filter-pushdown.slt index 6020ef88894ac..17418f65b5f0b 100644 --- a/test/sqllogictest/filter-pushdown.slt +++ b/test/sqllogictest/filter-pushdown.slt @@ -326,7 +326,6 @@ Explained Query: Source materialize.public.t filter=(((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end) < 2023-10-02 15:55:31.918)) - pushdown=(((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end) < 2023-10-02 15:55:31.918)) Target cluster: quickstart @@ -349,7 +348,6 @@ materialize.public.mv9: Source materialize.public.t filter=((mz_now() > timestamp_to_mz_timestamp((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end)))) - pushdown=((mz_now() > timestamp_to_mz_timestamp((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end)))) Target cluster: quickstart