diff --git a/misc/python/materialize/parallel_workload/settings.py b/misc/python/materialize/parallel_workload/settings.py index 39894cd0dc536..fe50ba74fc8fe 100644 --- a/misc/python/materialize/parallel_workload/settings.py +++ b/misc/python/materialize/parallel_workload/settings.py @@ -41,8 +41,6 @@ def _missing_(cls, value): ADDITIONAL_SYSTEM_PARAMETER_DEFAULTS = { # Uses a lot of memory, hard to predict how much "memory_limiter_interval": "0", - # TODO: Remove when https://github.com/MaterializeInc/database-issues/issues/9656 is fixed - "persist_stats_filter_enabled": "false", # See https://materializeinc.slack.com/archives/CTESPM7FU/p1758195280629909, should reenable when it performs better "enable_compute_logical_backpressure": "false", # Allows the `Scenario.RepeatRow` scenario to call `repeat_row`. Having diff --git a/src/expr/src/interpret.rs b/src/expr/src/interpret.rs index 737464a9407a6..7e9480500586b 100644 --- a/src/expr/src/interpret.rs +++ b/src/expr/src/interpret.rs @@ -1763,6 +1763,146 @@ mod tests { assert!(range_out.may_contain(Datum::Null)); } + /// Regression test for database-issues#9656. + /// + /// Adding an `Interval` to a `Timestamp` is non-monotone in the interval + /// argument: the lex order of intervals (months, days, micros) does not + /// respect calendar-month arithmetic with day-clamping. The interpreter + /// must therefore not assume monotonicity, otherwise persist filter + /// pushdown can incorrectly conclude that a part has no matching rows. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] + fn test_add_timestamp_interval_non_monotone() { + use chrono::NaiveDateTime; + use mz_repr::adt::interval::Interval; + use mz_repr::adt::timestamp::CheckedTimestamp; + use mz_repr::{Datum, Row}; + + let arena = RowArena::new(); + + // The setup: a timestamp literal `t = 2024-01-31 00:00:00`, and an + // interval column whose stats-range spans + // `[{0 months, 31 days, 0 us}, {1 month, 0 days, 0 us}]`. In lex order, + // the 31-day interval is the lower bound and the 1-month interval is + // the upper bound. The function values at the endpoints are: + // t + {0,31,0} = 2024-03-02 + // t + {1, 0,0} = 2024-02-29 + // But an *interior* interval like {0, 60, 0} maps to 2024-03-31, which + // lies far outside `[Feb 29, Mar 2]`. Under the (incorrect) monotone + // assumption, the interpreter would conclude the output is in that + // narrow window, and rule out predicates like `>= 2024-03-15`. + let ts_lit = |s: &str| { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(), + )); + MirScalarExpr::Literal(Ok(row), ReprScalarType::Timestamp.nullable(false)) + }; + let interval = |months: i32, days: i32, micros: i64| { + Datum::Interval(Interval { + months, + days, + micros, + }) + }; + + // Expression: `(timestamp_lit + interval_col) >= 2024-03-15`. + let expr = ts_lit("2024-01-31T00:00:00") + .call_binary(MirScalarExpr::column(0), AddTimestampInterval) + .call_binary(ts_lit("2024-03-15T00:00:00"), Gte); + + let relation = ReprRelationType::new(vec![ReprScalarType::Interval.nullable(false)]); + let mut interpreter = ColumnSpecs::new(&relation, &arena); + interpreter.push_column( + 0, + ResultSpec::value_between(interval(0, 31, 0), interval(1, 0, 0)), + ); + + let range_out = interpreter.expr(&expr).range; + // The actual data may include e.g. `{0, 60, 0}` → 2024-03-31, which + // satisfies `>= 2024-03-15`. The interpreter must admit `True` so that + // filter pushdown does not skip the part. Under the buggy + // `(true, true)` annotation, the output range would be + // `[Feb 29, Mar 2]`, all of which is `< Mar 15`, and the interpreter + // would (wrongly) admit only `False`. + assert!( + range_out.may_contain(Datum::True), + "interpreter incorrectly ruled out matching rows; \ + add_timestamp_interval is not monotone in the interval argument", + ); + } + + /// Regression test for `date_bin_timestamp`, which is non-monotone in the + /// `stride` argument: a larger stride can bin a source timestamp to an + /// *earlier* result than a smaller stride, because the bin alignment to + /// the unix epoch depends on the stride magnitude rather than on lex order. + #[mz_ore::test] + #[cfg_attr(miri, ignore)] + fn test_date_bin_timestamp_non_monotone() { + use chrono::NaiveDateTime; + use mz_repr::adt::interval::Interval; + use mz_repr::adt::timestamp::CheckedTimestamp; + use mz_repr::{Datum, Row}; + + let arena = RowArena::new(); + + let ts_lit = |s: &str| { + let mut row = Row::default(); + row.packer().push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").unwrap(), + ) + .unwrap(), + )); + MirScalarExpr::Literal(Ok(row), ReprScalarType::Timestamp.nullable(false)) + }; + let interval = |months: i32, days: i32, micros: i64| { + Datum::Interval(Interval { + months, + days, + micros, + }) + }; + + // Expression: `date_bin(stride_col, 2024-01-01 12:00:00) > 2024-01-01 06:00:00`. + // stride_col ranges over `[1 day, 2 days]`. + // + // Endpoint evaluations: + // 1 day stride → bins to 2024-01-01 00:00:00 + // 2 day stride → bins to 2023-12-31 00:00:00 + // + // Interior strides produce results *outside* that endpoint box. For + // example, a 1.5-day stride (i.e. `{0 months, 1 day, 12 h micros}`, + // which sorts between the two endpoints in lex order) bins + // 2024-01-01 12:00:00 to exactly 2024-01-01 12:00:00 — well above the + // endpoint maximum of 2024-01-01 00:00:00. With the buggy + // `(true, true)` annotation, the interpreter narrows the output to + // `[Dec 31 00:00, Jan 1 00:00]`, both of which are `<= Jan 1 06:00`, + // so the predicate is wrongly proved `False`. With the non-monotone + // fix the output is `anything()`, so `True` is correctly admitted. + let expr = MirScalarExpr::column(0) + .call_binary(ts_lit("2024-01-01T12:00:00"), DateBinTimestamp) + .call_binary(ts_lit("2024-01-01T06:00:00"), Gt); + + let relation = ReprRelationType::new(vec![ReprScalarType::Interval.nullable(false)]); + let mut interpreter = ColumnSpecs::new(&relation, &arena); + interpreter.push_column( + 0, + ResultSpec::value_between(interval(0, 1, 0), interval(0, 2, 0)), + ); + + let range_out = interpreter.expr(&expr).range; + assert!( + range_out.may_contain(Datum::True), + "date_bin is not monotone in the stride argument; \ + interior strides can produce outputs outside the endpoint-bounded \ + box, so the interpreter must admit True for `>`-style predicates", + ); + } + #[mz_ore::test] fn test_trace() { use super::Trace; diff --git a/src/expr/src/scalar/func.rs b/src/expr/src/scalar/func.rs index 71c9c7bf6f5ec..09705eff50d15 100644 --- a/src/expr/src/scalar/func.rs +++ b/src/expr/src/scalar/func.rs @@ -181,7 +181,13 @@ fn add_float64(a: f64, b: f64) -> Result { } } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "+")] +// `Interval` is lex-ordered (months, days, micros), but adding an interval to a +// timestamp adds *calendar* months (with day-clamping) which does not respect +// that ordering: e.g. `i1 = {0 months, 31 days}` is lex-less than +// `i2 = {1 month, 0 days}`, but `2024-01-31 + i1 = 2024-03-02` is greater than +// `2024-01-31 + i2 = 2024-02-29`. Day-clamping plus preserved sub-day time also +// breaks monotonicity in the first argument near month boundaries. +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "+")] fn add_timestamp_interval( a: CheckedTimestamp, b: Interval, @@ -189,7 +195,7 @@ fn add_timestamp_interval( add_timestamplike_interval(a, b) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "+")] +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "+")] fn add_timestamp_tz_interval( a: CheckedTimestamp>, b: Interval, @@ -212,7 +218,8 @@ where Ok(CheckedTimestamp::from_timestamplike(T::from_date_time(dt))?) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "-")] +// See `add_timestamp_interval` for why this is not monotone. +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "-")] fn sub_timestamp_interval( a: CheckedTimestamp, b: Interval, @@ -220,7 +227,7 @@ fn sub_timestamp_interval( sub_timestamplike_interval(a, b) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "-")] +#[sqlfunc(is_monotone = "(false, false)", is_infix_op = true, sqlname = "-")] fn sub_timestamp_tz_interval( a: CheckedTimestamp>, b: Interval, @@ -249,7 +256,12 @@ fn add_date_time( Ok(CheckedTimestamp::from_timestamplike(dt)?) } -#[sqlfunc(is_monotone = "(true, true)", is_infix_op = true, sqlname = "+")] +// Monotone in `date` (dates have no sub-day component, so day-clamping at month +// boundaries only causes results to collapse, never to reverse), but not in +// `interval`: e.g. `{0 months, 31 days}` is lex-less than `{1 month, 0 days}`, +// but adding the former to `2024-01-31` gives `2024-03-02` while the latter +// gives `2024-02-29`. +#[sqlfunc(is_monotone = "(true, false)", is_infix_op = true, sqlname = "+")] fn add_date_interval( date: Date, interval: Interval, @@ -852,8 +864,9 @@ fn sub_interval(a: Interval, b: Interval) -> Result { .ok_or_else(|| EvalError::IntervalOutOfRange(format!("{a} - {b}").into())) } +// See `add_date_interval` for why this is not monotone in `interval`. #[sqlfunc( - is_monotone = "(true, true)", + is_monotone = "(true, false)", is_infix_op = true, sqlname = "-", propagates_nulls = true @@ -2036,7 +2049,12 @@ where Ok(CheckedTimestamp::from_timestamplike(res)?) } -#[sqlfunc(is_monotone = "(true, true)", sqlname = "bin_unix_epoch_timestamp")] +// Non-monotone in `stride`: the result is `origin + floor((source - origin) / +// stride) * stride`. For a fixed source like `2024-01-01 12:00:00`, a 1-day +// stride bins to `2024-01-01 00:00:00`, but a 2-day stride bins to +// `2023-12-31 00:00:00` — i.e. the lex-larger interval produces an earlier +// timestamp. Monotone in `source`. +#[sqlfunc(is_monotone = "(false, true)", sqlname = "bin_unix_epoch_timestamp")] fn date_bin_timestamp( stride: Interval, source: CheckedTimestamp, @@ -2047,7 +2065,8 @@ fn date_bin_timestamp( date_bin(stride, source, origin) } -#[sqlfunc(is_monotone = "(true, true)", sqlname = "bin_unix_epoch_timestamptz")] +// See `date_bin_timestamp` for why this is not monotone in `stride`. +#[sqlfunc(is_monotone = "(false, true)", sqlname = "bin_unix_epoch_timestamptz")] fn date_bin_timestamp_tz( stride: Interval, source: CheckedTimestamp>, diff --git a/test/sqllogictest/filter-pushdown.slt b/test/sqllogictest/filter-pushdown.slt index 6020ef88894ac..17418f65b5f0b 100644 --- a/test/sqllogictest/filter-pushdown.slt +++ b/test/sqllogictest/filter-pushdown.slt @@ -326,7 +326,6 @@ Explained Query: Source materialize.public.t filter=(((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end) < 2023-10-02 15:55:31.918)) - pushdown=(((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end) < 2023-10-02 15:55:31.918)) Target cluster: quickstart @@ -349,7 +348,6 @@ materialize.public.mv9: Source materialize.public.t filter=((mz_now() > timestamp_to_mz_timestamp((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end)))) - pushdown=((mz_now() > timestamp_to_mz_timestamp((#1{t} - case when (#0{x} = 0) then 1 day else 2 days end)))) Target cluster: quickstart