From 9a76be123f3c4fa4ed74168d7161d95bc6f4c3cb Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 00:25:26 +0000 Subject: [PATCH 01/13] implement AutoCDC SCD1 flow execution --- .../resources/error/error-conditions.json | 19 + .../autocdc/Scd1BatchProcessor.scala | 38 +- .../sql/pipelines/graph/DatasetManager.scala | 16 + .../sql/pipelines/graph/FlowExecution.scala | 276 ++++++ .../sql/pipelines/graph/FlowPlanner.scala | 13 + .../AutoCdcGraphExecutionTestMixin.scala | 213 +++++ ...CdcScd1AuxiliaryTableDurabilitySuite.scala | 254 +++++ .../graph/AutoCdcScd1FullRefreshSuite.scala | 241 +++++ .../graph/AutoCdcScd1MultiPipelineSuite.scala | 208 +++++ .../AutoCdcScd1SchemaEvolutionSuite.scala | 880 ++++++++++++++++++ .../AutoCdcScd1SinglePipelineSuite.scala | 216 +++++ ...utoCdcScd1TargetTableDurabilitySuite.scala | 159 ++++ .../graph/ConnectInvalidPipelineSuite.scala | 2 +- 13 files changed, 2523 insertions(+), 12 deletions(-) create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 9c9a657bc6e9f..8b301243ad3b3 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -203,6 +203,19 @@ ], "sqlState" : "22023" }, + "AUTOCDC_INVALID_STATE" : { + "message" : [ + "AutoCDC flow detected an invalid state:" + ], + "subClass" : { + "KEY_SCHEMA_DRIFT" : { + "message" : [ + "The AutoCDC flow's current key columns do not match the keys recorded in the auxiliary table (recorded keys ). AutoCDC does not support changing key columns or their types across incremental pipeline runs. To change keys, perform a full refresh of the target table." + ] + } + }, + "sqlState" : "42000" + }, "AUTOCDC_KEY_NOT_IN_SELECTED_SCHEMA" : { "message" : [ "Using column name comparison, the AutoCDC key column `` is not present in the flow's selected source schema. AutoCDC requires every key column to be present in the source change-data feed and retained by any configured column selection." @@ -256,6 +269,12 @@ ], "sqlState" : "0A000" }, + "AUTOCDC_TARGET_DOES_NOT_SUPPORT_MERGE" : { + "message" : [ + "Cannot start AutoCDC flow because the target table (format: ) does not support row-level MERGE operations. AutoCDC requires a target table whose format implements `SupportsRowLevelOperations`." + ], + "sqlState" : "0A000" + }, "AVRO_CANNOT_WRITE_NULL_FIELD" : { "message" : [ "Cannot write null value for field defined as non-null Avro data type .", diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala index aaea3b63e9ef3..15537d4173316 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala @@ -367,19 +367,29 @@ case class Scd1BatchProcessor( val incomingWinsDelete = microbatchDeleteVersionField.isNotNull && microbatchDeleteVersionField > destinationUpsertVersionField - // When the incoming upsert wins against an existing record, the entire row (all columns) - // will be overwritten, including the CDC metadata column. We only exclude keys because - // most merge implementations require that join columns are not being mutated, even if - // the mutation is a no-op. val resolver = microbatchDf.sparkSession.sessionState.conf.resolver val keyNames = changeArgs.keys.map(_.name) + + def constructTargetColumnAssignmentsFromMicrobatch(columnName: String): (String, Column) = { + // Map a column in the target table to its direct equivalent in the microbatch. Note that due + // to target table schema evolution during SDP dataset materialization, the microbatch's + // schema must always be a non-strict subset of the target table's schema. + val quotedCol = QuotingUtils.quoteIdentifier(columnName) + s"$destinationTableStr.$quotedCol" -> F.col(s"microbatch.$quotedCol") + } + + // Most merge implementations require that join columns are not mutated, even when the + // mutation would be a no-op. The remaining microbatch columns (including the CDC metadata + // column) are overwritten outright when the incoming upsert wins. val columnsToUpdateWhenIncomingWinsUpsert: Map[String, Column] = microbatchDf.columns .filterNot(c => keyNames.exists(resolver(_, c))) - .map { c => - val quotedCol = QuotingUtils.quoteIdentifier(c) - s"$destinationTableStr.$quotedCol" -> F.col(s"microbatch.$quotedCol") - } + .map(constructTargetColumnAssignmentsFromMicrobatch) + .toMap + + val columnsToInsertOnNewKey: Map[String, Column] = + microbatchDf.columns + .map(constructTargetColumnAssignmentsFromMicrobatch) .toMap microbatchDf @@ -391,7 +401,13 @@ case class Scd1BatchProcessor( // New key: only insert upserts; deletes for absent keys are no-ops for the target table // merge, and instead would have been inserted as tombstones into the auxiliary table. .whenNotMatched(microbatchDeleteVersionField.isNull) - .insertAll() + // When inserting a brand new row for a new key, construct column mappings from microbatch. + // It's possible the microbatch columns are a subset of the columns currently in the target + // table, due to changing and more restrictive column selection across runs, the source + // dropping a column, etc. + // It is not possible for the microbatch's schema to ever be a superset of the target table + // however, due to SDP's schema evolution always unioning old and new schemas. + .insert(columnsToInsertOnNewKey) .merge() } @@ -426,8 +442,8 @@ object Scd1BatchProcessor { private[autocdc] val winningRowColName: String = s"${reservedColumnNamePrefix}winning_row" private[pipelines] val cdcMetadataColName: String = s"${reservedColumnNamePrefix}metadata" - private[autocdc] val cdcDeleteSequenceFieldName: String = "deleteSequence" - private[autocdc] val cdcUpsertSequenceFieldName: String = "upsertSequence" + private[pipelines] val cdcDeleteSequenceFieldName: String = "deleteSequence" + private[pipelines] val cdcUpsertSequenceFieldName: String = "upsertSequence" /** Project the delete sequence out of the CDC metadata column. */ private[autocdc] def deleteSequenceOf(cdcMetadataCol: Column): Column = diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala index 4affbe4637dba..a59f7e5d614ee 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala @@ -303,6 +303,22 @@ object DatasetManager extends Logging { context.spark.sql(s"TRUNCATE TABLE ${table.identifier.quotedString}") } + if (isFullRefresh) { + // On full refresh, drop the AutoCDC auxiliary state associated with this table (if any) so + // that stale delete-tracking data and table properties are not carried forward into the new + // table generation. + + // Intentionally DROP and not TRUNCATE for two reasons; First, the auxiliary table may + // contain table properties that represent stateful information (ex. SCD key count) that + // should not be carried forward on a full refresh. Second, the auxiliary table is an + // internal table and not part of the dataflow graph. That means it does not go through + // schema evolution like other tables and hence on a full refresh, we should explicitly be + // drop the existing auxiliary table schema so it can be recomputed. + + val auxiliaryTableId = AutoCdcAuxiliaryTable.identifier(table.identifier) + context.spark.sql(s"DROP TABLE IF EXISTS ${auxiliaryTableId.quotedString}") + } + // Alter the table if we need to existingTableOpt.foreach { existingTable => val existingSchema = v2ColumnsToStructType(existingTable.columns()) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index 13a5621947d57..4e3d8ff486a24 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -23,12 +23,23 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.concurrent.{ExecutionContext, Future} import scala.util.control.NonFatal +import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, LogKeys} +import org.apache.spark.sql.{AnalysisException, Dataset, Row} import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.classic.SparkSession +import org.apache.spark.sql.connector.catalog.{ + CatalogV2Util, + Identifier, + SupportsRowLevelOperations, + TableCatalog +} +import org.apache.spark.sql.pipelines.autocdc.{ChangeArgs, Scd1BatchProcessor, Scd1ForeachBatchHandler} import org.apache.spark.sql.pipelines.graph.QueryOrigin.ExceptionHelpers import org.apache.spark.sql.pipelines.util.SparkSessionUtils import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} +import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.ThreadUtils /** @@ -301,3 +312,268 @@ class SinkWrite( .start() } } + +/** + * Helper mixin for AutoCDC merge-based write flows. + */ +trait AutoCdcMergeWriteMixin { + /** The spark session the AutoCDC flow is going to be planned in. */ + protected def spark: SparkSession + + /** The destination (target) table entity the AutoCDC flow will be writing to. */ + protected def destination: Table + + /** The AutoCDC flow's identifier, used as `flowName` in error messages emitted by this mixin. */ + protected def identifier: TableIdentifier + + /** The AutoCDC flow's [[ChangeArgs]] (keys, sequencing, columnSelection, ...). */ + protected def changeArgs: ChangeArgs + + /** + * Full schema of the auxiliary table for this SCD type. The first `changeArgs.keys.length` + * fields MUST be the AutoCDC key columns (in `changeArgs.keys` declaration order, with + * fully-resolved dataType and nullability) + */ + protected def auxiliaryTableSchema(): StructType + + // Immediately validate that the destination table supports row level operations. + requireDestinationSupportsRowLevelOps() + + /** + * The AutoCDC key columns for this flow (column names + types), derived by slicing the + * front of [[auxiliaryTableSchema]] using the keys-first invariant. This is the subset + * that must remain invariant across incremental pipeline runs; users who want to change + * keys must full-refresh the target. + */ + private def autoCdcKeyColumns(): StructType = + StructType(auxiliaryTableSchema().fields.take(changeArgs.keys.length)) + + /** + * Idempotently bring the auxiliary table for [[destination]] into a state consistent with the + * flow's current [[changeArgs]] and return its [[TableIdentifier]]. + */ + protected def createOrValidateAuxiliaryTable(spark: SparkSession): TableIdentifier = { + val auxIdent = AutoCdcAuxiliaryTable.identifier(destination.identifier) + val (catalog, v2Identifier) = resolveAuxiliaryTableCatalog(spark, auxIdent) + + if (!catalog.tableExists(v2Identifier)) { + // The auxiliary table inherits the target's format so MERGE semantics line up. When the + // target's format is unspecified (None), omit the USING clause and fall back to the + // session's default source provider. + val usingClause = destination.format.map(fmt => s"USING $fmt").getOrElse("") + val numKeyColumns = changeArgs.keys.length + spark.sql( + s"""CREATE TABLE IF NOT EXISTS + |${auxIdent.quotedString} + |(${auxiliaryTableSchema().toDDL}) $usingClause + |TBLPROPERTIES ( + | '${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = '$numKeyColumns' + |)""".stripMargin + ) + } else { + validateNoAutoCdcKeyDrift(spark, auxIdent) + } + auxIdent + } + + /** + * Validate that the AutoCDC key columns the flow expects exactly match the keys recorded + * in the existing auxiliary table at [[auxIdent]]: same number of key columns, same names + * (per the session resolver), same dataTypes. + */ + private def validateNoAutoCdcKeyDrift( + spark: SparkSession, + auxIdent: TableIdentifier): Unit = { + val (catalog, v2Identifier) = resolveAuxiliaryTableCatalog(spark, auxIdent) + val existingAuxTable = catalog.loadTable(v2Identifier) + val existingAuxSchema = CatalogV2Util.v2ColumnsToStructType(existingAuxTable.columns()) + val expectedKeySchema = autoCdcKeyColumns() + val resolver = spark.sessionState.conf.resolver + + val numRecordedKeys = Option( + existingAuxTable.properties().get(AutoCdcAuxiliaryTable.numKeyColumnsProperty) + ).map { raw => + try raw.toInt catch { + case _: NumberFormatException => + throw SparkException.internalError( + s"Auxiliary table ${auxIdent.quotedString} has a malformed " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property: '$raw'." + ) + } + }.getOrElse { + throw SparkException.internalError( + s"Auxiliary table ${auxIdent.quotedString} is missing the " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} table property; cannot validate " + + s"AutoCDC key columns. Full-refresh the target table to recreate the auxiliary table." + ) + } + + val recordedKeyFields = existingAuxSchema.fields.take(numRecordedKeys) + val drifted = + recordedKeyFields.length != expectedKeySchema.length || + recordedKeyFields.length != numRecordedKeys || + recordedKeyFields.zip(expectedKeySchema.fields).exists { case (recorded, expected) => + !resolver(recorded.name, expected.name) || + recorded.dataType != expected.dataType + } + + if (drifted) { + throw new AnalysisException( + errorClass = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + messageParameters = Map( + "flowName" -> identifier.unquotedString, + "auxTableName" -> auxIdent.unquotedString, + "expectedKeySchema" -> expectedKeySchema.toDDL, + "recordedKeySchema" -> StructType(recordedKeyFields).toDDL + ) + ) + } + } + + /** + * Validate that the target table's underlying connector implements + * [[SupportsRowLevelOperations]], which is the V2 connector contract for MERGE/UPDATE/DELETE + * with rewrite - all operations that the AutoCDC transformation executes. + */ + private def requireDestinationSupportsRowLevelOps(): Unit = { + val catalogManager = spark.sessionState.catalogManager + val catalog = destination.identifier.catalog + .map(catalogManager.catalog) + .getOrElse(catalogManager.currentCatalog) + .asInstanceOf[TableCatalog] + + val destinationTable = catalog.loadTable( + Identifier.of( + Array(destination.identifier.database.get), + destination.identifier.identifier + ) + ) + + if (!destinationTable.isInstanceOf[SupportsRowLevelOperations]) { + throw new AnalysisException( + errorClass = "AUTOCDC_TARGET_DOES_NOT_SUPPORT_MERGE", + messageParameters = Map( + "tableName" -> destination.identifier.quotedString, + "format" -> destination.format.getOrElse("") + ) + ) + } + } + + private def resolveAuxiliaryTableCatalog( + spark: SparkSession, + auxIdent: TableIdentifier): (TableCatalog, Identifier) = { + val catalogManager = spark.sessionState.catalogManager + val catalog = (auxIdent.catalog match { + case Some(catalogName) => catalogManager.catalog(catalogName) + case None => catalogManager.currentCatalog + }).asInstanceOf[TableCatalog] + val v2Identifier = Identifier.of(Array(auxIdent.database.get), auxIdent.table) + (catalog, v2Identifier) + } +} + +object AutoCdcAuxiliaryTable { + /** + * Helper for deriving the auxiliary AutoCDC catalog table identifier from a target table. + */ + def identifier(destination: TableIdentifier): TableIdentifier = TableIdentifier( + table = s"__auxiliary_auto_cdc_state_${destination.table}", + database = destination.database, + catalog = destination.catalog + ) + + /** + * Table property recording the number of AutoCDC key columns persisted at the front of an + * auxiliary table when it was first created. The number can only change after a full refresh of + * the target, which drops and recreates the auxiliary table. + */ + val numKeyColumnsProperty: String = + PipelinesTableProperties.pipelinesPrefix + "autoCdc.numKeyColumns" +} + +/** + * A [[StreamingFlowExecution]] that applies a CDC event stream to a target [[Table]] via + * SCD Type 1 MERGE semantics. + */ +class Scd1MergeStreamingWrite( + val identifier: TableIdentifier, + val flow: AutoCdcMergeFlow, + val graph: DataflowGraph, + val updateContext: PipelineUpdateContext, + val checkpointPath: String, + val trigger: Trigger, + val destination: Table, + val sqlConf: Map[String, String] +) extends StreamingFlowExecution with AutoCdcMergeWriteMixin { + + override def getOrigin: QueryOrigin = flow.origin + + override protected def changeArgs: ChangeArgs = flow.changeArgs + + /** + * Resolved Spark [[DataType]] of the sequencing expression. + */ + private def sequencingType: DataType = + flow.df.select(flow.changeArgs.sequencing).schema.head.dataType + + override def startStream(): StreamingQuery = { + val sourceChangeDataFeed = graph.reanalyzeFlow(flow).df + val auxiliaryTableIdentifier = createOrValidateAuxiliaryTable(spark = updateContext.spark) + + sourceChangeDataFeed.writeStream + .queryName(displayName) + .option("checkpointLocation", checkpointPath) + .trigger(trigger) + .foreachBatch((batch: Dataset[Row], batchId: Long) => { + val foreachBatchHandler = Scd1ForeachBatchHandler( + batchProcessor = Scd1BatchProcessor( + changeArgs = flow.changeArgs, + resolvedSequencingType = sequencingType + ), + auxiliaryTableIdentifier = auxiliaryTableIdentifier, + targetTableIdentifier = destination.identifier + ) + foreachBatchHandler.execute(batch, batchId) + }) + .start() + } + + override protected def auxiliaryTableSchema(): StructType = + // SCD1's auxiliary table is just keys + the CDC metadata struct; no user data columns. + // Keys come first, in `changeArgs.keys` declaration order, to satisfy the keys-first + // invariant that [[AutoCdcMergeWriteMixin]] relies on for drift detection. + StructType(autoCdcKeyFields :+ cdcMetadataField) + + /** + * AutoCDC key columns resolved out of the flow's augmented schema, in + * `changeArgs.keys` declaration order. Keys are guaranteed to be present in the schema + * because [[AutoCdcMergeFlow.schema]] validates that. + */ + private def autoCdcKeyFields: Seq[StructField] = { + val resolver = updateContext.spark.sessionState.conf.resolver + val targetTableSchema = flow.schema + flow.changeArgs.keys.map { key => + targetTableSchema.fields + .find(field => resolver(field.name, key.name)) + .getOrElse( + throw SparkException.internalError( + s"Key column '${key.name}' was not found in the AutoCDC flow's selected schema." + ) + ) + } + } + + /** CDC metadata field resolved out of the flow's augmented schema. */ + private def cdcMetadataField: StructField = { + val resolver = updateContext.spark.sessionState.conf.resolver + flow.schema.fields + .find(field => resolver(field.name, Scd1BatchProcessor.cdcMetadataColName)) + .getOrElse( + throw SparkException.internalError( + s"CDC metadata column '${Scd1BatchProcessor.cdcMetadataColName}' was not found in the " + + s"AutoCDC flow's target table schema." + ) + ) + } +} diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala index 29e2da4a5e13f..5860ca7389c8e 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.pipelines.graph +import org.apache.spark.sql.pipelines.autocdc.ScdType import org.apache.spark.sql.streaming.Trigger /** @@ -79,6 +80,18 @@ class FlowPlanner( s"streaming flow ${sf.identifier} (${flow.destinationIdentifier})" ) } + case acmf: AutoCdcMergeFlow if acmf.changeArgs.storedAsScdType == ScdType.Type1 => + val flowMetadata = FlowSystemMetadata(updateContext, acmf, graph) + new Scd1MergeStreamingWrite( + identifier = acmf.identifier, + flow = acmf, + graph = graph, + updateContext = updateContext, + checkpointPath = flowMetadata.latestCheckpointLocation, + trigger = triggerFor(acmf), + destination = output.asInstanceOf[Table], + sqlConf = acmf.sqlConf + ) case _ => throw new UnsupportedOperationException( s"Unable to plan flow of type ${flow.getClass.getSimpleName}" diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala new file mode 100644 index 0000000000000..5ebdb4b4c86d2 --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.scalatest.{BeforeAndAfterEach, Suite} + +import org.apache.spark.SparkThrowable +import org.apache.spark.sql.{Column, Row} +import org.apache.spark.sql.connector.catalog.SharedTablesInMemoryRowLevelOperationTableCatalog +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.pipelines.autocdc.{ + ChangeArgs, + ColumnSelection, + Scd1BatchProcessor, + ScdType, + UnqualifiedColumnName +} +import org.apache.spark.sql.pipelines.common.RunState +import org.apache.spark.sql.pipelines.logging.RunProgress +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Shared helpers for AutoCDC end-to-end graph-execution test suites. + */ +trait AutoCdcGraphExecutionTestMixin extends BeforeAndAfterEach { + self: Suite with ExecutionTest with SharedSparkSession => + + /** v2 catalog name registered for AutoCDC E2E tests. Tests qualify tables as `cat.ns1.t`. */ + protected val catalog: String = "cat" + + /** Namespace under [[catalog]] used by AutoCDC E2E tests. */ + protected val namespace: String = "ns1" + + override protected def beforeEach(): Unit = { + super.beforeEach() + spark.conf.set( + s"spark.sql.catalog.$catalog", + classOf[SharedTablesInMemoryRowLevelOperationTableCatalog].getName + ) + // Disable per-flow retries so failure-path tests (e.g. KEY_SCHEMA_DRIFT, INCOMPATIBLE_DATA) + // surface the AnalysisException after the first attempt instead of going through the default + // 2 retries, which would otherwise emit duplicate FAILED events and inflate test runtime + // without changing the asserted outcome. + spark.conf.set(SQLConf.PIPELINES_MAX_FLOW_RETRY_ATTEMPTS.key, "0") + spark.sql(s"CREATE NAMESPACE IF NOT EXISTS $catalog.$namespace") + } + + override protected def afterEach(): Unit = { + SharedTablesInMemoryRowLevelOperationTableCatalog.reset() + spark.sessionState.catalogManager.reset() + spark.sessionState.conf.unsetConf(s"spark.sql.catalog.$catalog") + spark.sessionState.conf.unsetConf(SQLConf.PIPELINES_MAX_FLOW_RETRY_ATTEMPTS.key) + super.afterEach() + } + + /** + * Run a pipeline to completion. If any flow emitted a [[RunProgress]] event with state + * [[RunState.FAILED]], collect every error from the event buffer and throw a single + * exception listing them, so that test failures surface meaningful stack traces instead of + * generic "test exited normally but flow failed" errors. + */ + protected def runPipeline(ctx: TestGraphRegistrationContext): Unit = { + val updateCtx = TestPipelineUpdateContext(spark, ctx.toDataflowGraph, storageRoot) + updateCtx.pipelineExecution.runPipeline() + updateCtx.pipelineExecution.awaitCompletion() + + if (updateCtx.eventBuffer.getEvents.exists(_.details == RunProgress(RunState.FAILED))) { + val errors = updateCtx.eventBuffer.getEvents.flatMap(_.error) + val ex = new RuntimeException( + s"Pipeline run failed with ${errors.size} error(s):\n" + + errors.map { e => + val stackSnippet = e.getStackTrace + .map(f => s" at $f") + .mkString("\n") + s" ${e.getClass.getSimpleName}: ${e.getMessage}\n$stackSnippet" + }.mkString("\n") + ) + errors.foreach(ex.addSuppressed) + throw ex + } + } + + /** + * Walk every [[Throwable]] reachable from `failure` via [[Throwable#getSuppressed]] and + * [[Throwable#getCause]] for the first [[SparkThrowable]] whose + * [[SparkThrowable#getCondition]] equals `condition`, then run [[checkError]] against that + * exception with all of its other arguments propagated through. + */ + protected def checkErrorInPipelineFailure( + failure: Throwable, + condition: String, + sqlState: Option[String] = None, + parameters: Map[String, String] = Map.empty, + matchPVals: Boolean = false, + queryContext: Array[ExpectedContext] = Array.empty): Unit = { + + def causeChain(t: Throwable): Iterator[Throwable] = + Iterator.iterate[Throwable](t)(_.getCause).takeWhile(_ != null) + + def reachable: Iterator[Throwable] = + (Iterator(failure) ++ failure.getSuppressed.iterator).flatMap(causeChain) + + val matched = reachable.collectFirst { + case t: SparkThrowable if t.getCondition == condition => t + } + assert( + matched.isDefined, + s"Expected a SparkThrowable with condition '$condition' reachable from the runPipeline " + + s"failure chain, got top-level: ${failure.getMessage}; chain:\n" + + reachable + .map(t => s" ${t.getClass.getSimpleName}: ${t.getMessage}") + .mkString("\n") + ) + checkError( + exception = matched.get, + condition = condition, + sqlState = sqlState, + parameters = parameters, + matchPVals = matchPVals, + queryContext = queryContext + ) + } + + /** + * DDL fragment for the AutoCDC metadata column appended to every SCD1 target table. Use + * inside a `CREATE TABLE` statement, for example: + * `CREATE TABLE t (id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)` + * + * Assumes sequence type is BIGINT (Long). + */ + protected val cdcMetadataDdl: String = { + val col = Scd1BatchProcessor.cdcMetadataColName + val del = Scd1BatchProcessor.cdcDeleteSequenceFieldName + val ups = Scd1BatchProcessor.cdcUpsertSequenceFieldName + s"$col STRUCT<$del:BIGINT,$ups:BIGINT> NOT NULL" + } + + /** + * Insert a pre-existing row into a target table, populating the CDC metadata struct so the + * row looks as if a previous AutoCDC run upserted it at sequencing version [[sequence]]. + * + * @param table Fully-qualified table name (catalog.schema.table). + * @param colValues Comma-separated SQL literals for the user-defined columns, in declared + * order, excluding the trailing CDC metadata column. + * @param sequence Value to seed `_cdc_metadata.upsertSequence` with. The + * `deleteSequence` field is left NULL. + */ + protected def insertPreloadedRow(table: String, colValues: String, sequence: Long): Unit = { + val del = Scd1BatchProcessor.cdcDeleteSequenceFieldName + val ups = Scd1BatchProcessor.cdcUpsertSequenceFieldName + spark.sql( + s"INSERT INTO $table SELECT $colValues, " + + s"named_struct('$del', CAST(NULL AS BIGINT), '$ups', CAST($sequence AS BIGINT))" + ) + } + + /** Catalog identifier of the AutoCDC auxiliary table for [[targetTableName]]. */ + protected def auxTableNameFor(targetTableName: String): String = { + val targetIdent = fullyQualifiedIdentifier(targetTableName, Some(catalog), Some(namespace)) + AutoCdcAuxiliaryTable.identifier(targetIdent).unquotedString + } + + /** + * Construct an [[AutoCdcFlow]] targeting `catalog.namespace.${target}` from the given + * query and CDC knobs. + */ + protected def autoCdcFlow( + name: String, + target: String, + query: FlowFunction, + keys: Seq[String], + sequencing: Column, + columnSelection: Option[ColumnSelection] = None, + deleteCondition: Option[Column] = None, + scdType: ScdType = ScdType.Type1 + ): AutoCdcFlow = AutoCdcFlow( + identifier = fullyQualifiedIdentifier(name, Some(catalog), Some(namespace)), + destinationIdentifier = fullyQualifiedIdentifier(target, Some(catalog), Some(namespace)), + func = query, + queryContext = QueryContext( + currentCatalog = Some(catalog), + currentDatabase = Some(namespace) + ), + origin = QueryOrigin.empty, + changeArgs = ChangeArgs( + keys = keys.map(UnqualifiedColumnName(_)), + sequencing = sequencing, + columnSelection = columnSelection, + deleteCondition = deleteCondition, + storedAsScdType = scdType + ) + ) + + /** Build a target row's `_cdc_metadata` struct value. */ + protected def cdcMeta(deleteSeq: Option[Long], upsertSeq: Option[Long]): Row = + Row(deleteSeq.orNull, upsertSeq.orNull) +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala new file mode 100644 index 0000000000000..b72cc4bd6e8e9 --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions +import org.apache.spark.sql.pipelines.autocdc.{ + ColumnSelection, + Scd1BatchProcessor, + UnqualifiedColumnName +} +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Tests covering the durability of AutoCDC's auxiliary table across pipeline runs: + * the per-key sequence watermarks recorded in the auxiliary table must persist between + * incremental runs, and the auxiliary table must be transparently recreated if it is + * deleted out-of-band. + */ +class AutoCdcScd1AuxiliaryTableDurabilitySuite + extends ExecutionTest + with SharedSparkSession + with AutoCdcGraphExecutionTestMixin { + + test("a higher-sequence event in a later pipeline run correctly upserts the row") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Single MemoryStream reused across both pipeline runs so the streaming checkpoint can + // resume cleanly. + val changeDataFeedStream = MemoryStream[(Int, String, Long)] + def buildGraphRegistrationContext(): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc( + changeDataFeedStream.toDF().toDF("id", "name", "version") + ), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + // Run #1: insert id=1 at seq=1. + changeDataFeedStream.addData((1, "alice", 1L)) + runPipeline(buildGraphRegistrationContext()) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", 1L, cdcMeta(None, Some(1L)))) + ) + + // Run #2: upsert id=1 at seq=2 (must replace) and insert id=2 at seq=1 (new key). + // The auxiliary table from run #1 persists and continues to gate seq comparisons. + changeDataFeedStream.addData((1, "alice2", 2L), (2, "bob", 1L)) + runPipeline(buildGraphRegistrationContext()) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq( + Row(1, "alice2", 2L, cdcMeta(None, Some(2L))), + Row(2, "bob", 1L, cdcMeta(None, Some(1L))) + ) + ) + } + + test("an event with a sequence lower than what was applied in a prior pipeline run " + + "is suppressed") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Single MemoryStream reused across both runs so the streaming checkpoint can resume. + val stream = MemoryStream[(Int, String, Long, Boolean)] + def buildCtx(): TestGraphRegistrationContext = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version", "is_delete")), + keys = Seq("id"), + sequencing = functions.col("version"), + deleteCondition = Some(functions.col("is_delete") === true), + columnSelection = Some(ColumnSelection.ExcludeColumns( + Seq(UnqualifiedColumnName("is_delete")) + )) + )) + } + + // Run #1: delete id=1 at seq=10. Auxiliary table records seq=10 as the watermark. + stream.addData((1, "alice", 10L, true)) + runPipeline(buildCtx()) + checkAnswer(spark.table(s"$catalog.$namespace.target"), Seq.empty) + + // Run #2: late upsert at seq=5 (< the persisted seq=10 watermark). Must be rejected. + stream.addData((1, "stale", 5L, false)) + runPipeline(buildCtx()) + + // Auxiliary table watermark from run #1 (seq=10) should keep rejecting the seq=5 event. + checkAnswer(spark.table(s"$catalog.$namespace.target"), Seq.empty) + } + + test("the auxiliary table places the AutoCDC key column first, ahead of any non-key " + + "source columns") { + val session = spark + import session.implicits._ + + // Source DF column order is (name, id, version): the AutoCDC key column `id` does NOT + // appear first in the source DF. The auxiliary table must still write `id` as its + // leading column. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(name STRING, id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(String, Int, Long)] + stream.addData(("alice", 1, 1L)) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("name", "id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + val auxSchema = spark.table(auxTableNameFor("target")).schema + + // The auxiliary table only contains keys and the metadata column, hence "name" should not be + // included. + assert(auxSchema.fieldNames.toSeq == Seq("id", Scd1BatchProcessor.cdcMetadataColName)) + assert(getAuxTableNumKeyColumns(target = "target") == 1) + } + + test("the auxiliary table preserves the user's declared key order, independent of the " + + "source DataFrame and target table column orders") { + val session = spark + import session.implicits._ + + // Source DF: (value, id, region, version). Target table: (value, id, region, version, + // _cdc_metadata) -- same ordering as the source. The user, however, declares + // `keys = Seq("region", "id")` -- the OPPOSITE order from how those columns appear in + // both the source DF and the target. The auxiliary table should honor the user's + // declared key order, not either physical column ordering, so subsequent runs + // positionally compare keys against the same recorded layout. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(value STRING, id INT NOT NULL, region STRING NOT NULL, " + + s"version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(String, Int, String, Long)] + stream.addData(("v", 1, "us", 1L)) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("value", "id", "region", "version")), + keys = Seq("region", "id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + val auxSchema = spark.table(auxTableNameFor("target")).schema + assert(auxSchema.fieldNames.toSeq == + Seq("region", "id", Scd1BatchProcessor.cdcMetadataColName)) + assert(getAuxTableNumKeyColumns(target = "target") == 2) + } + + test("if the AutoCDC auxiliary table is dropped between runs, it is transparently " + + "recreated") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Single MemoryStream reused across both runs so the streaming checkpoint can resume. + val stream = MemoryStream[(Int, Long)] + def buildCtx(): TestGraphRegistrationContext = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + stream.addData((1, 1L)) + runPipeline(buildCtx()) + assert(spark.catalog.tableExists(auxTableNameFor("target"))) + + // Manually drop the auxiliary table. + spark.sql(s"DROP TABLE ${auxTableNameFor("target")}") + assert(!spark.catalog.tableExists(auxTableNameFor("target"))) + + stream.addData((1, 2L)) + runPipeline(buildCtx()) + + // The dropped auxiliary table must be transparently recreated. + assert(spark.catalog.tableExists(auxTableNameFor("target"))) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, 2L, cdcMeta(None, Some(2L)))) + ) + } + + private def getAuxTableNumKeyColumns(target: String): Int = { + val auxName = auxTableNameFor(target) + val rows = spark.sql(s"SHOW TBLPROPERTIES $auxName").collect() + val prop = rows + .find(_.getString(0) == AutoCdcAuxiliaryTable.numKeyColumnsProperty) + .getOrElse(throw new AssertionError( + s"auxiliary table $auxName is missing the " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property; got: ${rows.toSeq}" + )) + prop.getString(1).toInt + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala new file mode 100644 index 0000000000000..bb5645e573d42 --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions +import org.apache.spark.sql.pipelines.autocdc.{ + ColumnSelection, + UnqualifiedColumnName +} +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Tests covering AutoCDC's full-refresh semantics: full refresh must wipe both the + * target rows and the AutoCDC auxiliary table for the refreshed targets, and must leave + * non-refreshed targets untouched in selective-refresh mode. + */ +class AutoCdcScd1FullRefreshSuite + extends ExecutionTest + with SharedSparkSession + with AutoCdcGraphExecutionTestMixin { + + test("full refresh wipes target rows and the auxiliary table for the refreshed flow") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Run #1: populate target + auxiliary table. + val stream1 = MemoryStream[(Int, String, Long)] + stream1.addData((1, "alice", 5L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + assert( + spark.catalog.tableExists(auxTableNameFor("target")), + "Auxiliary table should exist after first run" + ) + + // Run #2 (full refresh): auxiliary table should be dropped by DatasetManager, target + // truncated. The new run brings only id=2 at seq=1. + val stream2 = MemoryStream[(Int, String, Long)] + stream2.addData((2, "bob", 1L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + val updateCtx = TestPipelineUpdateContext( + spark, + ctx2.toDataflowGraph, + storageRoot, + fullRefreshTables = AllTables + ) + updateCtx.pipelineExecution.runPipeline() + updateCtx.pipelineExecution.awaitCompletion() + + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(2, "bob", 1L, cdcMeta(None, Some(1L)))) + ) + } + + test("after a full refresh, an event with a sequence below the previous run's " + + "watermark now lands") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Run #1: delete at seq=10 sets a high watermark in the auxiliary table. + val stream1 = MemoryStream[(Int, String, Long, Boolean)] + stream1.addData((1, "alice", 10L, true)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("id", "name", "version", "is_delete")), + keys = Seq("id"), + sequencing = functions.col("version"), + deleteCondition = Some(functions.col("is_delete") === true), + columnSelection = Some(ColumnSelection.ExcludeColumns( + Seq(UnqualifiedColumnName("is_delete")) + )) + )) + } + runPipeline(ctx1) + + // Run #2 (full refresh): auxiliary table is dropped, watermark reset. seq=5 should + // now land. + val stream2 = MemoryStream[(Int, String, Long, Boolean)] + stream2.addData((1, "fresh", 5L, false)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("id", "name", "version", "is_delete")), + keys = Seq("id"), + sequencing = functions.col("version"), + deleteCondition = Some(functions.col("is_delete") === true), + columnSelection = Some(ColumnSelection.ExcludeColumns( + Seq(UnqualifiedColumnName("is_delete")) + )) + )) + } + val updateCtx = TestPipelineUpdateContext( + spark, + ctx2.toDataflowGraph, + storageRoot, + fullRefreshTables = AllTables + ) + updateCtx.pipelineExecution.runPipeline() + updateCtx.pipelineExecution.awaitCompletion() + + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "fresh", 5L, cdcMeta(None, Some(5L)))) + ) + } + + test("selective full refresh wipes only the requested target's auxiliary state") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.t_a " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + spark.sql( + s"CREATE TABLE $catalog.$namespace.t_b " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Run #1: populate both targets at seq=10. + val streamA1 = MemoryStream[(Int, Long)] + val streamB1 = MemoryStream[(Int, Long)] + streamA1.addData((1, 10L)) + streamB1.addData((1, 10L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("t_a", catalog = Some(catalog), database = Some(namespace)) + registerTable("t_b", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_a", + target = "t_a", + query = dfFlowFunc(streamA1.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + registerFlow(autoCdcFlow( + name = "flow_b", + target = "t_b", + query = dfFlowFunc(streamB1.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: full refresh ONLY on t_a; t_b's auxiliary state must persist. + val streamA2 = MemoryStream[(Int, Long)] + val streamB2 = MemoryStream[(Int, Long)] + streamA2.addData((1, 5L)) // would have been suppressed pre-refresh; now wins + streamB2.addData((1, 5L)) // must be suppressed (auxiliary table retains seq=10) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("t_a", catalog = Some(catalog), database = Some(namespace)) + registerTable("t_b", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_a", + target = "t_a", + query = dfFlowFunc(streamA2.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + registerFlow(autoCdcFlow( + name = "flow_b", + target = "t_b", + query = dfFlowFunc(streamB2.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + val updateCtx = TestPipelineUpdateContext( + spark, + ctx2.toDataflowGraph, + storageRoot, + fullRefreshTables = SomeTables(Set( + fullyQualifiedIdentifier("t_a", Some(catalog), Some(namespace)) + )) + ) + updateCtx.pipelineExecution.runPipeline() + updateCtx.pipelineExecution.awaitCompletion() + + checkAnswer( + spark.table(s"$catalog.$namespace.t_a"), + Seq(Row(1, 5L, cdcMeta(None, Some(5L)))) + ) + // t_b: pre-existing seq=10 row still wins; the seq=5 event is dropped. + checkAnswer( + spark.table(s"$catalog.$namespace.t_b"), + Seq(Row(1, 10L, cdcMeta(None, Some(10L)))) + ) + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala new file mode 100644 index 0000000000000..ed740db045371 --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * End-to-end tests that exercise interactions between separate AutoCDC pipelines (i.e. + * distinct [[DataflowGraph]] / [[TestPipelineUpdateContext]] invocations) sharing the same + * v2 catalog. These complement the single-pipeline AutoCDC suites by validating the + * boundary semantics between independently-deployed pipelines. + * + * Each test constructs two graphs and runs them sequentially. In real deployments these + * could be two different pipeline definitions writing into the same metastore; the tests + * here verify that AutoCDC's per-target catalog state (target table, auxiliary table, + * schema invariants) behaves correctly across these pipeline boundaries. + */ +class AutoCdcScd1MultiPipelineSuite + extends ExecutionTest + with SharedSparkSession + with AutoCdcGraphExecutionTestMixin { + + test("two AutoCDC pipelines targeting separate tables maintain independent target and " + + "auxiliary tables") { + val session = spark + import session.implicits._ + + // Two distinct target tables created up-front. + spark.sql( + s"CREATE TABLE $catalog.$namespace.t_a " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + spark.sql( + s"CREATE TABLE $catalog.$namespace.t_b " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Pipeline #1 only knows about `t_a`. Its auxiliary table cat.ns1.__auxiliary_..._t_a + // must not affect pipeline #2's `t_b`. + val streamA = MemoryStream[(Int, String, Long)] + streamA.addData((1, "alice", 100L)) + val ctxA = new TestGraphRegistrationContext(spark) { + registerTable("t_a", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_a", + target = "t_a", + query = dfFlowFunc(streamA.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctxA) + + // Pipeline #2 only knows about `t_b`. Uses a deliberately *lower* sequence to verify + // the watermark from pipeline #1's auxiliary table (seq=100) does not leak into + // pipeline #2. + val streamB = MemoryStream[(Int, String, Long)] + streamB.addData((9, "bob", 1L)) + val ctxB = new TestGraphRegistrationContext(spark) { + registerTable("t_b", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_b", + target = "t_b", + query = dfFlowFunc(streamB.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctxB) + + checkAnswer( + spark.table(s"$catalog.$namespace.t_a"), + Seq(Row(1, "alice", 100L, cdcMeta(None, Some(100L)))) + ) + checkAnswer( + spark.table(s"$catalog.$namespace.t_b"), + Seq(Row(9, "bob", 1L, cdcMeta(None, Some(1L)))) + ) + + // Each target has its own auxiliary table; no cross-contamination. + assert(spark.catalog.tableExists(auxTableNameFor("t_a"))) + assert(spark.catalog.tableExists(auxTableNameFor("t_b"))) + } + + test("a downstream pipeline can read an AutoCDC target written by a different pipeline " + + "without observing the CDC metadata column") { + val session = spark + import session.implicits._ + + // Pipeline #1 writes into target `src` via AutoCDC. + spark.sql( + s"CREATE TABLE $catalog.$namespace.src " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + val stream = MemoryStream[(Int, String, Long)] + stream.addData((1, "alice", 1L), (2, "bob", 1L)) + val ctxWriter = new TestGraphRegistrationContext(spark) { + registerTable("src", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "writer", + target = "src", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctxWriter) + + // Pipeline #2 is a regular materialized view that selects the user-data columns from + // `src` (a different graph entirely). It must observe the merged AutoCDC rows and be + // able to ignore the metadata column without it polluting downstream consumers. + val ctxReader = new TestGraphRegistrationContext(spark) { + registerMaterializedView( + "downstream_mv", + query = dfFlowFunc( + spark.read.table(s"$catalog.$namespace.src").select("id", "name", "version") + ) + ) + } + runPipeline(ctxReader) + + checkAnswer( + spark.table(fullyQualifiedIdentifier("downstream_mv").toString), + Seq(Row(1, "alice", 1L), Row(2, "bob", 1L)) + ) + } + + test("a second pipeline targeting an existing AutoCDC table with different keys " + + "fails with KEY_SCHEMA_DRIFT") { + val session = spark + import session.implicits._ + + // Target table with both candidate keys present so the second pipeline would otherwise + // be schema-compatible with the first; only the AutoCDC `keys` differ between flows. + spark.sql( + s"CREATE TABLE $catalog.$namespace.shared_target " + + s"(id INT NOT NULL, name STRING NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Pipeline #1: AutoCDC flow keyed on `id`. Materializes the auxiliary table with schema + // (id, _cdc_metadata). + val stream1 = MemoryStream[(Int, String, Long)] + stream1.addData((1, "alice", 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_v1", + target = "shared_target", + query = dfFlowFunc(stream1.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Pipeline #2: completely separate graph, but targets the same physical `shared_target` + // table with `keys = Seq("name")`. + val stream2 = MemoryStream[(Int, String, Long)] + stream2.addData((2, "alice", 1L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_v2", + target = "shared_target", + query = dfFlowFunc(stream2.toDF().toDF("id", "name", "version")), + keys = Seq("name"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + sqlState = Some("42000"), + parameters = Map( + "flowName" -> + fullyQualifiedIdentifier("flow_v2", Some(catalog), Some(namespace)).unquotedString, + "auxTableName" -> auxTableNameFor("shared_target"), + // Pipeline #2's AutoCDC key resolves from the source DF, where `MemoryStream[(Int, String, + // Long)]` produces a nullable StringType for `name`. + "expectedKeySchema" -> "name STRING", + // Pipeline #1 persisted the aux table from a source DF whose `id` was a non-null Scala + // primitive (`Int`), so the recorded key carries `NOT NULL`. + "recordedKeySchema" -> "id INT NOT NULL" + ) + ) + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala new file mode 100644 index 0000000000000..e374c2f1e9f8b --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -0,0 +1,880 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import java.sql.Timestamp + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.pipelines.autocdc.{ + ColumnSelection, + UnqualifiedColumnName +} +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Tests covering AutoCDC's interaction with schema evolution and schema drift across + * pipeline runs. The suite documents the supported additive cases (new top-level columns, + * new nested fields in array-of-struct, broadening / narrowing column selection) and the + * cases that fail loudly today (subtractive nested evolution, type-incompatible changes, + * key-set changes, case-only renames). + * + * These behaviors are largely inherited from the lower layers (`SchemaMergingUtils` for + * schema merge, the v2 writer's column-resolution layer for nested-field handling) rather + * than implemented in AutoCDC itself; the tests here serve as the contract for AutoCDC's + * observable behavior on top of those layers. + */ +class AutoCdcScd1SchemaEvolutionSuite + extends ExecutionTest + with SharedSparkSession + with AutoCdcGraphExecutionTestMixin { + + test("a nullable non-key column merges correctly with mixed NULL and non-NULL values") { + val session = spark + import session.implicits._ + + // Single MemoryStream with `email` as nullable from the start. Run #1 emits a row with + // a NULL email; run #2 emits an upsert with a non-NULL email. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, email STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, String, Option[String], Long)] + def buildCtx(): TestGraphRegistrationContext = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "email", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + // Run #1: insert with NULL email. + stream.addData((1, "alice", None, 1L)) + runPipeline(buildCtx()) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", null, 1L, cdcMeta(None, Some(1L)))) + ) + + // Run #2: upsert with non-NULL email at higher seq replaces the row. + stream.addData((1, "alice2", Some("a@x.com"), 2L)) + runPipeline(buildCtx()) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice2", "a@x.com", 2L, cdcMeta(None, Some(2L)))) + ) + } + + test("widening a non-key column's type between runs fails with " + + "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE") { + val session = spark + import session.implicits._ + + // Changing a non-key column's type between pipeline runs is rejected by + // `SchemaMergingUtils` with CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE even when the new type + // is strictly wider. Users must full-refresh the target to change column types. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, age INT, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream1 = MemoryStream[(Int, Int, Long)] + stream1.addData((1, 30, 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("id", "age", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: widen `age` from Int to Long. + val stream2 = MemoryStream[(Int, Long, Long)] + stream2.addData((1, 31L, 2L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("id", "age", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE", + sqlState = Some("42825"), + // `left` is the persisted (run #1) INT type; `right` is run #2's widened BIGINT. + parameters = Map( + "left" -> "\"INT\"", + "right" -> "\"BIGINT\"" + ) + ) + } + + test("narrowing a non-key column's type between runs fails with " + + "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE") { + val session = spark + import session.implicits._ + + // Mirror image of the widening test above: changing a non-key column's type between + // pipeline runs is rejected by SchemaMergingUtils with CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE + // even when the new type is strictly narrower. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, payload BIGINT, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream1 = MemoryStream[(Int, Long, Long)] + stream1.addData((1, 100L, 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("id", "payload", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: narrow `payload` from Long (BIGINT) to Int (INT). + val stream2 = MemoryStream[(Int, Int, Long)] + stream2.addData((1, 5, 2L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("id", "payload", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE", + sqlState = Some("42825"), + // `left` is the persisted (run #1) BIGINT type; `right` is run #2's narrowed INT. + parameters = Map( + "left" -> "\"BIGINT\"", + "right" -> "\"INT\"" + ) + ) + } + + test("expanding the AutoCDC key set between runs fails with KEY_SCHEMA_DRIFT") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(region STRING, id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Run #1: keys = [id]. Auxiliary table is created with schema (id, _cdc_metadata) and + // num_key_columns = 1. + val stream1 = MemoryStream[(String, Int, Long)] + stream1.addData(("us", 1, 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("region", "id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: keys = [region, id]. Recorded num_key_columns = 1, expected 2 -> length + // mismatch -> KEY_SCHEMA_DRIFT. + val stream2 = MemoryStream[(String, Int, Long)] + stream2.addData(("us", 1, 2L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("region", "id", "version")), + keys = Seq("region", "id"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + sqlState = Some("42000"), + parameters = Map( + "flowName" -> + fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, + "auxTableName" -> auxTableNameFor("target"), + "expectedKeySchema" -> "region STRING,id INT NOT NULL", + "recordedKeySchema" -> "id INT NOT NULL" + ) + ) + } + + test("shrinking the AutoCDC key set between runs fails with KEY_SCHEMA_DRIFT") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(region STRING NOT NULL, id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Run #1: keys = [region, id]. Auxiliary table is created with schema + // (region, id, _cdc_metadata) and num_key_columns = 2. + val stream1 = MemoryStream[(String, Int, Long)] + stream1.addData(("us", 1, 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("region", "id", "version")), + keys = Seq("region", "id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: keys = [id]. Recorded num_key_columns = 2, expected 1 -> length mismatch -> + // KEY_SCHEMA_DRIFT. Without the strict-equality check, `id` matches at position 0 of the + // existing aux schema and the dropped `region` key would silently slip through. + val stream2 = MemoryStream[(String, Int, Long)] + stream2.addData(("us", 1, 2L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("region", "id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + sqlState = Some("42000"), + parameters = Map( + "flowName" -> + fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, + "auxTableName" -> auxTableNameFor("target"), + "expectedKeySchema" -> "id INT NOT NULL", + "recordedKeySchema" -> "region STRING,id INT NOT NULL" + ) + ) + } + + test("swapping a key column for a different one of the same arity fails with " + + "KEY_SCHEMA_DRIFT") { + val session = spark + import session.implicits._ + + // Target carries both candidate key columns (`region` and `country`) so the source DF is + // structurally compatible across both runs; only the AutoCDC `keys` declaration changes. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, region STRING, country STRING, version BIGINT NOT NULL, " + + s"$cdcMetadataDdl)" + ) + + // Run #1: keys = [id, region]. Auxiliary table records (id, region, _cdc_metadata) and + // num_key_columns = 2. + val stream1 = MemoryStream[(Int, String, String, Long)] + stream1.addData((1, "us", "USA", 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("id", "region", "country", "version")), + keys = Seq("id", "region"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: same key arity (2), but `country` is swapped in for `region`. Recorded + // num_key_columns matches expected (2), but the second key column's name diverges + // (`country` vs persisted `region`) -> KEY_SCHEMA_DRIFT. This is the case the + // length-mismatch check would silently miss without per-position name equality. + val stream2 = MemoryStream[(Int, String, String, Long)] + stream2.addData((1, "us", "USA", 2L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("id", "region", "country", "version")), + keys = Seq("id", "country"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + sqlState = Some("42000"), + parameters = Map( + "flowName" -> + fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, + "auxTableName" -> auxTableNameFor("target"), + "expectedKeySchema" -> "id INT NOT NULL,country STRING", + "recordedKeySchema" -> "id INT NOT NULL,region STRING" + ) + ) + } + + test("a new top-level nullable column appearing in the source DF between runs is " + + "added to the target") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Single MemoryStream of (id, name, email, version) shared across runs so the streaming + // checkpoint can resume cleanly. Run #1's flow drops `email` so the source's resolved DF + // schema is 3 columns; run #2 keeps all 4. The MemoryStream's underlying tuple schema is + // unchanged (only the downstream projection differs), so the source identity that the + // OffsetSeqLog records is stable across runs. + val stream = MemoryStream[(Int, String, Option[String], Long)] + def buildCtx(includeEmail: Boolean): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + val sourceDf = stream.toDF().toDF("id", "name", "email", "version") + val projectedDf = if (includeEmail) sourceDf else sourceDf.drop("email") + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(projectedDf), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + // Run #1: source projects (id, name, version). Target schema is unchanged. + stream.addData((1, "alice", None, 1L)) + runPipeline(buildCtx(includeEmail = false)) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", 1L, cdcMeta(None, Some(1L)))) + ) + + // Run #2: source projects (id, name, email, version). mergeSchemas appends `email` to + // the target (StructType.merge keeps the left schema's order and appends right-only + // fields); existing rows get NULL for the new column. + stream.addData((2, "bob", Some("b@x.com"), 2L)) + runPipeline(buildCtx(includeEmail = true)) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq( + Row(1, "alice", 1L, cdcMeta(None, Some(1L)), null), + Row(2, "bob", 2L, cdcMeta(None, Some(2L)), "b@x.com") + ) + ) + } + + test("broadening the column selection between runs adds the newly-included column to " + + "the target") { + val session = spark + import session.implicits._ + + // Source DF schema is fixed at (id, name, email, version) across both runs. Only the + // `columnSelection` knob differs: run #1 includes (id, name, version); run #2 selects + // None (= all source columns). mergeSchemas adds `email` to the target via the same + // generic SDP path as the new-source-column case, but driven by the + // [[ColumnSelection]] knob rather than the source DF's own schema. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, String, String, Long)] + def buildCtx(selection: Option[ColumnSelection]): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "email", "version")), + keys = Seq("id"), + sequencing = functions.col("version"), + columnSelection = selection + )) + } + + // Run #1: only (id, name, version) selected; `email` is dropped before the MERGE. + stream.addData((1, "alice", "ignored", 1L)) + runPipeline(buildCtx(selection = Some(ColumnSelection.IncludeColumns( + Seq("id", "name", "version").map(UnqualifiedColumnName(_)) + )))) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", 1L, cdcMeta(None, Some(1L)))) + ) + + // Run #2: broaden to no selection. mergeSchemas adds `email`; existing rows get NULL, + // new rows get the actual value. + stream.addData((2, "bob", "b@x.com", 2L)) + runPipeline(buildCtx(selection = None)) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq( + Row(1, "alice", 1L, cdcMeta(None, Some(1L)), null), + Row(2, "bob", 2L, cdcMeta(None, Some(2L)), "b@x.com") + ) + ) + } + + test("narrowing the column selection between runs preserves the dropped column on " + + "existing rows and leaves it NULL on new rows") { + val session = spark + import session.implicits._ + + // Validates the additive-only column-selection contract on the narrowing side: + // tightening `columnSelection` between runs leaves the dropped column in place at the + // schema level (SDP's `SchemaMergingUtils.mergeSchemas` is a union, never a subtraction). + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, email STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, String, String, Long)] + def buildCtx(selection: Option[ColumnSelection]): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "email", "version")), + keys = Seq("id"), + sequencing = functions.col("version"), + columnSelection = selection + )) + } + + // Run #1: include all columns; populate `email` for key=1. + stream.addData((1, "alice", "a@x.com", 1L)) + runPipeline(buildCtx(selection = None)) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", "a@x.com", 1L, cdcMeta(None, Some(1L)))) + ) + + // Run #2: narrow the selection to drop `email`. The merge omits `email` from both + // INSERT and UPDATE assignment maps; key=1's `email` is preserved at "a@x.com" while + // key=2 is inserted with `email = NULL`. + stream.addData((2, "bob", "ignored", 2L)) + runPipeline(buildCtx(selection = Some(ColumnSelection.IncludeColumns( + Seq("id", "name", "version").map(UnqualifiedColumnName(_)) + )))) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq( + Row(1, "alice", "a@x.com", 1L, cdcMeta(None, Some(1L))), + Row(2, "bob", null, 2L, cdcMeta(None, Some(2L))) + ) + ) + } + + test("a top-level column dropped from the source DF between runs is preserved on " + + "existing rows and left NULL on new rows") { + val session = spark + import session.implicits._ + + // Symmetric to the new-source-column case (which exercises the source DF *gaining* a + // column). Validates that the additive-only column-selection contract holds when the + // narrowing is driven by the source DF's own schema shrinking, rather than by a + // tightening [[ChangeArgs.columnSelection]]. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Same `MemoryStream[(Int, String, Option[String], Long)]` shape across runs; runs + // differ in whether `email` is kept in the projected source DF. + val stream = MemoryStream[(Int, String, Option[String], Long)] + def buildCtx(includeEmail: Boolean): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + val sourceDf = stream.toDF().toDF("id", "name", "email", "version") + val projectedDf = if (includeEmail) sourceDf else sourceDf.drop("email") + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(projectedDf), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + // Run #1: wide source DF (id, name, email, version). mergeSchemas appends `email` to + // the target. + stream.addData((1, "alice", Some("a@x.com"), 1L)) + runPipeline(buildCtx(includeEmail = true)) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", 1L, cdcMeta(None, Some(1L)), "a@x.com")) + ) + + // Run #2: source DF drops `email` upstream of the flow. Target still has `email` + // (`StructType.merge` is additive-only); the merge omits `email` from both INSERT and + // UPDATE assignment maps. Key=1's `email` is preserved at "a@x.com"; key=2 is inserted + // with `email = NULL`. + stream.addData((2, "bob", None, 2L)) + runPipeline(buildCtx(includeEmail = false)) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq( + Row(1, "alice", 1L, cdcMeta(None, Some(1L)), "a@x.com"), + Row(2, "bob", 2L, cdcMeta(None, Some(2L)), null) + ) + ) + } + + test("dropping a nested struct field between runs fails with INCOMPATIBLE_DATA_FOR_TABLE") { + val session = spark + import session.implicits._ + + // The v2 writer's column-resolution layer requires every nested target field to be + // present in the microbatch DF. When run #2's source projection drops `b.c`, the merge + // fails with INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA. Users who want to drop a + // nested field between runs must full-refresh the target. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(key INT NOT NULL, version BIGINT NOT NULL, " + + s"value STRUCT>, $cdcMetadataDdl)" + ) + + // Stream is (key, version, a, b_c, b_d). Each run reshapes into different `value` + // shapes; the underlying tuple shape is unchanged so the streaming source's identity + // is stable across runs. + val stream = MemoryStream[(Int, Long, Int, Int, Int)] + def buildCtx(includeC: Boolean): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + val src = stream.toDF().toDF("key", "version", "a", "b_c", "b_d") + val inner = if (includeC) { + functions.struct(functions.col("b_c").as("c"), functions.col("b_d").as("d")) + } else { + functions.struct(functions.col("b_d").as("d")) + } + val projected = src.select( + functions.col("key"), + functions.col("version"), + functions.struct(functions.col("a"), inner.as("b")).as("value") + ) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(projected), + keys = Seq("key"), + sequencing = functions.col("version") + )) + } + + stream.addData((1, 1L, 1, 1, 1), (2, 1L, 2, 2, 2)) + runPipeline(buildCtx(includeC = true)) + + // Run #2 drops b.c. The v2 writer rejects the merge because it cannot find data for + // the target's `value.b.c` column. + stream.addData((1, 2L, 10, 99, 10), (3, 1L, 3, 99, 3)) + val ex = intercept[RuntimeException] { runPipeline(buildCtx(includeC = false)) } + val all = Iterator(ex) ++ ex.getSuppressed.iterator + assert( + all.exists(t => Option(t.getMessage).exists(m => + m.contains("INCOMPATIBLE_DATA_FOR_TABLE") && m.contains("value") && m.contains("b") && + m.contains("c"))), + s"Expected INCOMPATIBLE_DATA_FOR_TABLE failure for value.b.c, got: ${ex.getMessage}" + ) + } + + test("a new field added inside an array element between runs is added to the " + + "target") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(key INT NOT NULL, version BIGINT NOT NULL, " + + s"vals ARRAY>>, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, Long, Int, Int, Int)] + def buildCtx(includeD: Boolean): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + val src = stream.toDF().toDF("key", "version", "a", "b_c", "b_d") + val inner = if (includeD) { + functions.struct(functions.col("b_c").as("c"), functions.col("b_d").as("d")) + } else { + functions.struct(functions.col("b_c").as("c")) + } + val projected = src.select( + functions.col("key"), + functions.col("version"), + functions.array( + functions.struct(functions.col("a"), inner.as("b")) + ).as("vals") + ) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(projected), + keys = Seq("key"), + sequencing = functions.col("version") + )) + } + + stream.addData((1, 1L, 1, 1, 99)) + runPipeline(buildCtx(includeD = false)) + + // Run #2 widens to include b.d. Existing key=1 row's vals[0].b.d is NULL until the + // upsert at version=2 writes the new value. + stream.addData((1, 2L, 1, 1, 2), (3, 1L, 3, 3, 3)) + runPipeline(buildCtx(includeD = true)) + + // Inline-explode flattens the array for assertion. + checkAnswer( + spark.table(s"$catalog.$namespace.target") + .selectExpr("key", "inline(vals) as (a, b)") + .select("key", "a", "b.c", "b.d"), + Seq( + Row(1, 1, 1, 2), + Row(3, 3, 3, 3) + ) + ) + } + + test("dropping a field inside an array element between runs fails with " + + "INCOMPATIBLE_DATA_FOR_TABLE") { + val session = spark + import session.implicits._ + + // Symmetric to the nested-struct case, but for `array`. The v2 writer rejects + // the merge because it cannot find data for the target's `vals.element.b.d` column + // when run #2's projection drops `d` from the element struct. Users must full-refresh + // the target to drop a nested array-element field. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(key INT NOT NULL, version BIGINT NOT NULL, " + + s"vals ARRAY>>, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, Long, Int, Int, Int)] + def buildCtx(includeD: Boolean): TestGraphRegistrationContext = + new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + val src = stream.toDF().toDF("key", "version", "a", "b_c", "b_d") + val inner = if (includeD) { + functions.struct(functions.col("b_c").as("c"), functions.col("b_d").as("d")) + } else { + functions.struct(functions.col("b_c").as("c")) + } + val projected = src.select( + functions.col("key"), + functions.col("version"), + functions.array( + functions.struct(functions.col("a"), inner.as("b")) + ).as("vals") + ) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(projected), + keys = Seq("key"), + sequencing = functions.col("version") + )) + } + + stream.addData((1, 1L, 1, 1, 1), (2, 1L, 2, 2, 2)) + runPipeline(buildCtx(includeD = true)) + + stream.addData((1, 2L, 10, 10, 99), (3, 1L, 3, 3, 99)) + val ex = intercept[RuntimeException] { runPipeline(buildCtx(includeD = false)) } + val all = Iterator(ex) ++ ex.getSuppressed.iterator + assert( + all.exists(t => Option(t.getMessage).exists(m => + m.contains("INCOMPATIBLE_DATA_FOR_TABLE") && m.contains("vals"))), + s"Expected INCOMPATIBLE_DATA_FOR_TABLE failure for vals element, got: ${ex.getMessage}" + ) + } + + test("a source DF column whose name differs from the target only by case fails with " + + "AMBIGUOUS_REFERENCE under case-insensitive resolution") { + val session = spark + import session.implicits._ + + // `DatasetManager`'s schema-merge compares the existing target schema and the flow's + // output schema *case-sensitively*: `SchemaMergingUtils.mergeSchemas` calls + // `StructType.merge` without forwarding the session-level case-sensitivity. When the + // target has `value` and the source DF emits `Value`, the merged schema ends up with + // both as separate columns. Reference resolution downstream is case-insensitive + // (Spark's default), so the MERGE plan trips on the duplicate and reports + // AMBIGUOUS_REFERENCE. + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(key INT NOT NULL, version BIGINT NOT NULL, value STRING, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, Long, String)] + stream.addData((1, 1L, "alice")) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + // Source DF emits `Value` (capital), differing only in case from the target's + // `value` column. + val df = stream.toDF().toDF("key", "version", "Value") + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(df), + keys = Seq("key"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx) } + val all = Iterator(ex) ++ ex.getSuppressed.iterator + assert( + all.exists(t => Option(t.getMessage).exists(_.contains("AMBIGUOUS_REFERENCE"))), + s"Expected AMBIGUOUS_REFERENCE failure, got: ${ex.getMessage}" + ) + } + } + + test("extra columns on the target that the AutoCDC flow does not emit are preserved " + + "across the merge") { + val session = spark + import session.implicits._ + + // The target is wider than the AutoCDC flow's source DF: column `extra` is present on + // the target but never produced by the flow. AutoCDC must tolerate the extra target + // column -- pre-existing rows keep their `extra` value, and newly-inserted rows + // resolve `extra` to NULL. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, extra INT, $cdcMetadataDdl)" + ) + insertPreloadedRow( + s"$catalog.$namespace.target", + colValues = "1, 'preloaded', 0, 42", + sequence = 0L + ) + + val stream = MemoryStream[(Int, String, Long)] + stream.addData((1, "alice", 1L), (2, "bob", 1L)) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + checkAnswer( + spark.table(s"$catalog.$namespace.target").select("id", "name", "version", "extra"), + Seq( + Row(1, "alice", 1L, 42), // extra preserved on the upsert + Row(2, "bob", 1L, null) // extra is NULL for inserts + ) + ) + } + + test("changing a non-key column type from TIMESTAMP to STRING between runs fails with " + + "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE") { + val session = spark + import session.implicits._ + + // `mergeSchemas` rejects an incompatible type change between TIMESTAMP and STRING. + // Captured alongside the type-widening / type-narrowing tests; users must full-refresh + // the target to change a column's type. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(key INT NOT NULL, version BIGINT NOT NULL, value TIMESTAMP, $cdcMetadataDdl)" + ) + + val stream1 = MemoryStream[(Int, Long, Timestamp)] + stream1.addData((1, 1L, Timestamp.valueOf("2024-01-01 10:00:00"))) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("key", "version", "value")), + keys = Seq("key"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2 emits `value` as STRING. mergeSchemas rejects the type change. + val stream2 = MemoryStream[(Int, Long, String)] + stream2.addData((1, 2L, "2024-01-02 11:00:00")) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("key", "version", "value")), + keys = Seq("key"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + val all = Iterator(ex) ++ ex.getSuppressed.iterator + assert( + all.exists(t => Option(t.getMessage).exists(_.contains("CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE"))), + s"Expected CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE failure, got: ${ex.getMessage}" + ) + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala new file mode 100644 index 0000000000000..992dd89ffc05e --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions +import org.apache.spark.sql.pipelines.autocdc.{ + ChangeArgs, + ColumnSelection, + ScdType, + UnqualifiedColumnName +} +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Smoke tests for AutoCDC SCD type 1 flows running within a single pipeline: one + * [[DataflowGraph]] / [[TestPipelineUpdateContext]] executes one or more AutoCDC flows, + * and the target table contents are asserted at the end. Multi-pipeline scenarios (where + * multiple pipelines write to the same target) live in [[AutoCdcScd1MultiPipelineSuite]]. + */ +class AutoCdcScd1SinglePipelineSuite + extends ExecutionTest + with SharedSparkSession + with AutoCdcGraphExecutionTestMixin { + + test("an upsert event lands a new row in an empty target table") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, String, Long)] + stream.addData((1, "alice", 1L)) + + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + runPipeline(ctx) + + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", 1L, cdcMeta(None, Some(1L)))) + ) + } + + test("consecutive upsert, delete, and re-upsert events for the same key in one run " + + "converge to the latest event") { + val session = spark + import session.implicits._ + + // Target schema deliberately omits `is_delete`: the source carries it as a control + // column, drives the deleteCondition, and is excluded from the target projection. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, String, Long, Boolean)] + stream.addData( + (1, "alice", 1L, false), // initial upsert + (1, "alice", 2L, true), // delete + (1, "alice2", 3L, false) // reinsert + ) + + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version", "is_delete")), + keys = Seq("id"), + sequencing = functions.col("version"), + deleteCondition = Some(functions.col("is_delete") === true), + columnSelection = Some(ColumnSelection.ExcludeColumns( + Seq(UnqualifiedColumnName("is_delete")) + )) + )) + } + + runPipeline(ctx) + + // After all three events at seqs 1, 2, 3: row "alice2" wins as the highest-sequenced + // upsert; the delete at seq=2 is bounded by the seq=3 upsert. + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice2", 3L, cdcMeta(None, Some(3L)))) + ) + } + + test("two AutoCDC flows targeting separate tables in one pipeline produce independent " + + "results") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.t_a " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + spark.sql( + s"CREATE TABLE $catalog.$namespace.t_b " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val streamA = MemoryStream[(Int, Long)] + val streamB = MemoryStream[(Int, Long)] + streamA.addData((1, 1L), (2, 1L)) + streamB.addData((10, 1L)) + + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("t_a", catalog = Some(catalog), database = Some(namespace)) + registerTable("t_b", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_a", + target = "t_a", + query = dfFlowFunc(streamA.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + registerFlow(autoCdcFlow( + name = "flow_b", + target = "t_b", + query = dfFlowFunc(streamB.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + checkAnswer( + spark.table(s"$catalog.$namespace.t_a"), + Seq(Row(1, 1L, cdcMeta(None, Some(1L))), Row(2, 1L, cdcMeta(None, Some(1L)))) + ) + checkAnswer( + spark.table(s"$catalog.$namespace.t_b"), + Seq(Row(10, 1L, cdcMeta(None, Some(1L)))) + ) + assert(spark.catalog.tableExists(auxTableNameFor("t_a"))) + assert(spark.catalog.tableExists(auxTableNameFor("t_b"))) + } + + test("an AutoCDC flow targeting a table whose format does not support row-level " + + "operations fails with AUTOCDC_TARGET_DOES_NOT_SUPPORT_MERGE") { + val session = spark + import session.implicits._ + + // Intentionally use a non-merge compatible catalog, whose default table format is parquet. + val catalog = TestGraphRegistrationContext.DEFAULT_CATALOG + val database = TestGraphRegistrationContext.DEFAULT_DATABASE + + spark.sql( + s"CREATE TABLE $catalog.$database.target_no_merge " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, Long)] + stream.addData((1, 1L)) + + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target_no_merge") + registerFlow(AutoCdcFlow( + identifier = fullyQualifiedIdentifier("auto_cdc_flow"), + destinationIdentifier = fullyQualifiedIdentifier("target_no_merge"), + func = dfFlowFunc(stream.toDF().toDF("id", "version")), + queryContext = QueryContext( + currentCatalog = Some(catalog), + currentDatabase = Some(database) + ), + origin = QueryOrigin.empty, + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = functions.col("version"), + storedAsScdType = ScdType.Type1 + ) + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_TARGET_DOES_NOT_SUPPORT_MERGE", + sqlState = Some("0A000"), + parameters = Map( + "tableName" -> s"`$catalog`.`$database`.`target_no_merge`", + "format" -> "parquet" + ) + ) + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala new file mode 100644 index 0000000000000..46f8ee47db02f --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions +import org.apache.spark.sql.pipelines.autocdc.Scd1BatchProcessor +import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Tests covering AutoCDC's behavior when the target table is pre-populated by something + * other than a prior AutoCDC run: pre-loaded rows, missing CDC metadata column on the + * target, and rows with NULL CDC metadata. These cases verify that AutoCDC interoperates + * gracefully with users who hand-populate the target table. + */ +class AutoCdcScd1TargetTableDurabilitySuite + extends ExecutionTest + with SharedSparkSession + with AutoCdcGraphExecutionTestMixin { + + test("pre-loaded rows: an event with a lower sequence is suppressed and a higher one " + + "wins") { + val session = spark + import session.implicits._ + + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + insertPreloadedRow(s"$catalog.$namespace.target", "1, 'alice', 5", 5L) + insertPreloadedRow(s"$catalog.$namespace.target", "2, 'bob', 5", 5L) + + val stream = MemoryStream[(Int, String, Long)] + stream.addData( + (1, "stale", 2L), // < pre-existing seq=5 -> ignored + (2, "bob2", 10L) // > pre-existing seq=5 -> upserts + ) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq( + Row(1, "alice", 5L, cdcMeta(None, Some(5L))), + Row(2, "bob2", 10L, cdcMeta(None, Some(10L))) + ) + ) + } + + test("pre-loaded target rows merge correctly on the first AutoCDC run, and the " + + "auxiliary table is created lazily") { + val session = spark + import session.implicits._ + + // Target was populated by some external process; this is the first AutoCDC run. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + insertPreloadedRow(s"$catalog.$namespace.target", "1, 'alice', 1", 1L) + + assert( + !spark.catalog.tableExists(auxTableNameFor("target")), + "Auxiliary table should not exist before the first AutoCDC run" + ) + + val stream = MemoryStream[(Int, String, Long)] + stream.addData((1, "bob", 2L)) + + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + // seq=2 > pre-existing seq=1, so "bob" replaces "alice" via the upsert sequence column. + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "bob", 2L, cdcMeta(None, Some(2L)))) + ) + assert( + spark.catalog.tableExists(auxTableNameFor("target")), + "Auxiliary table should be created lazily on the first AutoCDC run" + ) + } + + test("a target table created without the CDC metadata column gets the column " + + "auto-added on the first AutoCDC run") { + val session = spark + import session.implicits._ + + // User creates the target without the AutoCDC metadata column. DatasetManager evolves + // the existing table schema by merging it with the AutoCdcMergeFlow's output schema, + // which includes the metadata column. The first run therefore proceeds normally, and + // subsequent reads see the metadata struct alongside the user's data columns. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL)" + ) + + val stream = MemoryStream[(Int, String, Long)] + stream.addData((1, "alice", 1L)) + + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx) + + val schema = spark.table(s"$catalog.$namespace.target").schema + assert( + schema.fieldNames.contains(Scd1BatchProcessor.cdcMetadataColName), + s"Target must have ${Scd1BatchProcessor.cdcMetadataColName} after first AutoCDC run; " + + s"got ${schema.fieldNames.toSeq}" + ) + checkAnswer( + spark.table(s"$catalog.$namespace.target"), + Seq(Row(1, "alice", 1L, cdcMeta(None, Some(1L)))) + ) + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala index f19fed4e57806..8dad5019c0fe0 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala @@ -660,7 +660,7 @@ class ConnectInvalidPipelineSuite extends PipelineTest with SharedSparkSession { ) { // Temporary views in SDP normally accept either streaming or batch producing flows, but // AutoCDC flows are an explicit exception: SCD reconciliation only runs at the - // streaming-table sink (`Scd1ForeachBatchExec`), so pointing an AutoCDC flow at a view + // streaming-table sink (`Scd1ForeachBatchHandler`), so pointing an AutoCDC flow at a view // would silently drop reconciliation and expose just the projected CDF to consumers. // `validateFlowStreamingness` rejects this case with a dedicated sub-condition under // INVALID_FLOW_QUERY_TYPE. From eeff543b631f89da3ddc379b671d4c1c6145ece6 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 01:59:04 +0000 Subject: [PATCH 02/13] claude self-review --- .../autocdc/AutoCdcReservedNames.scala | 32 ++++ .../autocdc/Scd1BatchProcessor.scala | 25 ++- .../sql/pipelines/graph/DatasetManager.scala | 7 +- .../spark/sql/pipelines/graph/Flow.scala | 3 +- .../sql/pipelines/graph/FlowExecution.scala | 165 ++++++++++-------- .../pipelines/autocdc/AutoCdcFlowSuite.scala | 14 +- ...CdcScd1AuxiliaryTableDurabilitySuite.scala | 93 +++++++++- .../AutoCdcScd1SchemaEvolutionSuite.scala | 112 ++++++++++++ 8 files changed, 351 insertions(+), 100 deletions(-) create mode 100644 sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala new file mode 100644 index 0000000000000..2b0f8e293e76b --- /dev/null +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.autocdc + +/** + * Names that AutoCDC reserves for its own use, both for internal columns it inserts during + * reconciliation (e.g. `${prefix}metadata`, `${prefix}winning_row`) and for internal tables it + * manages alongside user-defined targets (e.g. the per-target auxiliary state table). + * + * A single recognizable prefix gives a single auditable answer to "what does AutoCDC own", and + * lets user-defined columns and tables be unambiguously distinguished from AutoCDC-managed ones. + */ +private[pipelines] object AutoCdcReservedNames { + + /** Common reserved-name prefix shared by AutoCDC internal columns and internal tables. */ + val prefix: String = "__spark_autocdc_" +} diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala index 15537d4173316..3c0d054ca57d5 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala @@ -371,9 +371,9 @@ case class Scd1BatchProcessor( val keyNames = changeArgs.keys.map(_.name) def constructTargetColumnAssignmentsFromMicrobatch(columnName: String): (String, Column) = { - // Map a column in the target table to its direct equivalent in the microbatch. Note that due - // to target table schema evolution during SDP dataset materialization, the microbatch's - // schema must always be a non-strict subset of the target table's schema. + // Map a column in the target table to its direct equivalent in the microbatch. Note that + // because of target-table schema evolution during SDP dataset materialization, the + // microbatch's columns are always a subset of (or equal to) the target's columns. val quotedCol = QuotingUtils.quoteIdentifier(columnName) s"$destinationTableStr.$quotedCol" -> F.col(s"microbatch.$quotedCol") } @@ -402,11 +402,10 @@ case class Scd1BatchProcessor( // merge, and instead would have been inserted as tombstones into the auxiliary table. .whenNotMatched(microbatchDeleteVersionField.isNull) // When inserting a brand new row for a new key, construct column mappings from microbatch. - // It's possible the microbatch columns are a subset of the columns currently in the target - // table, due to changing and more restrictive column selection across runs, the source - // dropping a column, etc. - // It is not possible for the microbatch's schema to ever be a superset of the target table - // however, due to SDP's schema evolution always unioning old and new schemas. + // The microbatch's columns may be a strict subset of the target's columns -- e.g. the user + // narrowed `column_list` between runs, or the source DF dropped a column. The target's + // columns can never be a strict subset of the microbatch's however, because SDP's schema + // evolution always unions old and new schemas onto the target. .insert(columnsToInsertOnNewKey) .merge() } @@ -433,14 +432,12 @@ case class Scd1BatchProcessor( object Scd1BatchProcessor { /** - * Reserved column-name prefix for internal SDP AutoCDC processing. Source change-data-feed - * dataframes must not contain any columns starting with this prefix; the invariant is + * Internal columns inserted by AutoCDC reconciliation. Source change-data-feed dataframes must + * not contain any columns starting with [[AutoCdcReservedNames.prefix]]; the invariant is * enforced at [[org.apache.spark.sql.pipelines.graph.AutoCdcMergeFlow]] construction. */ - private[pipelines] val reservedColumnNamePrefix: String = "__spark_autocdc_" - - private[autocdc] val winningRowColName: String = s"${reservedColumnNamePrefix}winning_row" - private[pipelines] val cdcMetadataColName: String = s"${reservedColumnNamePrefix}metadata" + private[autocdc] val winningRowColName: String = s"${AutoCdcReservedNames.prefix}winning_row" + private[pipelines] val cdcMetadataColName: String = s"${AutoCdcReservedNames.prefix}metadata" private[pipelines] val cdcDeleteSequenceFieldName: String = "deleteSequence" private[pipelines] val cdcUpsertSequenceFieldName: String = "upsertSequence" diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala index a59f7e5d614ee..80c9c6d391c66 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala @@ -306,13 +306,16 @@ object DatasetManager extends Logging { if (isFullRefresh) { // On full refresh, drop the AutoCDC auxiliary state associated with this table (if any) so // that stale delete-tracking data and table properties are not carried forward into the new - // table generation. + // table generation. We unconditionally issue the DROP for every fully-refreshed target; for + // non-AutoCDC tables this is a no-op because [[AutoCdcAuxiliaryTable.identifier]] derives + // its name from [[AutoCdcReservedNames.prefix]], which is reserved across AutoCDC and + // therefore cannot collide with a user-managed table. // Intentionally DROP and not TRUNCATE for two reasons; First, the auxiliary table may // contain table properties that represent stateful information (ex. SCD key count) that // should not be carried forward on a full refresh. Second, the auxiliary table is an // internal table and not part of the dataflow graph. That means it does not go through - // schema evolution like other tables and hence on a full refresh, we should explicitly be + // schema evolution like other tables and hence on a full refresh, we should explicitly // drop the existing auxiliary table schema so it can be recomputed. val auxiliaryTableId = AutoCdcAuxiliaryTable.identifier(table.identifier) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala index 04ef8d3186c5d..6f4f9cfcb0b30 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} import org.apache.spark.sql.classic.DataFrame import org.apache.spark.sql.pipelines.AnalysisWarning import org.apache.spark.sql.pipelines.autocdc.{ + AutoCdcReservedNames, CaseSensitivityLabels, ChangeArgs, ColumnSelection, @@ -335,7 +336,7 @@ class AutoCdcMergeFlow( */ private def requireReservedPrefixAbsentInSourceColumns(): Unit = { val resolver = spark.sessionState.conf.resolver - val reservedPrefix = Scd1BatchProcessor.reservedColumnNamePrefix + val reservedPrefix = AutoCdcReservedNames.prefix def nameContainsReservedPrefix(name: String): Boolean = { name.length >= reservedPrefix.length && resolver( diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index 4e3d8ff486a24..de1de84378507 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -35,7 +35,12 @@ import org.apache.spark.sql.connector.catalog.{ SupportsRowLevelOperations, TableCatalog } -import org.apache.spark.sql.pipelines.autocdc.{ChangeArgs, Scd1BatchProcessor, Scd1ForeachBatchHandler} +import org.apache.spark.sql.pipelines.autocdc.{ + AutoCdcReservedNames, + ChangeArgs, + Scd1BatchProcessor, + Scd1ForeachBatchHandler +} import org.apache.spark.sql.pipelines.graph.QueryOrigin.ExceptionHelpers import org.apache.spark.sql.pipelines.util.SparkSessionUtils import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} @@ -313,6 +318,27 @@ class SinkWrite( } } +object AutoCdcAuxiliaryTable { + /** + * Helper for deriving the auxiliary AutoCDC catalog table identifier from a target table. The + * derived name is anchored on [[AutoCdcReservedNames.prefix]] so it is unambiguously + * AutoCDC-managed and cannot collide with a user-managed table. + */ + def identifier(destination: TableIdentifier): TableIdentifier = TableIdentifier( + table = s"${AutoCdcReservedNames.prefix}aux_state_${destination.table}", + database = destination.database, + catalog = destination.catalog + ) + + /** + * Table property recording the number of AutoCDC key columns persisted at the front of an + * auxiliary table when it was first created. The number can only change after a full refresh of + * the target, which drops and recreates the auxiliary table. + */ + val numKeyColumnsProperty: String = + PipelinesTableProperties.pipelinesPrefix + "autoCdc.numKeyColumns" +} + /** * Helper mixin for AutoCDC merge-based write flows. */ @@ -332,11 +358,11 @@ trait AutoCdcMergeWriteMixin { /** * Full schema of the auxiliary table for this SCD type. The first `changeArgs.keys.length` * fields MUST be the AutoCDC key columns (in `changeArgs.keys` declaration order, with - * fully-resolved dataType and nullability) + * fully-resolved dataType). */ - protected def auxiliaryTableSchema(): StructType + protected def auxiliaryTableSchema: StructType - // Immediately validate that the destination table supports row level operations. + // Eagerly validate at construction time that the destination supports row-level ops. requireDestinationSupportsRowLevelOps() /** @@ -345,8 +371,8 @@ trait AutoCdcMergeWriteMixin { * that must remain invariant across incremental pipeline runs; users who want to change * keys must full-refresh the target. */ - private def autoCdcKeyColumns(): StructType = - StructType(auxiliaryTableSchema().fields.take(changeArgs.keys.length)) + private lazy val autoCdcKeyColumns: StructType = + StructType(auxiliaryTableSchema.fields.take(changeArgs.keys.length)) /** * Idempotently bring the auxiliary table for [[destination]] into a state consistent with the @@ -354,7 +380,7 @@ trait AutoCdcMergeWriteMixin { */ protected def createOrValidateAuxiliaryTable(spark: SparkSession): TableIdentifier = { val auxIdent = AutoCdcAuxiliaryTable.identifier(destination.identifier) - val (catalog, v2Identifier) = resolveAuxiliaryTableCatalog(spark, auxIdent) + val (catalog, v2Identifier) = resolveTableCatalog(spark, auxIdent) if (!catalog.tableExists(v2Identifier)) { // The auxiliary table inherits the target's format so MERGE semantics line up. When the @@ -365,7 +391,7 @@ trait AutoCdcMergeWriteMixin { spark.sql( s"""CREATE TABLE IF NOT EXISTS |${auxIdent.quotedString} - |(${auxiliaryTableSchema().toDDL}) $usingClause + |(${auxiliaryTableSchema.toDDL}) $usingClause |TBLPROPERTIES ( | '${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = '$numKeyColumns' |)""".stripMargin @@ -379,39 +405,28 @@ trait AutoCdcMergeWriteMixin { /** * Validate that the AutoCDC key columns the flow expects exactly match the keys recorded * in the existing auxiliary table at [[auxIdent]]: same number of key columns, same names - * (per the session resolver), same dataTypes. + * (per the session resolver), same `dataType`s. */ private def validateNoAutoCdcKeyDrift( spark: SparkSession, auxIdent: TableIdentifier): Unit = { - val (catalog, v2Identifier) = resolveAuxiliaryTableCatalog(spark, auxIdent) + val (catalog, v2Identifier) = resolveTableCatalog(spark, auxIdent) val existingAuxTable = catalog.loadTable(v2Identifier) val existingAuxSchema = CatalogV2Util.v2ColumnsToStructType(existingAuxTable.columns()) - val expectedKeySchema = autoCdcKeyColumns() + val expectedKeySchema = autoCdcKeyColumns val resolver = spark.sessionState.conf.resolver - val numRecordedKeys = Option( - existingAuxTable.properties().get(AutoCdcAuxiliaryTable.numKeyColumnsProperty) - ).map { raw => - try raw.toInt catch { - case _: NumberFormatException => - throw SparkException.internalError( - s"Auxiliary table ${auxIdent.quotedString} has a malformed " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property: '$raw'." - ) - } - }.getOrElse { - throw SparkException.internalError( - s"Auxiliary table ${auxIdent.quotedString} is missing the " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} table property; cannot validate " + - s"AutoCDC key columns. Full-refresh the target table to recreate the auxiliary table." - ) - } - + val numRecordedKeys = parseRecordedNumKeyColumns(existingAuxTable, auxIdent) val recordedKeyFields = existingAuxSchema.fields.take(numRecordedKeys) val drifted = + // The key count persisted to table properties should match against the number of keys in the + // current pipeline execution's expected aux table schema. recordedKeyFields.length != expectedKeySchema.length || + // The number of keys in the existing aux table schema should be no less than what was + // recorded in the aux table's properties. recordedKeyFields.length != numRecordedKeys || + // Each key in the existing aux table schema should should also exist in the current pipeline + // execution's expected aux table schema. recordedKeyFields.zip(expectedKeySchema.fields).exists { case (recorded, expected) => !resolver(recorded.name, expected.name) || recorded.dataType != expected.dataType @@ -430,24 +445,41 @@ trait AutoCdcMergeWriteMixin { } } + /** + * Read the integer [[AutoCdcAuxiliaryTable.numKeyColumnsProperty]] off an existing auxiliary + * table. Both "missing property" and "non-integer value" indicate corrupt internal state and + * surface as `internalError`s; the property is written by [[createOrValidateAuxiliaryTable]] on + * first run and is not expected to be missing or malformed on a healthy auxiliary table. + */ + private def parseRecordedNumKeyColumns( + existingAuxTable: org.apache.spark.sql.connector.catalog.Table, + auxIdent: TableIdentifier): Int = { + val rawNumKeyColumns = Option( + existingAuxTable.properties().get(AutoCdcAuxiliaryTable.numKeyColumnsProperty) + ).getOrElse { + throw SparkException.internalError( + s"Auxiliary table ${auxIdent.quotedString} is missing the " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} table property; cannot validate " + + s"AutoCDC key columns. Full-refresh the target table to recreate the auxiliary table." + ) + } + try rawNumKeyColumns.toInt catch { + case _: NumberFormatException => + throw SparkException.internalError( + s"Auxiliary table ${auxIdent.quotedString} has a malformed " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property: '$rawNumKeyColumns'." + ) + } + } + /** * Validate that the target table's underlying connector implements * [[SupportsRowLevelOperations]], which is the V2 connector contract for MERGE/UPDATE/DELETE * with rewrite - all operations that the AutoCDC transformation executes. */ private def requireDestinationSupportsRowLevelOps(): Unit = { - val catalogManager = spark.sessionState.catalogManager - val catalog = destination.identifier.catalog - .map(catalogManager.catalog) - .getOrElse(catalogManager.currentCatalog) - .asInstanceOf[TableCatalog] - - val destinationTable = catalog.loadTable( - Identifier.of( - Array(destination.identifier.database.get), - destination.identifier.identifier - ) - ) + val (catalog, v2Identifier) = resolveTableCatalog(spark, destination.identifier) + val destinationTable = catalog.loadTable(v2Identifier) if (!destinationTable.isInstanceOf[SupportsRowLevelOperations]) { throw new AnalysisException( @@ -460,38 +492,23 @@ trait AutoCdcMergeWriteMixin { } } - private def resolveAuxiliaryTableCatalog( + private def resolveTableCatalog( spark: SparkSession, - auxIdent: TableIdentifier): (TableCatalog, Identifier) = { + ident: TableIdentifier): (TableCatalog, Identifier) = { val catalogManager = spark.sessionState.catalogManager - val catalog = (auxIdent.catalog match { - case Some(catalogName) => catalogManager.catalog(catalogName) - case None => catalogManager.currentCatalog - }).asInstanceOf[TableCatalog] - val v2Identifier = Identifier.of(Array(auxIdent.database.get), auxIdent.table) - (catalog, v2Identifier) + val catalog = ident.catalog + .map(catalogManager.catalog) + .getOrElse(catalogManager.currentCatalog) + .asInstanceOf[TableCatalog] + val namespace = ident.database.getOrElse( + throw SparkException.internalError( + s"Cannot resolve table identifier ${ident.quotedString}: namespace is unspecified." + ) + ) + (catalog, Identifier.of(Array(namespace), ident.table)) } } -object AutoCdcAuxiliaryTable { - /** - * Helper for deriving the auxiliary AutoCDC catalog table identifier from a target table. - */ - def identifier(destination: TableIdentifier): TableIdentifier = TableIdentifier( - table = s"__auxiliary_auto_cdc_state_${destination.table}", - database = destination.database, - catalog = destination.catalog - ) - - /** - * Table property recording the number of AutoCDC key columns persisted at the front of an - * auxiliary table when it was first created. The number can only change after a full refresh of - * the target, which drops and recreates the auxiliary table. - */ - val numKeyColumnsProperty: String = - PipelinesTableProperties.pipelinesPrefix + "autoCdc.numKeyColumns" -} - /** * A [[StreamingFlowExecution]] that applies a CDC event stream to a target [[Table]] via * SCD Type 1 MERGE semantics. @@ -539,10 +556,10 @@ class Scd1MergeStreamingWrite( .start() } - override protected def auxiliaryTableSchema(): StructType = - // SCD1's auxiliary table is just keys + the CDC metadata struct; no user data columns. - // Keys come first, in `changeArgs.keys` declaration order, to satisfy the keys-first - // invariant that [[AutoCdcMergeWriteMixin]] relies on for drift detection. + override protected lazy val auxiliaryTableSchema: StructType = + // SCD1's auxiliary table is just keys + the CDC metadata struct; no user data columns. Keys + // come first, in `changeArgs.keys` declaration order, to satisfy the keys-first invariant that + // [[AutoCdcMergeWriteMixin]] relies on for drift detection. StructType(autoCdcKeyFields :+ cdcMetadataField) /** @@ -550,7 +567,7 @@ class Scd1MergeStreamingWrite( * `changeArgs.keys` declaration order. Keys are guaranteed to be present in the schema * because [[AutoCdcMergeFlow.schema]] validates that. */ - private def autoCdcKeyFields: Seq[StructField] = { + private lazy val autoCdcKeyFields: Seq[StructField] = { val resolver = updateContext.spark.sessionState.conf.resolver val targetTableSchema = flow.schema flow.changeArgs.keys.map { key => @@ -565,7 +582,7 @@ class Scd1MergeStreamingWrite( } /** CDC metadata field resolved out of the flow's augmented schema. */ - private def cdcMetadataField: StructField = { + private lazy val cdcMetadataField: StructField = { val resolver = updateContext.spark.sessionState.conf.resolver flow.schema.fields .find(field => resolver(field.name, Scd1BatchProcessor.cdcMetadataColName)) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala index 8d365906559bc..65eafd6c7dcc2 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala @@ -409,7 +409,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { "Contract: a source df column with the reserved AutoCDC prefix is rejected at flow " + "construction" ) { - val conflictingName = s"${Scd1BatchProcessor.reservedColumnNamePrefix}foo" + val conflictingName = s"${AutoCdcReservedNames.prefix}foo" val sourceDf = sourceDfWithExtraColumns(conflictingName -> StringType) checkError( @@ -422,7 +422,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { "caseSensitivity" -> CaseSensitivityLabels.CaseInsensitive, "columnName" -> conflictingName, "schemaName" -> "changeDataFeed", - "reservedColumnNamePrefix" -> Scd1BatchProcessor.reservedColumnNamePrefix + "reservedColumnNamePrefix" -> AutoCdcReservedNames.prefix ) ) } @@ -435,7 +435,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { // from any ChangeArgs path still fails at construction with a different error. The // reservation is on the name itself, not on its presence in the source feed. val cleanSourceDf = threeColumnSourceDf() - val reservedName = s"${Scd1BatchProcessor.reservedColumnNamePrefix}foo" + val reservedName = s"${AutoCdcReservedNames.prefix}foo" val keysEx = intercept[AnalysisException] { newAutoCdcMergeFlow( @@ -487,7 +487,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { "caseSensitivity" -> CaseSensitivityLabels.CaseInsensitive, "columnName" -> Scd1BatchProcessor.cdcMetadataColName, "schemaName" -> "changeDataFeed", - "reservedColumnNamePrefix" -> Scd1BatchProcessor.reservedColumnNamePrefix + "reservedColumnNamePrefix" -> AutoCdcReservedNames.prefix ) ) } @@ -497,7 +497,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { ) { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { val conflictingName = - s"${Scd1BatchProcessor.reservedColumnNamePrefix}foo".toUpperCase(Locale.ROOT) + s"${AutoCdcReservedNames.prefix}foo".toUpperCase(Locale.ROOT) val sourceDf = sourceDfWithExtraColumns(conflictingName -> StringType) checkError( @@ -510,7 +510,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { "caseSensitivity" -> CaseSensitivityLabels.CaseInsensitive, "columnName" -> conflictingName, "schemaName" -> "changeDataFeed", - "reservedColumnNamePrefix" -> Scd1BatchProcessor.reservedColumnNamePrefix + "reservedColumnNamePrefix" -> AutoCdcReservedNames.prefix ) ) } @@ -524,7 +524,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { // `spark.sql.caseSensitive`, consistent with the schema-augmentation logic in this class. withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { val nonConflictingName = - s"${Scd1BatchProcessor.reservedColumnNamePrefix}foo".toUpperCase(Locale.ROOT) + s"${AutoCdcReservedNames.prefix}foo".toUpperCase(Locale.ROOT) val sourceDf = sourceDfWithExtraColumns(nonConflictingName -> StringType) // No exception expected: construction succeeds. diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala index b72cc4bd6e8e9..cdb85f14ea1ce 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala @@ -154,7 +154,7 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite runPipeline(ctx) val auxSchema = spark.table(auxTableNameFor("target")).schema - + // The auxiliary table only contains keys and the metadata column, hence "name" should not be // included. assert(auxSchema.fieldNames.toSeq == Seq("id", Scd1BatchProcessor.cdcMetadataColName)) @@ -193,7 +193,7 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite runPipeline(ctx) val auxSchema = spark.table(auxTableNameFor("target")).schema - assert(auxSchema.fieldNames.toSeq == + assert(auxSchema.fieldNames.toSeq == Seq("region", "id", Scd1BatchProcessor.cdcMetadataColName)) assert(getAuxTableNumKeyColumns(target = "target") == 2) } @@ -240,6 +240,95 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite ) } + test("an auxiliary table missing the numKeyColumns property fails with INTERNAL_ERROR") { + val session = spark + import session.implicits._ + + val auxIdent = AutoCdcAuxiliaryTable.identifier( + fullyQualifiedIdentifier("target", Some(catalog), Some(namespace)) + ) + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + // Pre-create the auxiliary table without the numKeyColumns property to simulate corrupt + // internal state (e.g. a stale auxiliary table written by an older code path). The pipeline + // is expected to surface this as INTERNAL_ERROR rather than silently mis-validate keys. + spark.sql( + s"CREATE TABLE ${auxIdent.unquotedString} " + + s"(id INT NOT NULL, $cdcMetadataDdl)" + ) + + val stream = MemoryStream[(Int, Long)] + stream.addData((1, 1L)) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx) } + checkErrorInPipelineFailure( + failure = ex, + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> + (s"Auxiliary table ${auxIdent.quotedString} is missing the " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} table property; cannot validate " + + s"AutoCDC key columns. Full-refresh the target table to recreate the auxiliary table.") + ) + ) + } + + test("an auxiliary table with a malformed numKeyColumns property fails with INTERNAL_ERROR") { + val session = spark + import session.implicits._ + + val auxIdent = AutoCdcAuxiliaryTable.identifier( + fullyQualifiedIdentifier("target", Some(catalog), Some(namespace)) + ) + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + // Pre-create the auxiliary table with a non-integer numKeyColumns property; the pipeline + // should surface INTERNAL_ERROR rather than NumberFormatException. + spark.sql( + s"CREATE TABLE ${auxIdent.unquotedString} " + + s"(id INT NOT NULL, $cdcMetadataDdl) " + + s"TBLPROPERTIES ('${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = 'not-an-int')" + ) + + val stream = MemoryStream[(Int, Long)] + stream.addData((1, 1L)) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx) } + checkErrorInPipelineFailure( + failure = ex, + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> + (s"Auxiliary table ${auxIdent.quotedString} has a malformed " + + s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property: 'not-an-int'.") + ) + ) + } + private def getAuxTableNumKeyColumns(target: String): Int = { val auxName = auxTableNameFor(target) val rows = spark.sql(s"SHOW TBLPROPERTIES $auxName").collect() diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala index e374c2f1e9f8b..79bfc95b0eee9 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -366,6 +366,118 @@ class AutoCdcScd1SchemaEvolutionSuite ) } + test("a same-named key whose dataType differs from what the auxiliary table recorded " + + "fails with KEY_SCHEMA_DRIFT") { + val session = spark + import session.implicits._ + + // Synthetic setup: the target carries `id BIGINT NOT NULL` but a stale auxiliary table + // (recorded by a hypothetical earlier code path / earlier source schema) claims the recorded + // key was `id INT NOT NULL`. The flow's expected key columns therefore don't match the + // recorded key columns at the per-position dataType comparison even though the key arity and + // names line up. This is the case where naming-based comparisons alone would silently let + // through a contract change. + val auxIdent = AutoCdcAuxiliaryTable.identifier( + fullyQualifiedIdentifier("target", Some(catalog), Some(namespace)) + ) + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(id BIGINT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + spark.sql( + s"CREATE TABLE ${auxIdent.unquotedString} " + + s"(id INT NOT NULL, $cdcMetadataDdl) " + + s"TBLPROPERTIES ('${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = '1')" + ) + + val stream = MemoryStream[(Long, Long)] + stream.addData((1L, 1L)) + val ctx = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream.toDF().toDF("id", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + sqlState = Some("42000"), + parameters = Map( + "flowName" -> + fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, + "auxTableName" -> auxTableNameFor("target"), + "expectedKeySchema" -> "id BIGINT NOT NULL", + "recordedKeySchema" -> "id INT NOT NULL" + ) + ) + } + + test("reordering a composite key set across runs ([a,b] -> [b,a]) fails with " + + "KEY_SCHEMA_DRIFT") { + val session = spark + import session.implicits._ + + // Target carries both candidate key columns so the source DF is structurally compatible + // across both runs; only the AutoCDC `keys` declaration order changes. The auxiliary table + // records keys positionally in declared order, so a reorder must fail per-position name + // comparison. + spark.sql( + s"CREATE TABLE $catalog.$namespace.target " + + s"(a INT NOT NULL, b STRING NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Run #1: keys = [a, b]. Auxiliary table records positions 0=a, 1=b. + val stream1 = MemoryStream[(Int, String, Long)] + stream1.addData((1, "x", 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream1.toDF().toDF("a", "b", "version")), + keys = Seq("a", "b"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Run #2: same arity (2) and same key set ({a, b}), but reordered. The validator should + // reject because of per-position name comparison: position 0 is `b` expected vs recorded + // `a`. + val stream2 = MemoryStream[(Int, String, Long)] + stream2.addData((1, "x", 2L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "auto_cdc_flow", + target = "target", + query = dfFlowFunc(stream2.toDF().toDF("a", "b", "version")), + keys = Seq("b", "a"), + sequencing = functions.col("version") + )) + } + + val ex = intercept[RuntimeException] { runPipeline(ctx2) } + checkErrorInPipelineFailure( + failure = ex, + condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", + sqlState = Some("42000"), + parameters = Map( + "flowName" -> + fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, + "auxTableName" -> auxTableNameFor("target"), + "expectedKeySchema" -> "b STRING NOT NULL,a INT NOT NULL", + "recordedKeySchema" -> "a INT NOT NULL,b STRING NOT NULL" + ) + ) + } + test("a new top-level nullable column appearing in the source DF between runs is " + "added to the target") { val session = spark From e2d51ba1369050fd89a825137b2e73f6b49be640 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 04:08:16 +0000 Subject: [PATCH 03/13] more multipipeline tests --- .../graph/AutoCdcScd1MultiPipelineSuite.scala | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala index ed740db045371..f5e29f5434051 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala @@ -144,6 +144,155 @@ class AutoCdcScd1MultiPipelineSuite ) } + test("two AutoCDC pipelines targeting the same table with identical key and data " + + "schemas merge into a shared target table") { + val session = spark + import session.implicits._ + + // Target table is created once up-front; both pipelines target it with the same + // AutoCDC `keys` and the same source-DF data schema. The two pipelines have distinct + // flow names ("flow_v1" / "flow_v2") so they own independent streaming checkpoints, + // but share the target table and its auxiliary table. + spark.sql( + s"CREATE TABLE $catalog.$namespace.shared_target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Pipeline #1: inserts rows with id=1 and id=2 at version=1. + val stream1 = MemoryStream[(Int, String, Long)] + stream1.addData((1, "alice", 1L), (2, "bob", 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_v1", + target = "shared_target", + query = dfFlowFunc(stream1.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Sanity-check pipeline #1's effect before pipeline #2 runs. + checkAnswer( + spark.table(s"$catalog.$namespace.shared_target"), + Seq( + Row(1, "alice", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L))), + Row(2, "bob", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L))) + ) + ) + + // Pipeline #2: updates id=2 (existing key) to a higher sequence and inserts id=3 + // (new key). id=1 is untouched and must survive into the final target unchanged. + val stream2 = MemoryStream[(Int, String, Long)] + stream2.addData((2, "bob-v2", 2L), (3, "carol", 1L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_v2", + target = "shared_target", + query = dfFlowFunc(stream2.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx2) + + // Final target: id=1 untouched (pipeline #1's state), id=2 updated by pipeline #2, + // id=3 freshly inserted by pipeline #2. + checkAnswer( + spark.table(s"$catalog.$namespace.shared_target"), + Seq( + Row(1, "alice", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L))), + Row(2, "bob-v2", 2L, cdcMeta(deleteSeq = None, upsertSeq = Some(2L))), + Row(3, "carol", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L))) + ) + ) + + // The auxiliary table for the shared target is itself shared across both pipelines. + assert(spark.catalog.tableExists(auxTableNameFor("shared_target"))) + } + + test("two AutoCDC pipelines targeting the same table with the same key but different " + + "data columns evolve the shared target schema") { + val session = spark + import session.implicits._ + + // Target is created up-front with pipeline #1's schema only; pipeline #2 brings a new + // top-level nullable `age` column that the dataset materialization layer is expected + // to schema-merge into the target. + spark.sql( + s"CREATE TABLE $catalog.$namespace.shared_target " + + s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" + ) + + // Pipeline #1: source DF schema is (id, name, version); inserts id=1 and id=2. + val stream1 = MemoryStream[(Int, String, Long)] + stream1.addData((1, "alice", 1L), (2, "bob", 1L)) + val ctx1 = new TestGraphRegistrationContext(spark) { + registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_v1", + target = "shared_target", + query = dfFlowFunc(stream1.toDF().toDF("id", "name", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx1) + + // Sanity-check pipeline #1's state before schema evolution kicks in. + checkAnswer( + spark.table(s"$catalog.$namespace.shared_target"), + Seq( + Row(1, "alice", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L))), + Row(2, "bob", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L))) + ) + ) + + // Pipeline #2: source DF schema is (id, name, age, version). The new nullable `age` column + // should be added to the target by dataset materialization; pipeline #1's untouched id=1 row + // is backfilled to NULL. + val stream2 = MemoryStream[(Int, String, Option[Int], Long)] + stream2.addData((2, "bob-v2", Some(25), 2L), (3, "carol", Some(30), 1L)) + val ctx2 = new TestGraphRegistrationContext(spark) { + registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) + registerFlow(autoCdcFlow( + name = "flow_v2", + target = "shared_target", + query = dfFlowFunc(stream2.toDF().toDF("id", "name", "age", "version")), + keys = Seq("id"), + sequencing = functions.col("version") + )) + } + runPipeline(ctx2) + + checkAnswer( + spark.table(s"$catalog.$namespace.shared_target"), + Seq( + Row(1, "alice", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L)), null), + Row(2, "bob-v2", 2L, cdcMeta(deleteSeq = None, upsertSeq = Some(2L)), 25), + Row(3, "carol", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L)), 30) + ) + ) + + // Pipeline #1 runs again with its original (id, name, version) schema. The evolved + // target schema with `age` must persist: id=1's update leaves age untouched, id=4 is + // inserted with age=NULL, and pipeline #2's id=2/id=3 rows are unchanged. + stream1.addData((1, "alice-v2", 2L), (4, "dave", 1L)) + runPipeline(ctx1) + + checkAnswer( + spark.table(s"$catalog.$namespace.shared_target"), + Seq( + Row(1, "alice-v2", 2L, cdcMeta(deleteSeq = None, upsertSeq = Some(2L)), null), + Row(2, "bob-v2", 2L, cdcMeta(deleteSeq = None, upsertSeq = Some(2L)), 25), + Row(3, "carol", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L)), 30), + Row(4, "dave", 1L, cdcMeta(deleteSeq = None, upsertSeq = Some(1L)), null) + ) + ) + } + test("a second pipeline targeting an existing AutoCDC table with different keys " + "fails with KEY_SCHEMA_DRIFT") { val session = spark From f17c7d0c61d6bb0af15fbe799e2596fabeb11c0f Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 05:37:55 +0000 Subject: [PATCH 04/13] Drop AutoCDC key-schema-drift detection for SCD1 execution Upstream layers (MERGE planner, schema-merging utils, V2 connector column resolution) already surface organic failures for the only data-corrupting key-schema changes -- adding, dropping, or swapping keys raise UNRESOLVED_COLUMN; incompatible key-type changes raise CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE. Upcastable type changes and key-list reorderings remain (intentionally) silent no-ops. Removing the explicit drift check lets us delete the auxiliary-table numKeyColumns table property, the keys-first invariant on the aux schema, and the AUTOCDC_INVALID_STATE error condition. The auxiliary-table create/load path is now a one-liner that creates the table on first run and otherwise leaves it untouched. A follow-up branch (`prevent-autocdc-key-drift`) revisits drift detection; this commit is the baseline for that exploration. - common/utils/.../error-conditions.json: drop AUTOCDC_INVALID_STATE. - pipelines/.../FlowExecution.scala: drop validateNoAutoCdcKeyDrift, parseRecordedNumKeyColumns, autoCdcKeyColumns, numKeyColumnsProperty, and the now-unused identifier abstract member; rename createOrValidateAuxiliaryTable -> createAuxiliaryTableIfNotExists. - pipelines/.../AutoCdcScd1SchemaEvolutionSuite: drop the five KEY_SCHEMA_DRIFT cases; suite docstring now scopes to schema evolution only. - pipelines/.../AutoCdcScd1AuxiliaryTableDurabilitySuite: drop the two numKeyColumns INTERNAL_ERROR cases and the getAuxTableNumKeyColumns helper; existing structural tests no longer assert on the property. - pipelines/.../AutoCdcScd1MultiPipelineSuite: drop the cross-pipeline KEY_SCHEMA_DRIFT case. - pipelines/.../AutoCdcGraphExecutionTestMixin: drop KEY_SCHEMA_DRIFT reference in the retry-disable comment. --- .../resources/error/error-conditions.json | 13 - .../sql/pipelines/graph/FlowExecution.scala | 143 +-------- .../AutoCdcGraphExecutionTestMixin.scala | 8 +- ...CdcScd1AuxiliaryTableDurabilitySuite.scala | 102 ------ .../graph/AutoCdcScd1MultiPipelineSuite.scala | 61 ---- .../AutoCdcScd1SchemaEvolutionSuite.scala | 294 +----------------- 6 files changed, 26 insertions(+), 595 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 8b301243ad3b3..e72611efdca92 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -203,19 +203,6 @@ ], "sqlState" : "22023" }, - "AUTOCDC_INVALID_STATE" : { - "message" : [ - "AutoCDC flow detected an invalid state:" - ], - "subClass" : { - "KEY_SCHEMA_DRIFT" : { - "message" : [ - "The AutoCDC flow's current key columns do not match the keys recorded in the auxiliary table (recorded keys ). AutoCDC does not support changing key columns or their types across incremental pipeline runs. To change keys, perform a full refresh of the target table." - ] - } - }, - "sqlState" : "42000" - }, "AUTOCDC_KEY_NOT_IN_SELECTED_SCHEMA" : { "message" : [ "Using column name comparison, the AutoCDC key column `` is not present in the flow's selected source schema. AutoCDC requires every key column to be present in the source change-data feed and retained by any configured column selection." diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index de1de84378507..adc17b702a14b 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -29,12 +29,7 @@ import org.apache.spark.sql.{AnalysisException, Dataset, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.classic.SparkSession -import org.apache.spark.sql.connector.catalog.{ - CatalogV2Util, - Identifier, - SupportsRowLevelOperations, - TableCatalog -} +import org.apache.spark.sql.connector.catalog.{Identifier, SupportsRowLevelOperations, TableCatalog} import org.apache.spark.sql.pipelines.autocdc.{ AutoCdcReservedNames, ChangeArgs, @@ -329,14 +324,6 @@ object AutoCdcAuxiliaryTable { database = destination.database, catalog = destination.catalog ) - - /** - * Table property recording the number of AutoCDC key columns persisted at the front of an - * auxiliary table when it was first created. The number can only change after a full refresh of - * the target, which drops and recreates the auxiliary table. - */ - val numKeyColumnsProperty: String = - PipelinesTableProperties.pipelinesPrefix + "autoCdc.numKeyColumns" } /** @@ -349,129 +336,33 @@ trait AutoCdcMergeWriteMixin { /** The destination (target) table entity the AutoCDC flow will be writing to. */ protected def destination: Table - /** The AutoCDC flow's identifier, used as `flowName` in error messages emitted by this mixin. */ - protected def identifier: TableIdentifier - /** The AutoCDC flow's [[ChangeArgs]] (keys, sequencing, columnSelection, ...). */ protected def changeArgs: ChangeArgs - /** - * Full schema of the auxiliary table for this SCD type. The first `changeArgs.keys.length` - * fields MUST be the AutoCDC key columns (in `changeArgs.keys` declaration order, with - * fully-resolved dataType). - */ + /** Full schema of the auxiliary table for this SCD type. */ protected def auxiliaryTableSchema: StructType // Eagerly validate at construction time that the destination supports row-level ops. requireDestinationSupportsRowLevelOps() /** - * The AutoCDC key columns for this flow (column names + types), derived by slicing the - * front of [[auxiliaryTableSchema]] using the keys-first invariant. This is the subset - * that must remain invariant across incremental pipeline runs; users who want to change - * keys must full-refresh the target. - */ - private lazy val autoCdcKeyColumns: StructType = - StructType(auxiliaryTableSchema.fields.take(changeArgs.keys.length)) - - /** - * Idempotently bring the auxiliary table for [[destination]] into a state consistent with the - * flow's current [[changeArgs]] and return its [[TableIdentifier]]. + * Idempotently create the auxiliary table for [[destination]] if it does not already exist + * and return its [[TableIdentifier]]. */ - protected def createOrValidateAuxiliaryTable(spark: SparkSession): TableIdentifier = { + protected def createAuxiliaryTableIfNotExists(spark: SparkSession): TableIdentifier = { val auxIdent = AutoCdcAuxiliaryTable.identifier(destination.identifier) - val (catalog, v2Identifier) = resolveTableCatalog(spark, auxIdent) - - if (!catalog.tableExists(v2Identifier)) { - // The auxiliary table inherits the target's format so MERGE semantics line up. When the - // target's format is unspecified (None), omit the USING clause and fall back to the - // session's default source provider. - val usingClause = destination.format.map(fmt => s"USING $fmt").getOrElse("") - val numKeyColumns = changeArgs.keys.length - spark.sql( - s"""CREATE TABLE IF NOT EXISTS - |${auxIdent.quotedString} - |(${auxiliaryTableSchema.toDDL}) $usingClause - |TBLPROPERTIES ( - | '${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = '$numKeyColumns' - |)""".stripMargin - ) - } else { - validateNoAutoCdcKeyDrift(spark, auxIdent) - } + // The auxiliary table inherits the target's format so MERGE semantics line up. When the + // target's format is unspecified (None), omit the USING clause and fall back to the + // session's default source provider. + val usingClause = destination.format.map(fmt => s"USING $fmt").getOrElse("") + spark.sql( + s"""CREATE TABLE IF NOT EXISTS + |${auxIdent.quotedString} + |(${auxiliaryTableSchema.toDDL}) $usingClause""".stripMargin + ) auxIdent } - /** - * Validate that the AutoCDC key columns the flow expects exactly match the keys recorded - * in the existing auxiliary table at [[auxIdent]]: same number of key columns, same names - * (per the session resolver), same `dataType`s. - */ - private def validateNoAutoCdcKeyDrift( - spark: SparkSession, - auxIdent: TableIdentifier): Unit = { - val (catalog, v2Identifier) = resolveTableCatalog(spark, auxIdent) - val existingAuxTable = catalog.loadTable(v2Identifier) - val existingAuxSchema = CatalogV2Util.v2ColumnsToStructType(existingAuxTable.columns()) - val expectedKeySchema = autoCdcKeyColumns - val resolver = spark.sessionState.conf.resolver - - val numRecordedKeys = parseRecordedNumKeyColumns(existingAuxTable, auxIdent) - val recordedKeyFields = existingAuxSchema.fields.take(numRecordedKeys) - val drifted = - // The key count persisted to table properties should match against the number of keys in the - // current pipeline execution's expected aux table schema. - recordedKeyFields.length != expectedKeySchema.length || - // The number of keys in the existing aux table schema should be no less than what was - // recorded in the aux table's properties. - recordedKeyFields.length != numRecordedKeys || - // Each key in the existing aux table schema should should also exist in the current pipeline - // execution's expected aux table schema. - recordedKeyFields.zip(expectedKeySchema.fields).exists { case (recorded, expected) => - !resolver(recorded.name, expected.name) || - recorded.dataType != expected.dataType - } - - if (drifted) { - throw new AnalysisException( - errorClass = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - messageParameters = Map( - "flowName" -> identifier.unquotedString, - "auxTableName" -> auxIdent.unquotedString, - "expectedKeySchema" -> expectedKeySchema.toDDL, - "recordedKeySchema" -> StructType(recordedKeyFields).toDDL - ) - ) - } - } - - /** - * Read the integer [[AutoCdcAuxiliaryTable.numKeyColumnsProperty]] off an existing auxiliary - * table. Both "missing property" and "non-integer value" indicate corrupt internal state and - * surface as `internalError`s; the property is written by [[createOrValidateAuxiliaryTable]] on - * first run and is not expected to be missing or malformed on a healthy auxiliary table. - */ - private def parseRecordedNumKeyColumns( - existingAuxTable: org.apache.spark.sql.connector.catalog.Table, - auxIdent: TableIdentifier): Int = { - val rawNumKeyColumns = Option( - existingAuxTable.properties().get(AutoCdcAuxiliaryTable.numKeyColumnsProperty) - ).getOrElse { - throw SparkException.internalError( - s"Auxiliary table ${auxIdent.quotedString} is missing the " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} table property; cannot validate " + - s"AutoCDC key columns. Full-refresh the target table to recreate the auxiliary table." - ) - } - try rawNumKeyColumns.toInt catch { - case _: NumberFormatException => - throw SparkException.internalError( - s"Auxiliary table ${auxIdent.quotedString} has a malformed " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property: '$rawNumKeyColumns'." - ) - } - } - /** * Validate that the target table's underlying connector implements * [[SupportsRowLevelOperations]], which is the V2 connector contract for MERGE/UPDATE/DELETE @@ -536,7 +427,7 @@ class Scd1MergeStreamingWrite( override def startStream(): StreamingQuery = { val sourceChangeDataFeed = graph.reanalyzeFlow(flow).df - val auxiliaryTableIdentifier = createOrValidateAuxiliaryTable(spark = updateContext.spark) + val auxiliaryTableIdentifier = createAuxiliaryTableIfNotExists(spark = updateContext.spark) sourceChangeDataFeed.writeStream .queryName(displayName) @@ -558,8 +449,8 @@ class Scd1MergeStreamingWrite( override protected lazy val auxiliaryTableSchema: StructType = // SCD1's auxiliary table is just keys + the CDC metadata struct; no user data columns. Keys - // come first, in `changeArgs.keys` declaration order, to satisfy the keys-first invariant that - // [[AutoCdcMergeWriteMixin]] relies on for drift detection. + // come first, in `changeArgs.keys` declaration order, to anchor the per-key sequence + // watermark used to gate out-of-order events. StructType(autoCdcKeyFields :+ cdcMetadataField) /** diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala index 5ebdb4b4c86d2..e752d460ec84d 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala @@ -53,10 +53,10 @@ trait AutoCdcGraphExecutionTestMixin extends BeforeAndAfterEach { s"spark.sql.catalog.$catalog", classOf[SharedTablesInMemoryRowLevelOperationTableCatalog].getName ) - // Disable per-flow retries so failure-path tests (e.g. KEY_SCHEMA_DRIFT, INCOMPATIBLE_DATA) - // surface the AnalysisException after the first attempt instead of going through the default - // 2 retries, which would otherwise emit duplicate FAILED events and inflate test runtime - // without changing the asserted outcome. + // Disable per-flow retries so failure-path tests (e.g. INCOMPATIBLE_DATA) surface the + // AnalysisException after the first attempt instead of going through the default 2 retries, + // which would otherwise emit duplicate FAILED events and inflate test runtime without + // changing the asserted outcome. spark.conf.set(SQLConf.PIPELINES_MAX_FLOW_RETRY_ATTEMPTS.key, "0") spark.sql(s"CREATE NAMESPACE IF NOT EXISTS $catalog.$namespace") } diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala index cdb85f14ea1ce..50ff60556a73c 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala @@ -158,7 +158,6 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite // The auxiliary table only contains keys and the metadata column, hence "name" should not be // included. assert(auxSchema.fieldNames.toSeq == Seq("id", Scd1BatchProcessor.cdcMetadataColName)) - assert(getAuxTableNumKeyColumns(target = "target") == 1) } test("the auxiliary table preserves the user's declared key order, independent of the " + @@ -195,7 +194,6 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite val auxSchema = spark.table(auxTableNameFor("target")).schema assert(auxSchema.fieldNames.toSeq == Seq("region", "id", Scd1BatchProcessor.cdcMetadataColName)) - assert(getAuxTableNumKeyColumns(target = "target") == 2) } test("if the AutoCDC auxiliary table is dropped between runs, it is transparently " + @@ -240,104 +238,4 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite ) } - test("an auxiliary table missing the numKeyColumns property fails with INTERNAL_ERROR") { - val session = spark - import session.implicits._ - - val auxIdent = AutoCdcAuxiliaryTable.identifier( - fullyQualifiedIdentifier("target", Some(catalog), Some(namespace)) - ) - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - // Pre-create the auxiliary table without the numKeyColumns property to simulate corrupt - // internal state (e.g. a stale auxiliary table written by an older code path). The pipeline - // is expected to surface this as INTERNAL_ERROR rather than silently mis-validate keys. - spark.sql( - s"CREATE TABLE ${auxIdent.unquotedString} " + - s"(id INT NOT NULL, $cdcMetadataDdl)" - ) - - val stream = MemoryStream[(Int, Long)] - stream.addData((1, 1L)) - val ctx = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream.toDF().toDF("id", "version")), - keys = Seq("id"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx) } - checkErrorInPipelineFailure( - failure = ex, - condition = "INTERNAL_ERROR", - parameters = Map( - "message" -> - (s"Auxiliary table ${auxIdent.quotedString} is missing the " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} table property; cannot validate " + - s"AutoCDC key columns. Full-refresh the target table to recreate the auxiliary table.") - ) - ) - } - - test("an auxiliary table with a malformed numKeyColumns property fails with INTERNAL_ERROR") { - val session = spark - import session.implicits._ - - val auxIdent = AutoCdcAuxiliaryTable.identifier( - fullyQualifiedIdentifier("target", Some(catalog), Some(namespace)) - ) - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - // Pre-create the auxiliary table with a non-integer numKeyColumns property; the pipeline - // should surface INTERNAL_ERROR rather than NumberFormatException. - spark.sql( - s"CREATE TABLE ${auxIdent.unquotedString} " + - s"(id INT NOT NULL, $cdcMetadataDdl) " + - s"TBLPROPERTIES ('${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = 'not-an-int')" - ) - - val stream = MemoryStream[(Int, Long)] - stream.addData((1, 1L)) - val ctx = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream.toDF().toDF("id", "version")), - keys = Seq("id"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx) } - checkErrorInPipelineFailure( - failure = ex, - condition = "INTERNAL_ERROR", - parameters = Map( - "message" -> - (s"Auxiliary table ${auxIdent.quotedString} has a malformed " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property: 'not-an-int'.") - ) - ) - } - - private def getAuxTableNumKeyColumns(target: String): Int = { - val auxName = auxTableNameFor(target) - val rows = spark.sql(s"SHOW TBLPROPERTIES $auxName").collect() - val prop = rows - .find(_.getString(0) == AutoCdcAuxiliaryTable.numKeyColumnsProperty) - .getOrElse(throw new AssertionError( - s"auxiliary table $auxName is missing the " + - s"${AutoCdcAuxiliaryTable.numKeyColumnsProperty} property; got: ${rows.toSeq}" - )) - prop.getString(1).toInt - } } diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala index f5e29f5434051..e952b5eefa356 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala @@ -293,65 +293,4 @@ class AutoCdcScd1MultiPipelineSuite ) } - test("a second pipeline targeting an existing AutoCDC table with different keys " + - "fails with KEY_SCHEMA_DRIFT") { - val session = spark - import session.implicits._ - - // Target table with both candidate keys present so the second pipeline would otherwise - // be schema-compatible with the first; only the AutoCDC `keys` differ between flows. - spark.sql( - s"CREATE TABLE $catalog.$namespace.shared_target " + - s"(id INT NOT NULL, name STRING NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - - // Pipeline #1: AutoCDC flow keyed on `id`. Materializes the auxiliary table with schema - // (id, _cdc_metadata). - val stream1 = MemoryStream[(Int, String, Long)] - stream1.addData((1, "alice", 1L)) - val ctx1 = new TestGraphRegistrationContext(spark) { - registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "flow_v1", - target = "shared_target", - query = dfFlowFunc(stream1.toDF().toDF("id", "name", "version")), - keys = Seq("id"), - sequencing = functions.col("version") - )) - } - runPipeline(ctx1) - - // Pipeline #2: completely separate graph, but targets the same physical `shared_target` - // table with `keys = Seq("name")`. - val stream2 = MemoryStream[(Int, String, Long)] - stream2.addData((2, "alice", 1L)) - val ctx2 = new TestGraphRegistrationContext(spark) { - registerTable("shared_target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "flow_v2", - target = "shared_target", - query = dfFlowFunc(stream2.toDF().toDF("id", "name", "version")), - keys = Seq("name"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx2) } - checkErrorInPipelineFailure( - failure = ex, - condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - sqlState = Some("42000"), - parameters = Map( - "flowName" -> - fullyQualifiedIdentifier("flow_v2", Some(catalog), Some(namespace)).unquotedString, - "auxTableName" -> auxTableNameFor("shared_target"), - // Pipeline #2's AutoCDC key resolves from the source DF, where `MemoryStream[(Int, String, - // Long)]` produces a nullable StringType for `name`. - "expectedKeySchema" -> "name STRING", - // Pipeline #1 persisted the aux table from a source DF whose `id` was a non-null Scala - // primitive (`Int`), so the recorded key carries `NOT NULL`. - "recordedKeySchema" -> "id INT NOT NULL" - ) - ) - } } diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala index 79bfc95b0eee9..e8d3c4b21144a 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -31,11 +31,11 @@ import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistratio import org.apache.spark.sql.test.SharedSparkSession /** - * Tests covering AutoCDC's interaction with schema evolution and schema drift across - * pipeline runs. The suite documents the supported additive cases (new top-level columns, - * new nested fields in array-of-struct, broadening / narrowing column selection) and the - * cases that fail loudly today (subtractive nested evolution, type-incompatible changes, - * key-set changes, case-only renames). + * Tests covering AutoCDC's interaction with schema evolution across pipeline runs. The + * suite documents the supported additive cases (new top-level columns, new nested fields + * in array-of-struct, broadening / narrowing column selection) and the cases that fail + * loudly today (subtractive nested evolution, type-incompatible changes, case-only + * renames). * * These behaviors are largely inherited from the lower layers (`SchemaMergingUtils` for * schema merge, the v2 writer's column-resolution layer for nested-field handling) rather @@ -194,290 +194,6 @@ class AutoCdcScd1SchemaEvolutionSuite ) } - test("expanding the AutoCDC key set between runs fails with KEY_SCHEMA_DRIFT") { - val session = spark - import session.implicits._ - - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(region STRING, id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - - // Run #1: keys = [id]. Auxiliary table is created with schema (id, _cdc_metadata) and - // num_key_columns = 1. - val stream1 = MemoryStream[(String, Int, Long)] - stream1.addData(("us", 1, 1L)) - val ctx1 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream1.toDF().toDF("region", "id", "version")), - keys = Seq("id"), - sequencing = functions.col("version") - )) - } - runPipeline(ctx1) - - // Run #2: keys = [region, id]. Recorded num_key_columns = 1, expected 2 -> length - // mismatch -> KEY_SCHEMA_DRIFT. - val stream2 = MemoryStream[(String, Int, Long)] - stream2.addData(("us", 1, 2L)) - val ctx2 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream2.toDF().toDF("region", "id", "version")), - keys = Seq("region", "id"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx2) } - checkErrorInPipelineFailure( - failure = ex, - condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - sqlState = Some("42000"), - parameters = Map( - "flowName" -> - fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, - "auxTableName" -> auxTableNameFor("target"), - "expectedKeySchema" -> "region STRING,id INT NOT NULL", - "recordedKeySchema" -> "id INT NOT NULL" - ) - ) - } - - test("shrinking the AutoCDC key set between runs fails with KEY_SCHEMA_DRIFT") { - val session = spark - import session.implicits._ - - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(region STRING NOT NULL, id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - - // Run #1: keys = [region, id]. Auxiliary table is created with schema - // (region, id, _cdc_metadata) and num_key_columns = 2. - val stream1 = MemoryStream[(String, Int, Long)] - stream1.addData(("us", 1, 1L)) - val ctx1 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream1.toDF().toDF("region", "id", "version")), - keys = Seq("region", "id"), - sequencing = functions.col("version") - )) - } - runPipeline(ctx1) - - // Run #2: keys = [id]. Recorded num_key_columns = 2, expected 1 -> length mismatch -> - // KEY_SCHEMA_DRIFT. Without the strict-equality check, `id` matches at position 0 of the - // existing aux schema and the dropped `region` key would silently slip through. - val stream2 = MemoryStream[(String, Int, Long)] - stream2.addData(("us", 1, 2L)) - val ctx2 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream2.toDF().toDF("region", "id", "version")), - keys = Seq("id"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx2) } - checkErrorInPipelineFailure( - failure = ex, - condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - sqlState = Some("42000"), - parameters = Map( - "flowName" -> - fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, - "auxTableName" -> auxTableNameFor("target"), - "expectedKeySchema" -> "id INT NOT NULL", - "recordedKeySchema" -> "region STRING,id INT NOT NULL" - ) - ) - } - - test("swapping a key column for a different one of the same arity fails with " + - "KEY_SCHEMA_DRIFT") { - val session = spark - import session.implicits._ - - // Target carries both candidate key columns (`region` and `country`) so the source DF is - // structurally compatible across both runs; only the AutoCDC `keys` declaration changes. - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(id INT NOT NULL, region STRING, country STRING, version BIGINT NOT NULL, " + - s"$cdcMetadataDdl)" - ) - - // Run #1: keys = [id, region]. Auxiliary table records (id, region, _cdc_metadata) and - // num_key_columns = 2. - val stream1 = MemoryStream[(Int, String, String, Long)] - stream1.addData((1, "us", "USA", 1L)) - val ctx1 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream1.toDF().toDF("id", "region", "country", "version")), - keys = Seq("id", "region"), - sequencing = functions.col("version") - )) - } - runPipeline(ctx1) - - // Run #2: same key arity (2), but `country` is swapped in for `region`. Recorded - // num_key_columns matches expected (2), but the second key column's name diverges - // (`country` vs persisted `region`) -> KEY_SCHEMA_DRIFT. This is the case the - // length-mismatch check would silently miss without per-position name equality. - val stream2 = MemoryStream[(Int, String, String, Long)] - stream2.addData((1, "us", "USA", 2L)) - val ctx2 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream2.toDF().toDF("id", "region", "country", "version")), - keys = Seq("id", "country"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx2) } - checkErrorInPipelineFailure( - failure = ex, - condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - sqlState = Some("42000"), - parameters = Map( - "flowName" -> - fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, - "auxTableName" -> auxTableNameFor("target"), - "expectedKeySchema" -> "id INT NOT NULL,country STRING", - "recordedKeySchema" -> "id INT NOT NULL,region STRING" - ) - ) - } - - test("a same-named key whose dataType differs from what the auxiliary table recorded " + - "fails with KEY_SCHEMA_DRIFT") { - val session = spark - import session.implicits._ - - // Synthetic setup: the target carries `id BIGINT NOT NULL` but a stale auxiliary table - // (recorded by a hypothetical earlier code path / earlier source schema) claims the recorded - // key was `id INT NOT NULL`. The flow's expected key columns therefore don't match the - // recorded key columns at the per-position dataType comparison even though the key arity and - // names line up. This is the case where naming-based comparisons alone would silently let - // through a contract change. - val auxIdent = AutoCdcAuxiliaryTable.identifier( - fullyQualifiedIdentifier("target", Some(catalog), Some(namespace)) - ) - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(id BIGINT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - spark.sql( - s"CREATE TABLE ${auxIdent.unquotedString} " + - s"(id INT NOT NULL, $cdcMetadataDdl) " + - s"TBLPROPERTIES ('${AutoCdcAuxiliaryTable.numKeyColumnsProperty}' = '1')" - ) - - val stream = MemoryStream[(Long, Long)] - stream.addData((1L, 1L)) - val ctx = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream.toDF().toDF("id", "version")), - keys = Seq("id"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx) } - checkErrorInPipelineFailure( - failure = ex, - condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - sqlState = Some("42000"), - parameters = Map( - "flowName" -> - fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, - "auxTableName" -> auxTableNameFor("target"), - "expectedKeySchema" -> "id BIGINT NOT NULL", - "recordedKeySchema" -> "id INT NOT NULL" - ) - ) - } - - test("reordering a composite key set across runs ([a,b] -> [b,a]) fails with " + - "KEY_SCHEMA_DRIFT") { - val session = spark - import session.implicits._ - - // Target carries both candidate key columns so the source DF is structurally compatible - // across both runs; only the AutoCDC `keys` declaration order changes. The auxiliary table - // records keys positionally in declared order, so a reorder must fail per-position name - // comparison. - spark.sql( - s"CREATE TABLE $catalog.$namespace.target " + - s"(a INT NOT NULL, b STRING NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" - ) - - // Run #1: keys = [a, b]. Auxiliary table records positions 0=a, 1=b. - val stream1 = MemoryStream[(Int, String, Long)] - stream1.addData((1, "x", 1L)) - val ctx1 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream1.toDF().toDF("a", "b", "version")), - keys = Seq("a", "b"), - sequencing = functions.col("version") - )) - } - runPipeline(ctx1) - - // Run #2: same arity (2) and same key set ({a, b}), but reordered. The validator should - // reject because of per-position name comparison: position 0 is `b` expected vs recorded - // `a`. - val stream2 = MemoryStream[(Int, String, Long)] - stream2.addData((1, "x", 2L)) - val ctx2 = new TestGraphRegistrationContext(spark) { - registerTable("target", catalog = Some(catalog), database = Some(namespace)) - registerFlow(autoCdcFlow( - name = "auto_cdc_flow", - target = "target", - query = dfFlowFunc(stream2.toDF().toDF("a", "b", "version")), - keys = Seq("b", "a"), - sequencing = functions.col("version") - )) - } - - val ex = intercept[RuntimeException] { runPipeline(ctx2) } - checkErrorInPipelineFailure( - failure = ex, - condition = "AUTOCDC_INVALID_STATE.KEY_SCHEMA_DRIFT", - sqlState = Some("42000"), - parameters = Map( - "flowName" -> - fullyQualifiedIdentifier("auto_cdc_flow", Some(catalog), Some(namespace)).unquotedString, - "auxTableName" -> auxTableNameFor("target"), - "expectedKeySchema" -> "b STRING NOT NULL,a INT NOT NULL", - "recordedKeySchema" -> "a INT NOT NULL,b STRING NOT NULL" - ) - ) - } - test("a new top-level nullable column appearing in the source DF between runs is " + "added to the target") { val session = spark From 9dacba874b1a03f308c4f07758e4d02fcaa02261 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 19:45:10 +0000 Subject: [PATCH 05/13] linting --- .../pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala index e8d3c4b21144a..bba9bc57fa2fb 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -651,7 +651,7 @@ class AutoCdcScd1SchemaEvolutionSuite checkAnswer( spark.table(s"$catalog.$namespace.target").select("id", "name", "version", "extra"), Seq( - Row(1, "alice", 1L, 42), // extra preserved on the upsert + Row(1, "alice", 1L, 42), // extra preserved on the upsert Row(2, "bob", 1L, null) // extra is NULL for inserts ) ) @@ -701,7 +701,9 @@ class AutoCdcScd1SchemaEvolutionSuite val ex = intercept[RuntimeException] { runPipeline(ctx2) } val all = Iterator(ex) ++ ex.getSuppressed.iterator assert( - all.exists(t => Option(t.getMessage).exists(_.contains("CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE"))), + all.exists(t => Option(t.getMessage).exists( + _.contains("CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE")) + ), s"Expected CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE failure, got: ${ex.getMessage}" ) } From b3ea8f6924791e847104f1e311af6dd9626e003f Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 19:49:06 +0000 Subject: [PATCH 06/13] document auxiliary table creation --- .../apache/spark/sql/pipelines/graph/FlowExecution.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index adc17b702a14b..3fc78e1d220a8 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -427,6 +427,14 @@ class Scd1MergeStreamingWrite( override def startStream(): StreamingQuery = { val sourceChangeDataFeed = graph.reanalyzeFlow(flow).df + + // The auxiliary table is created here (at flow execution) rather than during flow resolution + // or dataset materialization for two reasons: + // 1. It is an internal state store: we deliberately keep it out of the graph registration + // context's table set so that it is invisible to other flows and the [[DatasetManager]] + // will never materialize it. + // 2. Its format must match the target table's, which only exists after the target is + // materialized. Flow resolution must also stay side-effect free (e.g. for dry runs). val auxiliaryTableIdentifier = createAuxiliaryTableIfNotExists(spark = updateContext.spark) sourceChangeDataFeed.writeStream From fa6104bab183ae82778660d98631e7e664a974c7 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 20:30:26 +0000 Subject: [PATCH 07/13] PR feedback --- .../resources/error/error-conditions.json | 2 +- python/pyspark/pipelines/api.py | 5 ++ .../sql/pipelines/graph/DatasetManager.scala | 10 ++-- .../sql/pipelines/graph/FlowExecution.scala | 44 +++++++++------ .../graph/AutoCdcScd1MultiPipelineSuite.scala | 4 +- .../AutoCdcScd1SchemaEvolutionSuite.scala | 54 +++++++++++-------- 6 files changed, 72 insertions(+), 47 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e72611efdca92..9d84d638e6567 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -258,7 +258,7 @@ }, "AUTOCDC_TARGET_DOES_NOT_SUPPORT_MERGE" : { "message" : [ - "Cannot start AutoCDC flow because the target table (format: ) does not support row-level MERGE operations. AutoCDC requires a target table whose format implements `SupportsRowLevelOperations`." + "Cannot start AutoCDC flow: the target table (format: ) does not support row-level operations. AutoCDC requires a target backed by a connector that supports MERGE." ], "sqlState" : "0A000" }, diff --git a/python/pyspark/pipelines/api.py b/python/pyspark/pipelines/api.py index 578b28ec3793d..084547f4c2b19 100644 --- a/python/pyspark/pipelines/api.py +++ b/python/pyspark/pipelines/api.py @@ -556,6 +556,11 @@ def create_auto_cdc_flow( Note that for keys, sequence_by, column_list, and except_column_list the arguments have to be column identifiers without qualifiers, e.g. they cannot be col("sourceTable.keyId"). + The set and types of `keys` are part of the Auto CDC flow's persisted state. Changing keys + across incremental runs (renaming, swapping, growing, shrinking, or changing the type of a + key column) is not supported and will produce undefined behavior. To change the key set, + fully refresh the target table. + :param target: The name of the target table that receives the Auto CDC flow. :param source: The name of the CDC source to stream from. :param keys: The column or combination of columns that uniquely identify a row in the source \ diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala index 80c9c6d391c66..da2907eacae81 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala @@ -311,12 +311,10 @@ object DatasetManager extends Logging { // its name from [[AutoCdcReservedNames.prefix]], which is reserved across AutoCDC and // therefore cannot collide with a user-managed table. - // Intentionally DROP and not TRUNCATE for two reasons; First, the auxiliary table may - // contain table properties that represent stateful information (ex. SCD key count) that - // should not be carried forward on a full refresh. Second, the auxiliary table is an - // internal table and not part of the dataflow graph. That means it does not go through - // schema evolution like other tables and hence on a full refresh, we should explicitly - // drop the existing auxiliary table schema so it can be recomputed. + // Intentionally DROP and not TRUNCATE: the auxiliary table is an internal state store + // that is not part of the dataflow graph, so it does not participate in regular schema + // evolution like user tables do. On a full refresh we want a clean recreation against + // the new target schema rather than carrying forward the previous generation's layout. val auxiliaryTableId = AutoCdcAuxiliaryTable.identifier(table.identifier) context.spark.sql(s"DROP TABLE IF EXISTS ${auxiliaryTableId.quotedString}") diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index 3fc78e1d220a8..c9ab932d0edb1 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -327,9 +327,9 @@ object AutoCdcAuxiliaryTable { } /** - * Helper mixin for AutoCDC merge-based write flows. + * Base trait for AutoCDC merge-based write flows. */ -trait AutoCdcMergeWriteMixin { +trait AutoCdcMergeWriteBase { /** The spark session the AutoCDC flow is going to be planned in. */ protected def spark: SparkSession @@ -342,12 +342,19 @@ trait AutoCdcMergeWriteMixin { /** Full schema of the auxiliary table for this SCD type. */ protected def auxiliaryTableSchema: StructType - // Eagerly validate at construction time that the destination supports row-level ops. - requireDestinationSupportsRowLevelOps() - /** * Idempotently create the auxiliary table for [[destination]] if it does not already exist * and return its [[TableIdentifier]]. + * + * Note that this is `CREATE TABLE IF NOT EXISTS`: when the aux table already exists, its + * schema is left untouched and `auxiliaryTableSchema` is ignored. This is safe for the + * additive evolution we support today (new non-key columns on the target), but it means we + * do not catch user-side AutoCDC key drift here - if a subsequent run renames, swaps, + * grows, shrinks, or changes the type of a key column, the aux table keeps its old key + * layout and the resulting MERGE will fail downstream with a confusing error rather than a + * targeted "keys changed, full-refresh required" error. The contract is therefore that + * changing the AutoCDC key set requires fully refreshing the target table; see the + * [[create_auto_cdc_flow]] Python docstring for the user-facing version of this rule. */ protected def createAuxiliaryTableIfNotExists(spark: SparkSession): TableIdentifier = { val auxIdent = AutoCdcAuxiliaryTable.identifier(destination.identifier) @@ -368,7 +375,7 @@ trait AutoCdcMergeWriteMixin { * [[SupportsRowLevelOperations]], which is the V2 connector contract for MERGE/UPDATE/DELETE * with rewrite - all operations that the AutoCDC transformation executes. */ - private def requireDestinationSupportsRowLevelOps(): Unit = { + protected def requireDestinationSupportsRowLevelOps(): Unit = { val (catalog, v2Identifier) = resolveTableCatalog(spark, destination.identifier) val destinationTable = catalog.loadTable(v2Identifier) @@ -413,7 +420,7 @@ class Scd1MergeStreamingWrite( val trigger: Trigger, val destination: Table, val sqlConf: Map[String, String] -) extends StreamingFlowExecution with AutoCdcMergeWriteMixin { +) extends StreamingFlowExecution with AutoCdcMergeWriteBase { override def getOrigin: QueryOrigin = flow.origin @@ -422,12 +429,14 @@ class Scd1MergeStreamingWrite( /** * Resolved Spark [[DataType]] of the sequencing expression. */ - private def sequencingType: DataType = + private lazy val sequencingType: DataType = flow.df.select(flow.changeArgs.sequencing).schema.head.dataType override def startStream(): StreamingQuery = { + requireDestinationSupportsRowLevelOps() + val sourceChangeDataFeed = graph.reanalyzeFlow(flow).df - + // The auxiliary table is created here (at flow execution) rather than during flow resolution // or dataset materialization for two reasons: // 1. It is an internal state store: we deliberately keep it out of the graph registration @@ -437,19 +446,20 @@ class Scd1MergeStreamingWrite( // materialized. Flow resolution must also stay side-effect free (e.g. for dry runs). val auxiliaryTableIdentifier = createAuxiliaryTableIfNotExists(spark = updateContext.spark) + val foreachBatchHandler = Scd1ForeachBatchHandler( + batchProcessor = Scd1BatchProcessor( + changeArgs = flow.changeArgs, + resolvedSequencingType = sequencingType + ), + auxiliaryTableIdentifier = auxiliaryTableIdentifier, + targetTableIdentifier = destination.identifier + ) + sourceChangeDataFeed.writeStream .queryName(displayName) .option("checkpointLocation", checkpointPath) .trigger(trigger) .foreachBatch((batch: Dataset[Row], batchId: Long) => { - val foreachBatchHandler = Scd1ForeachBatchHandler( - batchProcessor = Scd1BatchProcessor( - changeArgs = flow.changeArgs, - resolvedSequencingType = sequencingType - ), - auxiliaryTableIdentifier = auxiliaryTableIdentifier, - targetTableIdentifier = destination.identifier - ) foreachBatchHandler.execute(batch, batchId) }) .start() diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala index e952b5eefa356..32f34923c480e 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1MultiPipelineSuite.scala @@ -54,8 +54,8 @@ class AutoCdcScd1MultiPipelineSuite s"(id INT NOT NULL, name STRING, version BIGINT NOT NULL, $cdcMetadataDdl)" ) - // Pipeline #1 only knows about `t_a`. Its auxiliary table cat.ns1.__auxiliary_..._t_a - // must not affect pipeline #2's `t_b`. + // Pipeline #1 only knows about `t_a`. Its auxiliary table + // cat.ns1.__spark_autocdc_aux_state_t_a must not affect pipeline #2's `t_b`. val streamA = MemoryStream[(Int, String, Long)] streamA.addData((1, "alice", 100L)) val ctxA = new TestGraphRegistrationContext(spark) { diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala index bba9bc57fa2fb..6df09cabfcacc 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -452,12 +452,13 @@ class AutoCdcScd1SchemaEvolutionSuite // the target's `value.b.c` column. stream.addData((1, 2L, 10, 99, 10), (3, 1L, 3, 99, 3)) val ex = intercept[RuntimeException] { runPipeline(buildCtx(includeC = false)) } - val all = Iterator(ex) ++ ex.getSuppressed.iterator - assert( - all.exists(t => Option(t.getMessage).exists(m => - m.contains("INCOMPATIBLE_DATA_FOR_TABLE") && m.contains("value") && m.contains("b") && - m.contains("c"))), - s"Expected INCOMPATIBLE_DATA_FOR_TABLE failure for value.b.c, got: ${ex.getMessage}" + checkErrorInPipelineFailure( + failure = ex, + condition = "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA", + parameters = Map( + "tableName" -> s"`$catalog`.`$namespace`.`target`", + "colName" -> "`value`.`b`.`c`" + ) ) } @@ -564,11 +565,13 @@ class AutoCdcScd1SchemaEvolutionSuite stream.addData((1, 2L, 10, 10, 99), (3, 1L, 3, 3, 99)) val ex = intercept[RuntimeException] { runPipeline(buildCtx(includeD = false)) } - val all = Iterator(ex) ++ ex.getSuppressed.iterator - assert( - all.exists(t => Option(t.getMessage).exists(m => - m.contains("INCOMPATIBLE_DATA_FOR_TABLE") && m.contains("vals"))), - s"Expected INCOMPATIBLE_DATA_FOR_TABLE failure for vals element, got: ${ex.getMessage}" + checkErrorInPipelineFailure( + failure = ex, + condition = "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA", + parameters = Map( + "tableName" -> s"`$catalog`.`$namespace`.`target`", + "colName" -> "`vals`.`element`.`b`.`d`" + ) ) } @@ -607,10 +610,16 @@ class AutoCdcScd1SchemaEvolutionSuite } val ex = intercept[RuntimeException] { runPipeline(ctx) } - val all = Iterator(ex) ++ ex.getSuppressed.iterator - assert( - all.exists(t => Option(t.getMessage).exists(_.contains("AMBIGUOUS_REFERENCE"))), - s"Expected AMBIGUOUS_REFERENCE failure, got: ${ex.getMessage}" + // The exact `name` and `referenceNames` parameters depend on internal merge-plan + // synthesis; the condition match is the meaningful invariant for this test. + checkErrorInPipelineFailure( + failure = ex, + condition = "AMBIGUOUS_REFERENCE", + parameters = Map( + "name" -> ".*", + "referenceNames" -> ".*" + ), + matchPVals = true ) } } @@ -699,12 +708,15 @@ class AutoCdcScd1SchemaEvolutionSuite } val ex = intercept[RuntimeException] { runPipeline(ctx2) } - val all = Iterator(ex) ++ ex.getSuppressed.iterator - assert( - all.exists(t => Option(t.getMessage).exists( - _.contains("CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE")) - ), - s"Expected CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE failure, got: ${ex.getMessage}" + checkErrorInPipelineFailure( + failure = ex, + condition = "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE", + sqlState = Some("42825"), + // `left` is the persisted (run #1) TIMESTAMP type; `right` is run #2's STRING. + parameters = Map( + "left" -> "\"TIMESTAMP\"", + "right" -> "\"STRING\"" + ) ) } } From 5e56a963bf471fcd28dbf9d951a92ae9e4a15e0e Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 21:00:45 +0000 Subject: [PATCH 08/13] linting and doc cleanup --- .../spark/sql/pipelines/graph/FlowExecution.scala | 10 ++-------- .../graph/AutoCdcScd1SchemaEvolutionSuite.scala | 2 +- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index c9ab932d0edb1..a45abda5cf478 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -347,14 +347,8 @@ trait AutoCdcMergeWriteBase { * and return its [[TableIdentifier]]. * * Note that this is `CREATE TABLE IF NOT EXISTS`: when the aux table already exists, its - * schema is left untouched and `auxiliaryTableSchema` is ignored. This is safe for the - * additive evolution we support today (new non-key columns on the target), but it means we - * do not catch user-side AutoCDC key drift here - if a subsequent run renames, swaps, - * grows, shrinks, or changes the type of a key column, the aux table keeps its old key - * layout and the resulting MERGE will fail downstream with a confusing error rather than a - * targeted "keys changed, full-refresh required" error. The contract is therefore that - * changing the AutoCDC key set requires fully refreshing the target table; see the - * [[create_auto_cdc_flow]] Python docstring for the user-facing version of this rule. + * schema is left untouched and `auxiliaryTableSchema` is ignored. For SCD1, they keys must be + * invariant across executions and the CDC metadata will always be present, so this is correct. */ protected def createAuxiliaryTableIfNotExists(spark: SparkSession): TableIdentifier = { val auxIdent = AutoCdcAuxiliaryTable.identifier(destination.identifier) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala index 6df09cabfcacc..766b66726cad9 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -661,7 +661,7 @@ class AutoCdcScd1SchemaEvolutionSuite spark.table(s"$catalog.$namespace.target").select("id", "name", "version", "extra"), Seq( Row(1, "alice", 1L, 42), // extra preserved on the upsert - Row(2, "bob", 1L, null) // extra is NULL for inserts + Row(2, "bob", 1L, null) // extra is NULL for inserts ) ) } From e4b562ecae71b719ebba4b6ab29b07d7cdcd95ab Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 21:26:58 +0000 Subject: [PATCH 09/13] PR feedbavk --- .../autocdc/Scd1BatchProcessor.scala | 2 +- .../sql/pipelines/graph/DatasetManager.scala | 2 +- .../spark/sql/pipelines/graph/Flow.scala | 2 +- .../sql/pipelines/graph/FlowExecution.scala | 18 ++++----- .../sql/pipelines/graph/FlowPlanner.scala | 37 +++++++++++-------- .../AutoCdcGraphExecutionTestMixin.scala | 2 +- .../AutoCdcScd1SinglePipelineSuite.scala | 4 +- 7 files changed, 36 insertions(+), 31 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala index 3c0d054ca57d5..0035f442fb00a 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala @@ -404,7 +404,7 @@ case class Scd1BatchProcessor( // When inserting a brand new row for a new key, construct column mappings from microbatch. // The microbatch's columns may be a strict subset of the target's columns -- e.g. the user // narrowed `column_list` between runs, or the source DF dropped a column. The target's - // columns can never be a strict subset of the microbatch's however, because SDP's schema + // columns can never be a strict subset of the microbatch's, however, because SDP's schema // evolution always unions old and new schemas onto the target. .insert(columnsToInsertOnNewKey) .merge() diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala index da2907eacae81..67948242a552b 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala @@ -308,7 +308,7 @@ object DatasetManager extends Logging { // that stale delete-tracking data and table properties are not carried forward into the new // table generation. We unconditionally issue the DROP for every fully-refreshed target; for // non-AutoCDC tables this is a no-op because [[AutoCdcAuxiliaryTable.identifier]] derives - // its name from [[AutoCdcReservedNames.prefix]], which is reserved across AutoCDC and + // its name from [[AutoCdcReservedNames.prefix]], which is reserved by AutoCDC and // therefore cannot collide with a user-managed table. // Intentionally DROP and not TRUNCATE: the auxiliary table is an internal state store diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala index 6f4f9cfcb0b30..9f357ef026b0f 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala @@ -272,7 +272,7 @@ class AutoCdcMergeFlow( } /** The DataType of the sequencing expression, derived once from the source change feed. */ - private val sequencingType: DataType = + private[graph] val sequencingType: DataType = df.select(changeArgs.sequencing).schema.head.dataType /** diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index a45abda5cf478..b920ee64a57c5 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.classic.SparkSession import org.apache.spark.sql.connector.catalog.{Identifier, SupportsRowLevelOperations, TableCatalog} +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.pipelines.autocdc.{ AutoCdcReservedNames, ChangeArgs, @@ -39,7 +40,7 @@ import org.apache.spark.sql.pipelines.autocdc.{ import org.apache.spark.sql.pipelines.graph.QueryOrigin.ExceptionHelpers import org.apache.spark.sql.pipelines.util.SparkSessionUtils import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} -import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.util.ThreadUtils /** @@ -388,10 +389,13 @@ trait AutoCdcMergeWriteBase { spark: SparkSession, ident: TableIdentifier): (TableCatalog, Identifier) = { val catalogManager = spark.sessionState.catalogManager - val catalog = ident.catalog + val catalogPlugin = ident.catalog .map(catalogManager.catalog) .getOrElse(catalogManager.currentCatalog) - .asInstanceOf[TableCatalog] + val catalog = catalogPlugin match { + case t: TableCatalog => t + case _ => throw QueryCompilationErrors.missingCatalogTablesAbilityError(catalogPlugin) + } val namespace = ident.database.getOrElse( throw SparkException.internalError( s"Cannot resolve table identifier ${ident.quotedString}: namespace is unspecified." @@ -420,12 +424,6 @@ class Scd1MergeStreamingWrite( override protected def changeArgs: ChangeArgs = flow.changeArgs - /** - * Resolved Spark [[DataType]] of the sequencing expression. - */ - private lazy val sequencingType: DataType = - flow.df.select(flow.changeArgs.sequencing).schema.head.dataType - override def startStream(): StreamingQuery = { requireDestinationSupportsRowLevelOps() @@ -443,7 +441,7 @@ class Scd1MergeStreamingWrite( val foreachBatchHandler = Scd1ForeachBatchHandler( batchProcessor = Scd1BatchProcessor( changeArgs = flow.changeArgs, - resolvedSequencingType = sequencingType + resolvedSequencingType = flow.sequencingType ), auxiliaryTableIdentifier = auxiliaryTableIdentifier, targetTableIdentifier = destination.identifier diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala index 5860ca7389c8e..6fa9a0c06a391 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala @@ -74,28 +74,35 @@ class FlowPlanner( trigger = triggerFor(sf), checkpointPath = flowMetadata.latestCheckpointLocation ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported destination type: ${output.getClass.getSimpleName} for " + - s"streaming flow ${sf.identifier} (${flow.destinationIdentifier})" - ) + case _ => unsupportedDestinationType(sf, output) } case acmf: AutoCdcMergeFlow if acmf.changeArgs.storedAsScdType == ScdType.Type1 => val flowMetadata = FlowSystemMetadata(updateContext, acmf, graph) - new Scd1MergeStreamingWrite( - identifier = acmf.identifier, - flow = acmf, - graph = graph, - updateContext = updateContext, - checkpointPath = flowMetadata.latestCheckpointLocation, - trigger = triggerFor(acmf), - destination = output.asInstanceOf[Table], - sqlConf = acmf.sqlConf - ) + output match { + case o: Table => + new Scd1MergeStreamingWrite( + identifier = acmf.identifier, + flow = acmf, + graph = graph, + updateContext = updateContext, + checkpointPath = flowMetadata.latestCheckpointLocation, + trigger = triggerFor(acmf), + destination = o, + sqlConf = acmf.sqlConf + ) + case _ => unsupportedDestinationType(acmf, output) + } case _ => throw new UnsupportedOperationException( s"Unable to plan flow of type ${flow.getClass.getSimpleName}" ) } } + + private def unsupportedDestinationType(flow: ResolvedFlow, output: Output): Nothing = { + throw new UnsupportedOperationException( + s"Unsupported destination type: ${output.getClass.getSimpleName} for " + + s"flow ${flow.identifier} writing to ${flow.destinationIdentifier}" + ) + } } diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala index e752d460ec84d..5e2286a4fd56d 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala @@ -98,7 +98,7 @@ trait AutoCdcGraphExecutionTestMixin extends BeforeAndAfterEach { /** * Walk every [[Throwable]] reachable from `failure` via [[Throwable#getSuppressed]] and - * [[Throwable#getCause]] for the first [[SparkThrowable]] whose + * [[Throwable#getCause]], searching for the first [[SparkThrowable]] whose * [[SparkThrowable#getCondition]] equals `condition`, then run [[checkError]] against that * exception with all of its other arguments propagated through. */ diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala index 992dd89ffc05e..f06b8c4615339 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SinglePipelineSuite.scala @@ -108,7 +108,7 @@ class AutoCdcScd1SinglePipelineSuite runPipeline(ctx) // After all three events at seqs 1, 2, 3: row "alice2" wins as the highest-sequenced - // upsert; the delete at seq=2 is bounded by the seq=3 upsert. + // upsert; the delete at seq=2 is superseded by the seq=3 upsert. checkAnswer( spark.table(s"$catalog.$namespace.target"), Seq(Row(1, "alice2", 3L, cdcMeta(None, Some(3L)))) @@ -171,7 +171,7 @@ class AutoCdcScd1SinglePipelineSuite val session = spark import session.implicits._ - // Intentionally use a non-merge compatible catalog, whose default table format is parquet. + // Intentionally use a non-merge-compatible catalog, whose default table format is parquet. val catalog = TestGraphRegistrationContext.DEFAULT_CATALOG val database = TestGraphRegistrationContext.DEFAULT_DATABASE From fe77a7cbac642412f5e60253f8ea1cf0ac5f88d6 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 26 May 2026 23:45:57 +0000 Subject: [PATCH 10/13] fix schema evolution tests for empty tableName and queryContext The V2 writer's TableOutputResolver produces INCOMPATIBLE_DATA_FOR_TABLE errors with an empty `tableName` during plan analysis (the merge plan does not carry the target's catalog identifier through to the resolver call site), and AMBIGUOUS_REFERENCE now carries a SQL query context. Update the SCD1 schema-evolution test assertions to match. --- .../graph/AutoCdcScd1SchemaEvolutionSuite.scala | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala index 766b66726cad9..4c20b21ad57a5 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1SchemaEvolutionSuite.scala @@ -452,11 +452,14 @@ class AutoCdcScd1SchemaEvolutionSuite // the target's `value.b.c` column. stream.addData((1, 2L, 10, 99, 10), (3, 1L, 3, 99, 3)) val ex = intercept[RuntimeException] { runPipeline(buildCtx(includeC = false)) } + // The V2 writer's `TableOutputResolver` produces this error during plan analysis with + // an empty `tableName` because the merge plan it analyzes does not carry the target's + // catalog identifier through to the resolver call site. checkErrorInPipelineFailure( failure = ex, condition = "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA", parameters = Map( - "tableName" -> s"`$catalog`.`$namespace`.`target`", + "tableName" -> "``", "colName" -> "`value`.`b`.`c`" ) ) @@ -565,11 +568,12 @@ class AutoCdcScd1SchemaEvolutionSuite stream.addData((1, 2L, 10, 10, 99), (3, 1L, 3, 3, 99)) val ex = intercept[RuntimeException] { runPipeline(buildCtx(includeD = false)) } + // See the nested-struct test above for why `tableName` is empty here. checkErrorInPipelineFailure( failure = ex, condition = "INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA", parameters = Map( - "tableName" -> s"`$catalog`.`$namespace`.`target`", + "tableName" -> "``", "colName" -> "`vals`.`element`.`b`.`d`" ) ) @@ -619,7 +623,14 @@ class AutoCdcScd1SchemaEvolutionSuite "name" -> ".*", "referenceNames" -> ".*" ), - matchPVals = true + matchPVals = true, + queryContext = Array( + ExpectedContext( + fragment = s"`$catalog`.`$namespace`.`target`.`Value`", + start = 0, + stop = 27 + ) + ) ) } } From c850e09c41b51e9e467ecc1d195fe6814a8d7080 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Wed, 27 May 2026 01:28:39 +0000 Subject: [PATCH 11/13] PR feedback --- .../sql/pipelines/autocdc/ChangeArgs.scala | 16 +++++++++++--- .../sql/pipelines/graph/FlowExecution.scala | 22 +++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala index b975e06807f57..c475377ba5060 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala @@ -129,13 +129,23 @@ private[pipelines] object CaseSensitivityLabels { } /** The SCD (Slowly Changing Dimension) strategy for a CDC flow. */ -sealed trait ScdType +sealed trait ScdType { + /** + * Short, stable label for this SCD type. Persisted as table property on AutoCDC flow auxiliary + * tables. + */ + def label: String +} object ScdType { /** Representation for the standard SCD1 strategy. */ - case object Type1 extends ScdType + case object Type1 extends ScdType { + override val label: String = "SCD1" + } /** Representation for the standard SCD2 strategy. */ - case object Type2 extends ScdType + case object Type2 extends ScdType { + override val label: String = "SCD2" + } } /** diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index b920ee64a57c5..f427e6fbadc0f 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -325,6 +325,12 @@ object AutoCdcAuxiliaryTable { database = destination.database, catalog = destination.catalog ) + + /** + * Reserved table property key set on the auxiliary table to record which SCD strategy it + * serves. + */ + val scdTypePropertyKey: String = s"${PipelinesTableProperties.pipelinesPrefix}autocdc.scd_type" } /** @@ -357,10 +363,13 @@ trait AutoCdcMergeWriteBase { // target's format is unspecified (None), omit the USING clause and fall back to the // session's default source provider. val usingClause = destination.format.map(fmt => s"USING $fmt").getOrElse("") + val tblPropertiesClause = + s"TBLPROPERTIES ('${AutoCdcAuxiliaryTable.scdTypePropertyKey}' = " + + s"'${changeArgs.storedAsScdType.label}')" spark.sql( s"""CREATE TABLE IF NOT EXISTS |${auxIdent.quotedString} - |(${auxiliaryTableSchema.toDDL}) $usingClause""".stripMargin + |(${auxiliaryTableSchema.toDDL}) $usingClause $tblPropertiesClause""".stripMargin ) auxIdent } @@ -379,7 +388,12 @@ trait AutoCdcMergeWriteBase { errorClass = "AUTOCDC_TARGET_DOES_NOT_SUPPORT_MERGE", messageParameters = Map( "tableName" -> destination.identifier.quotedString, - "format" -> destination.format.getOrElse("") + "format" -> destination.format.orElse( + Option( + destinationTable.properties.get(TableCatalog.PROP_PROVIDER) + ) + ) + .getOrElse("") ) ) } @@ -420,13 +434,13 @@ class Scd1MergeStreamingWrite( val sqlConf: Map[String, String] ) extends StreamingFlowExecution with AutoCdcMergeWriteBase { + requireDestinationSupportsRowLevelOps() + override def getOrigin: QueryOrigin = flow.origin override protected def changeArgs: ChangeArgs = flow.changeArgs override def startStream(): StreamingQuery = { - requireDestinationSupportsRowLevelOps() - val sourceChangeDataFeed = graph.reanalyzeFlow(flow).df // The auxiliary table is created here (at flow execution) rather than during flow resolution From f99660bcd8dcb4e4a6a7dd6c639217e31d0f29ad Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Wed, 27 May 2026 04:39:43 +0000 Subject: [PATCH 12/13] PR feedback --- .../sql/pipelines/graph/DatasetManager.scala | 5 +-- .../sql/pipelines/graph/FlowExecution.scala | 6 +-- .../sql/pipelines/graph/FlowPlanner.scala | 37 ++++++++++++------- .../graph/AutoCdcScd1FullRefreshSuite.scala | 18 +++++---- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala index 67948242a552b..456edca8d1e22 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala @@ -306,10 +306,7 @@ object DatasetManager extends Logging { if (isFullRefresh) { // On full refresh, drop the AutoCDC auxiliary state associated with this table (if any) so // that stale delete-tracking data and table properties are not carried forward into the new - // table generation. We unconditionally issue the DROP for every fully-refreshed target; for - // non-AutoCDC tables this is a no-op because [[AutoCdcAuxiliaryTable.identifier]] derives - // its name from [[AutoCdcReservedNames.prefix]], which is reserved by AutoCDC and - // therefore cannot collide with a user-managed table. + // table generation. We unconditionally issue the DROP for every fully-refreshed target. // Intentionally DROP and not TRUNCATE: the auxiliary table is an internal state store // that is not part of the dataflow graph, so it does not participate in regular schema diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index f427e6fbadc0f..ea151830f5441 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -316,9 +316,9 @@ class SinkWrite( object AutoCdcAuxiliaryTable { /** - * Helper for deriving the auxiliary AutoCDC catalog table identifier from a target table. The - * derived name is anchored on [[AutoCdcReservedNames.prefix]] so it is unambiguously - * AutoCDC-managed and cannot collide with a user-managed table. + * Helper for deriving the auxiliary AutoCDC catalog table identifier from a target table. If a + * table exists with a name matching the name derived here, it is assumed to be an AutoCDC + * auxiliary table that should be managed by the pipeline. */ def identifier(destination: TableIdentifier): TableIdentifier = TableIdentifier( table = s"${AutoCdcReservedNames.prefix}aux_state_${destination.table}", diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala index 6fa9a0c06a391..8251780524a2d 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowPlanner.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.pipelines.graph +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.pipelines.autocdc.ScdType import org.apache.spark.sql.streaming.Trigger @@ -76,21 +77,29 @@ class FlowPlanner( ) case _ => unsupportedDestinationType(sf, output) } - case acmf: AutoCdcMergeFlow if acmf.changeArgs.storedAsScdType == ScdType.Type1 => - val flowMetadata = FlowSystemMetadata(updateContext, acmf, graph) - output match { - case o: Table => - new Scd1MergeStreamingWrite( - identifier = acmf.identifier, - flow = acmf, - graph = graph, - updateContext = updateContext, - checkpointPath = flowMetadata.latestCheckpointLocation, - trigger = triggerFor(acmf), - destination = o, - sqlConf = acmf.sqlConf + case acmf: AutoCdcMergeFlow => + acmf.changeArgs.storedAsScdType match { + case ScdType.Type1 => + val flowMetadata = FlowSystemMetadata(updateContext, acmf, graph) + output match { + case o: Table => + new Scd1MergeStreamingWrite( + identifier = acmf.identifier, + flow = acmf, + graph = graph, + updateContext = updateContext, + checkpointPath = flowMetadata.latestCheckpointLocation, + trigger = triggerFor(acmf), + destination = o, + sqlConf = acmf.sqlConf + ) + case _ => unsupportedDestinationType(acmf, output) + } + case ScdType.Type2 => + throw new AnalysisException( + errorClass = "AUTOCDC_SCD2_NOT_SUPPORTED", + messageParameters = Map.empty ) - case _ => unsupportedDestinationType(acmf, output) } case _ => throw new UnsupportedOperationException( diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala index bb5645e573d42..94ba7e20aed1f 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1FullRefreshSuite.scala @@ -169,11 +169,16 @@ class AutoCdcScd1FullRefreshSuite s"(id INT NOT NULL, version BIGINT NOT NULL, $cdcMetadataDdl)" ) - // Run #1: populate both targets at seq=10. + // streamA is replaced across runs because t_a is full-refreshed in run #2 (its streaming + // checkpoint is reset by full-refresh, so a fresh source is fine and matches the user-visible + // semantics). streamB is reused across runs because t_b is NOT full-refreshed -- its + // streaming checkpoint must resume against the same MemoryStream instance, otherwise the + // seq=5 assertion below could pass for the wrong reason (the source never produced seq=5 + // in run #2 instead of the aux watermark suppressing it). val streamA1 = MemoryStream[(Int, Long)] - val streamB1 = MemoryStream[(Int, Long)] + val streamB = MemoryStream[(Int, Long)] streamA1.addData((1, 10L)) - streamB1.addData((1, 10L)) + streamB.addData((1, 10L)) val ctx1 = new TestGraphRegistrationContext(spark) { registerTable("t_a", catalog = Some(catalog), database = Some(namespace)) registerTable("t_b", catalog = Some(catalog), database = Some(namespace)) @@ -187,7 +192,7 @@ class AutoCdcScd1FullRefreshSuite registerFlow(autoCdcFlow( name = "flow_b", target = "t_b", - query = dfFlowFunc(streamB1.toDF().toDF("id", "version")), + query = dfFlowFunc(streamB.toDF().toDF("id", "version")), keys = Seq("id"), sequencing = functions.col("version") )) @@ -196,9 +201,8 @@ class AutoCdcScd1FullRefreshSuite // Run #2: full refresh ONLY on t_a; t_b's auxiliary state must persist. val streamA2 = MemoryStream[(Int, Long)] - val streamB2 = MemoryStream[(Int, Long)] streamA2.addData((1, 5L)) // would have been suppressed pre-refresh; now wins - streamB2.addData((1, 5L)) // must be suppressed (auxiliary table retains seq=10) + streamB.addData((1, 5L)) // must be suppressed (auxiliary table retains seq=10) val ctx2 = new TestGraphRegistrationContext(spark) { registerTable("t_a", catalog = Some(catalog), database = Some(namespace)) registerTable("t_b", catalog = Some(catalog), database = Some(namespace)) @@ -212,7 +216,7 @@ class AutoCdcScd1FullRefreshSuite registerFlow(autoCdcFlow( name = "flow_b", target = "t_b", - query = dfFlowFunc(streamB2.toDF().toDF("id", "version")), + query = dfFlowFunc(streamB.toDF().toDF("id", "version")), keys = Seq("id"), sequencing = functions.col("version") )) From eed7d91f282f570cf0021b988a6bb85b452a28de Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 27 May 2026 11:22:56 +0000 Subject: [PATCH 13/13] Trigger CI