-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-55855][SQL][DML] DSv2 Transaction Management #55278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d048934
f44c3c0
934f20f
f20be52
6cf1365
0ea7e05
30e887b
0fc3049
dc9992c
3ed327b
38b6e31
6e33fa0
9f163a9
a46d78f
bd1efb8
21989de
f93daf9
391a1f3
6002929
dfbbc0e
32a591e
6094689
2faf237
5854cd0
097fa7c
76c5670
951ad43
dfd5f24
3262808
0cffb4b
cb6c361
009a604
367c66a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.connector.catalog; | ||
|
|
||
| import org.apache.spark.annotation.Evolving; | ||
| import org.apache.spark.sql.connector.catalog.transactions.Transaction; | ||
| import org.apache.spark.sql.connector.catalog.transactions.TransactionInfo; | ||
|
|
||
| /** | ||
| * A {@link CatalogPlugin} that supports transactions. | ||
| * <p> | ||
| * Catalogs that implement this interface opt in to transactional query execution. A catalog | ||
| * implementing this interface is responsible for starting transactions. | ||
| * | ||
| * @since 4.2.0 | ||
| */ | ||
| @Evolving | ||
| public interface TransactionalCatalogPlugin extends CatalogPlugin { | ||
|
|
||
| /** | ||
| * Begins a new transaction and returns a {@link Transaction} representing it. | ||
| * | ||
| * @param info metadata about the transaction being started. | ||
| */ | ||
| Transaction beginTransaction(TransactionInfo info); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.connector.catalog.transactions; | ||
|
|
||
| import org.apache.spark.annotation.Evolving; | ||
| import org.apache.spark.sql.connector.catalog.CatalogPlugin; | ||
| import org.apache.spark.sql.connector.catalog.TransactionalCatalogPlugin; | ||
|
|
||
| import java.io.Closeable; | ||
|
|
||
| /** | ||
| * Represents a transaction. | ||
| * <p> | ||
| * Spark begins a transaction with {@link TransactionalCatalogPlugin#beginTransaction} and | ||
| * executes read/write operations against the transaction's catalog. On success, Spark | ||
| * calls {@link #commit()}; on failure, Spark calls {@link #abort()}. In both cases Spark | ||
| * subsequently calls {@link #close()} to release resources. | ||
| * | ||
| * @since 4.2.0 | ||
| */ | ||
| @Evolving | ||
| public interface Transaction extends Closeable { | ||
|
|
||
| /** | ||
| * Returns the catalog associated with this transaction. This catalog is responsible for tracking | ||
| * read/write operations that occur within the boundaries of a transaction. This allows | ||
| * connectors to perform conflict resolution at commit time. | ||
| */ | ||
| CatalogPlugin catalog(); | ||
|
|
||
| /** | ||
| * Commits the transaction. All writes performed under it become visible to other readers. | ||
| * <p> | ||
| * The connector is responsible for detecting and resolving conflicting commits or throwing | ||
| * an exception if resolution is not possible. | ||
| * <p> | ||
| * This method will be called exactly once per transaction. Spark calls {@link #close()} | ||
| * immediately after this method returns. | ||
| * | ||
| * @throws IllegalStateException if the transaction has already been committed or aborted. | ||
| */ | ||
| void commit(); | ||
|
|
||
| /** | ||
| * Aborts the transaction, discarding any staged changes. | ||
| * <p> | ||
| * This method must be idempotent. If the transaction has already been committed or aborted, | ||
| * invoking it must have no effect. | ||
| * <p> | ||
| * Spark calls {@link #close()} immediately after this method returns. | ||
| */ | ||
| void abort(); | ||
|
|
||
| /** | ||
| * Releases any resources held by this transaction. | ||
| * <p> | ||
| * Spark always calls this method after {@link #commit()} or {@link #abort()}, regardless of | ||
| * whether those methods succeed or not. | ||
| * <p> | ||
| * This method must be idempotent. If the transaction has already been closed, | ||
| * invoking it must have no effect. | ||
| */ | ||
| @Override | ||
| void close(); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.connector.catalog.transactions; | ||
|
|
||
| import org.apache.spark.annotation.Evolving; | ||
|
|
||
| /** | ||
| * Metadata about a transaction. | ||
| * | ||
| * @since 4.2.0 | ||
| */ | ||
| @Evolving | ||
| public interface TransactionInfo { | ||
| /** | ||
| * Returns a unique identifier for this transaction. | ||
| */ | ||
| String id(); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -351,6 +351,33 @@ class Analyzer( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns a copy of this analyzer that uses the given [[CatalogManager]] for all catalog | ||
| * lookups. All other configuration (extended rules, checks, etc.) is preserved. Used by | ||
| * [[QueryExecution]] to create a per-query analyzer for transactional operations for | ||
| * transaction-aware catalog resolution. | ||
| * | ||
| * IMPORTANT: any new extension point added to Analyzer must also be copied here, otherwise | ||
| * transaction-aware analyzer clones (created by QueryExecution) will silently miss those rules. | ||
| */ | ||
| def withCatalogManager(newCatalogManager: CatalogManager): Analyzer = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a fragile place. Am I right it is a hard requirement to delegate to the original analyzer here? @andreaschat-db @juliuszsompolski, thoughts?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How can we ensure once we add more state to the analyzer it is used correctly here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it is a hard requirement because otherwise we will silently drop all registered extensions. In general, anyone who adds a new extension needs to also amend this method. The method is defined in the Analyzer itself so it should not be that easy to miss. In any case, just to be sure, I added a new suite/test, i.e. AnalyzerExtensionPropagationSuite, where I use reflection to verify all known extensions. If anyone adds a new extension the test will fail and point to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd tighten the test a bit more, left some comments there. |
||
| val self = this | ||
| new Analyzer(newCatalogManager, sharedRelationCache) { | ||
| override val hintResolutionRules: Seq[Rule[LogicalPlan]] = self.hintResolutionRules | ||
| override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = self.extendedResolutionRules | ||
| override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = self.postHocResolutionRules | ||
| override val extendedCheckRules: Seq[LogicalPlan => Unit] = self.extendedCheckRules | ||
| override val singlePassResolverExtensions: Seq[ResolverExtension] = | ||
| self.singlePassResolverExtensions | ||
| override val singlePassMetadataResolverExtensions: Seq[ResolverExtension] = | ||
| self.singlePassMetadataResolverExtensions | ||
| override val singlePassPostHocResolutionRules: Seq[Rule[LogicalPlan]] = | ||
| self.singlePassPostHocResolutionRules | ||
| override val singlePassExtendedResolutionChecks: Seq[LogicalPlan => Unit] = | ||
| self.singlePassExtendedResolutionChecks | ||
| } | ||
| } | ||
|
|
||
| override def execute(plan: LogicalPlan): LogicalPlan = { | ||
| AnalysisContext.withNewAnalysisContext { | ||
| executeSameContext(plan) | ||
|
|
@@ -458,7 +485,9 @@ class Analyzer( | |
| Batch("Simple Sanity Check", Once, | ||
| LookupFunctions), | ||
| Batch("Keep Legacy Outputs", Once, | ||
| KeepLegacyOutputs) | ||
| KeepLegacyOutputs), | ||
| Batch("Unresolve Relations", Once, | ||
| new UnresolveRelationsInTransaction(catalogManager)) | ||
| ) | ||
|
|
||
| override def batches: Seq[Batch] = earlyBatches ++ Seq( | ||
|
|
@@ -1015,7 +1044,7 @@ class Analyzer( | |
| // DataSourceV2Relation on each view access. Only dataframe temp view may contain it | ||
| // as it stores resolved plans directly. | ||
| case view: View if view.isTempViewStoringAnalyzedPlan => | ||
| view.copy(child = resolveTableReferences(view.child)) | ||
| view.copy(child = resolveTableReferencesInTempView(view.child)) | ||
| case p @ SubqueryAlias(_, view: View) => | ||
| p.copy(child = resolveViews(view, options)) | ||
| case _ => plan | ||
|
|
@@ -1024,17 +1053,43 @@ class Analyzer( | |
| // Unwrap temp views storing analyzed plans and resolve V2TableReference nodes in the child. | ||
| private def unwrapRelationPlan(plan: LogicalPlan): LogicalPlan = { | ||
| EliminateSubqueryAliases(plan) match { | ||
| case v: View if v.isTempViewStoringAnalyzedPlan => resolveTableReferences(v.child) | ||
| case v: View if v.isTempViewStoringAnalyzedPlan => resolveTableReferencesInTempView(v.child) | ||
| case other => other | ||
| } | ||
| } | ||
|
|
||
| // Resolve V2TableReference nodes in a plan. V2TableReference is only created for temp views | ||
| // (via V2TableReference.createForTempView), so we only need to resolve it when returning | ||
| // Resolve the write target of a V2 write command (batch or streaming). | ||
| private def resolveWriteTarget( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I understand the purpose of this method.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The The alternative here would be to set to streaming in IIUC, streaming is a read side concept. For the write perspective, we write batches which are not streams any more. This is I think the reason we originally create a Thoughts? |
||
| write: LogicalPlan, | ||
| table: NamedRelation, | ||
| withNewTable: NamedRelation => LogicalPlan): LogicalPlan = { | ||
| table match { | ||
| case u: UnresolvedRelation if !u.isStreaming => | ||
| resolveRelation(u).map(unwrapRelationPlan).map { | ||
| case v: View => throw QueryCompilationErrors.writeIntoViewNotAllowedError( | ||
| v.desc.identifier, write) | ||
| case u: UnresolvedCatalogRelation => | ||
| throw QueryCompilationErrors.writeIntoV1TableNotAllowedError( | ||
| u.tableMeta.identifier, write) | ||
| case r: DataSourceV2Relation => withNewTable(r) | ||
| case _ => | ||
| throw QueryCompilationErrors.writeIntoTempViewNotAllowedError( | ||
| u.multipartIdentifier.quoted) | ||
| }.getOrElse(write) | ||
| case _ => write | ||
| } | ||
| } | ||
|
|
||
| // Resolve V2TableReference nodes inside temp view plans. These are created by | ||
| // V2TableReference.createForTempView. We only need to resolve it when returning | ||
| // the plan of temp views (in resolveViews and unwrapRelationPlan). | ||
| private def resolveTableReferences(plan: LogicalPlan): LogicalPlan = { | ||
| private def resolveTableReferencesInTempView(plan: LogicalPlan): LogicalPlan = { | ||
| plan.resolveOperatorsUp { | ||
| case r: V2TableReference => relationResolution.resolveReference(r) | ||
| case r: V2TableReference => | ||
| assert(r.context.isInstanceOf[V2TableReference.TemporaryViewContext], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I agree with this, it feels risky to enforce this. What about something like below? |
||
| s"""Expected TemporaryViewContext in temp view but got | ||
| |${r.context.getClass.getSimpleName}""".stripMargin) | ||
| relationResolution.resolveReference(r) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1057,23 +1112,11 @@ class Analyzer( | |
| case other => i.copy(table = other) | ||
| } | ||
|
|
||
| // TODO (SPARK-27484): handle streaming write commands when we have them. | ||
| case write: StreamingV2WriteCommand => | ||
| resolveWriteTarget(write, write.table, write.withNewTable) | ||
|
|
||
| case write: V2WriteCommand => | ||
| write.table match { | ||
| case u: UnresolvedRelation if !u.isStreaming => | ||
| resolveRelation(u).map(unwrapRelationPlan).map { | ||
| case v: View => throw QueryCompilationErrors.writeIntoViewNotAllowedError( | ||
| v.desc.identifier, write) | ||
| case u: UnresolvedCatalogRelation => | ||
| throw QueryCompilationErrors.writeIntoV1TableNotAllowedError( | ||
| u.tableMeta.identifier, write) | ||
| case r: DataSourceV2Relation => write.withNewTable(r) | ||
| case _ => | ||
| throw QueryCompilationErrors.writeIntoTempViewNotAllowedError( | ||
| u.multipartIdentifier.quoted) | ||
| }.getOrElse(write) | ||
| case _ => write | ||
| } | ||
| resolveWriteTarget(write, write.table, write.withNewTable) | ||
|
|
||
| case u: UnresolvedRelation => | ||
| resolveRelation(u).map(resolveViews(_, u.options)).getOrElse(u) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This statement can be misleading. It could be a transactional commit from the connector point view. I think a better message would be to highlight that if called within Transaction, this should stage changes but not propagate them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated both this and the one at BatchWrite.