From 384497bd28f14f125287eac9fd89b55e152acded Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Tue, 14 Apr 2026 19:02:39 +0000 Subject: [PATCH 01/21] slam Co-authored-by: Isaac --- .../org/apache/spark/internal/LogKeys.java | 4 + .../org/apache/spark/internal/Logging.scala | 5 + .../scala/org/apache/spark/SparkContext.scala | 2 + .../apache/spark/internal/config/Tests.scala | 7 + .../apache/spark/rdd/RDDOperationScope.scala | 22 +- .../apache/spark/scheduler/DAGScheduler.scala | 18 + .../spark/util/LastAttemptAccumulator.scala | 857 ++++++++++++++++++ .../apache/spark/sql/classic/Dataset.scala | 8 +- .../spark/sql/execution/QueryExecution.scala | 27 +- .../spark/sql/execution/SparkPlan.scala | 25 +- .../execution/adaptive/AQETestHelper.scala | 78 ++ .../adaptive/AdaptiveSparkPlanExec.scala | 20 +- .../adaptive/AdaptiveSparkPlanHelper.scala | 17 + .../exchange/ShuffleExchangeExec.scala | 27 +- .../metric/SQLLastAttemptAccumulator.scala | 412 +++++++++ .../metric/SQLLastAttemptMetric.scala | 113 +++ .../sql/execution/metric/SQLMetrics.scala | 8 +- .../metric/MetricsFailureInjectionSuite.scala | 364 ++++++++ ...SQLLastAttemptMetricIntegrationSuite.scala | 711 +++++++++++++++ .../SQLLastAttemptMetricPlanShapesSuite.scala | 490 ++++++++++ .../SQLLastAttemptMetricUnitSuite.scala | 189 ++++ .../metric/SQLMetricsTestUtils.scala | 24 + .../apache/spark/sql/test/SQLTestUtils.scala | 22 + 23 files changed, 3424 insertions(+), 26 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/metric/MetricsFailureInjectionSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricPlanShapesSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala diff --git a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java index a13153429541c..977a6e3d7ed74 100644 --- a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java +++ b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java @@ -728,6 +728,10 @@ public enum LogKeys implements LogKey { SIGNAL, SINK, SIZE, + SLAM_INVALIDATE, + SLAM_STATE_SYSTEM_METRIC, + SLAM_STATE_USER_METRIC, + SLAM_UNEXPECTED_REASON, SLEEP_TIME, SLIDE_DURATION, SMALLEST_CLUSTER_INDEX, diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala index 810bdabebb38a..3fac57dbe5dda 100644 --- a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala @@ -95,6 +95,11 @@ class LogEntry(messageWithContext: => MessageWithContext) { def message: String = cachedMessageWithContext.message def context: java.util.Map[String, String] = cachedMessageWithContext.context + + def +(other: LogEntry): LogEntry = { + val combined = cachedMessageWithContext + other.cachedMessageWithContext + new LogEntry(combined) + } } /** diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index fad9bb522ad92..0262144490ce8 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -3152,6 +3152,8 @@ object SparkContext extends Logging { private[spark] val RDD_SCOPE_KEY = "spark.rdd.scope" private[spark] val RDD_SCOPE_NO_OVERRIDE_KEY = "spark.rdd.scope.noOverride" private[spark] val SQL_EXECUTION_ID_KEY = "spark.sql.execution.id" + private[spark] val DATASET_QUERY_EXECUTION_ID_KEY = + "spark.sql.dataset.queryExecution.id" /** * Executor id for the driver. In earlier versions of Spark, this was ``, but this was diff --git a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala index 98b80317db982..0f23566ad7681 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala @@ -85,6 +85,13 @@ private[spark] object Tests { .booleanConf .createWithDefault(false) + val INJECT_SHUFFLE_FETCH_FAILURES = + ConfigBuilder("spark.testing.injectShuffleFetchFailures") + .doc("Injecting fetch failures for shuffle stages by providing an invalid BlockManager " + + "location for the first stage attempt. Testing only flag!") + .booleanConf + .createWithDefault(false) + val TEST_SKIP_ESS_REGISTER = ConfigBuilder("spark.testing.skipESSRegister") .version("4.0.0") .doc("None of Spark testing modes (local, local-cluster) enables shuffle service. So it is " + diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala index 49c259999a471..675c44153cd4d 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala @@ -130,6 +130,22 @@ private[spark] object RDDOperationScope extends Logging { name: String, allowNesting: Boolean, ignoreParent: Boolean)(body: => T): T = { + withScope(sc, name, allowNesting, ignoreParent, + nextScopeId().toString)(body) + } + + /** + * Execute the given body such that all RDDs created in this body + * will have the same scope, with an explicit scope ID. + * + * Note: Return statements are NOT allowed in body. + */ + private[spark] def withScope[T]( + sc: SparkContext, + name: String, + allowNesting: Boolean, + ignoreParent: Boolean, + rddScopeId: String)(body: => T): T = { // Save the old scope to restore it later val scopeKey = SparkContext.RDD_SCOPE_KEY val noOverrideKey = SparkContext.RDD_SCOPE_NO_OVERRIDE_KEY @@ -139,10 +155,12 @@ private[spark] object RDDOperationScope extends Logging { try { if (ignoreParent) { // Ignore all parent settings and scopes and start afresh with our own root scope - sc.setLocalProperty(scopeKey, new RDDOperationScope(name).toJson) + sc.setLocalProperty(scopeKey, + new RDDOperationScope(name, None, rddScopeId).toJson) } else if (sc.getLocalProperty(noOverrideKey) == null) { // Otherwise, set the scope only if the higher level caller allows us to do so - sc.setLocalProperty(scopeKey, new RDDOperationScope(name, oldScope).toJson) + sc.setLocalProperty(scopeKey, + new RDDOperationScope(name, oldScope, rddScopeId).toJson) } // Optionally disallow the child body to override our scope if (!allowNesting) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 0c8d437c98bd9..e73bf558dc0c0 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1830,6 +1830,11 @@ private[spark] class DAGScheduler( throw SparkCoreErrors.accessNonExistentAccumulatorError(id) } acc.merge(updates.asInstanceOf[AccumulatorV2[Any, Any]]) + if (acc.isInstanceOf[LastAttemptAccumulator[_, _, _]]) { + acc.asInstanceOf[LastAttemptAccumulator[_, _, _]].mergeLastAttempt( + updates, stage.rdd, event.taskInfo, + task.stageId, task.stageAttemptId, task.localProperties) + } // To avoid UI cruft, ignore cases where value wasn't updated if (acc.name.isDefined && !updates.isZero) { stage.latestInfo.accumulables(id) = acc.toInfo(None, Some(acc.value)) @@ -2311,6 +2316,19 @@ private[spark] class DAGScheduler( // The epoch of the task is acceptable (i.e., the task was launched after the most // recent failure we're aware of for the executor), so mark the task's output as // available. + // For testing purposes, inject fetch failures controlled from the driver-side by + // supplying an invalid location. + if (Utils.isTesting && + sc.conf.get(config.Tests.INJECT_SHUFFLE_FETCH_FAILURES) && + task.stageAttemptId == 0) { + val currentLocation = status.location + val invalidLocation = BlockManagerId( + execId = BlockManagerId.INVALID_EXECUTOR_ID, + host = currentLocation.host, + port = currentLocation.port, + topologyInfo = currentLocation.topologyInfo) + status.updateLocation(invalidLocation) + } val isChecksumMismatched = mapOutputTracker.registerMapOutput( shuffleStage.shuffleDep.shuffleId, smt.partitionId, status) if (isChecksumMismatched) { diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala new file mode 100644 index 0000000000000..b078e40e00634 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -0,0 +1,857 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +import scala.math.Ordering.Implicits._ +import scala.reflect.ClassTag +import scala.util.control.NonFatal + +import org.apache.spark.SparkContext +import org.apache.spark.internal.{LogEntry, Logging, LogKey, LogKeys} +import org.apache.spark.rdd.RDD +import org.apache.spark.scheduler.TaskInfo + +/* + * Last Attempt Accumulators are Accumulators that track the value of a metric aggregated across + * the "last execution" that produced the values. "Last execution" can be defined as: + * - For RDDs: the last execution of a given RDD partition, in the latest Stage and Stage attempt + * that recomputed it. + * - Across RDDs: lastAttemptValueForRDDId, lastAttemptValueForRDDIds, lastAttemptValueForAllRDDs, + * lastAttemptValueForHighestRDDId let specify that only values from specific RDDs should be + * aggregated. + * - For Spark SQL Execution: In SQLLastAttemptAccumulator, lastAttemptValueForDataset, + * lastAttemptValueForQueryExecution let specify that only values from the last SQL execution of + * a specific Dataset (or QueryExecution) should be aggregated. + * + * In specific situations the last attempt value cannot be computed. This is both because of known + * specific user actions (e.g. mixing driver updates with task updates), and because the + * accumulator performs (and logs) various internal sanity checks and bails out if it detects an + * unexpected situation. Therefore, all the lastAttempt methods return an Option[OUT], where None + * means that it has bailed out. + * + * Updates to the accumulator from completed Tasks are merged in mergeLastAttempt, called from + * DAGScheduler.updateAccumulators, called from DAGScheduler.handleTaskCompletion in the single + * threaded DAGScheduler event loop. Therefore, we don't need to worry about concurrency control + * when updating the accumulator values. However, reading of the last attempt value can potentially + * be done concurrently, so we use synchronization. When there is normally no contention, JVM + * synchronization should be very low overhead. + * + * In order to be able to provide last attempt value, we need to keep track of partial metric + * values, so that after a partial re-attempt the partial value can be updated, and then + * re-aggregated. + * There are various sources of re-attempts that we have to track: + * + * 1. Spark Core. + * ============== + * - Updates from failed tasks are filtered in Task.collectAccumulatorUpdates before they are + * even passed back to the driver. We don't need to worry about them here. + * - We should not get results from two successful attempts of a Task in the same Stage attempt. + * TaskSetManager.handleSuccessfulTask ensures that. + * - Therefore we only need to track Stage retries. The Last Attempt Metric will aggregate the + * metric value of a given RDD partition from the last attempt of the Stage with the highest + * stageId. + * Normally recomputation creates a new stageAttemptId in the same Stage, but there can also + * be multiple new Stages due to: + * - In AQE, a materialized QueryStage is submitted as a new Stage, which would normally get + * skipped, as it is already materialized. However, if results of that stage have been lost, + * the recomputation will happen in that Stage. + * - If the same Dataset with the same QueryExecution and same executedPlan is reused for + * another execution (e.g. again calling collect()). All map stages should be materialized, + * so like with AQE, they should be skipped, unless the results have been lost. Then, + * recomputation will happen in that Stage. The result stage computing the action will be + * fully re-executed. + * - Due to the async nature of cancellation, there can be tasks from previous attempts that + * arrive later than the last attempt. Therefore, we need to track and compare stageId and + * stageAttemptId of every computed RDD partition, in order to discard late comers. + * + * 2. Spark SQL. + * ============= + * LastAttemptAccumulator offers simple tracking of the last SQL execution, by assuming that + * the last execution will be in the scope of an RDD with the highestid, and using + * [[lastAttemptValueForHighestRDDId]. See [[SQLLastAttemptAccumulator]] for more possibilities + * of tracking SQL execution. + * + * Simple last SQL execution tracking + * ---------------------------------- + * Whenever an AQE replan happens, or a repeated execution is submitted, there will be a new + * RDD created for that execution. If AQE creates a new plan, it always uses it and cancels + * the previous one. So, aggregating the metric updates from the RDD with the highest id + * should correspond to the last execution and the latest AQE plan. + * This has some limitations, e.g. doesn't work if the same metric is used in multiple places + * in the query plan, and we want all occurrences to be aggregated together. + * It also wouldn't work if a SparkPlan splits it execution into multiple RDDs. This for example + * happens in BroadcastNestedLoopJoinExec with matchedStreamRows and notMatchedBroadcastRows. + * One can use this simple last attempt tracking by using lastAttemptLastRDDValue. + * + * 3. Driver only updates. + * ======================= + * Sometimes the metric is manipulated directly from the driver, not from within a Task. + * It can be either explicit by user code, or implicit by Catalyst Optimizer, for example + * ConvertToLocalRelation rule, folding a piece of the plan by evaluating it manually on the + * driver. + * When this happens, LastAttemptAccumulator has no information to reason about what was the + * last execution. If the only metric updates are coming from the driver, it assumes that these are + * the "last attempt". If there are both updates from executors and from the driver, it bails out. + * + * Implementation + * ============== + * To track the last attempts, we track a map of metric values per RDD id: + * - Map[RddId, LastAttemptRDDVals[PARTIAL]] + * + * In LastAttemptRDDVals we track an Array of per RDD partition partial merge values, together with + * the stageId and stageAttemptId and taskAttemptNumber to record task execution. + * We also track the RDD id, RDDScope id and last SQL execution id updating that RDD. + * + * Normally to merge partial values, two full Accumulators are used. However, accumulator classes + * that support Last Attempt have to implement partialMerge which merges PARTIAL type. + * This is used to have more compact representation, as PARTIAL can be e.g. a primitive type as + * opposed to a full AccumulatorV2 object instance. + */ + + +private class LastAttemptRDDVals[@specialized T]( + val rddId: Int, + val rddScopeId: Option[String], + // Arrays of partial metric values, and the corresponding stage, stage attempt and task attempt, + // with index representing RDD partition id. + // Metric updates to a given RDD partition can come from different stageAttempts if a retry + // happens while a Job with the Stage is running (a downstream Stage within a Job detects + // missing blocks and triggers recompute), or from different Stages, if a retry happens later + // (a new Job is submitted that depends on data from the RDD, if it finds it's missing it will + // recompute it in a new Stage). + // If a missing output is detected in a Stage while the stage is still running (e.g. executor + // is lost or decomissioned while the stage is running, and loses the output of some already + // finished tasks), a new Task with new taskAttemptNumber will be started for that Task. + // There may be multiple Tasks with different taskAttemptNumbers running in parallel due to + // speculation, but DAGScheduler guarantees that only one of them will reach metrics reporting, + // so it doesn't have to be dealt with here. + // + // There may be partitions that are either not computed at all (for example, due to early stop + // in take/limit), or AQE task coalescing may be visible as an update of the partition id of + // the first partition of the coalesced range. AQE guarantees that if these are retried, they + // will be coalesced in the same ranges, so update the same values. + // Not computed partitions should have EMPTY_ID in all the Int Arrays. + // + // Arrays of primitive types are more memory efficient than an array of objects due to + // references, object headers and paddings overheads. + // The `@specialized` annotation should make scala specialize it to use primitive array instead + // of boxed objects. + val partitionPartialVals: Array[T], + val stageIds: Array[Int], + val stageAttemptIds: Array[Int], + val taskAttemptNumbers: Array[Int]) + { + + // In a case of repeated execution of the same QueryExecution and reuse of the SparkPlan + // (for example multiple `collect()` on the same Dataset), a new RDD may be executed in the same + // RDDOperationScope for the new execution. Hence, we can have multiple RDDs with the same + // RDDOperationScope, coming from different SQL executions and we should only count the last one. + // However, it may also be an old RDD that is reused in the new execution, but needs to be + // partially recomputed because part of it is missing. In that case, the last attempt value needs + // to still be aggregated over the whole RDD, because the whole RDD is used in the new execution. + // Note that this only applies per RDDOperationScope/SparkPlan, because other plans in the same + // new execution may have reused their RDD in whole, and hence have the last SQL executionId + // come from an earlier execution. + // Note: This doesn't work in case a user concurrently executed multiple actions on the same + // Dataset, resulting in multiple concurrent executions trying to compute the same RDD. This + // however should not happen in practice and would likely produce other unexpected effects. + var lastSqlExecutionId: Option[Long] = None + + def numPartitions: Int = stageIds.length + + def isEmptyAt(partitionId: Int): Boolean = { + if (stageIds(partitionId) == LastAttemptRDDVals.EMPTY_ID) { + assert(stageAttemptIds(partitionId) == LastAttemptRDDVals.EMPTY_ID) + assert(taskAttemptNumbers(partitionId) == LastAttemptRDDVals.EMPTY_ID) + true + } else { + false + } + } + + def update(partialValue: AccumulatorPartialVal[T]): Unit = { + partitionPartialVals(partialValue.rddPartitionId) = partialValue.partialMergeVal + stageIds(partialValue.rddPartitionId) = partialValue.stageId + stageAttemptIds(partialValue.rddPartitionId) = partialValue.stageAttemptId + taskAttemptNumbers(partialValue.rddPartitionId) = partialValue.taskAttemptNumber + lastSqlExecutionId = partialValue.sqlExecutionId + } + + def partialValueAt(partId: Int): AccumulatorPartialVal[T] = { + AccumulatorPartialVal( + partialMergeVal = partitionPartialVals(partId), + rddId = rddId, + rddPartitionId = partId, + rddNumPartitions = stageIds.length, + rddScopeId = rddScopeId, + stageId = stageIds(partId), + stageAttemptId = stageAttemptIds(partId), + taskAttemptNumber = taskAttemptNumbers(partId), + sqlExecutionId = lastSqlExecutionId) + } + + override def toString: String = { + s"""LastAttemptVal( + | rddId=$rddId, + | rddScopeId=$rddScopeId, + | lastSqlExecutionId=$lastSqlExecutionId, + | partitionPartialVals=${partitionPartialVals.mkString("[", ",", "]")}, + | stageIds=${stageIds.mkString("[", ",", "]")}, + | stageAttemptIds=${stageAttemptIds.mkString("[", ",", "]")}, + | taskAttemptNumbers=${taskAttemptNumbers.mkString("[", ",", "]")} + |)""".stripMargin + } +} + +private object LastAttemptRDDVals { + // EMPTY_ID in stageId means that the partition was not computed. + val EMPTY_ID: Int = -1 + + def apply[@specialized T]( + rddId: Int, + rddScopeId: Option[String], + numPartitions: Int)(implicit ct: ClassTag[T]): LastAttemptRDDVals[T] = { + new LastAttemptRDDVals[T]( + rddId, + rddScopeId, + new Array[T](numPartitions), + Array.fill(numPartitions)(LastAttemptRDDVals.EMPTY_ID), + Array.fill(numPartitions)(LastAttemptRDDVals.EMPTY_ID), + Array.fill(numPartitions)(LastAttemptRDDVals.EMPTY_ID)) + } + + def createFromFirstUpdate[@specialized T]( + update: AccumulatorPartialVal[T])(implicit ct: ClassTag[T]): LastAttemptRDDVals[T] = { + val newVal = LastAttemptRDDVals[T]( + rddId = update.rddId, + rddScopeId = update.rddScopeId, + update.rddNumPartitions) + newVal.update(update) + newVal + } +} + +private class LastAttemptMap[K, V] { + // Map used to keep metric updates, keyed by RDD id or RDD scope id, backed by a List. + // In the majority of cases (when there are no stage retries and no AQE replanning + // cancelling already running stages), there will be only one key, so a list backed map + // should have less overhead. + // + // Accumulators are modified only from DAGScheduler.updateAccumulators -> mergeLastAttempt, + // which is running from a single thread (scheduling loop), so no concurrency control is needed + // for updates. Read accesses to an immutable list should use a consistent state without extra + // synchronization. + + @volatile private var map: List[(K, V)] = Nil + + def contains(key: K): Boolean = map.exists(_._1 == key) + + def get(key: K): Option[V] = map.collectFirst { case (k, v) if k == key => v } + + def put(key: K, value: V): Unit = synchronized { + map = (key, value) :: map.filterNot(_._1 == key) + } + + def keys: Iterable[K] = map.map(_._1) + def values: Iterable[V] = map.map(_._2) + def isEmpty: Boolean = map.isEmpty + def nonEmpty: Boolean = map.nonEmpty + def clear(): Unit = synchronized { map = Nil } + + override def toString: String = map + .map(elem => s"${elem._1} -> ${elem._2}").mkString("LastAttemptMap {\n", ",\n", "\n}") +} + +private case class AccumulatorPartialVal[PARTIAL]( + partialMergeVal: PARTIAL, + rddId: Int, + rddPartitionId: Int, + rddNumPartitions: Int, + rddScopeId: Option[String], + stageId: Int, + stageAttemptId: Int, + taskAttemptNumber: Int, + sqlExecutionId: Option[Long] +) { + override def toString: String = { + s"""AccumulatorPartialVal( + | partialMergeVal=$partialMergeVal, + | rddId=$rddId, + | rddPartitionId=$rddPartitionId, + | rddNumPartitions=$rddNumPartitions, + | rddScopeId=$rddScopeId, + | stageId=$stageId, + | stageAttemptId=$stageAttemptId, + | taskAttemptNumber=$taskAttemptNumber, + | sqlExecutionId=$sqlExecutionId + |)""".stripMargin + } + + /** Tuple of stage id, stage attempt id and taskAttemptNumber, defining the order of attempts. */ + val attempt: (Int, Int, Int) = (stageId, stageAttemptId, taskAttemptNumber) +} + +/** + * A trait that can be mixed-in into a subclass an [[AccumulatorV2]] to make it track the "logical" + * value of the "last attempt" of the execution using the accumulator - aggregated from the last + * attempts of any Task that calculated some RDD partitions and used this accumulator, and + * discarding any values coming from earlier attempts that have been recomputed. + * If the accumulator is used by multiple RDDs, the last attempt value is tracked separately for + * each, and can be retrieved for each or all of them separately, see lastAttemptValueForX methods. + * If the accumulator is used directly on the Spark Driver using the [[AccumulatorV2.set]] or + * [[AccumulatorV2.add]], that value is considered the last attempt value. + * If the accumulator was both used in Tasks and updated directly on the driver, it can't determine + * what should be considered the last attempt, and lastAttemptValueForX methods will return None. + * + * [[LastAttemptAccumulator]] is not reset by the [[AccumulatorV2.reset]] method implementation, + * and its state is not copied by the [[AccumulatorV2.copy]] method implementation, and it should + * not be serialized to the Executors. The internal state should only be initialized by the + * [[initializeLastAttemptAccumulator]] method on the "main" instance of the accumulator, that was + * created and registered with [[AccumulatorContext]] with [[AccumulatorV2.register()]]. All the + * interfaces of [[LastAttemptAccumulator]]: [[mergeLastAttempt]] (used only by DAGScheduler) and + * lastAttemptValueForX, [[logAccumulatorState]] (used by the using code) should only be invoked on + * that instance, on the Spark Driver. + * + * The [[LastAttemptAccumulator]] is not thread-safe. [[mergeLastAttempt]] should only be used by + * DAGScheduler, by the scheduler thread. Retrieving the value using lastAttemptValueForXXX while + * it is concurrently updated (execution is running) can produce some inconsistencies, but should + * not crash. + * If an RDD using the [[LastAttemptAccumulator]] is used concurrently by multiple actions that + * all try to recompute it, it may produce unexpected results and the semantics of what is "last + * attempt" becomes ambiguous. This should not be done in practice, and will likely result in more + * unexpected behaviours in Spark. + * + * Implementations must implement [[partialMergeVal]] and [[partialMerge]] methods operating on + * [[PARTIAL]] type. In regular [[AccumulatorV2]] implementations, the [[AccumulatorV2]] object + * itself hold the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is used + * to merge these objects together. [[LastAttemptAccumulator]] needs to keep track of partial + * values of every partition of every RDD that used the accumulator, and holding a full + * [[AccumulatorV2]] object for each would have a high overhead. Therefore, an implementation should + * be able to return [[PARTIAL]] value from [[partialMergeVal]] that represents an intermediate + * mergeable value. and a [[partialMerge]] method that can merge that value into the accumulator. + * Implementations must also implement an [[isMergeable]] method that checks if the other + * [[AccumulatorV2]] is of a compatible type to be merged with this using [[partialMergeVal]]. In + * regular [[AccumulatorV2]] implementations, this check is normally done inside the + * [[AccumulatorV2.merge]] method, which is not used here. + * + * If an implementation is used to keep user data in the accumulator, it should override + * [[accumulatorStoresUserData]] to return true, to ensure correct structured logging annotation. + * Otherwise it should override it to false. + */ +trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { + this: AccumulatorV2[IN, OUT] => + + // For every RDD that participated in the computation of this accumulator, keep the partial + // value of the accumulator for the latest stage and stage attempt that computed it. + // Keyed by rdd.id. + // Only kept and accessed on the driver, in the instance of the LastAttemptAccumulator that was + // created and registered with AccumulatorContext with AccumulatorV2.register(). + // Should not be copied / reset by the implementation of copy() / reset() functions. + // Transient: only needed on the driver and doesn't need to be serialized. + @transient + private var lastAttemptRddsMap: LastAttemptMap[Int, LastAttemptRDDVals[PARTIAL]] = _ + + // ClassTag for PARTIAL, captured at initialization time. + @transient private var partialClassTag: ClassTag[PARTIAL] = _ + + // Metric value set directly on the driver, not from within a task. + // Only kept and accessed on the driver, in the instance of the LastAttemptAccumulator that was + // created and registered with AccumulatorContext with AccumulatorV2.register(). + // Should not be copied / reset by the implementation of copy() / reset() functions. + // Transient: only needed on the driver and doesn't need to be serialized. + @transient + private var lastAttemptDirectDriverValue: Option[OUT] = _ + + // Flipped to true if unexpected metrics updates are received and we can no longer reason + // about the last attempt. + // Should not be copied / reset by the implementation of copy() / reset() functions. + // Transient: only needed on the driver and doesn't need to be serialized. + @transient + protected var lastAttemptAccumulatorInvalid: Boolean = false + + // Indicates that the LastAttemptAccumulator has been initialized. + // It is initialized in assertValid(). + // Should not be copied / reset by the implementation of copy() / reset() functions. + // Transient: only needed on the driver and doesn't need to be serialized. + @transient + protected var lastAttemptAccumulatorInitialized: Boolean = false + + /** Reset the state of the last attempt accumulator, discarding all the past attempts, and + * making it valid again if it was invalidated. */ + def resetLastAttemptAccumulator(): Unit = try { + lastAttemptRddsMap.clear() + lastAttemptDirectDriverValue = None + lastAttemptAccumulatorInvalid = false + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in resetLastAttemptAccumulator", + exception = Some(e)) + } + + def initializeLastAttemptAccumulator()(implicit ct: ClassTag[PARTIAL]): Unit = try { + assert(isAtDriverSide) + assert(!lastAttemptAccumulatorInitialized) + assert(!lastAttemptAccumulatorInvalid) + assert(lastAttemptRddsMap == null) + assert(lastAttemptDirectDriverValue == null) + partialClassTag = ct + lastAttemptRddsMap = new LastAttemptMap[Int, LastAttemptRDDVals[PARTIAL]] + lastAttemptDirectDriverValue = None + lastAttemptAccumulatorInitialized = true + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in initializeLastAttemptAccumulator", + exception = Some(e)) + } + + private def accumulatorId: Long = { + // This can throw if this is a copy/serialized accumulator, + // not the instance registered with AccumulatorContext. + // Catch it so we can safely use it for logging in unexpected situations. + try { + this.id + } catch { + case NonFatal(e) => + logWarning(log"Unexpected exception in getting accumulator id", e) + -1L // needs to be a long for LogKeys.ACCUMULATOR_ID + } + } + + /** Log entry to log debug information about the internal state of the accumulator. */ + def logAccumulatorState: LogEntry = try { + log"""LastAttemptAccumulator id=${MDC(LogKeys.ACCUMULATOR_ID, accumulatorId)}: + |Invalidated: ${MDC(LogKeys.SLAM_INVALIDATE, lastAttemptAccumulatorInvalid)}. + |Direct driver value: ${MDC(logKeyAccumulatorState, lastAttemptDirectDriverValue)}. + |Value: ${MDC(logKeyAccumulatorState, value)}. + |lastAttemptRddsMap: + |${MDC(logKeyAccumulatorState, lastAttemptRddsMap)}.""" + .stripMargin + } catch { + case NonFatal(e) => + logWarning(log"Unexpected exception in logAccumulatorState", e) + log"" + } + + private def logAccumulatorUpdate( + newAccumPartialValue: Option[AccumulatorPartialVal[PARTIAL]] = None, + oldAccumPartialValue: Option[AccumulatorPartialVal[PARTIAL]] = None): LogEntry = try { + log"""Old partial RDD value: ${MDC(logKeyAccumulatorState, oldAccumPartialValue)}. + |New partial RDD value: ${MDC(logKeyAccumulatorState, newAccumPartialValue)}.""" + .stripMargin + } catch { + case NonFatal(e) => + logWarning(log"Unexpected exception in logAccumulatorUpdate", e) + log"" + } + + private def unexpectedLastAttemptMetricUpdate( + invalidate: Boolean, + reason: String, + exception: Option[Throwable] = None, + newAccumPartialValue: Option[AccumulatorPartialVal[PARTIAL]] = None, + oldAccumPartialValue: Option[AccumulatorPartialVal[PARTIAL]] = None): Unit = { + // TODO(LC-6501): Add usage logging events with structured logging. + val logEntry = + log"""Unexpected last attempt tracking for accumulator ${ + MDC(LogKeys.ACCUMULATOR_ID, accumulatorId)}. + |Invalidate: ${MDC(LogKeys.SLAM_INVALIDATE, invalidate)}. + |Reason: ${MDC(LogKeys.SLAM_UNEXPECTED_REASON, reason)}. + |""".stripMargin + + log"State:\n" + logAccumulatorState + + log"Update:\n" + logAccumulatorUpdate(newAccumPartialValue, oldAccumPartialValue) + exception match { + case Some(e) => logWarning(logEntry, e) + case None => logWarning(logEntry) + } + if (invalidate) { + lastAttemptAccumulatorInvalid = true + } + if (Utils.isTesting && lastAttemptAccumulatorInitialized && exception.isDefined) { + // If this is a test, rethrow the exception. + // (Rethrow only if lastAttemptAccumulatorInitialized. In some tests, we check for proper + // graceful handling of unexpected exceptions in accumulators that are not properly + // initialized, so we don't want to throw there.) + throw exception.get + } + } + + protected def unexpectedLastAttemptMetricOperation( + invalidate: Boolean, + reason: String, + exception: Option[Throwable] = None): Unit = { + // subclasses don't have visibility of private class AccumulatorPartialVal. + unexpectedLastAttemptMetricUpdate( + invalidate = invalidate, + reason = reason, + exception = exception, + newAccumPartialValue = None, + oldAccumPartialValue = None) + } + + /** Set of assertions that should always hold for a valid [[LastAttemptAccumulator]]. */ + protected def assertValid(): Unit = { + assert(lastAttemptAccumulatorInitialized) + assert(!lastAttemptAccumulatorInvalid) + assert(isAtDriverSide) + assert(metadata != null) + assert(!metadata.countFailedValues) + assert(lastAttemptDirectDriverValue.isEmpty || lastAttemptRddsMap.isEmpty) + } + + /** + * Accumulator subclasses where metric values can contain user data (for example, maximum of + * processed values, observable metrics) as opposed to system measurements (for example, count + * of processed rows) should return true to ensure correct structured logging annotation. + */ + protected def accumulatorStoresUserData: Boolean + + protected def logKeyAccumulatorState: LogKey = { + if (accumulatorStoresUserData) { + LogKeys.SLAM_STATE_USER_METRIC + } else { + LogKeys.SLAM_STATE_SYSTEM_METRIC + } + } + + /** Return intermediate value of PARTIAL type that can be merged together by partialMerge. */ + protected def partialMergeVal: PARTIAL + + /** Merge together partial values of PARTIAl type returned by partialMergeVal. */ + protected def partialMerge(otherVal: PARTIAL): Unit + + /** Check if the other accumulator is mergeable with this one. */ + protected def isMergeable(other: AccumulatorV2[_, _]): Boolean + + /** + * Check if the value is set on the driver side, not from within a task. + * This must be called from `add` and `set` methods of any AccumulatorV2 subclass supporting + * last attempt metrics to set what the `value` of the metric is after the operation. + */ + protected def setValueIfOnDriverSide(value: OUT): Unit = try { + if (isAtDriverSide && lastAttemptAccumulatorInitialized && !lastAttemptAccumulatorInvalid) { + // Direct update on the driver, not from within a task. + // This gives little information about the source of the update, so we can't reason about + // "last attempt" if it's mixed with non-driver updates. + lastAttemptDirectDriverValue = Some(value) + if (lastAttemptRddsMap.nonEmpty) { + unexpectedLastAttemptMetricUpdate( + invalidate = true, + reason = "Incoming direct driver value while task updates exist") + } + } + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in setValueIfOnDriverSide", + exception = Some(e)) + } + + /** + * It needs Task and Stage information to reason about the last attempt. + * + * Called from a single thread in DAGScheduler, no synchronization needed. + * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + */ + private[spark] def mergeLastAttempt( + other: AccumulatorV2[_, _], + rdd: RDD[_], + taskInfo: TaskInfo, + stageId: Int, + stageAttemptId: Int, + localProperties: java.util.Properties): Unit = try { + implicit val ct: ClassTag[PARTIAL] = partialClassTag + if (lastAttemptAccumulatorInvalid) return + // Skip zero-value updates. They contribute nothing to the aggregate and can come + // from stages where the accumulator was present in the task closure but never incremented. + if (other.isZero) return + assertValid() + + if (!isMergeable(other)) { + // This should never happen. + unexpectedLastAttemptMetricUpdate( + invalidate = true, + "Merging accumulators of different types") + return + } + + if (!other.isInstanceOf[LastAttemptAccumulator[_, _, _]]) { + // This should never happen. + unexpectedLastAttemptMetricUpdate( + invalidate = true, + "Merging with accumulator which is not SLAM") + return + } + val lastAttemptOther = other + .asInstanceOf[LastAttemptAccumulator[IN, OUT, PARTIAL]] + + val update = AccumulatorPartialVal( + partialMergeVal = lastAttemptOther.partialMergeVal, + rddId = rdd.id, + rddPartitionId = taskInfo.partitionId, + rddNumPartitions = rdd.getNumPartitions, + rddScopeId = rdd.scope.map(_.id), + stageId = stageId, + stageAttemptId = stageAttemptId, + taskAttemptNumber = taskInfo.attemptNumber, + sqlExecutionId = + Option(localProperties.getProperty(SparkContext.SQL_EXECUTION_ID_KEY)).map(_.toLong)) + + if (lastAttemptDirectDriverValue.nonEmpty) { + unexpectedLastAttemptMetricUpdate(invalidate = true, + "Incoming task updates while direct driver value exists", + newAccumPartialValue = Some(update)) + return + } + + lastAttemptRddsMap.get(update.rddId) match { + case Some(oldRDDValue) => // This RDD was already seen. + val oldValue = oldRDDValue.partialValueAt(update.rddPartitionId) + + logTrace(log"mergeLastAttempt existing RDD update:\n" + + log"${MDC(logKeyAccumulatorState, oldRDDValue)}\n" + + logAccumulatorUpdate( + newAccumPartialValue = Some(update), oldAccumPartialValue = Some(oldValue))) + + // Check basic consistency + if (oldValue.rddNumPartitions != update.rddNumPartitions) { + unexpectedLastAttemptMetricUpdate( + invalidate = true, + reason = "RDD with changing number of partitions", + newAccumPartialValue = Some(update), + oldAccumPartialValue = Some(oldValue)) + return + } + if (oldValue.rddScopeId != update.rddScopeId) { + unexpectedLastAttemptMetricUpdate( + invalidate = true, + reason = "RDD with changing RDDOperationScope", + newAccumPartialValue = Some(update), + oldAccumPartialValue = Some(oldValue)) + return + } + + if (oldRDDValue.isEmptyAt(update.rddPartitionId)) { + // No previous attempt for this RDD partition. + oldRDDValue.update(update) + } else { + if (update.attempt > oldValue.attempt) { + // New last attempt for this RDD partition. + oldRDDValue.update(update) + } else if (update.attempt == oldValue.attempt) { + // Same attempt, should not happen. + unexpectedLastAttemptMetricUpdate( + invalidate = true, + reason = "Same stage, stageAttemptId and taskAttemptNumber reported multiple times", + newAccumPartialValue = Some(update), + oldAccumPartialValue = Some(oldValue)) + } + // else: Older attempt reported after newer attempt. Not fatal, discard it. + } + + case None => // First time we see this RDD. + logTrace(log"mergeLastAttempt new RDD update:\n" + logAccumulatorUpdate( + newAccumPartialValue = Some(update), oldAccumPartialValue = None)) + val newVal = LastAttemptRDDVals.createFromFirstUpdate(update) + lastAttemptRddsMap.put(update.rddId, newVal) + } + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricUpdate( + invalidate = true, + reason = "Unexpected exception in mergeLastAttempt", + exception = Some(e)) + } + + /** Accumulates last attempt values from given RDD into an acc. */ + private def lastAttemptValueAggregateInternal(rddId: Int, acc: this.type) = { + // Note: even if the given RDD is not present, we can't tell if it executed but just never + // updated this accumulator, so we still report the zero value back. + for { + lastAttemptVal <- lastAttemptRddsMap.get(rddId) + partitionId <- lastAttemptVal.partitionPartialVals.indices + } { + // Some partitions may not be computed. + // May be because of operations like take. + // May be because of AQE coalescing executing tasks covering multiple partitions. + if (!lastAttemptVal.isEmptyAt(partitionId)) { + acc.partialMerge(lastAttemptVal.partitionPartialVals(partitionId)) + } + } + } + + /** + * Returns the last attempt value of this accumulator, aggregated from a set of RDDs. + * + * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForRDDIds(rddIds: Seq[Int]): Option[OUT] = try { + if (lastAttemptAccumulatorInvalid) return None + assertValid() + if (lastAttemptDirectDriverValue.isDefined) { + // return zero value if there is no RDD execution recorded. + return Some(copyAndReset().asInstanceOf[this.type].value) + } + + val acc = copyAndReset().asInstanceOf[this.type] + rddIds.distinct.foreach(lastAttemptValueAggregateInternal(_, acc)) + Some(acc.value) + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in lastAttemptValueForRDDs", + exception = Some(e)) + None + } + + /** + * Returns the last attempt value of this accumulator, aggregated from a specific RDD. + * + * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForRDDId(rddId: Int): Option[OUT] = try { + lastAttemptValueForRDDIds(Seq(rddId)) + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in lastAttemptValueForRDD", + exception = Some(e)) + None + } + + /** + * Returns the last attempt value of this accumulator, aggregated from all RDDs that ever + * returned any values for it. + * + * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForAllRDDs(): Option[OUT] = try { + if (lastAttemptAccumulatorInvalid) return None + assertValid() + if (lastAttemptDirectDriverValue.isDefined) return lastAttemptDirectDriverValue + lastAttemptValueForRDDIds(lastAttemptRddsMap.keys.toSeq) + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in lastAttemptValueForAllRDDs", + exception = Some(e)) + None + } + + /** + * Returns the last attempt value of this accumulator, aggregated from the RDD with the highest + * id that ever returned any values for it. + * + * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForHighestRDDId(): Option[OUT] = try { + if (lastAttemptAccumulatorInvalid) return None + assertValid() + if (lastAttemptDirectDriverValue.isDefined) return lastAttemptDirectDriverValue + + if (lastAttemptRddsMap.nonEmpty) { + lastAttemptValueForRDDId(lastAttemptRddsMap.keys.max) + } else { + // return zero value if there is no RDD execution recorded. + Some(copyAndReset().asInstanceOf[this.type].value) + } + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in lastAttemptValueForHighestRDDId", + exception = Some(e)) + None + } + + /** + * Returns the last attempt value of this accumulator, aggregated from RDDs with given scope ids. + * + * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForRDDScopes(rddScopeIds: Seq[String]): Option[OUT] = try { + if (lastAttemptAccumulatorInvalid) return None + assertValid() + if (lastAttemptDirectDriverValue.isDefined) { + // Return zero value if there is no RDD execution recorded. + return Some(copyAndReset().asInstanceOf[this.type].value) + } + val scopesLookup = rddScopeIds.toSet + val matchingRDDs = lastAttemptRddsMap.values.filter { rddVal => + rddVal.rddScopeId.exists(scopesLookup.contains) + }.toSeq + // When multiple RDDs share the same scope from different SQL executions (e.g. repeated + // Dataset.collect() calls), only aggregate the ones from the latest execution per scope. + val rddIds = matchingRDDs.groupBy(_.rddScopeId).values.flatMap { rddsInScope => + val maxExecId = rddsInScope.flatMap(_.lastSqlExecutionId).maxOption + rddsInScope.filter { r => + maxExecId.isEmpty || r.lastSqlExecutionId == maxExecId + } + }.map(_.rddId).toSeq + lastAttemptValueForRDDIds(rddIds) + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in lastAttemptValueForRDDScopes", + exception = Some(e)) + None + } + + /** Visible for testing. */ + def getDirectDriverValue: Option[OUT] = { + lastAttemptDirectDriverValue + } + + /** Visible for testing */ + def getHighestRDDId: Option[Int] = { + if (lastAttemptRddsMap.nonEmpty) Some(lastAttemptRddsMap.keys.max) else None + } + + /** Visible for testing */ + def getNumRDDs: Int = { + lastAttemptRddsMap.keys.size + } + + /** Visible for testing */ + def getValid: Boolean = { + !lastAttemptAccumulatorInvalid + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala index 2070873f96579..1ece669ab8133 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala @@ -2259,9 +2259,11 @@ class Dataset[T] private[sql]( */ private def withAction[U](name: String, qe: QueryExecution)(action: SparkPlan => U) = { SQLExecution.withNewExecutionId(qe, Some(name)) { - QueryExecution.withInternalError(s"""The "$name" action failed.""") { - qe.executedPlan.resetMetrics() - action(qe.executedPlan) + qe.withQueryExecutionId(sparkSession) { + QueryExecution.withInternalError(s"""The "$name" action failed.""") { + qe.executedPlan.resetMetrics() + action(qe.executedPlan) + } } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index f08b561d6ef9a..1a88316340a19 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -26,7 +26,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.Path -import org.apache.spark.SparkException +import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.internal.LogKeys.EXTENDED_EXPLAIN_GENERATOR import org.apache.spark.rdd.RDD @@ -327,7 +327,30 @@ class QueryExecution( protected def executePhase[T](phase: String)(block: => T): T = sparkSession.withActive { QueryExecution.withInternalError(s"The Spark SQL phase $phase failed with an internal error.") { - tracker.measurePhase(phase)(block) + withQueryExecutionId(sparkSession) { + tracker.measurePhase(phase)(block) + } + } + } + + /** + * Set the query execution id in thread-local properties while + * executing the block. This is used by + * [[LastAttemptAccumulator]] to associate driver-side metric + * updates with a specific QueryExecution. + */ + private[sql] def withQueryExecutionId[T]( + session: SparkSession)(block: => T): T = { + val sc = session.sparkContext + val oldId = sc.getLocalProperty( + SparkContext.DATASET_QUERY_EXECUTION_ID_KEY) + sc.setLocalProperty( + SparkContext.DATASET_QUERY_EXECUTION_ID_KEY, id.toString) + try { + block + } finally { + sc.setLocalProperty( + SparkContext.DATASET_QUERY_EXECUTION_ID_KEY, oldId) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 7f94cc77f3454..35769eca9af13 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -249,12 +249,21 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ doExecuteWrite(writeFilesSpec) } + /** + * A deterministic scope ID for RDDs created by this SparkPlan, + * used by [[LastAttemptAccumulator]] to track which RDD belongs + * to which SparkPlan node. + */ + private[spark] def rddScopeId: String = + "spark_plan_" + id.toString + /** * Executes a query after preparing the query and adding query plan information to created RDDs * for visualization. */ protected final def executeQuery[T](query: => T): T = { - RDDOperationScope.withScope(sparkContext, nodeName, false, true) { + RDDOperationScope.withScope( + sparkContext, nodeName, false, true, rddScopeId) { prepare() waitForSubqueries() query @@ -375,6 +384,11 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ */ private def getByteArrayRdd( n: Int = -1, takeFromEnd: Boolean = false): RDD[(Long, ChunkedByteBuffer)] = { + // Wrap in the plan's RDD scope so that the wrapper RDD created by mapPartitionsInternal + // inherits this plan's deterministic scope ID rather than getting an anonymous auto-generated + // one. + val rdd = RDDOperationScope.withScope( + sparkContext, nodeName, false, true, rddScopeId) { execute().mapPartitionsInternal { iter => var count = 0 val buffer = new Array[Byte](4 << 10) // 4K @@ -409,10 +423,17 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ out.writeInt(-1) out.flush() out.close() - Iterator((count, cbbos.toChunkedByteBuffer)) + Iterator((count.toLong, cbbos.toChunkedByteBuffer)) } + } + _lastByteArrayRddId = Some(rdd.id) + rdd } + /** The RDD id of the last wrapper RDD created by [[getByteArrayRdd]]. Visible for testing. */ + @transient private var _lastByteArrayRddId: Option[Int] = None + private[spark] def lastByteArrayRddId: Option[Int] = _lastByteArrayRddId + /** * Decodes the byte arrays back to UnsafeRows and put them into buffer. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala new file mode 100644 index 0000000000000..2fffdf29cd312 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.adaptive + +import scala.collection.mutable + +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.util.{AccumulatorContext, Utils} + +/** Testing only helpers for AQE. */ +object AQETestHelper { + // See [withForcedCancellation]. + @volatile private var metricIdsForForcedCancellation: Set[Long] = Set.empty + + /** + * Set `triggerMetrics` to induce a forced cancellation into the execution when any of the + * metrics is non-empty. + * In this case the results will be discarded and the stage re-run, causing the metrics to be + * incremented again. + */ + def withForcedCancellation[T](triggerMetrics: SQLMetric*)(thunk: => T): T = { + metricIdsForForcedCancellation = triggerMetrics.map(_.id).toSet + val res = try { + thunk + } finally { + metricIdsForForcedCancellation = Set.empty + } + res + } + + /* + * Track for which plans we have already triggered the forced replanning so we only do it once. + */ + private val forcedCancellationTriggeredForPlans = mutable.HashSet.empty[Int] + + /** Return `true` if forced cancellation mechanism is enabled. */ + def isForcedCancellationEnabled: Boolean = + Utils.isTesting && metricIdsForForcedCancellation.nonEmpty + + /** Return `true` if forced cancellation has already been triggered for `plan`. */ + def wasForcedCancellationTriggeredForPlan(plan: SparkPlan): Boolean = synchronized { + forcedCancellationTriggeredForPlans.contains(plan.id) + } + + /** Mark that force cancellation was successfully triggered for `plan`. */ + def markForcedCancellationTriggeredForPlan(plan: SparkPlan): Unit = synchronized { + assert(!forcedCancellationTriggeredForPlans.contains(plan.id), + "A plan was forced to cancel a second time.") + forcedCancellationTriggeredForPlans += plan.id + } + + /** Return `true` if we should try to force cancellation for `plan` at this point. */ + def shouldForceCancellation(plan: SparkPlan): Boolean = { + // Trigger the forced cancellation only we are in testing + Utils.isTesting && + // ...and if we haven't triggered it yet + !wasForcedCancellationTriggeredForPlan(plan) && + // ...and if any of the trigger metrics > 0. + metricIdsForForcedCancellation.exists { id => + AccumulatorContext.get(id).map(!_.isZero).getOrElse(false) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 4840016bf745d..f346961a277d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -340,6 +340,8 @@ case class AdaptiveSparkPlanExec( if (errors.nonEmpty) { cleanUpAndThrowException(errors.toSeq, None) } + val testTriggerForceCancellation = AQETestHelper.shouldForceCancellation(this) + if (!currentPhysicalPlan.isInstanceOf[ResultQueryStageExec]) { // Try re-optimizing and re-planning. Adopt the new plan if its cost is equal to or less // than that of the current plan; otherwise keep the current physical plan together with @@ -352,14 +354,21 @@ case class AdaptiveSparkPlanExec( // the current physical plan. Once a new plan is adopted and both logical and physical // plans are updated, we can clear the query stage list because at this point the two // plans are semantically and physically in sync again. - val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) + var logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) + if (testTriggerForceCancellation) { + // Force unwrap all LogicalQueryStage so they get replanned. + logicalPlan = logicalPlan.transformDown { + case LogicalQueryStage(logical, _) => logical + } + } val afterReOptimize = reOptimize(logicalPlan) if (afterReOptimize.isDefined) { val (newPhysicalPlan, newLogicalPlan) = afterReOptimize.get val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) val newCost = costEvaluator.evaluateCost(newPhysicalPlan) if (newCost < origCost || - (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) { + (newCost == origCost && currentPhysicalPlan != newPhysicalPlan) || + testTriggerForceCancellation) { lazy val plans = sideBySide( currentPhysicalPlan.treeString, newPhysicalPlan.treeString).mkString("\n") logOnLevel(log"Plan changed:\n${MDC(QUERY_PLAN, plans)}") @@ -369,6 +378,9 @@ case class AdaptiveSparkPlanExec( stagesToReplace = Seq.empty[QueryStageExec] } } + if (testTriggerForceCancellation) { + AQETestHelper.markForcedCancellationTriggeredForPlan(this) + } } // Now that some stages have finished, we can try creating new stages. result = createQueryStages(fun, currentPhysicalPlan, firstRun = false) @@ -392,6 +404,10 @@ case class AdaptiveSparkPlanExec( logOnLevel(log"Final plan:\n${MDC(QUERY_PLAN, currentPhysicalPlan)}") } + /** Delegates to the inner plan's lastByteArrayRddId. Visible for testing. */ + override private[spark] def lastByteArrayRddId: Option[Int] = + executedPlan.lastByteArrayRddId + override def executeCollect(): Array[InternalRow] = { withFinalPlanUpdate(_.executeCollect()) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala index 2556edee8d02f..14372457c0621 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala @@ -94,6 +94,21 @@ trait AdaptiveSparkPlanHelper { collect(p) { case plan if allChildren(plan).isEmpty => plan } } + /** + * Returns true if the condition specified by `f` is satisfied by any node in this tree. + */ + def exists(p: SparkPlan)(f: SparkPlan => Boolean): Boolean = { + find(p)(f).isDefined + } + + /** + * Like [[exists]], but also considers plan nodes inside subqueries. + */ + def existsWithSubqueries( + p: SparkPlan)(f: SparkPlan => Boolean): Boolean = { + exists(p)(f) || subqueriesAll(p).exists(exists(_)(f)) + } + /** * Finds and returns the first [[SparkPlan]] of the tree for which the given partial function * is defined (pre-order), and applies the partial function to it. @@ -138,3 +153,5 @@ trait AdaptiveSparkPlanHelper { case other => other } } + +object AdaptiveSparkPlanHelper extends AdaptiveSparkPlanHelper diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala index 7dcbf3779b93d..7f757c651c560 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala @@ -241,17 +241,22 @@ case class ShuffleExchangeExec( */ @transient lazy val shuffleDependency : ShuffleDependency[Int, InternalRow, InternalRow] = { - val dep = ShuffleExchangeExec.prepareShuffleDependency( - inputRDD, - child.output, - outputPartitioning, - serializer, - writeMetrics) - metrics("numPartitions").set(dep.partitioner.numPartitions) - val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates( - sparkContext, executionId, metrics("numPartitions") :: Nil) - dep + // Wrap in the exchange's RDD scope so that any wrapper RDDs created during shuffle dependency + // preparation (e.g. by prepareShuffleDependency's mapPartitionsInternal calls) get this + // exchange's scope ID. + RDDOperationScope.withScope(sparkContext, nodeName, false, true, rddScopeId) { + val dep = ShuffleExchangeExec.prepareShuffleDependency( + inputRDD, + child.output, + outputPartitioning, + serializer, + writeMetrics) + metrics("numPartitions").set(dep.partitioner.numPartitions) + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates( + sparkContext, executionId, metrics("numPartitions") :: Nil) + dep + } } protected override def doExecute(): RDD[InternalRow] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala new file mode 100644 index 0000000000000..2d9e352d27f7d --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala @@ -0,0 +1,412 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.metric + +import scala.collection.mutable +import scala.reflect.ClassTag +import scala.util.control.NonFatal + +import org.apache.spark.SparkContext +import org.apache.spark.internal.{LogEntry, Logging} +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.execution.{BaseSubqueryExec, QueryExecution, SparkPlan, SubqueryAdaptiveBroadcastExec, SubqueryBroadcastExec, SubqueryExec, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, BroadcastExchangeLike, ReusedExchangeExec, ShuffleExchangeExec, ShuffleExchangeLike} +import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} + +/* + * SQLLastAttemptAccumulator is a LastAttemptAccumulator that allows to track the last attempt + * updates that happened in the scope of execution of a plan created by a specific Dataset's + * QueryExecution. + * + * Tracking RDDs belonging to a Dataset execution. + * ----------------------------------------------- + * Dataset executes executedPlan from it's QueryExecution. Each SparkPlan node in the + * executedPlan saves the RDD with its execution (executeRDD or executeColumnarRDD). However, + * the root RDD of a Spark Stage that actually gets submitted and executed is not necessarily + * that RDD. It may be an ephemeral RDD created on the fly when submitting the job, e.g.: + * - for result stages, there may be additional transformations to format the results, like + * apply an Encoder (e.g. turn InternalRows into Rows) + * or transformations for Arrow + * - for map stages, there may be some additional transformations to prepare the shuffle + * data in correct format. + * - operations like dataframe caching may wrap the plan results to format and write to cache. + * We therefore cannot track the metrics updates just by RDD id. However, each SparkPlan also + * creates an RDDOperationScope, and wraps the execution it submits by that scope. + * The completed Tasks should have RDDOperationScope of the SparkPlan that submitted the + * Stage. We need to extract the RDDOperationScopes from Dataset.queryExecution.executedPlan + * to track last attempt metric updates coming from that execution. + * + * Additionally, it is possible that the same queryExecution.executedPlan is reused. For + * example, when collect() is called multiple times on the same Dataset. + * - Part of the execution (e.g. the shuffles) should then be reused. Accumulator should still + * keep their partial values associated to it's RDDOperationScope, and return it for this + * new attempt. + * - Some of the execution (e.g. the result stage) may be recomputed. Since the SparkPlan will + * be the same, RDDOperationScope will be the same, and this should become a newer execution + * of the same RDD, which should replace the previous one. + * + * AQE plan changes + * ---------------- + * AQE re-optimizes LogicalPlan and creates new SparkPlan. If the new plan doesn't contain + * some of the QueryStages from the previous plan, they can be cancelled while they already + * started running and accumulated some metric results. + * If the metric is part of SparkPlan.metrics, then the newly created plan will have new + * metrics and the old metrics would have been discarded; so nothing needs to be tracked here. + * But if the metric is coming from outside, it can be reused by the new SparkPlan. + * A new plan will have a new RDD and a new RDDOperationScope, so by tracking these for the + * final AQE plan, only values from the final plan and execution should be aggregated. + * + * It can also happen that the new AQE plan reuses SparkPlan instances from the old plan, + * see CancelShuffleStageInBroadcastJoin. However, in that cases, the old plan will be put + * under some new plan in a new submitted Stages. Since we only truly track the plans that + * submit Stages, these should be different and enough to disambiguate. + * + * Driver only updates + * ------------------- + * The metric can be updated directly on the driver side, during the execution of catalyst + * optimizer. One example is [[ConvertToLocalRelation]] optimization rule, which constant folds + * pieces of the plan, including potentially folding [[IncrementMetric]] expressions. + * Execution in this scope is tagged with [[QueryExecution.id]] using + * [[SparkContext.DATASET_QUERY_EXECUTION_ID_KEY]] property, and this metric is tracking + * the metric value separately for each QueryExecution. + * Like with LastAttemptAccumulator, the metric will bail out if it's updated both from the driver + * and from executor Tasks. + * + * Cached / Checkpointed plans + * --------------------------- + * If the metric was used inside a cached (df.cache, df.persist) or checkpointed (df.checkpoint, + * df.localCheckpoint, which is then turned into an RDDScanExec or InMemoryTableScanExec in the + * Dataset's executedPlan, we deem it undefined behavior for [[lastAttemptValueForDataset]] and + * [[lastAttemptValueForQueryExecution]]. [[lastAttemptValueForHighestRDDId()]] should return the + * value from when the execution in which the plan was cached/checkpointed and should be used + * instead. + * + * The main issue is if the metric is in the top stage of the cached plan. When that plan is + * executed in some Dataset (as lazy execution), the metric will be executed in the scope of the + * stage that contains the InMemoryTableScanExec / RDDScanExec, which will be some parent of that + * plan, and not plan of the cached plan. So if the cached plan is then used in another Dataset, + * that Dataset will not have information about that parent. + * There could be some hacks done to fix it by record in the InMemoryRelation the scopes in which + * it was materialized. There are also other issues, like that checkpoint throws away the plan, so + * it would also have to record the RDD scopes used during checkpointing. This gets further + * complicated if recomputations are involved, and are done in yet another scope. + * It was declared undefined behavior instead of pursuing this. + */ + +/** + * A trait that can be mixed-in into a subclass an [[AccumulatorV2]] to make it track the "logical" + * value of the "last attempt" of the execution using the accumulator. + * In addition to what [[LastAttemptAccumulator]] does, it allows to track the last attempt + * executed in the scope of a Dataset's QueryExecution, via + * [[lastAttempValueForDataset]] and [[lastAttemptValueForQueryExecution]] methods. + */ +trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator[IN, OUT, PARTIAL] { + this: AccumulatorV2[IN, OUT] => + + @transient + private var lastAttemptDirectDriverQueryExecutionValues: mutable.Map[String, this.type] = _ + + override def initializeLastAttemptAccumulator()(implicit ct: ClassTag[PARTIAL]): Unit = try { + lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, this.type]() + super.initializeLastAttemptAccumulator()(ct) + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in initializeLastAttemptAccumulator", + exception = Some(e)) + } + + override def resetLastAttemptAccumulator(): Unit = try { + lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, this.type]() + super.resetLastAttemptAccumulator() + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in resetLastAttemptAccumulator", + exception = Some(e)) + } + + override protected def assertValid() = { + super.assertValid() + assert(lastAttemptDirectDriverQueryExecutionValues != null) + } + + protected def getOrCreateDirectDriverQueryExecutionValue(queryExecutionId: String): this.type = { + lastAttemptDirectDriverQueryExecutionValues.synchronized { + if (!lastAttemptDirectDriverQueryExecutionValues.contains(queryExecutionId)) { + lastAttemptDirectDriverQueryExecutionValues.put( + queryExecutionId, + this.copyAndReset().asInstanceOf[this.type]) + } + lastAttemptDirectDriverQueryExecutionValues(queryExecutionId) + } + } + + protected def getActiveDatasetQueryExecutionId: Option[String] = { + SparkContext + .getActive + .flatMap(sc => Option(sc.getLocalProperty(SparkContext.DATASET_QUERY_EXECUTION_ID_KEY))) + } + + /** + * Check if the value is added on the driver side, not from within a task. + * If it is set in the scope of a Dataset's QueryExecution, associate it with that scope. + * This must be called from `add` methods of any AccumulatorV2 subclass supporting + * SQL last attempt metrics to set what the `value` of the metric is after the operation. + * This should be called there after [[setValueIfOnDriverSide]]. + */ + protected def addQueryExecutionValueIfOnDriverSide(value: IN): Unit = try { + // Note: setValueIfOnDriverSide will already make it invalid if there are also RDD updates. + if (isAtDriverSide && lastAttemptAccumulatorInitialized && !lastAttemptAccumulatorInvalid) { + // Direct update on the driver, not from within a task. + getActiveDatasetQueryExecutionId match { + case Some(qeId) => + val driverMetric = getOrCreateDirectDriverQueryExecutionValue(qeId) + driverMetric.add(value) + case None => // pass + } + } + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in addQueryExecutionValueIfOnDriverSide", + exception = Some(e)) + } + + override def logAccumulatorState: LogEntry = try { + val driverQEVals = Option(lastAttemptDirectDriverQueryExecutionValues) + .map(_.map { case (key, metric) => s"$key -> ${metric.value}" }.mkString("\n")) + .getOrElse("") + super.logAccumulatorState + + log""" + |Direct driver QE values: + |${MDC(logKeyAccumulatorState, driverQEVals)} + """.stripMargin + } catch { + case NonFatal(e) => + logWarning(log"Unexpected exception in logAccumulatorState", e) + log"" + } + + /** + * Returns the last attempt value of this accumulator, aggregated from the last execution of this + * QueryExecution. + * + * @note The output of this method is undefined if this metric was used inside a part of the plan + * which was either checkpointed (e.g. df.localCheckpoint(), df.checkpoint()) or cached + * (e.d. df.cache(), df.persist()). + * [[lastAttemptValueForHighestRDDId()]] should return the value from when the execution in + * which the plan was cached/checkpointed. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForQueryExecution(qe: QueryExecution): Option[OUT] = { + if (lastAttemptAccumulatorInvalid) return None + assertValid() + // If there was a driver set value defined in the scope of this QueryExecution, return that. + lastAttemptDirectDriverQueryExecutionValues.get(qe.id.toString) match { + case Some(metric) => return Some(metric.value) + case None => // pass + } + // Otherwise, gather the RDD scoped from the plan and find metric updates from these scopes. + val scopes = SQLLastAttemptAccumulator.extractStageRDDScopes(qe.executedPlan) + scopes match { + case Left(bailOutReason) => + unexpectedLastAttemptMetricOperation( + invalidate = false, + reason = s"Unable to extract RDD scopes from query execution plan: $bailOutReason") + None + case Right(scopes) => + lastAttemptValueForRDDScopes(scopes) + } + } + + /** + * Returns the last attempt value of this accumulator, aggregated from the last execution of this + * Dataset. + * + * @note The output of this method is undefined if this metric was used inside a part of the plan + * which was either checkpointed (e.g. df.localCheckpoint(), df.checkpoint()) or cached + * (e.d. df.cache(), df.persist()). + * [[lastAttemptValueForHighestRDDId()]] should return the value from when the execution in + * which the plan was cached/checkpointed. + * + * @return None if the last attempt value cannot be established, Some(value) otherwise. + */ + def lastAttemptValueForDataset(ds: Dataset[_]): Option[OUT] = { + lastAttemptValueForQueryExecution(ds.queryExecution) + } + + /** Visible for testing. */ + def getDirectDriverQueryExecutionValue(qeId: String): Option[OUT] = { + lastAttemptDirectDriverQueryExecutionValues.get(qeId).map(_.value) + } +} + +object SQLLastAttemptAccumulator extends Logging { + + private[metric] def extractStageRDDScopes(sparkPlan: SparkPlan): Either[String, Seq[String]] = { + var bailOutReason: Option[String] = None + + // recurse, setting the bailOutReason on failure, or returning the list of scopes on success. + def recurse(sparkPlan: SparkPlan): Seq[String] = { + if (bailOutReason.isDefined) { + Nil + } else { + extractStageRDDScopes(sparkPlan) match { + case Left(reason) => + bailOutReason = Some(reason) + Nil + case Right(scopes) => scopes + } + } + } + + def scopeIds(sparkPlan: SparkPlan): Seq[String] = { + AdaptiveSparkPlanHelper.stripAQEPlan(sparkPlan) match { + case w: WholeStageCodegenExec => + // WholeStageCodegenExec can fallback and execute the child plan without codegen instead, + // we don't know when this happens, so we need to account for both cases. + // It will never be both at the same time as this is a compilation time decision, so + // returning both won't result in duplicates. + Seq(w.rddScopeId, w.child.rddScopeId) + case p => Seq(p.rddScopeId) + } + } + + // The root of the plan is submitted as a result stage. + val resultStageScopes = scopeIds(sparkPlan) + + val stagesScopes = AdaptiveSparkPlanHelper.flatMap(sparkPlan) { + case _ if bailOutReason.isDefined => Nil + + // broadcast exchange stage submitting nodes + case bl: BroadcastExchangeLike => bl match { + case b: BroadcastExchangeExec => + // The job is submitted in scope of child of the broadcast exchange. + // ``` + // val rs = child.executeCollectResult() + // ``` + // <- executeCollectResult() is called on the child, and child executes it in its scope. + scopeIds(b.child) + case p => + // Bail out if future unknown implementation is encountered. + bailOutReason = Some(s"Unsupported BroadcastExchangeLike: ${p.getClass.getName}") + Nil + } + + // shuffle exchange stage submitting nodes + case sl: ShuffleExchangeLike => sl match { + // All shuffle exchange implementation create the ShuffledRowRDD / ShuffledBlockRDD + // with its own scope, and it will be executed in that scope. + case s: ShuffleExchangeExec => scopeIds(s) + case p => + // Bail out if future unknown implementation is encountered. + bailOutReason = Some(s"Unsupported BroadcastExchangeLike: ${p.getClass.getName}") + Nil + } + + // reused exchange + case r: ReusedExchangeExec => + // Reused exchange is going to reuse stuff executed in the scope of its child, + // i.e. the exchange it reuses. + recurse(r.child) + + case sl: BaseSubqueryExec => sl match { + case s: SubqueryExec => + // ``` + // val rows: Array[InternalRow] = if (maxNumRows.isDefined) { + // child.executeTake(maxNumRows.get) + // } else { + // child.executeCollect() + // } + // ``` + // will launch stages in scope of child. + scopeIds(s.child) + case _: SubqueryBroadcastExec => + // Used by DPP filter only, not part of main flow of query execution. + Nil + case _: SubqueryAdaptiveBroadcastExec => + // Used by DPP filter only. + Nil + case p => + // Bail out if future unknown implementation is encountered. + bailOutReason = Some(s"Unsupported BroadcastExchangeLike: ${p.getClass.getName}") + Nil + } + + /* Useful comments for posterity. + // cached table node + case _: InMemoryTableScanLike => + // Do nothing for cached tables. There are many border cases where it wouldn't work. + // Some notes for posterity: + // For [[InMemoryTableScanExec]], we could recursed into the cachedPlan, but: + // - if the metric is in the top stage of that plan, then it would be executed in the scope + // of the stage of whatever execution that InMemoryTableScanExec is part of when the + // plan is cached. [[InMemoryTableScanExec]] is not a stage submitting node by itself, and + // by itself it doesn't have visibility into the parent that submits the stage that + // materializes the cache. If the current executedPlan is not the one that materializes, + // then the metric would return 0 instead of the value from the cached execution. If the + // current executedPlan is the one that materializes the cache, then it would be the + // correct value. + // - if the metrics is in the top stage of the cachedPlan and AQE is enabled, it will be + // wrapped into a [[TableCacheQueryScanExec]], which would submit a stage. + // Currently this will be misattributed, but that could be fixed if + // [[InMemoryRelation.buildBuffers]] wrapped the creation of `val cached` into an + // RDDOperationScope that attributes it to cachedPlan. + // - if the metric is in a map stage of the cachedPlan, then it would be correctly + // annotated with the scope of that stage, and it would work correctly. + // + // Since it's hard to achieve a consistent behavior here, we just do not support it. + Nil + + // RDD node + case _: RDDScanExec => + // Similar as with cached tables, do nothing with RDDs. + // This could be a plan coming from an execution of df.checkpoint(). + // Since checkpointing cuts the references to the original plan, there is no way to descend + // into it to check attribution. + // We could try to make checkpoint collect and store the scopes of the original execution, + // but even then it would face similar inconsistencies as described above for cached plans. + // - if the metric is in the top stage of that plan, then if it was executed in the scope of + // this execution, it w + // - if the metric is in a map stage of the plan that was checkpointed, it requires that + // checkpoint would track these stages and scopes. + // + // Since it's hard to achieve a consistent behavior here, we just do not support it. + Nil + */ + + case _ => Nil // only extract from nodes that submit stages + } + + // also collect the plan scopes of all subqueries, which are executed "on the side". + val subqueriesScopes = AdaptiveSparkPlanHelper.flatMap(sparkPlan) { p => + p.subqueries.flatMap(recurse) + } + + if (bailOutReason.isDefined) { + Left(bailOutReason.get) + } else { + Right(resultStageScopes ++ stagesScopes ++ subqueriesScopes) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala new file mode 100644 index 0000000000000..4499e514504cc --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.metric + +import scala.util.control.NonFatal + +import org.apache.spark.SparkContext +import org.apache.spark.util.AccumulatorV2 + +class SQLLastAttemptMetric( + metricType: String, + initValue: Long = 0L) + extends SQLMetric(metricType, initValue) + with SQLLastAttemptAccumulator[Long, Long, Long] { + + override protected def partialMergeVal: Long = _value + + override protected def partialMerge(value: Long): Unit = { + val acc = this.copyAndReset().asInstanceOf[this.type] + acc.set(value) + merge(acc) + } + + override protected def isMergeable(other: AccumulatorV2[_, _]): Boolean = other match { + case o: SQLLastAttemptMetric => o.metricType == metricType + case _ => false + } + + // SQLLastAttemptMetric is used internally to aggregate system metrics (counters) such as + // number of rows processed, and it should not store user data. + protected def accumulatorStoresUserData: Boolean = false + + /** + * Check if the value is added on the driver side, not from within a task. + * If it is set in the scope of a Dataset's QueryExecution, associate it with that scope. + * This must be called from `set` methods of this SQLMetric subclass. + * This should be called there after setValueIfOnDriverSide. + * See [[addQueryExecutionValueIfOnDriverSide]]. + * This method is directly in this class, because in general AccumulatorV2 does not have a + * direct set method, only SQLMetric has it. + */ + protected def setQueryExecutionValueIfOnDriverSide(value: Long): Unit = try { + // Note: setValueIfOnDriverSide will already make it invalid if there are also RDD updates. + if (isAtDriverSide && lastAttemptAccumulatorInitialized && !lastAttemptAccumulatorInvalid) { + // Direct update on the driver, not from within a task. + getActiveDatasetQueryExecutionId match { + case Some(qeId) => + val driverMetric = getOrCreateDirectDriverQueryExecutionValue(qeId) + driverMetric.set(value) + case None => // pass + } + } + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in addQueryExecutionValueIfOnDriverSide", + exception = Some(e)) + } + + override def copy(): SQLLastAttemptMetric = { + val newAcc = new SQLLastAttemptMetric(metricType, initValue) + newAcc._value = _value + newAcc + } + + override def add(v: Long): Unit = { + super.add(v) + if (v >= 0) { + // set value of SQLMetric after the add. + setValueIfOnDriverSide(value) + addQueryExecutionValueIfOnDriverSide(v) + } + } + + override def set(v: Long): Unit = { + super.set(v) + if (v >= 0) { + // set value of SQLMetric after the set. + setValueIfOnDriverSide(value) + setQueryExecutionValueIfOnDriverSide(value) + } + } + +} + +object SQLLastAttemptMetrics { + /** + * Create a metric to report the value aggregated from the last attempt of each task. These + * would be the values for the tasks that actually contributed to the final output of the + * execution. + */ + def createMetric(sc: SparkContext, name: String): SQLLastAttemptMetric = { + val acc = new SQLLastAttemptMetric(SQLMetrics.SUM_METRIC) + acc.register(sc, name = SQLMetrics.metricsCache.get(name), countFailedValues = false) + acc.initializeLastAttemptAccumulator() + acc + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala index 13f4d7926bea8..6deff9d5c5eb5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala @@ -34,7 +34,7 @@ import org.apache.spark.util.AccumulatorContext.internOption */ class SQLMetric( val metricType: String, - initValue: Long = 0L) extends AccumulatorV2[Long, Long] { + val initValue: Long = 0L) extends AccumulatorV2[Long, Long] { // initValue defines the initial value of the metric. 0 is the lowest value considered valid. // If a SQLMetric is invalid, it is set to 0 upon receiving any updates, and it also reports // 0 as its value to avoid exposing it to the user programmatically. @@ -45,7 +45,7 @@ class SQLMetric( // for SPARK-11013. assert(initValue <= 0) // _value will always be either initValue or non-negative. - private var _value = initValue + private[metric] var _value = initValue override def copy(): SQLMetric = { val newAcc = new SQLMetric(metricType, initValue) @@ -110,7 +110,7 @@ class SQLMetric( } object SQLMetrics { - private val SUM_METRIC = "sum" + private[metric] val SUM_METRIC = "sum" private val SIZE_METRIC = "size" private val TIMING_METRIC = "timing" private val NS_TIMING_METRIC = "nsTiming" @@ -120,7 +120,7 @@ object SQLMetrics { val cachedSQLAccumIdentifier = Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER) - private val metricsCache: LoadingCache[String, Option[String]] = + private[metric] val metricsCache: LoadingCache[String, Option[String]] = CacheBuilder.newBuilder().maximumSize(10000) .build(new CacheLoader[String, Option[String]] { override def load(name: String): Option[String] = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/MetricsFailureInjectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/MetricsFailureInjectionSuite.scala new file mode 100644 index 0000000000000..847a12f4f305c --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/MetricsFailureInjectionSuite.scala @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.metric + +import scala.util.Random + +import org.apache.spark.internal.config +import org.apache.spark.sql.{Column, Dataset} +import org.apache.spark.sql.execution.adaptive.{AQETestHelper, DisableAdaptiveExecutionSuite} +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +class MetricsFailureInjectionSuite + extends SharedSparkSession + with SQLMetricsTestUtils + // Need to control AQE per-test to ensure expected plan shapes. + with DisableAdaptiveExecutionSuite { + + import testImplicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + // Disable re-use, since it interferes with the forced replanning. + spark.conf.set(SQLConf.EXCHANGE_REUSE_ENABLED, false) + } + + def setUpTestTable(tableName: String): Unit = { + val rand = new Random(1) + val randomPrefix = rand.nextString(30) + spark.range(300).map { id => + (id, (id % 5).toInt, randomPrefix + (id % 111)) + }.toDF("id", "low_cardinality_col", "large_col") + .write.format("parquet").saveAsTable(tableName) + val numRecords = spark.read.table(tableName).count() + assert(numRecords === 300) + } + + for { + useAQE <- BOOLEAN_DOMAIN + } test(s"Two stage metrics AQE cancellation injection - useAQE=$useAQE") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> useAQE.toString) { + val stage1Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 1 counter") + val stage2Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 2 counter") + val stage1SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 1 SLAM") + val stage2SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 2 SLAM") + + def runQueryWithMetrics( + triggerMetrics: SQLMetric*)( + postRunChecks: Dataset[_] => Unit): Unit = { + assert(stage1Metric.value === 0) + assert(stage2Metric.value === 0) + withTable("test_table") { + setUpTestTable("test_table") + AQETestHelper.withForcedCancellation(triggerMetrics: _*) { + val stage1MetricsExpr = incrementMetrics(Seq(stage1Metric, stage1SLAMetric)) + val stage1 = spark.read.table("test_table").filter(Column(stage1MetricsExpr)) + val stage2MetricsExpr = incrementMetrics(Seq(stage2Metric, stage2SLAMetric)) + val stage2 = + stage1.groupBy("low_cardinality_col").count().filter(Column(stage2MetricsExpr)) + val finalDf = stage2.as[(Int, Long)] + val result = finalDf.collect() + + assert(result.toMap === (0 until 5).map(v => (v, 300 / 5)).toMap) + postRunChecks(finalDf) + stage1Metric.reset() + stage2Metric.reset() + } + } + } + + // SLAM values don't change with retries, so we can reuse the same assertions for all cases. + def assertSLAM(finalDf: Dataset[_]): Unit = { + assert(stage1SLAMetric.lastAttemptValueForHighestRDDId() === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForHighestRDDId() === Some(5)) + + assert(stage1SLAMetric.lastAttemptValueForDataset(finalDf) === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForDataset(finalDf) === Some(5)) + } + + // Case 1: No forced replanning. + runQueryWithMetrics() { finalDf => + assert(stage1Metric.value === 300) + assert(stage2Metric.value === 5) + + assertSLAM(finalDf) + } + + // Case 2: Replan on stage1Metric. + runQueryWithMetrics(stage1Metric) { finalDf => + if (useAQE) { + assert(stage1Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + } + assert(stage2Metric.value === 5) + + assertSLAM(finalDf) + } + + // Case 3: Replan on stage2Metric (will be ignored, because this is a result stage). + runQueryWithMetrics(stage2Metric) { finalDf => + assert(stage1Metric.value === 300) + assert(stage2Metric.value === 5) + + assertSLAM(finalDf) + } + + // Case 4: Replan on both metrics (only first will actually trigger). + runQueryWithMetrics(stage1Metric, stage2Metric) { finalDf => + if (useAQE) { + assert(stage1Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + } + assert(stage2Metric.value === 5) + + assertSLAM(finalDf) + } + } + } + + for { + useAQE <- BOOLEAN_DOMAIN + } test(s"Three stage metrics AQE cancellation injection - useAQE=$useAQE") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> useAQE.toString) { + val stage1Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 1 counter") + val stage2Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 2 counter") + val stage3Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 3 counter") + val stage1SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 1 SLAM") + val stage2SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 2 SLAM") + val stage3SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 3 SLAM") + + def runQueryWithMetrics( + triggerMetrics: SQLMetric*)(postRunChecks: Dataset[_] => Unit): Unit = { + assert(stage1Metric.value === 0) + assert(stage2Metric.value === 0) + withTable("primary_table", "secondary_table") { + // Use the same layout for both. Makes the query a non-obvious self-join essentially. + setUpTestTable("primary_table") + setUpTestTable("secondary_table") + AQETestHelper.withForcedCancellation(triggerMetrics: _*) { + val stage1MetricsExpr = incrementMetrics(Seq(stage1Metric, stage1SLAMetric)) + val stage1 = spark.read.table("primary_table") + .filter(Column(stage1MetricsExpr)) + val stage2MetricsExpr = incrementMetrics(Seq(stage2Metric, stage2SLAMetric)) + val stage2 = stage1.join( + spark.read.table("secondary_table"), + usingColumn = "id", + joinType = "fullOuter") + .filter(Column(stage2MetricsExpr)) + val stage3MetricsExpr = incrementMetrics(Seq(stage3Metric, stage3SLAMetric)) + val stage3 = stage2 + .groupBy("primary_table.low_cardinality_col") + .count() + .filter(Column(stage3MetricsExpr)) + val finalDf = stage3.as[(Int, Long)] + val result = finalDf.collect() + assert(result.toMap === (0 until 5).map(v => (v, 300 / 5)).toMap) + postRunChecks(finalDf) + stage1Metric.reset() + stage2Metric.reset() + stage3Metric.reset() + } + } + } + + // SLAM values don't change with retries, so we can reuse the same assertions for all cases. + def assertSLAM(finalDf: Dataset[_]): Unit = { + assert(stage1SLAMetric.lastAttemptValueForHighestRDDId() === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForHighestRDDId() === Some(300)) + assert(stage3SLAMetric.lastAttemptValueForHighestRDDId() === Some(5)) + + assert(stage1SLAMetric.lastAttemptValueForDataset(finalDf) === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForDataset(finalDf) === Some(300)) + assert(stage3SLAMetric.lastAttemptValueForDataset(finalDf) === Some(5)) + } + + // Case 1: No forced replanning. + runQueryWithMetrics() { finalDf => + assert(stage1Metric.value === 300) + assert(stage2Metric.value === 300) + assert(stage3Metric.value === 5) + + assertSLAM(finalDf) + } + + // Case 2: Replan on stage1Metric. + runQueryWithMetrics(stage1Metric) { finalDf => + if (useAQE) { + assert(stage1Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + } + assert(stage2Metric.value === 300) + assert(stage3Metric.value === 5) + + assertSLAM(finalDf) + } + + // Case 3: Replan on stage2Metric (will also re-run the first stage). + runQueryWithMetrics(stage2Metric) { finalDf => + if (useAQE) { + assert(stage1Metric.value > 300) + assert(stage2Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + assert(stage2Metric.value === 300) + } + assert(stage3Metric.value === 5) + + assertSLAM(finalDf) + } + + // Case 4: Replan on all metrics (only first will actually trigger). + runQueryWithMetrics(stage1Metric, stage2Metric, stage3Metric) { finalDf => + if (useAQE) { + assert(stage1Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + } + assert(stage2Metric.value === 300) + assert(stage3Metric.value === 5) + + assertSLAM(finalDf) + } + } + } + + for { + injectFailure <- BOOLEAN_DOMAIN + } test(s"Two stage metrics block failure injection - injectFailure=$injectFailure") { + val stage1Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 1 counter") + val stage2Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 2 counter") + val stage1SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 1 SLAM") + val stage2SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 2 SLAM") + + def runQueryWithMetrics( + triggerMetrics: SQLMetric*)(postRunChecks: Dataset[_] => Unit): Unit = { + assert(stage1Metric.value === 0) + assert(stage2Metric.value === 0) + withTable("test_table") { + setUpTestTable("test_table") + withSparkContextConf( + config.Tests.INJECT_SHUFFLE_FETCH_FAILURES.key -> injectFailure.toString) { + val stage1MetricsExpr = incrementMetrics(Seq(stage1Metric, stage1SLAMetric)) + val stage1 = spark.read.table("test_table").filter(Column(stage1MetricsExpr)) + val stage2MetricsExpr = incrementMetrics(Seq(stage2Metric, stage2SLAMetric)) + val stage2 = + stage1.groupBy("low_cardinality_col").count().filter(Column(stage2MetricsExpr)) + val finalDf = stage2.as[(Int, Long)] + val result = finalDf.collect() + assert(result.toMap === (0 until 5).map(v => (v, 300 / 5)).toMap) + postRunChecks(finalDf) + stage1Metric.reset() + stage2Metric.reset() + } + } + } + + runQueryWithMetrics() { finalDf => + if (injectFailure) { + assert(stage1Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + } + // Stage2 doesn't have a downstream shuffle stage we can fail. + assert(stage2Metric.value === 5) + + assert(stage1SLAMetric.lastAttemptValueForHighestRDDId() === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForHighestRDDId() === Some(5)) + + assert(stage1SLAMetric.lastAttemptValueForDataset(finalDf) === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForDataset(finalDf) === Some(5)) + } + } + + for { + injectFailure <- BOOLEAN_DOMAIN + } test(s"Non-deterministic stage block failure injection - injectFailure=$injectFailure") { + val stage1Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 1 counter") + val stage2Metric = SQLMetrics.createMetric(spark.sparkContext, "stage 2 counter") + val stage1SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 1 SLAM") + val stage2SLAMetric = + SQLLastAttemptMetrics.createMetric(spark.sparkContext, "stage 2 SLAM") + + def runQueryWithMetrics( + triggerMetrics: SQLMetric*)(postRunChecks: Dataset[_] => Unit): Unit = { + assert(stage1Metric.value === 0) + assert(stage2Metric.value === 0) + withTable("test_table") { + setUpTestTable("test_table") + withSparkContextConf( + config.Tests.INJECT_SHUFFLE_FETCH_FAILURES.key -> injectFailure.toString) { + val stage1MetricsExpr = incrementMetrics(Seq(stage1Metric, stage1SLAMetric)) + val udfRand = + udf { + () => { + new Random().nextDouble() + } + }.asNondeterministic().apply().expr + val stage1 = spark.read.table("test_table") + .withColumn("non_deterministic_col", Column(udfRand)) + .filter(Column(stage1MetricsExpr)) + val stage2MetricsExpr = incrementMetrics(Seq(stage2Metric, stage2SLAMetric)) + val stage2 = stage1 + .groupBy("low_cardinality_col") + .avg("non_deterministic_col") + .filter(Column(stage2MetricsExpr)) + // Add an extra stage with a single task to avoid flaky failures. If a ResultTask + // returns non-deterministic results to the client, it forces the query to abort + // instead of retrying the input stages. + val finalDf = stage2.repartition(1).as[(Int, Double)] + val result = finalDf.collect() + // Don't compare the second value, since it's random. + assert(result.map(_._1).toSet === (0 until 5).toSet) + postRunChecks(finalDf) + stage1Metric.reset() + stage2Metric.reset() + } + } + } + + runQueryWithMetrics() { finalDf => + if (injectFailure) { + assert(stage1Metric.value > 300) + } else { + assert(stage1Metric.value === 300) + } + // Stage2 doesn't have a downstream shuffle stage we can fail. + assert(stage2Metric.value === 5) + + assert(stage1SLAMetric.lastAttemptValueForHighestRDDId() === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForHighestRDDId() === Some(5)) + + assert(stage1SLAMetric.lastAttemptValueForDataset(finalDf) === Some(300)) + assert(stage2SLAMetric.lastAttemptValueForDataset(finalDf) === Some(5)) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala new file mode 100644 index 0000000000000..5d6ce6af20c65 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala @@ -0,0 +1,711 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.metric + +import org.apache.spark.internal.config +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.optimizer.BuildRight +import org.apache.spark.sql.catalyst.plans.RightOuter +import org.apache.spark.sql.execution.{CoalescedPartitionSpec, CoalescedShuffleRead, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +/** Tests [[SQLLastAttemptMetric]] used by [[RDD]]s and [[Dataset]]s */ +class SQLLastAttemptMetricIntegrationSuite + extends SharedSparkSession + with SQLMetricsTestUtils { + import testImplicits._ + + protected def withRetries = false + + test("single stage rdd updates with shared slam") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val rdd1 = spark.sparkContext.parallelize(1 to 10, 2).map { x => + slam.add(1) + x + } + + rdd1.count() + assert(withRetries || slam.value === 10) + assert(slam.lastAttemptValueForAllRDDs() === Some(10)) + assert(slam.lastAttemptValueForRDDId(rdd1.id) === Some(10)) + assert(slam.lastAttemptValueForRDDIds(Seq(rdd1.id, rdd1.id)) === Some(10)) + assert(slam.lastAttemptValueForRDDIds(Seq(rdd1.id + 1, rdd1.id + 2)) === Some(0)) + assert(slam.lastAttemptValueForRDDIds(Seq(rdd1.id, rdd1.id + 10, rdd1.id)) === Some(10)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(10)) + + val rdd2 = spark.sparkContext.parallelize(1 to 50, 3).map { x => + slam.add(3) + x + } + rdd2.count() + assert(withRetries || slam.value === 160) // +150 + assert(slam.lastAttemptValueForRDDId(rdd1.id) === Some(10)) // value for first rdd unaffected + assert(slam.lastAttemptValueForRDDId(rdd2.id) === Some(150)) // value for second rdd recorded + assert(slam.lastAttemptValueForAllRDDs() === Some(160)) // value for all rdds summed + assert(slam.getNumRDDs === 2) + assert(slam.lastAttemptValueForHighestRDDId() === Some(150)) // highest RDD id updated. + + // Re-executing rdd1 + rdd1.count() + assert(withRetries || slam.value === 170) // +10 + // Re-execution doesn't produce duplicate last attempt values + assert(slam.lastAttemptValueForRDDId(rdd1.id) === Some(10)) + assert(slam.lastAttemptValueForAllRDDs() === Some(160)) + assert(slam.getNumRDDs === 2) + // Highest RDD id tracks highest rdd.id, not the last RDD to be executed + assert(slam.lastAttemptValueForHighestRDDId() === Some(150)) + + // New RDD on top of rdd1, but in a single stage. + val rdd3 = rdd1.map { x => + slam.add(2) + x + } + rdd3.count() + assert(withRetries || slam.value === 200) // +30 + assert(slam.lastAttemptValueForRDDId(rdd1.id) === Some(10)) // stays the same + // The increment from rdd1 and rdd3 are in the same stage, so they are recorded together. + assert(slam.getNumRDDs === 3) + assert(slam.lastAttemptValueForRDDId(rdd3.id) === Some(30)) + assert(slam.getHighestRDDId === Some(rdd3.id)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(30)) + + // Setting a value directly from the driver makes slam bail out, because it can't reason + // about what is the "last attempt" on driver vs. coming from RDD executions. + slam.set(42) + assert(!slam.getValid) + // Information stays available for logging and debugging. + assert(slam.getNumRDDs === 3) + assert(slam.getHighestRDDId === Some(rdd3.id)) + + logInfo(slam.logAccumulatorState) + } + + test("multi stage rdd updates") { + val slam1 = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM1") + val slam2 = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM2") + + val rdd1 = spark.sparkContext.parallelize(1 to 10, 2).map { x => + slam1.add(1) + x + } + val repartition = rdd1.repartition(10) + val rdd2 = repartition.map { x => + slam2.add(1) + x + } + rdd2.collect() + assert(withRetries || slam1.value === 10) + assert(withRetries || slam2.value === 10) + assert(slam1.lastAttemptValueForAllRDDs() === Some(10)) + assert(slam2.lastAttemptValueForAllRDDs() === Some(10)) + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) + assert(slam2.lastAttemptValueForHighestRDDId() === Some(10)) + // It is executed in a Stage submitted by the repartition. + assert(slam1.lastAttemptValueForRDDId(rdd1.id) === Some(0)) + assert(slam1.lastAttemptValueForRDDId(repartition.id) === Some(0)) // Surprise, nope. + // Repartition creates a number of MapPartitionsRDDs, CoalescedRDDs, ShuffledRDDs... + // The actual stage that submits the map stage is somewhere internal. + assert(slam1.getHighestRDDId.isDefined) + val mapStageRddId = slam1.getHighestRDDId.get + assert(slam1.lastAttemptValueForRDDId(mapStageRddId) === Some(10)) + + // Test passing multiple ids. + assert(slam1.lastAttemptValueForRDDIds(Seq(rdd1.id, repartition.id)) === Some(0)) + assert(slam1.lastAttemptValueForRDDIds( + Seq(rdd1.id, mapStageRddId, repartition.id)) === Some(10)) + assert(slam1.lastAttemptValueForRDDIds(Seq(rdd1.id, rdd2.id)) === Some(0)) + assert(slam1.lastAttemptValueForRDDIds(Seq(-10)) === Some(0)) + + rdd2.collect() + // Repartition stage is reused, but result stage is re-executed. + assert(withRetries || slam1.value === 10) // no change + assert(withRetries || slam2.value === 20) // +10 + // Last attempt value is not duplicated, since result stage is an action on the same RDD. + assert(slam1.lastAttemptValueForAllRDDs() === Some(10)) + assert(slam2.lastAttemptValueForAllRDDs() === Some(10)) + + rdd1.collect() + assert(withRetries || slam1.value === 20) // +10 + // The first time around it was executed in the repartition RDD stage. + // This time around it is executed from action of rdd1. + assert(slam1.getNumRDDs === 2) + assert(slam1.lastAttemptValueForAllRDDs() === Some(20)) + assert(slam1.lastAttemptValueForRDDId(rdd1.id) === Some(10)) // new + assert(slam1.lastAttemptValueForRDDId(mapStageRddId) === Some(10)) // old + // Highest RDD id stays the same. + assert(slam1.getHighestRDDId === Some(mapStageRddId)) + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) + + rdd1.collect() + assert(withRetries || slam1.value === 30) // +10 + // Still the same + assert(slam1.lastAttemptValueForAllRDDs() === Some(20)) + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) + + rdd2.collect() + // Repartition stage is reused (again), but result stage is re-executed (again). + assert(withRetries || slam1.value === 30) // no change + assert(withRetries || slam2.value === 30) // +10 + // Last attempt value is not duplicated, since result stage is an action on the same RDD. + assert(slam1.lastAttemptValueForAllRDDs() === Some(20)) + assert(slam2.lastAttemptValueForAllRDDs() === Some(10)) + + // Executed in different RDDs, but never duplicated. + assert(slam1.lastAttemptValueForRDDId(rdd1.id) === Some(10)) + assert(slam1.lastAttemptValueForRDDId(mapStageRddId) === Some(10)) + assert(slam1.lastAttemptValueForRDDId(repartition.id) === Some(0)) + assert(slam1.lastAttemptValueForRDDId(rdd2.id) === Some(0)) + assert(slam2.lastAttemptValueForRDDId(rdd1.id) === Some(0)) + assert(slam2.lastAttemptValueForRDDId(mapStageRddId) === Some(0)) + assert(slam1.lastAttemptValueForRDDId(repartition.id) === Some(0)) + assert(slam2.lastAttemptValueForRDDId(rdd2.id) === Some(10)) + + val newRepartition = rdd1.repartition(10) + val newRdd2 = newRepartition.map { x => + slam2.add(1) + x + } + newRdd2.collect() + assert(withRetries || slam1.value === 40) // +10 + assert(withRetries || slam2.value === 40) // +10 + // SLAM metrics get re-executed in the new RDDs + // rdd1 is reused, but the shuffle is new, and that is what submits the map stage. + assert(slam1.getNumRDDs === 3) + assert(slam1.lastAttemptValueForAllRDDs() === Some(30)) // +10 + assert(slam2.getNumRDDs === 2) + assert(slam2.lastAttemptValueForAllRDDs() === Some(20)) // +10 + // Values are recorded for the new highest RDD id. + assert(slam1.getHighestRDDId.isDefined) + val newMapStageId = slam1.getHighestRDDId.get + assert(newMapStageId > mapStageRddId) + assert(slam2.getHighestRDDId === Some(newRdd2.id)) + assert(slam1.lastAttemptValueForRDDId(newMapStageId) === Some(10)) + assert(slam2.lastAttemptValueForRDDId(newRdd2.id) === Some(10)) + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) + assert(slam2.lastAttemptValueForHighestRDDId() === Some(10)) + + logInfo(slam1.logAccumulatorState) + logInfo(slam2.logAccumulatorState) + } + + test("rdd take") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val rdd = spark.sparkContext.parallelize(1 to 100, 100).map { x => + slam.add(1) + x + } + withSparkContextConf( + // make it fixed to not be affected by potential changes of default. + config.RDD_LIMIT_INITIAL_NUM_PARTITIONS.key -> "1", + config.RDD_LIMIT_SCALE_UP_FACTOR.key -> "4" + ) { + rdd.take(1) // execute 1 partition + assert(withRetries || slam.value === 1) + assert(slam.lastAttemptValueForAllRDDs() === Some(1)) + + // take(2) scales up from 1 partition; the exact number of partitions scanned + // depends on the scale-up algorithm. + val valueBefore = slam.value + rdd.take(2) + val slamAfterTake2 = slam.lastAttemptValueForAllRDDs() + assert(slamAfterTake2.isDefined) + assert(slamAfterTake2.get >= 2) // at least 2 partitions + assert(slamAfterTake2.get < 100) // but not all partitions. + + // take(100) should execute all 100 partitions + rdd.take(100) + assert(slam.lastAttemptValueForAllRDDs() === Some(100)) + assert(slam.getNumRDDs === 1) + } + + logInfo(slam.logAccumulatorState) + } + + test("rdd coalesce") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val rdd1 = spark.sparkContext.parallelize(1 to 100, 100).map { x => + slam.add(1) + x + } + val rdd2 = rdd1.coalesce(20) + // Test that coalescing that changes partition count doesn't break anything. + rdd2.collect() + assert(slam.lastAttemptValueForRDDId(rdd2.id) === Some(100)) + rdd1.collect() + assert(slam.lastAttemptValueForRDDId(rdd1.id) === Some(100)) + + logInfo(slam.logAccumulatorState) + } + + test("dataset updates") { + val slam1 = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM1") + val df1 = spark.range(10).filter(Column(incrementMetric(slam1))) + + df1.collect() + assert(withRetries || slam1.value === 10) + assert(slam1.getHighestRDDId.isDefined) + val df1HighestId = slam1.getHighestRDDId.get + val df1ExecutedPlanRddId = df1.queryExecution.executedPlan.execute().id + // The RDD id recorded by SLAM is the wrapper RDD from getByteArrayRdd. + assert(df1HighestId == df1.queryExecution.executedPlan.lastByteArrayRddId.get) + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) + assert(slam1.lastAttemptValueForRDDId(df1HighestId) === Some(10)) + // Values retrieved from the Dataset are the same as from the RDD. + assert(slam1.lastAttemptValueForDataset(df1) === Some(10)) + assert(slam1.lastAttemptValueForQueryExecution(df1.queryExecution) === Some(10)) + + df1.collect() + assert(withRetries || slam1.value === 20) // +10 + // The same executedPlan RDD is reused, but getByteArrayRdd creates a new wrapper. + assert(df1.queryExecution.executedPlan.execute().id === df1ExecutedPlanRddId) + assert(slam1.getHighestRDDId === df1.queryExecution.executedPlan.lastByteArrayRddId) + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) + // Both wrapper RDDs are summed in allRDDs. + assert(slam1.lastAttemptValueForAllRDDs() === Some(20)) + assert(slam1.lastAttemptValueForDataset(df1) === Some(10)) + + val df2 = df1.filter("id < 5").filter(Column(incrementMetric(slam1))) + df2.collect() + assert(withRetries || slam1.value === 35) // +15 + assert(slam1.getHighestRDDId.isDefined) + val df2HighestId = slam1.getHighestRDDId.get + assert(df2HighestId === df2.queryExecution.executedPlan.lastByteArrayRddId.get) + // Both incrementMetric expressions are within the same Stage, so they record together. + assert(slam1.lastAttemptValueForHighestRDDId() === Some(15)) + // allRDDs includes wrapper RDDs from repeated df1.collect() calls. + assert(slam1.lastAttemptValueForAllRDDs() === Some(35)) + // New Dataset records only new value. + assert(slam1.lastAttemptValueForDataset(df2) === Some(15)) + // Value df1 is still remembered. + assert(slam1.lastAttemptValueForDataset(df1) === Some(10)) + + val slam2 = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM2") + val df3 = df1.repartition(1).filter(Column(incrementMetric(slam2))) + df3.collect() + assert(withRetries || slam1.value === 45) // +10 + assert(withRetries || slam2.value === 10) // new + // df3 creates a new plan, and the plan / RDD from df1 is not reused. + assert(slam1.getHighestRDDId.isDefined) + val slam1HighestId = slam1.getHighestRDDId.get + assert(slam1HighestId != df1HighestId) // new plan, new RDD id + assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) // from new execution + assert(slam1.lastAttemptValueForRDDId(df1HighestId) === Some(10)) // from first exec of df1 + // allRDDs includes wrapper RDDs from repeated collects. + assert(slam1.lastAttemptValueForAllRDDs() === Some(45)) + assert(slam2.lastAttemptValueForAllRDDs() === Some(10)) + // slam1 and slam2 are both executed in in df3 + assert(slam1.lastAttemptValueForDataset(df3) === Some(10)) + assert(slam2.lastAttemptValueForDataset(df3) === Some(10)) + // slam2 is not executed in df1 and df2. + assert(slam2.lastAttemptValueForDataset(df1) === Some(0)) + assert(slam2.lastAttemptValueForDataset(df2) === Some(0)) + // slam1 value from df1 and df2 are still remembered. + assert(slam1.lastAttemptValueForDataset(df1) === Some(10)) + assert(slam1.lastAttemptValueForDataset(df2) === Some(15)) + + // Plans and RDDs get reused (result stage is re-executed; shuffle stage is purely reused). + df3.collect() + // No change in dataset values. + assert(slam1.lastAttemptValueForDataset(df3) === Some(10)) + assert(slam2.lastAttemptValueForDataset(df3) === Some(10)) + assert(slam1.lastAttemptValueForDataset(df1) === Some(10)) + assert(slam1.lastAttemptValueForDataset(df2) === Some(15)) + + logInfo(slam1.logAccumulatorState) + logInfo(slam2.logAccumulatorState) + } + + test("dataset limit") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM1") + // 10 partitions of 10 elements each + val df = spark.range(0, 1000, 1, 10).filter(Column(incrementMetric(slam))) + var expectedMetricValue = 0 + var expectedSLAMValue = 0 + + // Note: this is sensitive to the internal implementation of LimitExec. + + df.take(5) + // One partition executed, local limit pushed into partition. + expectedMetricValue = 5 + expectedSLAMValue = 5 + assert(withRetries || slam.value === expectedMetricValue) + assert(slam.lastAttemptValueForHighestRDDId() === Some(expectedSLAMValue)) + // take(5) actually inline creates a new Dataset, with new executed plan + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + + df.take(50) + // One partition executed, local limit pushed into partition. + expectedMetricValue += 50 + expectedSLAMValue = 50 + assert(withRetries || slam.value === expectedMetricValue) + // New SQL plan creates new RDDs, so this is seen as new execution. + assert(slam.lastAttemptValueForHighestRDDId() === Some(expectedSLAMValue)) + assert(slam.getNumRDDs === 2) + assert(slam.lastAttemptValueForAllRDDs() === Some(expectedMetricValue)) + // take(50) executes a different inline Dataset and plan. + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + + df.take(220) + // Three partitions executed. + expectedMetricValue += 300 + expectedSLAMValue = 300 + assert(withRetries || slam.value === expectedMetricValue) + assert(slam.lastAttemptValueForHighestRDDId() === Some(expectedSLAMValue)) + assert(slam.getNumRDDs === 3) + assert(slam.lastAttemptValueForAllRDDs() === Some(expectedMetricValue)) + // take(220) executes a different inline Dataset and plan. + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + + df.take(320) + // Five partitions executed. + expectedMetricValue += 500 + expectedSLAMValue = 500 + assert(withRetries || slam.value === expectedMetricValue) + assert(slam.lastAttemptValueForHighestRDDId() === Some(expectedSLAMValue)) + assert(slam.getNumRDDs === 4) + assert(slam.lastAttemptValueForAllRDDs() === Some(expectedMetricValue)) + // take(320) executes a different inline Dataset and plan. + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + + df.take(1) + // One partition scanned, local limit pushed into partition. + expectedMetricValue += 1 + expectedSLAMValue = 1 + assert(withRetries || slam.value === expectedMetricValue) + // New RDD, so the value from new execution is back to 1. + assert(slam.lastAttemptValueForHighestRDDId() === Some(expectedSLAMValue)) + assert(slam.getNumRDDs === 5) + assert(slam.lastAttemptValueForAllRDDs() === Some(expectedMetricValue)) + // take(1) executes a different inline Dataset and plan. + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + + logInfo(slam.logAccumulatorState) + } + + test("driver set value") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + slam.set(10) + + // Regular metric value + assert(withRetries || slam.value === 10) + + assert(slam.getDirectDriverValue === Some(10)) + // "Driver update" is returned under "highest" and "all" RDDs + assert(slam.lastAttemptValueForHighestRDDId() === Some(10)) + assert(slam.lastAttemptValueForAllRDDs() === Some(10)) + assert(slam.getNumRDDs === 0) + // When specific RDDs are requested, driver value is not returned. + assert(slam.lastAttemptValueForRDDId(42) === Some(0)) + assert(slam.lastAttemptValueForRDDIds(Seq(7, 42)) === Some(0)) + + // Incrementing works + slam.add(5) + assert(withRetries || slam.value === 15) + assert(slam.lastAttemptValueForHighestRDDId() === Some(15)) + assert(slam.lastAttemptValueForAllRDDs() === Some(15)) + assert(slam.getDirectDriverValue === Some(15)) + + // Negative increments are ignored by SQLMetric + slam.add(-3) + assert(withRetries || slam.value === 15) + assert(slam.lastAttemptValueForHighestRDDId() === Some(15)) + assert(slam.lastAttemptValueForAllRDDs() === Some(15)) + assert(slam.getDirectDriverValue === Some(15)) + + // Reset does not reset SLAM. + slam.reset() + assert(withRetries || slam.value === 0) + assert(slam.lastAttemptValueForHighestRDDId() === Some(15)) + assert(slam.lastAttemptValueForAllRDDs() === Some(15)) + assert(slam.getDirectDriverValue === Some(15)) + + // Setting it back... + slam.set(20) + assert(withRetries || slam.value === 20) + assert(slam.lastAttemptValueForHighestRDDId() === Some(20)) + assert(slam.lastAttemptValueForAllRDDs() === Some(20)) + assert(slam.getDirectDriverValue === Some(20)) + assert(slam.getNumRDDs === 0) + + val df = spark.range(10).filter(Column(incrementMetric(slam))) + // SLAM was not executed in this Dataset, the driver value set manually + // before should not be returned. + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === None) + df.collect() + assert(withRetries || slam.value === 30) + // SLAM bails out when it sees both driver and executor values + assert(slam.lastAttemptValueForHighestRDDId() === None) + assert(slam.lastAttemptValueForAllRDDs() === None) + assert(slam.lastAttemptValueForRDDId(42) === None) + assert(slam.lastAttemptValueForRDDIds(Seq(7, 42)) === None) + assert(slam.lastAttemptValueForDataset(df) === None) + assert(!slam.getValid) + assert(slam.getNumRDDs === 0) // invalidated before RDD got recorded + assert(slam.getDirectDriverValue === Some(20)) + // Invalidated before QueryExecution value was recorded. + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === None) + + slam.reset() + slam.set(10) + assert(withRetries || slam.value === 10) + // SLAM stays bailed out. + assert(slam.lastAttemptValueForHighestRDDId() === None) + assert(slam.lastAttemptValueForAllRDDs() === None) + assert(slam.lastAttemptValueForRDDId(42) === None) + assert(slam.lastAttemptValueForRDDIds(Seq(7, 42)) === None) + assert(slam.lastAttemptValueForDataset(df) === None) + assert(!slam.getValid) + // SLAM info doesn't get updated anymore when invalid, but stays around for debugging purposes. + assert(slam.getNumRDDs === 0) + assert(slam.getDirectDriverValue === Some(20)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === None) + + logInfo(slam.logAccumulatorState) + + // resetLastAttemptAccumulator resets it and makes it valid to be used again. + slam.resetLastAttemptAccumulator() + assert(slam.getValid) + slam.set(42) + assert(slam.lastAttemptValueForHighestRDDId() === Some(42)) + assert(slam.getDirectDriverValue === Some(42)) + assert(slam.getNumRDDs === 0) + assert(slam.lastAttemptValueForDataset(df) === Some(0)) + } + + test("ConvertToLocalRelation direct driver execution") { + // Normally ConvertToLocalRelation is disabled in tests. + withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> "") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val df = Seq(1, 2, 3).toDF("a").filter(Column(incrementMetric(slam))) + + // SLAM is executed on the driver in the Optimized by ConvertToLocalRelation + df.collect() + assert(slam.lastAttemptValueForAllRDDs() === Some(3)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(3)) + assert(slam.getDirectDriverValue === Some(3)) + // SLAM recognizes it was executed on the driver + // in the scope of the QueryExecution of this Dataset. + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === Some(3)) + + // Second action does not re-execute Optimizer. + df.collect() + assert(slam.lastAttemptValueForAllRDDs() === Some(3)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(3)) + assert(slam.getDirectDriverValue === Some(3)) + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + + // Limitation: When a new Dataset is built and Optimizer reexecutes ConvertToLocalRelation, + // SLAM RDD retrieval cannot reason about re-execution on the driver, + // leading to duplicated metrics. + val df2 = df.withColumn("foo", Column(Literal("foo"))) + df2.collect() + assert(slam.lastAttemptValueForAllRDDs() === Some(6)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(6)) + assert(slam.getDirectDriverValue === Some(6)) + // But it recognizes that it is done in a new QueryExecution and is able to distinguish that + // without duplicates. + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + assert(slam.lastAttemptValueForDataset(df2) === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df2.queryExecution.id.toString) === Some(3)) + + // No RDD executions were recorded. + assert(slam.getNumRDDs === 0) + + logInfo(slam.logAccumulatorState) + } + } + + test("ConvertToLocalRelation manual optimizer triggering") { + // Normally ConvertToLocalRelation is disabled in tests. + withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> "") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val df = Seq(1, 2, 3).toDF("a").filter(Column(incrementMetric(slam))) + // Trigger the optimizer manually, which will trigger ConvertToLocalRelation + df.queryExecution.assertOptimized() + + // SLAM recognizes it was executed on the driver + // in the scope of the QueryExecution of this Dataset. + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === Some(3)) + + // Repeated actions do not re-execute Optimizer. + df.collect() + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + df.collect() + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + + logInfo(slam.logAccumulatorState) + } + } + + test("ConvertToLocalRelation in explain") { + // Normally ConvertToLocalRelation is disabled in tests. + withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> "") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val df = Seq(1, 2, 3).toDF("a").filter(Column(incrementMetric(slam))) + + // EXPLAIN triggers the optimizer and triggered ConvertToLocalRelation to execute + df.explain(true) + assert(withRetries || slam.value === 3) + assert(slam.lastAttemptValueForAllRDDs() === Some(3)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(3)) + assert(slam.getDirectDriverValue === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === Some(3)) + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + // Retriggering EXPLAIN does not cause duplicates + df.explain(true) + assert(withRetries || slam.value === 3) + assert(slam.lastAttemptValueForAllRDDs() === Some(3)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(3)) + assert(slam.getDirectDriverValue === Some(3)) + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === Some(3)) + + // Execution does not re-execute Optimizer and does not duplicate metric. + df.collect() + assert(withRetries || slam.value === 3) + assert(slam.lastAttemptValueForAllRDDs() === Some(3)) + assert(slam.lastAttemptValueForHighestRDDId() === Some(3)) + assert(slam.getDirectDriverValue === Some(3)) + assert(slam.lastAttemptValueForDataset(df) === Some(3)) + assert(slam.getDirectDriverQueryExecutionValue(df.queryExecution.id.toString) === Some(3)) + + // No RDD executions were recorded. + assert(slam.getNumRDDs === 0) + + logInfo(slam.logAccumulatorState) + } + } + + test("SC-168178 BroadcastNestedLoopJoin outer executes probe side twice") { + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val build = + spark.range(5).selectExpr("id as b").hint("broadcast") + val probe = spark.range(100).selectExpr("id as p").filter(Column(incrementMetric(slam))) + val df = probe.join(build, usingColumns = Seq(), joinType = "rightouter") + df.collect() + assert(AdaptiveSparkPlanHelper.exists(df.queryExecution.executedPlan) { + case BroadcastNestedLoopJoinExec(_, _, BuildRight, RightOuter, None) => true + case _ => false + }) + // When build side is outer, probe side gets executed twice by BNLJ: + // once for matches, and once to mark unmatched build rows. + // TODO(SC-168178) This can be a non-determinism correctness issue + assert(slam.getNumRDDs === 2) + assert(slam.lastAttemptValueForAllRDDs() === Some(200)) + // However, the two executions are different RDDs, but only one of them is highest id. + assert(slam.lastAttemptValueForHighestRDDId() === Some(100)) + // Dataset sees both RDDs. + assert(slam.lastAttemptValueForDataset(df) === Some(200)) + } + + test("SLAM with AQE CoalesceShufflePartitions") { + // Adapted from tests in CoalesceShufflePartitionsSuite + + val stage1Slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM2") + val stage2Slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "SLAM2") + val stage1MetricExpr = Column(incrementMetric(stage1Slam)) + val stage2MetricExpr = Column(incrementMetric(stage2Slam)) + + // Dataframe with a SLAM before and after a shuffle. + val df = spark.range(0, 1000, 1, numPartitions = 10) + .selectExpr("id % 20 as key", "id as value") + .filter(stage1MetricExpr) + .groupBy("key").count() + .filter(stage2MetricExpr) + + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "2000") { + df.collect() + } + + // Verify the AQE coalescing happened and coalesced the shuffle into 3 partitions. + // (based on ADVISORY_PARTITION_SIZE_IN_BYTES config) + val finalPlan = AdaptiveSparkPlanHelper.stripAQEPlan(df.queryExecution.executedPlan) + val shuffleReads = finalPlan.collect { + case r @ CoalescedShuffleRead() => r + } + assert(shuffleReads.nonEmpty) + shuffleReads.foreach { read => + // check there is actual coalescing of partitions happening + assert(read.isCoalescedRead) + assert(read.partitionSpecs.exists { + case p: CoalescedPartitionSpec if p.startReducerIndex < p.endReducerIndex - 1 => true + case _ => false + }) + } + + // Verify SLAM metrics. + assert(stage1Slam.lastAttemptValueForHighestRDDId() === Some(1000)) + assert(stage2Slam.lastAttemptValueForHighestRDDId() === Some(20)) + assert(stage1Slam.lastAttemptValueForDataset(df) === Some(1000)) + assert(stage2Slam.lastAttemptValueForDataset(df) === Some(20)) + } + + test("WholeStageCodegenExec fallback to non-codegen") { + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "1" // force fallback due to too large method + ) { + // This test is to verify that SLAM works correctly when WholeStageCodegenExec falls back + // to non-codegen execution. + val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + val df = spark + .range(10) + .filter(Column(incrementMetric(slam))) + // these two operators will be turned into a WholeStageCodegen, + .selectExpr("id + 1 as foo", "id + 2 as bar") + .filter("foo < bar") + df.collect() + assert(slam.lastAttemptValueForDataset(df) === Some(10)) + // Metric is attributed to the child of the WSCG node. + val wscg = df.queryExecution.executedPlan.collectFirst { + case w: WholeStageCodegenExec => w + } + assert(wscg.isDefined) + assert(slam.getHighestRDDId.isDefined) + // The highest tracked RDD is the wrapper RDD from getByteArrayRdd. + assert(slam.getHighestRDDId === df.queryExecution.executedPlan.lastByteArrayRddId) + } + } +} + +class SQLLastAttemptMetricIntegrationSuiteWithStageRetries + extends SQLLastAttemptMetricIntegrationSuite { + override protected def withRetries = true + + override protected def test( + testName: String, + testTags: org.scalatest.Tag*) + (testFun: => Any) + (implicit pos: org.scalactic.source.Position): Unit = { + super.test(testName, testTags : _*) { + withSparkContextConf(config.Tests.INJECT_SHUFFLE_FETCH_FAILURES.key -> "true") { + // Stage retries should not affect SLAM metrics. + testFun + } + }(pos) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricPlanShapesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricPlanShapesSuite.scala new file mode 100644 index 0000000000000..ea8d9568f7e4b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricPlanShapesSuite.scala @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.metric + +import scala.reflect.ClassTag +import scala.util.Random + +import org.scalatest.Tag + +import org.apache.spark.internal.config +import org.apache.spark.sql.execution.{CollectLimitExec, RDDScanExec, SparkPlan} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, AQETestHelper, DisableAdaptiveExecutionSuite} +import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec +import org.apache.spark.sql.execution.exchange._ +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +class SQLLastAttemptMetricPlanShapesSuite + extends SharedSparkSession + with SQLMetricsTestUtils + // Need to control AQE per-test to ensure expected plan shapes. + with DisableAdaptiveExecutionSuite { + + import testImplicits._ + + import SQLLastAttemptMetricPlanShapesSuite._ + + // Avoid initialising this before the Spark Context is initialised. + protected var testSLAMetric: SQLLastAttemptMetric = _ + + protected def setUpTestTable(): Unit = { + val rand = new Random(1) + val randomPrefix = rand.nextString(30) + spark + .range(NUM_RECORDS) + .map { id => + (id, (id % LOW_CARDINALITY).toInt, randomPrefix + (id % LARGE_CARDINALITY)) + }.toDF("id", "low_cardinality_col", "large_col") + .write.format("parquet").saveAsTable(TABLE_NAME) + val numRecords = spark.read.table(TABLE_NAME).count() + assert(numRecords === 300) + } + + override protected def beforeAll(): Unit = { + super.beforeAll() + setUpTestTable() + testSLAMetric = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") + // Move this into a local field so the closure doesn't hang on to the whole `this` + // reference as well. + val metric = testSLAMetric + val incrementMetric = () => { metric += 1; true } + val incrementMetricUdf = udf(incrementMetric).asNondeterministic() + spark.udf.register("increment_metric", incrementMetricUdf) + } + + override protected def afterAll(): Unit = { + spark.sql(s"DROP TABLE IF EXISTS $TABLE_NAME") + super.afterAll() + } + + override protected def beforeEach(): Unit = { + super.beforeEach() + // note: reset() does not influence lastAttemptValue, but influences regular value + testSLAMetric.reset() + } + + object MetricValue { + type Check = Option[Long] => Unit + + // Having the asserts in these helpers instead of in testPhysicalPlanShape + // produces better error messages. + def exactly(expectedValue: Long): Check = actualValue => + assert(actualValue === Some(expectedValue)) + + def atLeast(minimumValue: Long): Check = { actualValue => + assert(actualValue.isDefined) + assert(actualValue.get >= minimumValue) + } + } + + object PhysicalPlan { + type Check = SparkPlan => Unit + + val ANY: Check = _ => () // Ignore. + + def contains[T <: SparkPlan: ClassTag](implicit cls: ClassTag[T]): Check = { plan => + val existsSomeNodeOfTypeT = + AdaptiveSparkPlanHelper.existsWithSubqueries(plan)(_.getClass == cls.runtimeClass) + assert( + existsSomeNodeOfTypeT, + s"Expected a node ${cls.runtimeClass.getSimpleName}. Actual Plan:\n${plan.treeString}") + } + + def exists(pf: PartialFunction[SparkPlan, Boolean]): Check = { plan => + val existsMatchingNode = + AdaptiveSparkPlanHelper.existsWithSubqueries(plan)(pf.lift(_).getOrElse(false)) + assert( + existsMatchingNode, + s"Unexpected plan (check match function). Actual Plan:\n${plan.treeString}") + } + + def isAQE: Boolean = SQLConf.get.getConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED) + + def hasStageRetries: Boolean = spark.sparkContext.conf + .getOption(config.Tests.INJECT_SHUFFLE_FETCH_FAILURES.key).contains("true") + + def hasAQEReplans: Boolean = AQETestHelper.isForcedCancellationEnabled + } + + protected def testPhysicalPlanShape( + label: String, + setup: () => Unit = () => (), + extraSQLConfs: Map[String, String] = Map.empty, + sqlQuery: String, + executedPlanCheck: PhysicalPlan.Check, + metricValueCheck: MetricValue.Check + )(testTags: Tag*): Unit = { + for { + useAQE <- BOOLEAN_DOMAIN + stageRetries <- BOOLEAN_DOMAIN + aqeReplans <- if (useAQE) BOOLEAN_DOMAIN else Seq(false) + } test(s"$label - " + + s"useAQE=$useAQE, stageRetries=$stageRetries, aqeReplans=$aqeReplans", + testTags: _*) { + + // There is some special handling for df.cache() / df.persist() / df.localCheckpoint() tests. + val cachedPlanTest = label.startsWith("cache - ") + + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> useAQE.toString) { + setup() + withSQLConf(extraSQLConfs.toSeq: _*) { + val aqeRetryMetrics = if (aqeReplans) Seq(testSLAMetric) else Seq.empty + AQETestHelper.withForcedCancellation(aqeRetryMetrics: _*) { + withSparkContextConf( + config.Tests.INJECT_SHUFFLE_FETCH_FAILURES.key -> stageRetries.toString) { + val resultDf = spark.sql(sqlQuery) + val _ = resultDf.collect() + + // normal value of the metrics shall not work with retries or replans + if (!stageRetries && !aqeReplans) { + metricValueCheck(Some(testSLAMetric.value)) + } + // test LastRDDValue + metricValueCheck(testSLAMetric.lastAttemptValueForHighestRDDId()) + // test Dataset value + if (!cachedPlanTest) { + // SLAM.lastAttemptValueForDataset is undefined when SLAM is inside + // cached or checkpointed plan. + metricValueCheck(testSLAMetric.lastAttemptValueForDataset(resultDf)) + } + // test expected plan shape + val executedPlan = resultDf.queryExecution.executedPlan + executedPlanCheck(executedPlan) + val rddIdExec = testSLAMetric.getHighestRDDId + + // Repeated execution should not affect SLAM metric value + resultDf.collect() + // test LastRDDValue again + metricValueCheck(testSLAMetric.lastAttemptValueForHighestRDDId()) + // test Dataset value again + if (!cachedPlanTest) { + // SLAM.lastAttemptValueForDataset is undefined when SLAM is inside + // cached or checkpointed plan. + metricValueCheck(testSLAMetric.lastAttemptValueForDataset(resultDf)) + } + + // count() transformation creates a new Dataset. + // It should not affect the SLAM metric value of the first Dataset. + resultDf.count() + // test Dataset value again + if (!cachedPlanTest) { + // SLAM.lastAttemptValueForDataset is undefined when SLAM is inside + // cached or checkpointed plan. + metricValueCheck(testSLAMetric.lastAttemptValueForDataset(resultDf)) + } + // This should have created a new plan and executed new RDDs, + // unless it's a test of cached plan. + val rddIdExecCount = testSLAMetric.getHighestRDDId + if (cachedPlanTest) { + assert(rddIdExecCount === rddIdExec) + } else { + // count() creates a new plan with new RDDs. + assert(rddIdExecCount.get > rddIdExec.get) + } + } + } + } + } + } + } + + protected def testPlanShape( + label: String, + sqlQuery: String, + // Assert on the result of the test metric. + metricValueCheck: MetricValue.Check, + testTags: Tag* + ): Unit = { + testPhysicalPlanShape( + label = label, + sqlQuery = sqlQuery, + executedPlanCheck = PhysicalPlan.ANY, + metricValueCheck = metricValueCheck + )(testTags: _*) + } + + testPlanShape( + label = "simple plan", + sqlQuery = s"SELECT * FROM $TABLE_NAME WHERE increment_metric()", + metricValueCheck = MetricValue.exactly(NUM_RECORDS) + ) + + /* ******************** + * Various Subquery Plans + * ********************** */ + testPlanShape( + label = "subquery - IN", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id IN ( + | SELECT low_cardinality_col + | FROM $TABLE_NAME + | WHERE increment_metric())""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS) + ) + + testPlanShape( + label = "subquery - IN - aggregation", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id IN ( + | SELECT DISTINCT(low_cardinality_col) + | FROM $TABLE_NAME + | WHERE increment_metric())""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS) + ) + + testPlanShape( + label = "subquery - IN - TVF", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id IN ( + | SELECT * + | FROM range(5) + | WHERE increment_metric())""".stripMargin, + metricValueCheck = MetricValue.exactly(5) + ) + + testPlanShape( + label = "subquery - IN - explode", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id IN ( + | SELECT explode(array(low_cardinality_col, low_cardinality_col + 1)) + | FROM $TABLE_NAME + | WHERE increment_metric())""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS) + ) + + testPlanShape( + label = "subquery - IN - lateral view explode", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id IN ( + | SELECT new_column + | FROM $TABLE_NAME LATERAL VIEW + | explode(array(low_cardinality_col, low_cardinality_col + 1)) AS new_column + | WHERE increment_metric())""".stripMargin, + metricValueCheck = MetricValue.exactly(2 * NUM_RECORDS) + ) + + testPlanShape( + label = "subquery - scalar", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id == ( + | SELECT MAX(low_cardinality_col) + | FROM $TABLE_NAME + | WHERE increment_metric())""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS) + ) + + testPhysicalPlanShape( + label = "subquery - EXISTS", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE EXISTS ( + | SELECT low_cardinality_col + | FROM $TABLE_NAME + | WHERE increment_metric())""".stripMargin, + // This turns into a LIMIT query. + metricValueCheck = MetricValue.atLeast(1), + executedPlanCheck = PhysicalPlan.contains[CollectLimitExec] + )() + + testPhysicalPlanShape( + label = "subquery - EXISTS (correlated)", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME outer_table + | WHERE EXISTS ( + | SELECT low_cardinality_col + | FROM $TABLE_NAME inner_table + | WHERE increment_metric() + | AND inner_table.low_cardinality_col == outer_table.low_cardinality_col) + | """.stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.exists { + case _: BroadcastExchangeExec => true + case _: ShuffleExchangeExec => true + case _: ReusedExchangeExec => true + } + )() + + /* ***************************** + * Plans with different Exchanges + * ****************************** */ + + /* + * To cover: + * - ShuffleExchangeLike + * - ShuffleExchangeExec: covered by exchange - Shuffle + * - ReusedExchangeExec: covered by exchange - ReusedExchangeExec + * - BroadcastExchangeLike: + * - BroadcastExchangeExec: covered above by subquery - EXISTS (correlated)) + * - InMemoryTableScanLike (InMemoryTableScanExec): covered by exchange - InMemoryTableScanExec + */ + + testPhysicalPlanShape( + label = "exchange - Shuffle", + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME orig + | FULL OUTER JOIN ( + | SELECT * + | FROM $TABLE_NAME + | WHERE increment_metric() + | ) with_metric USING (id)""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.exists { + case _: ShuffleExchangeExec => true + // After forced AQE replans it may use ReusedExchange. + case _: ReusedExchangeExec if PhysicalPlan.hasAQEReplans => true + } + )() + + testPhysicalPlanShape( + label = "exchange - ReusedExchangeExec", + sqlQuery = + s"""WITH subquery_with_metric AS ( + | SELECT * + | FROM $TABLE_NAME + | WHERE increment_metric() + | ) + |SELECT * + | FROM subquery_with_metric a JOIN subquery_with_metric b USING (id)""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.contains[ReusedExchangeExec] + )() + + for (eager <- Seq("true", "false", "manual")) { + // SLAM metric in the top stage of cached query. + testPhysicalPlanShape( + label = s"cache - InMemoryTableScanExec - result stage - eager=$eager", + setup = () => { + spark.sql(s""" + |CREATE OR REPLACE TEMP VIEW table_with_metric AS ( + | SELECT low_cardinality_col + | FROM $TABLE_NAME + | WHERE increment_metric() + |)""".stripMargin) + if (eager == "true") { + spark.sql("CACHE TABLE table_with_metric") + } else { // false or manual + spark.sql("CACHE LAZY TABLE table_with_metric") + } + if (eager == "manual") { + spark.sql("select count(*) from table_with_metric").collect() + } + }, + sqlQuery = + s"""SELECT * + | FROM $TABLE_NAME + | WHERE id IN (SELECT * FROM table_with_metric)""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.contains[InMemoryTableScanExec] + )() + + // SLAM metric in the map stage of cached query. + testPhysicalPlanShape( + label = s"cache - InMemoryTableScanExec - map stage - eager=$eager", + setup = () => { + spark.sql(s""" + |CREATE OR REPLACE TEMP VIEW table_with_metric AS ( + | SELECT id, SUM(low_cardinality_col) + | FROM $TABLE_NAME + | WHERE increment_metric() + | GROUP BY id + |)""".stripMargin) + if (eager == "true") { + spark.sql("CACHE TABLE table_with_metric") + } else { // false or manual + spark.sql("CACHE LAZY TABLE table_with_metric") + } + if (eager == "manual") { + spark.sql("select count(*) from table_with_metric").collect() + } + }, + sqlQuery = + s"""SELECT * + | FROM table_with_metric""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.contains[InMemoryTableScanExec] + )() + + testPhysicalPlanShape( + label = s"cache - localCheckpoint - result stage - eager=$eager", + setup = () => { + val df = spark.sql(s""" + |SELECT low_cardinality_col + |FROM $TABLE_NAME + |WHERE increment_metric()""".stripMargin) + val cpEager = if (eager == "true") true else false + val cpDf = df.localCheckpoint(eager = cpEager) + if (eager == "manual") { + cpDf.count() + } + cpDf.createOrReplaceTempView("cp_table_with_metric") + }, + sqlQuery = + s"""SELECT * + | FROM cp_table_with_metric""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.contains[RDDScanExec] + )() + + testPhysicalPlanShape( + label = s"cache - localCheckpoint - map stage - eager=$eager", + setup = () => { + val df = spark.sql(s""" + |SELECT id, SUM(low_cardinality_col) + |FROM $TABLE_NAME + |WHERE increment_metric() + |GROUP BY id""".stripMargin) + val cpEager = if (eager == "true") true else false + val cpDf = df.localCheckpoint(eager = cpEager) + if (eager == "manual") { + cpDf.count() + } + cpDf.createOrReplaceTempView("cp_table_with_metric") + }, + sqlQuery = + s"""SELECT * + | FROM cp_table_with_metric""".stripMargin, + metricValueCheck = MetricValue.exactly(NUM_RECORDS), + executedPlanCheck = PhysicalPlan.contains[RDDScanExec] + )() + } +} + +object SQLLastAttemptMetricPlanShapesSuite { + val NUM_RECORDS: Long = 300 + val LOW_CARDINALITY: Int = 5 + val LARGE_CARDINALITY: Int = 111 + + val TABLE_NAME: String = "test_table" +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala new file mode 100644 index 0000000000000..b77463d93b235 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.metric + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} +import java.util.Properties + +import org.mockito.Mockito.when +import org.scalatestplus.mockito.MockitoSugar.mock + +import org.apache.spark.{SharedSparkContext, SparkFunSuite} +import org.apache.spark.rdd.RDD +import org.apache.spark.scheduler.TaskInfo + +/** Tests internals of [[SQLLastAttemptMetric]]. */ +class SQLLastAttemptMetricUnitSuite extends SparkFunSuite with SharedSparkContext { + + // scalastyle:off classforname + private val sqlLastAttemptMetricClass = Class + .forName("org.apache.spark.sql.execution.metric.SQLLastAttemptMetric") + // scalastyle:on classforname + + private val lastAttemptInitializedField = + sqlLastAttemptMetricClass.getDeclaredField("lastAttemptAccumulatorInitialized") + + private val lastAttemptRddsMapField = + sqlLastAttemptMetricClass.getDeclaredField( + "org$apache$spark$util$LastAttemptAccumulator$$lastAttemptRddsMap") + + private val directDriverValueField = + sqlLastAttemptMetricClass.getDeclaredField( + "org$apache$spark$util$LastAttemptAccumulator$$lastAttemptDirectDriverValue") + + private val partialMergeValMethod = sqlLastAttemptMetricClass.getMethod("partialMergeVal") + + private val mockRdd = mock[RDD[_]] + private val mockTaskInfo = mock[TaskInfo] + private val mockProperties = new Properties + + // Set mock attempt for mock Task, TaskInfo and RDD + // that can be used with mergeLastAttempt. + def setMockAttempt(rddId: Int, partitionId: Int, stageId: Int, stageAttemptId: Int): Unit = { + // reset to mock defaults + when(mockTaskInfo.attemptNumber).thenReturn(0) + when(mockRdd.scope).thenReturn(None) + when(mockRdd.getNumPartitions).thenReturn(5) + + when(mockRdd.id).thenReturn(rddId) + when(mockTaskInfo.partitionId).thenReturn(partitionId) + } + + // stageId and stageAttemptId are passed directly to mergeLastAttempt + + override def beforeAll(): Unit = { + super.beforeAll() + lastAttemptInitializedField.setAccessible(true) + lastAttemptRddsMapField.setAccessible(true) + directDriverValueField.setAccessible(true) + partialMergeValMethod.setAccessible(true) + } + + override def afterAll(): Unit = { + lastAttemptInitializedField.setAccessible(false) + lastAttemptRddsMapField.setAccessible(false) + directDriverValueField.setAccessible(false) + partialMergeValMethod.setAccessible(false) + super.beforeAll() + } + + test("serialization and deserialization") { + val slam = SQLLastAttemptMetrics.createMetric(sc, "test SLAM") + + assert(lastAttemptInitializedField.getBoolean(slam) === true) + assert(lastAttemptRddsMapField.get(slam) != null) + assert(directDriverValueField.get(slam) != null) + + // Serialize slam to ObjectOutputStream and deserialize it back. + val obs1 = new ByteArrayOutputStream() + val oos1 = new ObjectOutputStream(obs1) + oos1.writeObject(slam) + oos1.close() + val ois1 = new ObjectInputStream(new ByteArrayInputStream(obs1.toByteArray)) + val deser = ois1.readObject().asInstanceOf[SQLLastAttemptMetric] + + // serialized version should not be initialized + assert(lastAttemptInitializedField.getBoolean(deser) === false) + assert(lastAttemptRddsMapField.get(deser) == null) + assert(directDriverValueField.get(deser) == null) + + deser.set(42) + deser.add(7) + assert(deser.value === 49) + // these functions shouldn't be used on the deserialized metric, + // but assertions should be caught and None should be returned. + assert(deser.lastAttemptValueForHighestRDDId() === None) + assert(deser.lastAttemptValueForRDDId(1) === None) + assert(deser.lastAttemptValueForRDDIds(Seq(1, 2, 3)) === None) + assert(deser.lastAttemptValueForAllRDDs() === None) + // mergeLastAttempt shouldn't be used on the deserialized metric, + // but it should catch error and not fail. + deser.mergeLastAttempt(slam, null, null, 0, 0, null) + + // Serialize and deserialize again. + val obs2 = new ByteArrayOutputStream() + val oos2 = new ObjectOutputStream(obs2) + oos2.writeObject(deser) + oos2.close() + val ois2 = new ObjectInputStream(new ByteArrayInputStream(obs2.toByteArray)) + val reser = ois2.readObject().asInstanceOf[SQLLastAttemptMetric] + // Check that the value is brought back and can be used as partialMergeVal. + assert(reser.value === 49L) + assert(partialMergeValMethod.invoke(reser) === 49L) + } + + test("copy and mergeLastAttempt") { + val slam = SQLLastAttemptMetrics.createMetric(sc, "test SLAM") + + assert(lastAttemptInitializedField.getBoolean(slam) == true) + assert(lastAttemptRddsMapField.get(slam) != null) + assert(directDriverValueField.get(slam) != null) + + // copy should not initialize SLAM data. + val acc = slam.copy() + assert(lastAttemptInitializedField.getBoolean(acc) == false) + assert(lastAttemptRddsMapField.get(acc) == null) + assert(directDriverValueField.get(acc) == null) + // these functions shouldn't be used on the copy, + // but assertions should be caught and None should be returned. + assert(acc.lastAttemptValueForHighestRDDId() === None) + assert(acc.lastAttemptValueForRDDId(1) === None) + assert(acc.lastAttemptValueForRDDIds(Seq(1, 2, 3)) === None) + assert(acc.lastAttemptValueForAllRDDs() === None) + // mergeLastAttempt shouldn't be used on the copy, + // but it should catch error and not fail. + acc.mergeLastAttempt(slam, null, null, 0, 0, null) + + // Let's play with merging acc into slam. + setMockAttempt(rddId = 1, partitionId = 0, stageId = 10, stageAttemptId = 10) + acc.set(10) + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 10, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(10)) + + setMockAttempt(rddId = 1, partitionId = 1, stageId = 10, stageAttemptId = 10) + acc.set(10) // new partition id + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 10, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(20)) // 10 + 10, aggregated new partition id + + setMockAttempt(rddId = 1, partitionId = 1, stageId = 10, stageAttemptId = 9) + acc.set(7) // same partition id, older attempt. + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 9, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(20)) // no change + + setMockAttempt(rddId = 1, partitionId = 1, stageId = 9, stageAttemptId = 11) + acc.set(7) // same partition id, older stage. + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 9, 11, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(20)) // no change + + setMockAttempt(rddId = 1, partitionId = 1, stageId = 10, stageAttemptId = 11) + acc.set(7) // same partition id, newer attempt. + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 11, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(17)) // 10 replaced with 7 + + setMockAttempt(rddId = 1, partitionId = 1, stageId = 11, stageAttemptId = 1) + acc.set(8) // same partition id, newer stage. + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 11, 1, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(18)) // 7 replaced with 8 + + setMockAttempt(rddId = 2, partitionId = 2, stageId = 1, stageAttemptId = 1) + acc.set(42) // new RDD + slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 1, 1, mockProperties) + assert(slam.lastAttemptValueForRDDId(1) === Some(18)) // no change for rddId=1 + assert(slam.lastAttemptValueForRDDId(2) === Some(42)) // new RDD added + assert(slam.lastAttemptValueForAllRDDs() === Some(60)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala index 1df5f522b4b8d..463af0618de6e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala @@ -25,8 +25,10 @@ import org.apache.spark.TestUtils import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.{SparkPlan, SparkPlanInfo} import org.apache.spark.sql.execution.ui.{SparkPlanGraph, SQLAppStatusStore} +import org.apache.spark.sql.functions.udf import org.apache.spark.sql.internal.SQLConf.WHOLESTAGE_CODEGEN_ENABLED import org.apache.spark.sql.test.SQLTestUtils @@ -34,6 +36,28 @@ import org.apache.spark.sql.test.SQLTestUtils trait SQLMetricsTestUtils extends SQLTestUtils { import testImplicits._ + protected val BOOLEAN_DOMAIN: Seq[Boolean] = Seq(true, false) + + /** + * @return An `Expression` that increments a SQL metric and + * evaluates to true. Can be used in a filter. + */ + protected def incrementMetric( + metric: SQLMetric): Expression = { + udf { () => + { metric += 1; true } + }.asNondeterministic().apply().expr + } + + /** @return An `Expression` to increment multiple SQL metrics */ + protected def incrementMetrics(metrics: Seq[SQLMetric]): Expression = { + metrics.map(incrementMetric(_)).fold( + org.apache.spark.sql.catalyst.expressions.Literal(true): Expression) { + (acc, incrMetric) => + org.apache.spark.sql.catalyst.expressions.And(acc, incrMetric) + } + } + protected def currentExecutionIds(): Set[Long] = { spark.sparkContext.listenerBus.waitUntilEmpty(10000) statusStore.executionsList().map(_.executionId).toSet diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index e330db6b12921..0815ab35d7c63 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -255,6 +255,28 @@ private[sql] trait SQLTestUtilsBase super.withSQLConf(pairs: _*)(f) } + /** + * Temporarily sets SparkContext configuration values for testing. + * This is for configs that must be set on the SparkContext (not + * SQLConf), such as testing flags. + */ + protected def withSparkContextConf[T]( + pairs: (String, String)*)(f: => T): T = { + val sc = spark.sparkContext + val oldValues = pairs.map { case (k, _) => + k -> sc.conf.getOption(k) + } + try { + pairs.foreach { case (k, v) => sc.conf.set(k, v) } + f + } finally { + oldValues.foreach { + case (k, Some(v)) => sc.conf.set(k, v) + case (k, None) => sc.conf.remove(k) + } + } + } + /** * Drops functions after calling `f`. A function is represented by (functionName, isTemporary). */ From 4dd356a0217de432b9d09a0e1e24da9d4a6fc9a4 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Thu, 16 Apr 2026 17:36:56 +0000 Subject: [PATCH 02/21] address review comments - Fix afterAll() calling super.beforeAll() instead of super.afterAll() - Fix copy-paste errors in bailout messages (ShuffleExchangeLike, BaseSubqueryExec) - Fix typos: lastAttempValue, e.d., PARTIAl, highestid, splits it, decomissioned, a subclass an, in in, implementation create - Fix missing ]] in lastAttemptValueForHighestRDDId scaladoc link - Simplify partialMerge to avoid unnecessary copy+merge - Remove unused stageId/stageAttemptId params from setMockAttempt --- .../spark/util/LastAttemptAccumulator.scala | 12 +++++------ .../metric/SQLLastAttemptAccumulator.scala | 21 +++++++------------ .../metric/SQLLastAttemptMetric.scala | 5 ++--- ...SQLLastAttemptMetricIntegrationSuite.scala | 2 +- .../SQLLastAttemptMetricUnitSuite.scala | 21 +++++++++---------- 5 files changed, 27 insertions(+), 34 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index b078e40e00634..bce0e3bcfdb43 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -81,8 +81,8 @@ import org.apache.spark.scheduler.TaskInfo * 2. Spark SQL. * ============= * LastAttemptAccumulator offers simple tracking of the last SQL execution, by assuming that - * the last execution will be in the scope of an RDD with the highestid, and using - * [[lastAttemptValueForHighestRDDId]. See [[SQLLastAttemptAccumulator]] for more possibilities + * the last execution will be in the scope of an RDD with the highest id, and using + * [[lastAttemptValueForHighestRDDId]]. See [[SQLLastAttemptAccumulator]] for more possibilities * of tracking SQL execution. * * Simple last SQL execution tracking @@ -93,7 +93,7 @@ import org.apache.spark.scheduler.TaskInfo * should correspond to the last execution and the latest AQE plan. * This has some limitations, e.g. doesn't work if the same metric is used in multiple places * in the query plan, and we want all occurrences to be aggregated together. - * It also wouldn't work if a SparkPlan splits it execution into multiple RDDs. This for example + * It also wouldn't work if a SparkPlan splits its execution into multiple RDDs. This for example * happens in BroadcastNestedLoopJoinExec with matchedStreamRows and notMatchedBroadcastRows. * One can use this simple last attempt tracking by using lastAttemptLastRDDValue. * @@ -134,7 +134,7 @@ private class LastAttemptRDDVals[@specialized T]( // (a new Job is submitted that depends on data from the RDD, if it finds it's missing it will // recompute it in a new Stage). // If a missing output is detected in a Stage while the stage is still running (e.g. executor - // is lost or decomissioned while the stage is running, and loses the output of some already + // is lost or decommissioned while the stage is running, and loses the output of some already // finished tasks), a new Task with new taskAttemptNumber will be started for that Task. // There may be multiple Tasks with different taskAttemptNumbers running in parallel due to // speculation, but DAGScheduler guarantees that only one of them will reach metrics reporting, @@ -306,7 +306,7 @@ private case class AccumulatorPartialVal[PARTIAL]( } /** - * A trait that can be mixed-in into a subclass an [[AccumulatorV2]] to make it track the "logical" + * A trait that can be mixed into a subclass of [[AccumulatorV2]] to track the "logical" * value of the "last attempt" of the execution using the accumulator - aggregated from the last * attempts of any Task that calculated some RDD partitions and used this accumulator, and * discarding any values coming from earlier attempts that have been recomputed. @@ -534,7 +534,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** Return intermediate value of PARTIAL type that can be merged together by partialMerge. */ protected def partialMergeVal: PARTIAL - /** Merge together partial values of PARTIAl type returned by partialMergeVal. */ + /** Merge together partial values of PARTIAL type returned by partialMergeVal. */ protected def partialMerge(otherVal: PARTIAL): Unit /** Check if the other accumulator is mergeable with this one. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala index 2d9e352d27f7d..7b7b7965c7b6e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala @@ -80,7 +80,7 @@ import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} * ------------------- * The metric can be updated directly on the driver side, during the execution of catalyst * optimizer. One example is [[ConvertToLocalRelation]] optimization rule, which constant folds - * pieces of the plan, including potentially folding [[IncrementMetric]] expressions. + * pieces of the plan. * Execution in this scope is tagged with [[QueryExecution.id]] using * [[SparkContext.DATASET_QUERY_EXECUTION_ID_KEY]] property, and this metric is tracking * the metric value separately for each QueryExecution. @@ -109,11 +109,11 @@ import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} */ /** - * A trait that can be mixed-in into a subclass an [[AccumulatorV2]] to make it track the "logical" + * A trait that can be mixed into a subclass of [[AccumulatorV2]] to track the "logical" * value of the "last attempt" of the execution using the accumulator. * In addition to what [[LastAttemptAccumulator]] does, it allows to track the last attempt * executed in the scope of a Dataset's QueryExecution, via - * [[lastAttempValueForDataset]] and [[lastAttemptValueForQueryExecution]] methods. + * [[lastAttemptValueForDataset]] and [[lastAttemptValueForQueryExecution]] methods. */ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator[IN, OUT, PARTIAL] { this: AccumulatorV2[IN, OUT] => @@ -212,7 +212,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator * * @note The output of this method is undefined if this metric was used inside a part of the plan * which was either checkpointed (e.g. df.localCheckpoint(), df.checkpoint()) or cached - * (e.d. df.cache(), df.persist()). + * (e.g. df.cache(), df.persist()). * [[lastAttemptValueForHighestRDDId()]] should return the value from when the execution in * which the plan was cached/checkpointed. * @@ -245,7 +245,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator * * @note The output of this method is undefined if this metric was used inside a part of the plan * which was either checkpointed (e.g. df.localCheckpoint(), df.checkpoint()) or cached - * (e.d. df.cache(), df.persist()). + * (e.g. df.cache(), df.persist()). * [[lastAttemptValueForHighestRDDId()]] should return the value from when the execution in * which the plan was cached/checkpointed. * @@ -315,12 +315,12 @@ object SQLLastAttemptAccumulator extends Logging { // shuffle exchange stage submitting nodes case sl: ShuffleExchangeLike => sl match { - // All shuffle exchange implementation create the ShuffledRowRDD / ShuffledBlockRDD + // All shuffle exchange implementations create the ShuffledRowRDD / ShuffledBlockRDD // with its own scope, and it will be executed in that scope. case s: ShuffleExchangeExec => scopeIds(s) case p => // Bail out if future unknown implementation is encountered. - bailOutReason = Some(s"Unsupported BroadcastExchangeLike: ${p.getClass.getName}") + bailOutReason = Some(s"Unsupported ShuffleExchangeLike: ${p.getClass.getName}") Nil } @@ -349,7 +349,7 @@ object SQLLastAttemptAccumulator extends Logging { Nil case p => // Bail out if future unknown implementation is encountered. - bailOutReason = Some(s"Unsupported BroadcastExchangeLike: ${p.getClass.getName}") + bailOutReason = Some(s"Unsupported BaseSubqueryExec: ${p.getClass.getName}") Nil } @@ -367,11 +367,6 @@ object SQLLastAttemptAccumulator extends Logging { // then the metric would return 0 instead of the value from the cached execution. If the // current executedPlan is the one that materializes the cache, then it would be the // correct value. - // - if the metrics is in the top stage of the cachedPlan and AQE is enabled, it will be - // wrapped into a [[TableCacheQueryScanExec]], which would submit a stage. - // Currently this will be misattributed, but that could be fixed if - // [[InMemoryRelation.buildBuffers]] wrapped the creation of `val cached` into an - // RDDOperationScope that attributes it to cachedPlan. // - if the metric is in a map stage of the cachedPlan, then it would be correctly // annotated with the scope of that stage, and it would work correctly. // diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala index 4499e514504cc..26632f5920d72 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala @@ -30,9 +30,8 @@ class SQLLastAttemptMetric( override protected def partialMergeVal: Long = _value override protected def partialMerge(value: Long): Unit = { - val acc = this.copyAndReset().asInstanceOf[this.type] - acc.set(value) - merge(acc) + if (isZero) _value = 0 + _value += value } override protected def isMergeable(other: AccumulatorV2[_, _]): Boolean = other match { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala index 5d6ce6af20c65..3a624e1642052 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala @@ -311,7 +311,7 @@ class SQLLastAttemptMetricIntegrationSuite // allRDDs includes wrapper RDDs from repeated collects. assert(slam1.lastAttemptValueForAllRDDs() === Some(45)) assert(slam2.lastAttemptValueForAllRDDs() === Some(10)) - // slam1 and slam2 are both executed in in df3 + // slam1 and slam2 are both executed in df3 assert(slam1.lastAttemptValueForDataset(df3) === Some(10)) assert(slam2.lastAttemptValueForDataset(df3) === Some(10)) // slam2 is not executed in df1 and df2. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala index b77463d93b235..f3a696f81450e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricUnitSuite.scala @@ -53,7 +53,8 @@ class SQLLastAttemptMetricUnitSuite extends SparkFunSuite with SharedSparkContex // Set mock attempt for mock Task, TaskInfo and RDD // that can be used with mergeLastAttempt. - def setMockAttempt(rddId: Int, partitionId: Int, stageId: Int, stageAttemptId: Int): Unit = { + // stageId and stageAttemptId are passed directly to mergeLastAttempt. + def setMockAttempt(rddId: Int, partitionId: Int): Unit = { // reset to mock defaults when(mockTaskInfo.attemptNumber).thenReturn(0) when(mockRdd.scope).thenReturn(None) @@ -63,8 +64,6 @@ class SQLLastAttemptMetricUnitSuite extends SparkFunSuite with SharedSparkContex when(mockTaskInfo.partitionId).thenReturn(partitionId) } - // stageId and stageAttemptId are passed directly to mergeLastAttempt - override def beforeAll(): Unit = { super.beforeAll() lastAttemptInitializedField.setAccessible(true) @@ -78,7 +77,7 @@ class SQLLastAttemptMetricUnitSuite extends SparkFunSuite with SharedSparkContex lastAttemptRddsMapField.setAccessible(false) directDriverValueField.setAccessible(false) partialMergeValMethod.setAccessible(false) - super.beforeAll() + super.afterAll() } test("serialization and deserialization") { @@ -149,37 +148,37 @@ class SQLLastAttemptMetricUnitSuite extends SparkFunSuite with SharedSparkContex acc.mergeLastAttempt(slam, null, null, 0, 0, null) // Let's play with merging acc into slam. - setMockAttempt(rddId = 1, partitionId = 0, stageId = 10, stageAttemptId = 10) + setMockAttempt(rddId = 1, partitionId = 0) acc.set(10) slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 10, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(10)) - setMockAttempt(rddId = 1, partitionId = 1, stageId = 10, stageAttemptId = 10) + setMockAttempt(rddId = 1, partitionId = 1) acc.set(10) // new partition id slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 10, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(20)) // 10 + 10, aggregated new partition id - setMockAttempt(rddId = 1, partitionId = 1, stageId = 10, stageAttemptId = 9) + setMockAttempt(rddId = 1, partitionId = 1) acc.set(7) // same partition id, older attempt. slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 9, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(20)) // no change - setMockAttempt(rddId = 1, partitionId = 1, stageId = 9, stageAttemptId = 11) + setMockAttempt(rddId = 1, partitionId = 1) acc.set(7) // same partition id, older stage. slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 9, 11, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(20)) // no change - setMockAttempt(rddId = 1, partitionId = 1, stageId = 10, stageAttemptId = 11) + setMockAttempt(rddId = 1, partitionId = 1) acc.set(7) // same partition id, newer attempt. slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 10, 11, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(17)) // 10 replaced with 7 - setMockAttempt(rddId = 1, partitionId = 1, stageId = 11, stageAttemptId = 1) + setMockAttempt(rddId = 1, partitionId = 1) acc.set(8) // same partition id, newer stage. slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 11, 1, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(18)) // 7 replaced with 8 - setMockAttempt(rddId = 2, partitionId = 2, stageId = 1, stageAttemptId = 1) + setMockAttempt(rddId = 2, partitionId = 2) acc.set(42) // new RDD slam.mergeLastAttempt(acc, mockRdd, mockTaskInfo, 1, 1, mockProperties) assert(slam.lastAttemptValueForRDDId(1) === Some(18)) // no change for rddId=1 From 8b6dc3420a07f2d0e2b218189a728f03e0879d3a Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 20 Apr 2026 09:36:42 +0000 Subject: [PATCH 03/21] Szehon's review --- .../org/apache/spark/internal/LogKeys.java | 8 +-- .../spark/util/LastAttemptAccumulator.scala | 39 +++++++++----- .../spark/sql/execution/SparkPlan.scala | 2 +- .../execution/adaptive/AQETestHelper.scala | 3 +- .../metric/SQLLastAttemptAccumulator.scala | 51 ++++++++++++++----- .../metric/SQLLastAttemptMetric.scala | 40 +++------------ ...SQLLastAttemptMetricIntegrationSuite.scala | 11 ++-- 7 files changed, 85 insertions(+), 69 deletions(-) diff --git a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java index 977a6e3d7ed74..14aa5667f498c 100644 --- a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java +++ b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java @@ -728,10 +728,10 @@ public enum LogKeys implements LogKey { SIGNAL, SINK, SIZE, - SLAM_INVALIDATE, - SLAM_STATE_SYSTEM_METRIC, - SLAM_STATE_USER_METRIC, - SLAM_UNEXPECTED_REASON, + LAST_ATTEMPT_ACC_INVALIDATE, + LAST_ATTEMPT_ACC_SYSTEM_METRIC, + LAST_ATTEMPT_ACC_USER_METRIC, + LAST_ATTEMPT_ACC_UNEXPECTED_REASON, SLEEP_TIME, SLIDE_DURATION, SMALLEST_CLUSTER_INDEX, diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index bce0e3bcfdb43..44e04dfab4bcd 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -438,7 +438,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** Log entry to log debug information about the internal state of the accumulator. */ def logAccumulatorState: LogEntry = try { log"""LastAttemptAccumulator id=${MDC(LogKeys.ACCUMULATOR_ID, accumulatorId)}: - |Invalidated: ${MDC(LogKeys.SLAM_INVALIDATE, lastAttemptAccumulatorInvalid)}. + |Invalidated: ${MDC(LogKeys.LAST_ATTEMPT_ACC_INVALIDATE, lastAttemptAccumulatorInvalid)}. |Direct driver value: ${MDC(logKeyAccumulatorState, lastAttemptDirectDriverValue)}. |Value: ${MDC(logKeyAccumulatorState, value)}. |lastAttemptRddsMap: @@ -468,12 +468,11 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { exception: Option[Throwable] = None, newAccumPartialValue: Option[AccumulatorPartialVal[PARTIAL]] = None, oldAccumPartialValue: Option[AccumulatorPartialVal[PARTIAL]] = None): Unit = { - // TODO(LC-6501): Add usage logging events with structured logging. val logEntry = log"""Unexpected last attempt tracking for accumulator ${ MDC(LogKeys.ACCUMULATOR_ID, accumulatorId)}. - |Invalidate: ${MDC(LogKeys.SLAM_INVALIDATE, invalidate)}. - |Reason: ${MDC(LogKeys.SLAM_UNEXPECTED_REASON, reason)}. + |Invalidate: ${MDC(LogKeys.LAST_ATTEMPT_ACC_INVALIDATE, invalidate)}. + |Reason: ${MDC(LogKeys.LAST_ATTEMPT_ACC_UNEXPECTED_REASON, reason)}. |""".stripMargin + log"State:\n" + logAccumulatorState + log"Update:\n" + logAccumulatorUpdate(newAccumPartialValue, oldAccumPartialValue) @@ -525,9 +524,9 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { protected def logKeyAccumulatorState: LogKey = { if (accumulatorStoresUserData) { - LogKeys.SLAM_STATE_USER_METRIC + LogKeys.LAST_ATTEMPT_ACC_USER_METRIC } else { - LogKeys.SLAM_STATE_SYSTEM_METRIC + LogKeys.LAST_ATTEMPT_ACC_SYSTEM_METRIC } } @@ -702,6 +701,9 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** * Returns the last attempt value of this accumulator, aggregated from a set of RDDs. * + * If the metric was used directly on the driver, and wa not used in any RDD execution, + * the driver value will be used instead. + * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -730,6 +732,9 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** * Returns the last attempt value of this accumulator, aggregated from a specific RDD. * + * If the metric was used directly on the driver, and wa not used in any RDD execution, + * the driver value will be used instead. + * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -750,6 +755,9 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from all RDDs that ever * returned any values for it. * + * If the metric was used directly on the driver, and wa not used in any RDD execution, + * the driver value will be used instead. + * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -773,6 +781,9 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from the RDD with the highest * id that ever returned any values for it. * + * If the metric was used directly on the driver, and wa not used in any RDD execution, + * the driver value will be used instead. + * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -801,6 +812,9 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** * Returns the last attempt value of this accumulator, aggregated from RDDs with given scope ids. * + * If the metric was used directly on the driver, and wa not used in any RDD execution, + * the driver value will be used instead. + * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -817,14 +831,11 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { val matchingRDDs = lastAttemptRddsMap.values.filter { rddVal => rddVal.rddScopeId.exists(scopesLookup.contains) }.toSeq - // When multiple RDDs share the same scope from different SQL executions (e.g. repeated - // Dataset.collect() calls), only aggregate the ones from the latest execution per scope. - val rddIds = matchingRDDs.groupBy(_.rddScopeId).values.flatMap { rddsInScope => - val maxExecId = rddsInScope.flatMap(_.lastSqlExecutionId).maxOption - rddsInScope.filter { r => - maxExecId.isEmpty || r.lastSqlExecutionId == maxExecId - } - }.map(_.rddId).toSeq + // When multiple RDDs share the same scope (e.g. repeated Dataset.collect() calls create + // new wrapper RDDs in the same scope, or BroadcastNestedLoopJoin executing the probe side + // twice), only aggregate the latest one per scope, identified by the highest RDD id. + // RDD ids are globally monotonic, so the highest id is the latest. + val rddIds = matchingRDDs.groupBy(_.rddScopeId).values.map(_.maxBy(_.rddId).rddId).toSeq lastAttemptValueForRDDIds(rddIds) } catch { case NonFatal(e) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 35769eca9af13..7b62ea73b50ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -431,7 +431,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ } /** The RDD id of the last wrapper RDD created by [[getByteArrayRdd]]. Visible for testing. */ - @transient private var _lastByteArrayRddId: Option[Int] = None + @transient @volatile private var _lastByteArrayRddId: Option[Int] = None private[spark] def lastByteArrayRddId: Option[Int] = _lastByteArrayRddId /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala index 2fffdf29cd312..60bca314734f9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala @@ -39,6 +39,7 @@ object AQETestHelper { thunk } finally { metricIdsForForcedCancellation = Set.empty + forcedCancellationTriggeredForPlans.clear() } res } @@ -53,7 +54,7 @@ object AQETestHelper { Utils.isTesting && metricIdsForForcedCancellation.nonEmpty /** Return `true` if forced cancellation has already been triggered for `plan`. */ - def wasForcedCancellationTriggeredForPlan(plan: SparkPlan): Boolean = synchronized { + private def wasForcedCancellationTriggeredForPlan(plan: SparkPlan): Boolean = synchronized { forcedCancellationTriggeredForPlans.contains(plan.id) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala index 7b7b7965c7b6e..e710b5d10f844 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala @@ -115,14 +115,24 @@ import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} * executed in the scope of a Dataset's QueryExecution, via * [[lastAttemptValueForDataset]] and [[lastAttemptValueForQueryExecution]] methods. */ -trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator[IN, OUT, PARTIAL] { +trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL, DRIVER_ACC] + extends LastAttemptAccumulator[IN, OUT, PARTIAL] { this: AccumulatorV2[IN, OUT] => + /** Create a fresh accumulator to hold driver-side values for one QueryExecution. */ + protected def newDriverQueryExecutionAcc(): DRIVER_ACC + /** Add a value to a driver-side per-QueryExecution accumulator. */ + protected def addToDriverAcc(acc: DRIVER_ACC, value: IN): Unit + /** Set the value of a driver-side per-QueryExecution accumulator. */ + protected def setDriverAcc(acc: DRIVER_ACC, value: OUT): Unit + /** Read the value of a driver-side per-QueryExecution accumulator. */ + protected def driverAccValue(acc: DRIVER_ACC): OUT + @transient - private var lastAttemptDirectDriverQueryExecutionValues: mutable.Map[String, this.type] = _ + private var lastAttemptDirectDriverQueryExecutionValues: mutable.Map[String, DRIVER_ACC] = _ override def initializeLastAttemptAccumulator()(implicit ct: ClassTag[PARTIAL]): Unit = try { - lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, this.type]() + lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, DRIVER_ACC]() super.initializeLastAttemptAccumulator()(ct) } catch { case NonFatal(e) => @@ -133,7 +143,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator } override def resetLastAttemptAccumulator(): Unit = try { - lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, this.type]() + lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, DRIVER_ACC]() super.resetLastAttemptAccumulator() } catch { case NonFatal(e) => @@ -148,12 +158,11 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator assert(lastAttemptDirectDriverQueryExecutionValues != null) } - protected def getOrCreateDirectDriverQueryExecutionValue(queryExecutionId: String): this.type = { + protected def getOrCreateDirectDriverQueryExecutionValue(queryExecutionId: String): DRIVER_ACC = { lastAttemptDirectDriverQueryExecutionValues.synchronized { if (!lastAttemptDirectDriverQueryExecutionValues.contains(queryExecutionId)) { lastAttemptDirectDriverQueryExecutionValues.put( - queryExecutionId, - this.copyAndReset().asInstanceOf[this.type]) + queryExecutionId, newDriverQueryExecutionAcc()) } lastAttemptDirectDriverQueryExecutionValues(queryExecutionId) } @@ -178,8 +187,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator // Direct update on the driver, not from within a task. getActiveDatasetQueryExecutionId match { case Some(qeId) => - val driverMetric = getOrCreateDirectDriverQueryExecutionValue(qeId) - driverMetric.add(value) + addToDriverAcc(getOrCreateDirectDriverQueryExecutionValue(qeId), value) case None => // pass } } @@ -191,9 +199,28 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator exception = Some(e)) } + /** + * Like [[addQueryExecutionValueIfOnDriverSide]], but for set operations. + */ + protected def setQueryExecutionValueIfOnDriverSide(value: OUT): Unit = try { + if (isAtDriverSide && lastAttemptAccumulatorInitialized && !lastAttemptAccumulatorInvalid) { + getActiveDatasetQueryExecutionId match { + case Some(qeId) => + setDriverAcc(getOrCreateDirectDriverQueryExecutionValue(qeId), value) + case None => // pass + } + } + } catch { + case NonFatal(e) => + unexpectedLastAttemptMetricOperation( + invalidate = true, + reason = "Unexpected exception in setQueryExecutionValueIfOnDriverSide", + exception = Some(e)) + } + override def logAccumulatorState: LogEntry = try { val driverQEVals = Option(lastAttemptDirectDriverQueryExecutionValues) - .map(_.map { case (key, metric) => s"$key -> ${metric.value}" }.mkString("\n")) + .map(_.map { case (key, acc) => s"$key -> ${driverAccValue(acc)}" }.mkString("\n")) .getOrElse("") super.logAccumulatorState + log""" @@ -223,7 +250,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator assertValid() // If there was a driver set value defined in the scope of this QueryExecution, return that. lastAttemptDirectDriverQueryExecutionValues.get(qe.id.toString) match { - case Some(metric) => return Some(metric.value) + case Some(acc) => return Some(driverAccValue(acc)) case None => // pass } // Otherwise, gather the RDD scoped from the plan and find metric updates from these scopes. @@ -257,7 +284,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL] extends LastAttemptAccumulator /** Visible for testing. */ def getDirectDriverQueryExecutionValue(qeId: String): Option[OUT] = { - lastAttemptDirectDriverQueryExecutionValues.get(qeId).map(_.value) + lastAttemptDirectDriverQueryExecutionValues.get(qeId).map(driverAccValue) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala index 26632f5920d72..33326fb8e5bc4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetric.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.sql.execution.metric -import scala.util.control.NonFatal - import org.apache.spark.SparkContext import org.apache.spark.util.AccumulatorV2 @@ -25,13 +23,13 @@ class SQLLastAttemptMetric( metricType: String, initValue: Long = 0L) extends SQLMetric(metricType, initValue) - with SQLLastAttemptAccumulator[Long, Long, Long] { + with SQLLastAttemptAccumulator[Long, Long, Long, SQLMetric] { override protected def partialMergeVal: Long = _value override protected def partialMerge(value: Long): Unit = { - if (isZero) _value = 0 - _value += value + // For SQLLastAttemptMetric, this is just add to the underlying SQLMetric. + super.add(value) } override protected def isMergeable(other: AccumulatorV2[_, _]): Boolean = other match { @@ -43,33 +41,11 @@ class SQLLastAttemptMetric( // number of rows processed, and it should not store user data. protected def accumulatorStoresUserData: Boolean = false - /** - * Check if the value is added on the driver side, not from within a task. - * If it is set in the scope of a Dataset's QueryExecution, associate it with that scope. - * This must be called from `set` methods of this SQLMetric subclass. - * This should be called there after setValueIfOnDriverSide. - * See [[addQueryExecutionValueIfOnDriverSide]]. - * This method is directly in this class, because in general AccumulatorV2 does not have a - * direct set method, only SQLMetric has it. - */ - protected def setQueryExecutionValueIfOnDriverSide(value: Long): Unit = try { - // Note: setValueIfOnDriverSide will already make it invalid if there are also RDD updates. - if (isAtDriverSide && lastAttemptAccumulatorInitialized && !lastAttemptAccumulatorInvalid) { - // Direct update on the driver, not from within a task. - getActiveDatasetQueryExecutionId match { - case Some(qeId) => - val driverMetric = getOrCreateDirectDriverQueryExecutionValue(qeId) - driverMetric.set(value) - case None => // pass - } - } - } catch { - case NonFatal(e) => - unexpectedLastAttemptMetricOperation( - invalidate = true, - reason = "Unexpected exception in addQueryExecutionValueIfOnDriverSide", - exception = Some(e)) - } + override protected def newDriverQueryExecutionAcc(): SQLMetric = + new SQLMetric(metricType, initValue) + override protected def addToDriverAcc(acc: SQLMetric, value: Long): Unit = acc.add(value) + override protected def setDriverAcc(acc: SQLMetric, value: Long): Unit = acc.set(value) + override protected def driverAccValue(acc: SQLMetric): Long = acc.value override def copy(): SQLLastAttemptMetric = { val newAcc = new SQLLastAttemptMetric(metricType, initValue) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala index 3a624e1642052..3bb5e7dc0f09d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala @@ -598,7 +598,7 @@ class SQLLastAttemptMetricIntegrationSuite } } - test("SC-168178 BroadcastNestedLoopJoin outer executes probe side twice") { + test("BroadcastNestedLoopJoin outer executes probe side twice") { val slam = SQLLastAttemptMetrics.createMetric(spark.sparkContext, "test SLAM") val build = spark.range(5).selectExpr("id as b").hint("broadcast") @@ -611,13 +611,14 @@ class SQLLastAttemptMetricIntegrationSuite }) // When build side is outer, probe side gets executed twice by BNLJ: // once for matches, and once to mark unmatched build rows. - // TODO(SC-168178) This can be a non-determinism correctness issue + // This is a non-determinism correctness issue, and the two executions + // should not be double-counted in the last attempt value. assert(slam.getNumRDDs === 2) assert(slam.lastAttemptValueForAllRDDs() === Some(200)) - // However, the two executions are different RDDs, but only one of them is highest id. + // The two executions are different RDDs, but only one of them is highest id. assert(slam.lastAttemptValueForHighestRDDId() === Some(100)) - // Dataset sees both RDDs. - assert(slam.lastAttemptValueForDataset(df) === Some(200)) + // Dataset dedups per scope and returns only the latest RDD's value. + assert(slam.lastAttemptValueForDataset(df) === Some(100)) } test("SLAM with AQE CoalesceShufflePartitions") { From 38437aff1f7bf0fe02517fbcf433bb328c6323c3 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 20 Apr 2026 12:51:55 +0000 Subject: [PATCH 04/21] test failures --- .../java/org/apache/spark/internal/LogKeys.java | 8 ++++---- .../org/apache/spark/internal/config/Tests.scala | 14 +++++++------- .../configs-without-binding-policy-exceptions | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java index 14aa5667f498c..f0467c7d2d7a8 100644 --- a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java +++ b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java @@ -331,6 +331,10 @@ public enum LogKeys implements LogKey { LABEL_COLUMN, LARGEST_CLUSTER_INDEX, LAST_ACCESS_TIME, + LAST_ATTEMPT_ACC_INVALIDATE, + LAST_ATTEMPT_ACC_SYSTEM_METRIC, + LAST_ATTEMPT_ACC_UNEXPECTED_REASON, + LAST_ATTEMPT_ACC_USER_METRIC, LAST_COMMITTED_CHECKPOINT_ID, LAST_COMMIT_BASED_CHECKPOINT_ID, LAST_SCAN_TIME, @@ -728,10 +732,6 @@ public enum LogKeys implements LogKey { SIGNAL, SINK, SIZE, - LAST_ATTEMPT_ACC_INVALIDATE, - LAST_ATTEMPT_ACC_SYSTEM_METRIC, - LAST_ATTEMPT_ACC_USER_METRIC, - LAST_ATTEMPT_ACC_UNEXPECTED_REASON, SLEEP_TIME, SLIDE_DURATION, SMALLEST_CLUSTER_INDEX, diff --git a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala index 0f23566ad7681..8ecb14be1dfb8 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala @@ -39,6 +39,13 @@ private[spark] object Tests { .booleanConf .createOptional + val INJECT_SHUFFLE_FETCH_FAILURES = + ConfigBuilder("spark.testing.injectShuffleFetchFailures") + .doc("Injecting fetch failures for shuffle stages by providing an invalid BlockManager " + + "location for the first stage attempt. Testing only flag!") + .booleanConf + .createWithDefault(false) + val TEST_NO_STAGE_RETRY = ConfigBuilder("spark.test.noStageRetry") .version("1.2.0") .booleanConf @@ -85,13 +92,6 @@ private[spark] object Tests { .booleanConf .createWithDefault(false) - val INJECT_SHUFFLE_FETCH_FAILURES = - ConfigBuilder("spark.testing.injectShuffleFetchFailures") - .doc("Injecting fetch failures for shuffle stages by providing an invalid BlockManager " + - "location for the first stage attempt. Testing only flag!") - .booleanConf - .createWithDefault(false) - val TEST_SKIP_ESS_REGISTER = ConfigBuilder("spark.testing.skipESSRegister") .version("4.0.0") .doc("None of Spark testing modes (local, local-cluster) enables shuffle service. So it is " + diff --git a/sql/hive/src/test/resources/conf/binding-policy-exceptions/configs-without-binding-policy-exceptions b/sql/hive/src/test/resources/conf/binding-policy-exceptions/configs-without-binding-policy-exceptions index a84b0fc3b6c77..2873557363bc1 100644 --- a/sql/hive/src/test/resources/conf/binding-policy-exceptions/configs-without-binding-policy-exceptions +++ b/sql/hive/src/test/resources/conf/binding-policy-exceptions/configs-without-binding-policy-exceptions @@ -1171,6 +1171,7 @@ spark.taskMetrics.trackUpdatedBlockStatuses spark.test.noStageRetry spark.testing spark.testing.dynamicAllocation.schedule.enabled +spark.testing.injectShuffleFetchFailures spark.testing.memory spark.testing.nCoresPerExecutor spark.testing.nExecutorsPerHost From f04541010a6ce3742322e85288f6381eba20f849 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Tue, 21 Apr 2026 08:37:14 +0000 Subject: [PATCH 05/21] address review comments - Fix Scaladoc typos and grammar in LastAttemptAccumulator and SQLLastAttemptAccumulator. - Rewrite the "Cached / Checkpointed plans" paragraph and complete the truncated fragment in SQLLastAttemptAccumulator. - Add trait-level contract for driver-value handling; remove incorrect per-method mentions on scope-narrowed methods (they return zero when a driver value is present). - Fix QueryExecution.withQueryExecutionId Scaladoc to reference SQLLastAttemptAccumulator, the actual consumer. - Fix missing "if" in AQETestHelper.shouldForceCancellation comment. - Call super first in SQLLastAttemptAccumulator.initializeLastAttemptAccumulator and resetLastAttemptAccumulator. - Scope AdaptiveSparkPlanHelper singleton object to private[sql]. - Remove _lastByteArrayRddId field/accessor from SparkPlan and the override in AdaptiveSparkPlanExec; the redundant test assertions in SQLLastAttemptMetricIntegrationSuite are also removed, since per-scope maxBy(rddId) already implies the wrapper RDD is the one being tracked. --- .../spark/util/LastAttemptAccumulator.scala | 29 +++++++------- .../spark/sql/execution/QueryExecution.scala | 4 +- .../spark/sql/execution/SparkPlan.scala | 5 --- .../execution/adaptive/AQETestHelper.scala | 2 +- .../adaptive/AdaptiveSparkPlanExec.scala | 4 -- .../adaptive/AdaptiveSparkPlanHelper.scala | 2 +- .../metric/SQLLastAttemptAccumulator.scala | 39 ++++++++++--------- ...SQLLastAttemptMetricIntegrationSuite.scala | 7 ---- 8 files changed, 38 insertions(+), 54 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index 44e04dfab4bcd..1dcba89892869 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -312,11 +312,19 @@ private case class AccumulatorPartialVal[PARTIAL]( * discarding any values coming from earlier attempts that have been recomputed. * If the accumulator is used by multiple RDDs, the last attempt value is tracked separately for * each, and can be retrieved for each or all of them separately, see lastAttemptValueForX methods. - * If the accumulator is used directly on the Spark Driver using the [[AccumulatorV2.set]] or - * [[AccumulatorV2.add]], that value is considered the last attempt value. + * If the accumulator is used directly on the Spark Driver using [[AccumulatorV2.add]], + * that value is considered the last attempt value. * If the accumulator was both used in Tasks and updated directly on the driver, it can't determine * what should be considered the last attempt, and lastAttemptValueForX methods will return None. * + * Contract for driver-only updates: + * A driver-side value (set via [[AccumulatorV2.add]] on the driver, outside any Task) is only + * returned by methods that do not narrow by RDD, namely [[lastAttemptValueForAllRDDs]] and + * [[lastAttemptValueForHighestRDDId]]. Methods that narrow to specific RDDs or RDD scopes + * ([[lastAttemptValueForRDDId]], [[lastAttemptValueForRDDIds]], [[lastAttemptValueForRDDScopes]]) + * return the zero value when a driver-only value is present, because a driver-side update cannot + * be attributed to any particular RDD or scope. + * * [[LastAttemptAccumulator]] is not reset by the [[AccumulatorV2.reset]] method implementation, * and its state is not copied by the [[AccumulatorV2.copy]] method implementation, and it should * not be serialized to the Executors. The internal state should only be initialized by the @@ -337,12 +345,12 @@ private case class AccumulatorPartialVal[PARTIAL]( * * Implementations must implement [[partialMergeVal]] and [[partialMerge]] methods operating on * [[PARTIAL]] type. In regular [[AccumulatorV2]] implementations, the [[AccumulatorV2]] object - * itself hold the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is used + * itself holds the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is used * to merge these objects together. [[LastAttemptAccumulator]] needs to keep track of partial * values of every partition of every RDD that used the accumulator, and holding a full * [[AccumulatorV2]] object for each would have a high overhead. Therefore, an implementation should * be able to return [[PARTIAL]] value from [[partialMergeVal]] that represents an intermediate - * mergeable value. and a [[partialMerge]] method that can merge that value into the accumulator. + * mergeable value, and a [[partialMerge]] method that can merge that value into the accumulator. * Implementations must also implement an [[isMergeable]] method that checks if the other * [[AccumulatorV2]] is of a compatible type to be merged with this using [[partialMergeVal]]. In * regular [[AccumulatorV2]] implementations, this check is normally done inside the @@ -701,9 +709,6 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** * Returns the last attempt value of this accumulator, aggregated from a set of RDDs. * - * If the metric was used directly on the driver, and wa not used in any RDD execution, - * the driver value will be used instead. - * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -732,9 +737,6 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** * Returns the last attempt value of this accumulator, aggregated from a specific RDD. * - * If the metric was used directly on the driver, and wa not used in any RDD execution, - * the driver value will be used instead. - * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * @@ -755,7 +757,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from all RDDs that ever * returned any values for it. * - * If the metric was used directly on the driver, and wa not used in any RDD execution, + * If the metric was used directly on the driver, and was not used in any RDD execution, * the driver value will be used instead. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that @@ -781,7 +783,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from the RDD with the highest * id that ever returned any values for it. * - * If the metric was used directly on the driver, and wa not used in any RDD execution, + * If the metric was used directly on the driver, and was not used in any RDD execution, * the driver value will be used instead. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that @@ -812,9 +814,6 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { /** * Returns the last attempt value of this accumulator, aggregated from RDDs with given scope ids. * - * If the metric was used directly on the driver, and wa not used in any RDD execution, - * the driver value will be used instead. - * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 1a88316340a19..c0ab906de4841 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -336,8 +336,8 @@ class QueryExecution( /** * Set the query execution id in thread-local properties while * executing the block. This is used by - * [[LastAttemptAccumulator]] to associate driver-side metric - * updates with a specific QueryExecution. + * [[org.apache.spark.sql.execution.metric.SQLLastAttemptAccumulator]] to associate + * driver-side metric updates with a specific QueryExecution. */ private[sql] def withQueryExecutionId[T]( session: SparkSession)(block: => T): T = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 7b62ea73b50ab..a26c11ae8f052 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -426,14 +426,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ Iterator((count.toLong, cbbos.toChunkedByteBuffer)) } } - _lastByteArrayRddId = Some(rdd.id) rdd } - /** The RDD id of the last wrapper RDD created by [[getByteArrayRdd]]. Visible for testing. */ - @transient @volatile private var _lastByteArrayRddId: Option[Int] = None - private[spark] def lastByteArrayRddId: Option[Int] = _lastByteArrayRddId - /** * Decodes the byte arrays back to UnsafeRows and put them into buffer. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala index 60bca314734f9..7e78ee41900ce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQETestHelper.scala @@ -67,7 +67,7 @@ object AQETestHelper { /** Return `true` if we should try to force cancellation for `plan` at this point. */ def shouldForceCancellation(plan: SparkPlan): Boolean = { - // Trigger the forced cancellation only we are in testing + // Trigger the forced cancellation only if we are in testing Utils.isTesting && // ...and if we haven't triggered it yet !wasForcedCancellationTriggeredForPlan(plan) && diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index f346961a277d7..112ee82314c4b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -404,10 +404,6 @@ case class AdaptiveSparkPlanExec( logOnLevel(log"Final plan:\n${MDC(QUERY_PLAN, currentPhysicalPlan)}") } - /** Delegates to the inner plan's lastByteArrayRddId. Visible for testing. */ - override private[spark] def lastByteArrayRddId: Option[Int] = - executedPlan.lastByteArrayRddId - override def executeCollect(): Array[InternalRow] = { withFinalPlanUpdate(_.executeCollect()) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala index 14372457c0621..eea664b29fd52 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala @@ -154,4 +154,4 @@ trait AdaptiveSparkPlanHelper { } } -object AdaptiveSparkPlanHelper extends AdaptiveSparkPlanHelper +private[sql] object AdaptiveSparkPlanHelper extends AdaptiveSparkPlanHelper diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala index e710b5d10f844..1c3305a805b17 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala @@ -29,13 +29,13 @@ import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Broadcast import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} /* - * SQLLastAttemptAccumulator is a LastAttemptAccumulator that allows to track the last attempt + * SQLLastAttemptAccumulator is a LastAttemptAccumulator that allows tracking the last attempt * updates that happened in the scope of execution of a plan created by a specific Dataset's * QueryExecution. * * Tracking RDDs belonging to a Dataset execution. * ----------------------------------------------- - * Dataset executes executedPlan from it's QueryExecution. Each SparkPlan node in the + * Dataset executes executedPlan from its QueryExecution. Each SparkPlan node in the * executedPlan saves the RDD with its execution (executeRDD or executeColumnarRDD). However, * the root RDD of a Spark Stage that actually gets submitted and executed is not necessarily * that RDD. It may be an ephemeral RDD created on the fly when submitting the job, e.g.: @@ -54,7 +54,7 @@ import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} * Additionally, it is possible that the same queryExecution.executedPlan is reused. For * example, when collect() is called multiple times on the same Dataset. * - Part of the execution (e.g. the shuffles) should then be reused. Accumulator should still - * keep their partial values associated to it's RDDOperationScope, and return it for this + * keep their partial values associated to its RDDOperationScope, and return it for this * new attempt. * - Some of the execution (e.g. the result stage) may be recomputed. Since the SparkPlan will * be the same, RDDOperationScope will be the same, and this should become a newer execution @@ -72,8 +72,8 @@ import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} * final AQE plan, only values from the final plan and execution should be aggregated. * * It can also happen that the new AQE plan reuses SparkPlan instances from the old plan, - * see CancelShuffleStageInBroadcastJoin. However, in that cases, the old plan will be put - * under some new plan in a new submitted Stages. Since we only truly track the plans that + * see CancelShuffleStageInBroadcastJoin. However, in that case, the old plan will be put + * under some new plan in newly submitted Stages. Since we only truly track the plans that * submit Stages, these should be different and enough to disambiguate. * * Driver only updates @@ -90,28 +90,28 @@ import org.apache.spark.util.{AccumulatorV2, LastAttemptAccumulator} * Cached / Checkpointed plans * --------------------------- * If the metric was used inside a cached (df.cache, df.persist) or checkpointed (df.checkpoint, - * df.localCheckpoint, which is then turned into an RDDScanExec or InMemoryTableScanExec in the - * Dataset's executedPlan, we deem it undefined behavior for [[lastAttemptValueForDataset]] and - * [[lastAttemptValueForQueryExecution]]. [[lastAttemptValueForHighestRDDId()]] should return the - * value from when the execution in which the plan was cached/checkpointed and should be used - * instead. + * df.localCheckpoint) plan, which is then turned into an RDDScanExec or InMemoryTableScanExec + * in the Dataset's executedPlan, [[lastAttemptValueForDataset]] and + * [[lastAttemptValueForQueryExecution]] are declared undefined behavior. In this case, + * [[lastAttemptValueForHighestRDDId()]] should be used instead, which returns the value from + * the execution in which the plan was cached/checkpointed. * * The main issue is if the metric is in the top stage of the cached plan. When that plan is * executed in some Dataset (as lazy execution), the metric will be executed in the scope of the * stage that contains the InMemoryTableScanExec / RDDScanExec, which will be some parent of that * plan, and not plan of the cached plan. So if the cached plan is then used in another Dataset, * that Dataset will not have information about that parent. - * There could be some hacks done to fix it by record in the InMemoryRelation the scopes in which - * it was materialized. There are also other issues, like that checkpoint throws away the plan, so - * it would also have to record the RDD scopes used during checkpointing. This gets further - * complicated if recomputations are involved, and are done in yet another scope. + * There could be some hacks done to fix it by recording in the InMemoryRelation the scopes in + * which it was materialized. There are also other issues, like that checkpoint throws away the + * plan, so it would also have to record the RDD scopes used during checkpointing. This gets + * further complicated if recomputations are involved, and are done in yet another scope. * It was declared undefined behavior instead of pursuing this. */ /** * A trait that can be mixed into a subclass of [[AccumulatorV2]] to track the "logical" * value of the "last attempt" of the execution using the accumulator. - * In addition to what [[LastAttemptAccumulator]] does, it allows to track the last attempt + * In addition to what [[LastAttemptAccumulator]] does, it allows tracking the last attempt * executed in the scope of a Dataset's QueryExecution, via * [[lastAttemptValueForDataset]] and [[lastAttemptValueForQueryExecution]] methods. */ @@ -132,8 +132,8 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL, DRIVER_ACC] private var lastAttemptDirectDriverQueryExecutionValues: mutable.Map[String, DRIVER_ACC] = _ override def initializeLastAttemptAccumulator()(implicit ct: ClassTag[PARTIAL]): Unit = try { - lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, DRIVER_ACC]() super.initializeLastAttemptAccumulator()(ct) + lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, DRIVER_ACC]() } catch { case NonFatal(e) => unexpectedLastAttemptMetricOperation( @@ -143,8 +143,8 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL, DRIVER_ACC] } override def resetLastAttemptAccumulator(): Unit = try { - lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, DRIVER_ACC]() super.resetLastAttemptAccumulator() + lastAttemptDirectDriverQueryExecutionValues = new mutable.HashMap[String, DRIVER_ACC]() } catch { case NonFatal(e) => unexpectedLastAttemptMetricOperation( @@ -408,8 +408,9 @@ object SQLLastAttemptAccumulator extends Logging { // into it to check attribution. // We could try to make checkpoint collect and store the scopes of the original execution, // but even then it would face similar inconsistencies as described above for cached plans. - // - if the metric is in the top stage of that plan, then if it was executed in the scope of - // this execution, it w + // - if the metric is in the top stage of that plan, then if it was executed in the scope + // of this execution, it would be attributed to the scope of the parent stage that is + // consuming the checkpointed RDD, not to any scope of the original plan. // - if the metric is in a map stage of the plan that was checkpointed, it requires that // checkpoint would track these stages and scopes. // diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala index 3bb5e7dc0f09d..2e7af075a3e74 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptMetricIntegrationSuite.scala @@ -264,8 +264,6 @@ class SQLLastAttemptMetricIntegrationSuite assert(slam1.getHighestRDDId.isDefined) val df1HighestId = slam1.getHighestRDDId.get val df1ExecutedPlanRddId = df1.queryExecution.executedPlan.execute().id - // The RDD id recorded by SLAM is the wrapper RDD from getByteArrayRdd. - assert(df1HighestId == df1.queryExecution.executedPlan.lastByteArrayRddId.get) assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) assert(slam1.lastAttemptValueForRDDId(df1HighestId) === Some(10)) // Values retrieved from the Dataset are the same as from the RDD. @@ -276,7 +274,6 @@ class SQLLastAttemptMetricIntegrationSuite assert(withRetries || slam1.value === 20) // +10 // The same executedPlan RDD is reused, but getByteArrayRdd creates a new wrapper. assert(df1.queryExecution.executedPlan.execute().id === df1ExecutedPlanRddId) - assert(slam1.getHighestRDDId === df1.queryExecution.executedPlan.lastByteArrayRddId) assert(slam1.lastAttemptValueForHighestRDDId() === Some(10)) // Both wrapper RDDs are summed in allRDDs. assert(slam1.lastAttemptValueForAllRDDs() === Some(20)) @@ -286,8 +283,6 @@ class SQLLastAttemptMetricIntegrationSuite df2.collect() assert(withRetries || slam1.value === 35) // +15 assert(slam1.getHighestRDDId.isDefined) - val df2HighestId = slam1.getHighestRDDId.get - assert(df2HighestId === df2.queryExecution.executedPlan.lastByteArrayRddId.get) // Both incrementMetric expressions are within the same Stage, so they record together. assert(slam1.lastAttemptValueForHighestRDDId() === Some(15)) // allRDDs includes wrapper RDDs from repeated df1.collect() calls. @@ -687,8 +682,6 @@ class SQLLastAttemptMetricIntegrationSuite } assert(wscg.isDefined) assert(slam.getHighestRDDId.isDefined) - // The highest tracked RDD is the wrapper RDD from getByteArrayRdd. - assert(slam.getHighestRDDId === df.queryExecution.executedPlan.lastByteArrayRddId) } } } From c0cb791b48e447af3f3283e8a8291ba2afe29eb4 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Tue, 21 Apr 2026 09:30:33 +0000 Subject: [PATCH 06/21] fix overlong line --- .../scala/org/apache/spark/util/LastAttemptAccumulator.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index 1dcba89892869..82008fad4053b 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -345,8 +345,8 @@ private case class AccumulatorPartialVal[PARTIAL]( * * Implementations must implement [[partialMergeVal]] and [[partialMerge]] methods operating on * [[PARTIAL]] type. In regular [[AccumulatorV2]] implementations, the [[AccumulatorV2]] object - * itself holds the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is used - * to merge these objects together. [[LastAttemptAccumulator]] needs to keep track of partial + * itself holds the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is + * used to merge these objects together. [[LastAttemptAccumulator]] needs to keep track of partial * values of every partition of every RDD that used the accumulator, and holding a full * [[AccumulatorV2]] object for each would have a high overhead. Therefore, an implementation should * be able to return [[PARTIAL]] value from [[partialMergeVal]] that represents an intermediate From 6dcf2ffa11d5c1963ec0bd1b2317bcee7db70be2 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Sun, 26 Apr 2026 20:45:50 +0000 Subject: [PATCH 07/21] address review comments --- .../org/apache/spark/util/LastAttemptAccumulator.scala | 2 +- .../execution/metric/SQLLastAttemptAccumulator.scala | 10 +++++----- .../apache/spark/sql/execution/metric/SQLMetrics.scala | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index 82008fad4053b..a41100910c29e 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -76,7 +76,7 @@ import org.apache.spark.scheduler.TaskInfo * fully re-executed. * - Due to the async nature of cancellation, there can be tasks from previous attempts that * arrive later than the last attempt. Therefore, we need to track and compare stageId and - * stageAttemptId of every computed RDD partition, in order to discard late comers. + * stageAttemptId of every computed RDD partition, in order to discard latecomers. * * 2. Spark SQL. * ============= diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala index 1c3305a805b17..114d3974bb0ee 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLLastAttemptAccumulator.scala @@ -240,8 +240,8 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL, DRIVER_ACC] * @note The output of this method is undefined if this metric was used inside a part of the plan * which was either checkpointed (e.g. df.localCheckpoint(), df.checkpoint()) or cached * (e.g. df.cache(), df.persist()). - * [[lastAttemptValueForHighestRDDId()]] should return the value from when the execution in - * which the plan was cached/checkpointed. + * [[lastAttemptValueForHighestRDDId()]] should be used instead, which returns the + * value from the execution in which the plan was cached/checkpointed. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -253,7 +253,7 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL, DRIVER_ACC] case Some(acc) => return Some(driverAccValue(acc)) case None => // pass } - // Otherwise, gather the RDD scoped from the plan and find metric updates from these scopes. + // Otherwise, gather the RDD scopes from the plan and find metric updates from these scopes. val scopes = SQLLastAttemptAccumulator.extractStageRDDScopes(qe.executedPlan) scopes match { case Left(bailOutReason) => @@ -273,8 +273,8 @@ trait SQLLastAttemptAccumulator[IN, OUT, PARTIAL, DRIVER_ACC] * @note The output of this method is undefined if this metric was used inside a part of the plan * which was either checkpointed (e.g. df.localCheckpoint(), df.checkpoint()) or cached * (e.g. df.cache(), df.persist()). - * [[lastAttemptValueForHighestRDDId()]] should return the value from when the execution in - * which the plan was cached/checkpointed. + * [[lastAttemptValueForHighestRDDId()]] should be used instead, which returns the + * value from the execution in which the plan was cached/checkpointed. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala index 6deff9d5c5eb5..0523df282cda5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala @@ -34,7 +34,7 @@ import org.apache.spark.util.AccumulatorContext.internOption */ class SQLMetric( val metricType: String, - val initValue: Long = 0L) extends AccumulatorV2[Long, Long] { + initValue: Long = 0L) extends AccumulatorV2[Long, Long] { // initValue defines the initial value of the metric. 0 is the lowest value considered valid. // If a SQLMetric is invalid, it is set to 0 upon receiving any updates, and it also reports // 0 as its value to avoid exposing it to the user programmatically. From b0a06461ff0e8fb054da0c9303ec665219d652eb Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 08:02:12 +0000 Subject: [PATCH 08/21] remove cosmetic scaladoc warnings --- .../org/apache/spark/util/LastAttemptAccumulator.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index a41100910c29e..301c3a1c29d4c 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -82,7 +82,7 @@ import org.apache.spark.scheduler.TaskInfo * ============= * LastAttemptAccumulator offers simple tracking of the last SQL execution, by assuming that * the last execution will be in the scope of an RDD with the highest id, and using - * [[lastAttemptValueForHighestRDDId]]. See [[SQLLastAttemptAccumulator]] for more possibilities + * [[lastAttemptValueForHighestRDDId]]. See SQLLastAttemptAccumulator for more possibilities * of tracking SQL execution. * * Simple last SQL execution tracking @@ -344,12 +344,12 @@ private case class AccumulatorPartialVal[PARTIAL]( * unexpected behaviours in Spark. * * Implementations must implement [[partialMergeVal]] and [[partialMerge]] methods operating on - * [[PARTIAL]] type. In regular [[AccumulatorV2]] implementations, the [[AccumulatorV2]] object + * PARTIAL type. In regular [[AccumulatorV2]] implementations, the [[AccumulatorV2]] object * itself holds the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is * used to merge these objects together. [[LastAttemptAccumulator]] needs to keep track of partial * values of every partition of every RDD that used the accumulator, and holding a full * [[AccumulatorV2]] object for each would have a high overhead. Therefore, an implementation should - * be able to return [[PARTIAL]] value from [[partialMergeVal]] that represents an intermediate + * be able to return PARTIAL value from [[partialMergeVal]] that represents an intermediate * mergeable value, and a [[partialMerge]] method that can merge that value into the accumulator. * Implementations must also implement an [[isMergeable]] method that checks if the other * [[AccumulatorV2]] is of a compatible type to be merged with this using [[partialMergeVal]]. In From 6f703dddae2e7abd417b17ae21947af76ec83a34 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 08:32:56 +0000 Subject: [PATCH 09/21] remove cosmetic scaladoc warning --- .../main/scala/org/apache/spark/sql/execution/SparkPlan.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index a26c11ae8f052..cf2d0218d0fdd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -251,7 +251,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ /** * A deterministic scope ID for RDDs created by this SparkPlan, - * used by [[LastAttemptAccumulator]] to track which RDD belongs + * used by LastAttemptAccumulator to track which RDD belongs * to which SparkPlan node. */ private[spark] def rddScopeId: String = From f9c21e2f34034165eaaa8a445838ad8a5e37e706 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 11:14:22 +0000 Subject: [PATCH 10/21] remove [[AccumulatorV2.register()]] wiki links to fix unidoc --- .../apache/spark/util/LastAttemptAccumulator.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index 301c3a1c29d4c..2c115b65d1e56 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -329,7 +329,7 @@ private case class AccumulatorPartialVal[PARTIAL]( * and its state is not copied by the [[AccumulatorV2.copy]] method implementation, and it should * not be serialized to the Executors. The internal state should only be initialized by the * [[initializeLastAttemptAccumulator]] method on the "main" instance of the accumulator, that was - * created and registered with [[AccumulatorContext]] with [[AccumulatorV2.register()]]. All the + * created and registered with [[AccumulatorContext]] with AccumulatorV2.register. All the * interfaces of [[LastAttemptAccumulator]]: [[mergeLastAttempt]] (used only by DAGScheduler) and * lastAttemptValueForX, [[logAccumulatorState]] (used by the using code) should only be invoked on * that instance, on the Spark Driver. @@ -577,7 +577,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * * Called from a single thread in DAGScheduler, no synchronization needed. * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. */ private[spark] def mergeLastAttempt( other: AccumulatorV2[_, _], @@ -710,7 +710,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from a set of RDDs. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -738,7 +738,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from a specific RDD. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -761,7 +761,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * the driver value will be used instead. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -787,7 +787,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * the driver value will be used instead. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -815,7 +815,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from RDDs with given scope ids. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2.register()]]. + * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ From 16c91eee6fec11fb1dcaa7c62d2650d94e8c4e78 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 11:20:52 +0000 Subject: [PATCH 11/21] Escape SQL operators in InMemoryTableWithV2Filter scaladoc to fix javadoc HTML crash Background ---------- The scaladoc on object InMemoryTableWithV2Filter.evalPredicate listed the supported V2 predicates as plain text: Supports =, <=>, IS_NULL, IS_NOT_NULL, and ALWAYS_TRUE. genjavadoc emits this verbatim into the corresponding Java stub doc comment, which becomes: Supports =, <=>, IS_NULL, IS_NOT_NULL, and ALWAYS_TRUE. (genjavadoc HTML-escapes the trailing `>` but not the leading `<=`). javadoc's HTML parser interprets the unescaped `<=` as the start of an HTML tag, fails to find a closing `>`, and aborts during HTML generation of this class -- the mode the diagnostic added in f2fe445e2eb (SPARK-56630) is designed to surface. Why this stayed hidden ---------------------- javadoc's default `-Xmaxerrs` is 100. Every Spark unidoc run accumulates a few dozen "benign noise" errors during source loading -- genjavadoc emits stubs like static public abstract R apply(T1 v1, T2 v2, T3 v3, T4 v4); which trip both `cannot find symbol` (the type variables `R`, `T1`, ... are not in scope at the static-context emission site) and `illegal combination of modifiers: abstract and static`. The commit message for f2fe445e2eb documents that these are inert and that javadoc normally finishes anyway. In practice, however, whenever the per-PR error count crosses the `-Xmaxerrs` ceiling, javadoc bails during source loading and never reaches HTML generation. From the operator's side that looks identical to a genuine crash: a few hundred `[error]` lines on `target/java/...` Java stubs followed by `JavadocGenerationFailed`. The diagnostic in f2fe445e2eb can only identify a real culprit by reading the last `Generating .../.html...` line before `javadoc exited`, which only exists once HTML generation has begun -- so this `<=>` crash was masked by the early bailout for as long as the noise budget happened to fit under 100. A PR that adds a handful of new public Scala classes (each one adds a few more genjavadoc stubs and bumps the error count) flips the build from "succeeds with 100 warnings" to "fails with 100 errors", and the new crash class is suddenly the one being rendered when the limit is hit -- in this case `InMemoryTableWithV2Filter`. Effect of this change --------------------- Wrapping each operator name in backticks and escaping `<=>` as `<=>` produces a clean inline-code rendering and stops javadoc from treating the punctuation as markup. After this fix, javadoc proceeds past InMemoryTableWithV2Filter during HTML generation. --- .../spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala index f2827faf59435..9c310bf3d9fb1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala @@ -164,7 +164,7 @@ object InMemoryTableWithV2Filter { /** * Evaluates a single V2 predicate by resolving column values through the - * given function. Supports =, <=>, IS_NULL, IS_NOT_NULL, and ALWAYS_TRUE. + * given function. Supports `=`, `<=>`, `IS_NULL`, `IS_NOT_NULL`, and `ALWAYS_TRUE`. */ def evalPredicate( pred: Predicate, From c43d8788e7527d83c5e7df1a05b5e917688e6073 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 11:21:16 +0000 Subject: [PATCH 12/21] Raise unidoc -Xmaxerrs / -Xmaxwarns so javadoc surfaces the real crash javadoc's default `-Xmaxerrs` and `-Xmaxwarns` are both 100. Spark's unidoc invocation is driven by genjavadoc-emitted Java stubs in `/target/java/`, and every run accumulates a few dozen benign "errors" against those stubs -- e.g. error: illegal combination of modifiers: abstract and static static public abstract R apply(T1 v1, T2 v2, T3 v3, T4 v4); error: cannot find symbol ^ symbol: class T1 static public abstract R apply(T1 v1, T2 v2, T3 v3, T4 v4); These come from the way genjavadoc lifts type-parameterised methods into static Java emission and were documented in f2fe445e2eb (SPARK-56630) as "every PR produces this, javadoc always complains, normally still finishes". The number of such errors scales with the public API surface that the PR touches. The problem with leaving `-Xmaxerrs 100` in place is that any PR whose total noise count happens to exceed 100 causes javadoc to bail during source loading, before HTML generation begins. The build operator sees the same end-state as a genuine crash: hundreds of `[error]` lines on `target/java/...` followed by `JavadocGenerationFailed`. The diagnostic banner added in f2fe445e2eb cannot help here either -- it identifies the crash class via the last `Generating .../.html...` line before the exit, which only exists once javadoc has reached HTML generation. When the bailout is at the source-loading phase, the banner correctly reports "the crash predates HTML output" but cannot point at any specific source. Concrete example. The unidoc step on PR #55371 was failing deterministically on every CI run with exactly 100 errors. Same noise categories as a recent passing run on a different fork (LuciferYang's PR on the same master), but #55371 added a few public Scala classes and that pushed the cumulative count just over the cliff. Symptoms: [error] /.../target/java/.../ErrorInfo.java:12:29: error: illegal combination of modifiers: abstract and static [error] /.../target/java/.../ErrorInfo.java:12:36: error: cannot find symbol ... [warn] javadoc exited with exit code 1 [error] sbt.inc.Doc$JavadocGenerationFailed Unidoc failed -- diagnostic summary Javadoc exited but no class HTML generation was in progress; the crash predates HTML output -- likely a CLI / classpath / setup issue. See the full sbt output above. In reality there was nothing wrong with the CLI or classpath: the limit was simply being hit before javadoc could begin emitting HTML. Raising both limits to 9999 keeps javadoc producing output past the noise wall, which is what the f2fe445e2eb diagnostic was designed to operate on. With the higher limits, the same run on #55371 produced a useful banner naming the class javadoc was rendering at the moment of the crash, which led directly to the `<=>` scaladoc fix in InMemoryTableWithV2Filter. This change does not silence any real failure: javadoc still exits with a non-zero code when it encounters a genuine HTML-generation crash. It only prevents a noise-driven early bailout from masking the real culprit. --- project/SparkBuild.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 866a535c6d951..4dd391af425e4 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1699,7 +1699,9 @@ object Unidoc { "-tag", "todo:X", "-tag", "groupname:X", "-tag", "inheritdoc", - "--ignore-source-errors", "-notree" + "--ignore-source-errors", "-notree", + "-Xmaxerrs", "9999", + "-Xmaxwarns", "9999" ) }, From 234d8c9acb2f1d7758fbdd12e3b6fdfc198c70f5 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 15:30:47 +0000 Subject: [PATCH 13/21] raise -Xmaxerrs/-Xmaxwarns to 999999 to avoid clipping as warnings overflow into errors --- project/SparkBuild.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 4dd391af425e4..118ba949903b1 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1700,8 +1700,8 @@ object Unidoc { "-tag", "groupname:X", "-tag", "inheritdoc", "--ignore-source-errors", "-notree", - "-Xmaxerrs", "9999", - "-Xmaxwarns", "9999" + "-Xmaxerrs", "999999", + "-Xmaxwarns", "999999" ) }, From 8d4a2a2f2cd946aef76124030d4b32e9b61767ac Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 17:42:59 +0000 Subject: [PATCH 14/21] diagnostic: try -Xdoclint:none to see if the 6 errors are doclint-emitted --- project/SparkBuild.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 118ba949903b1..8171908b51841 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1701,7 +1701,8 @@ object Unidoc { "-tag", "inheritdoc", "--ignore-source-errors", "-notree", "-Xmaxerrs", "999999", - "-Xmaxwarns", "999999" + "-Xmaxwarns", "999999", + "-Xdoclint:none" ) }, From 0beedd54a27e443da23393bf7dd1941afb92d08d Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 20:10:50 +0000 Subject: [PATCH 15/21] diagnostic: -Xdoclint:all,-reference to test if the 6 errors are reference errors --- project/SparkBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 8171908b51841..ba95577a6a2d2 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1702,7 +1702,7 @@ object Unidoc { "--ignore-source-errors", "-notree", "-Xmaxerrs", "999999", "-Xmaxwarns", "999999", - "-Xdoclint:none" + "-Xdoclint:all,-reference" ) }, From 97382e9fe39cd9068e8d138bad282a202a3b8262 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 21:19:40 +0000 Subject: [PATCH 16/21] diagnostic: -Xdoclint:reference only, to surface the 6 reference errors as distinct lines --- project/SparkBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index ba95577a6a2d2..e387817b4a428 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1702,7 +1702,7 @@ object Unidoc { "--ignore-source-errors", "-notree", "-Xmaxerrs", "999999", "-Xmaxwarns", "999999", - "-Xdoclint:all,-reference" + "-Xdoclint:reference" ) }, From 4728d7a0d325de2284aa9971ea0c12d1fc4dd02a Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 21:25:35 +0000 Subject: [PATCH 17/21] retrigger CI From bfc108b4b32b16a876940eda9285a072ff2780b2 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 27 Apr 2026 22:01:14 +0000 Subject: [PATCH 18/21] retrigger CI From 477f8920f2d542f37e91adc972d308d0c3cb40d9 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Tue, 28 Apr 2026 05:20:48 +0000 Subject: [PATCH 19/21] Use [[AccumulatorV2#X]] form for method links and revert SparkBuild.scala experiments The scaladoc on LastAttemptAccumulator referenced AccumulatorV2 instance methods using the [[Class.method]] form, which genjavadoc translates to {@link Class.method}. javadoc reads `.` as the inner-class separator, not as a member separator, so each such reference fails to resolve to a member and is reported as a doclint reference error against the public API. The six occurrences (add, copy, merge, reset, register x2, plus the previously plain-text register) become six `[error]`-tracked failures in javadoc's final summary, which is what was making the docs job fail even after the earlier scaladoc fixes. Switch them to [[AccumulatorV2#X]] (the `#` member-separator form, already used in many places across Spark) so genjavadoc emits {@link AccumulatorV2#X} and javadoc resolves it as the intended method reference. Also reverts the various SparkBuild.scala -Xmaxerrs / -Xmaxwarns / -Xdoclint experiments from earlier in the branch -- those were diagnostic aids only, not appropriate for the unidoc production config. With this fix in place the references resolve cleanly and the docs job no longer needs the increased error/warning ceilings. --- .../spark/util/LastAttemptAccumulator.scala | 26 +++++++++---------- project/SparkBuild.scala | 5 +--- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala index 2c115b65d1e56..cd17476a0cc40 100644 --- a/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/util/LastAttemptAccumulator.scala @@ -312,24 +312,24 @@ private case class AccumulatorPartialVal[PARTIAL]( * discarding any values coming from earlier attempts that have been recomputed. * If the accumulator is used by multiple RDDs, the last attempt value is tracked separately for * each, and can be retrieved for each or all of them separately, see lastAttemptValueForX methods. - * If the accumulator is used directly on the Spark Driver using [[AccumulatorV2.add]], + * If the accumulator is used directly on the Spark Driver using [[AccumulatorV2#add]], * that value is considered the last attempt value. * If the accumulator was both used in Tasks and updated directly on the driver, it can't determine * what should be considered the last attempt, and lastAttemptValueForX methods will return None. * * Contract for driver-only updates: - * A driver-side value (set via [[AccumulatorV2.add]] on the driver, outside any Task) is only + * A driver-side value (set via [[AccumulatorV2#add]] on the driver, outside any Task) is only * returned by methods that do not narrow by RDD, namely [[lastAttemptValueForAllRDDs]] and * [[lastAttemptValueForHighestRDDId]]. Methods that narrow to specific RDDs or RDD scopes * ([[lastAttemptValueForRDDId]], [[lastAttemptValueForRDDIds]], [[lastAttemptValueForRDDScopes]]) * return the zero value when a driver-only value is present, because a driver-side update cannot * be attributed to any particular RDD or scope. * - * [[LastAttemptAccumulator]] is not reset by the [[AccumulatorV2.reset]] method implementation, - * and its state is not copied by the [[AccumulatorV2.copy]] method implementation, and it should + * [[LastAttemptAccumulator]] is not reset by the [[AccumulatorV2#reset]] method implementation, + * and its state is not copied by the [[AccumulatorV2#copy]] method implementation, and it should * not be serialized to the Executors. The internal state should only be initialized by the * [[initializeLastAttemptAccumulator]] method on the "main" instance of the accumulator, that was - * created and registered with [[AccumulatorContext]] with AccumulatorV2.register. All the + * created and registered with [[AccumulatorContext]] with [[AccumulatorV2#register]]. All the * interfaces of [[LastAttemptAccumulator]]: [[mergeLastAttempt]] (used only by DAGScheduler) and * lastAttemptValueForX, [[logAccumulatorState]] (used by the using code) should only be invoked on * that instance, on the Spark Driver. @@ -345,7 +345,7 @@ private case class AccumulatorPartialVal[PARTIAL]( * * Implementations must implement [[partialMergeVal]] and [[partialMerge]] methods operating on * PARTIAL type. In regular [[AccumulatorV2]] implementations, the [[AccumulatorV2]] object - * itself holds the intermediate value of the accumulator, and [[AccumulatorV2.merge]] method is + * itself holds the intermediate value of the accumulator, and [[AccumulatorV2#merge]] method is * used to merge these objects together. [[LastAttemptAccumulator]] needs to keep track of partial * values of every partition of every RDD that used the accumulator, and holding a full * [[AccumulatorV2]] object for each would have a high overhead. Therefore, an implementation should @@ -354,7 +354,7 @@ private case class AccumulatorPartialVal[PARTIAL]( * Implementations must also implement an [[isMergeable]] method that checks if the other * [[AccumulatorV2]] is of a compatible type to be merged with this using [[partialMergeVal]]. In * regular [[AccumulatorV2]] implementations, this check is normally done inside the - * [[AccumulatorV2.merge]] method, which is not used here. + * [[AccumulatorV2#merge]] method, which is not used here. * * If an implementation is used to keep user data in the accumulator, it should override * [[accumulatorStoresUserData]] to return true, to ensure correct structured logging annotation. @@ -577,7 +577,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * * Called from a single thread in DAGScheduler, no synchronization needed. * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2#register]]. */ private[spark] def mergeLastAttempt( other: AccumulatorV2[_, _], @@ -710,7 +710,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from a set of RDDs. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2#register]]. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -738,7 +738,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from a specific RDD. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2#register]]. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -761,7 +761,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * the driver value will be used instead. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2#register]]. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -787,7 +787,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * the driver value will be used instead. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2#register]]. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ @@ -815,7 +815,7 @@ trait LastAttemptAccumulator[IN, OUT, PARTIAL] extends Logging { * Returns the last attempt value of this accumulator, aggregated from RDDs with given scope ids. * * Should be used only on the Spark Driver, on the instance of [[LastAttemptAccumulator]] that - * was created and registered in [[AccumulatorContext]] by AccumulatorV2.register. + * was created and registered in [[AccumulatorContext]] by [[AccumulatorV2#register]]. * * @return None if the last attempt value cannot be established, Some(value) otherwise. */ diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index e387817b4a428..866a535c6d951 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1699,10 +1699,7 @@ object Unidoc { "-tag", "todo:X", "-tag", "groupname:X", "-tag", "inheritdoc", - "--ignore-source-errors", "-notree", - "-Xmaxerrs", "999999", - "-Xmaxwarns", "999999", - "-Xdoclint:reference" + "--ignore-source-errors", "-notree" ) }, From b71889b4663664c3390f25870e0ae2297fd12d98 Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Tue, 28 Apr 2026 07:03:16 +0000 Subject: [PATCH 20/21] Revert "Escape SQL operators in InMemoryTableWithV2Filter scaladoc to fix javadoc HTML crash" This reverts commit 16c91eee6fe. The fix in 16c91eee6fe was based on a misdiagnosis caused by stale genjavadoc-emitted Java stubs in target/java/ from previous test compilations in the local working tree. CI never compiles tests for the docs job, so the stub for this test source never reaches javadoc, and javadoc never sees the `<=>` -- this PR's docs failure was not caused by it. The actual root-cause fix is in the [[AccumulatorV2#X]] commit on this branch. The `<=>` escape itself remains a real fix worth landing -- it lives on the separate spark2 unidoc-fixes branch where it belongs alongside the SPARK-56630 follow-up. --- .../spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala index 9c310bf3d9fb1..f2827faf59435 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableWithV2Filter.scala @@ -164,7 +164,7 @@ object InMemoryTableWithV2Filter { /** * Evaluates a single V2 predicate by resolving column values through the - * given function. Supports `=`, `<=>`, `IS_NULL`, `IS_NOT_NULL`, and `ALWAYS_TRUE`. + * given function. Supports =, <=>, IS_NULL, IS_NOT_NULL, and ALWAYS_TRUE. */ def evalPredicate( pred: Predicate, From fe8df3ec52283d27562f31b19ae98458399d464b Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Tue, 28 Apr 2026 10:08:39 +0000 Subject: [PATCH 21/21] retrigger CI