apache · sunchao · Dec 6, 2022 · cloud-fan · Dec 7, 2022 · sunchao
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala
@@ -81,18 +81,21 @@ case class BatchScanExec(
 
           val newRows = new InternalRowSet(p.expressions.map(_.dataType))
           newRows ++= newPartitions.map(_.asInstanceOf[HasPartitionKey].partitionKey())
-          val oldRows = p.partitionValuesOpt.get
 
-          if (oldRows.size != newRows.size) {
-            throw new SparkException("Data source must have preserved the original partitioning " +
-                "during runtime filtering: the number of unique partition values obtained " +
-                s"through HasPartitionKey changed: before ${oldRows.size}, after ${newRows.size}")
+          val oldRows = p.partitionValuesOpt.get.toSet
+          // We require the new number of partition keys to be equal or less than the old number
+          // of partition keys here. In the case of less than, empty partitions will be added for
+          // those missing keys that are not present in the new input partitions.
+          if (oldRows.size < newRows.size) {
+            throw new SparkException("During runtime filtering, data source must either report " +
+                "the same number of partition keys, or a subset of partition keys from the " +
+                s"original. Before: ${oldRows.size} partition keys. After: ${newRows.size} " +
+                "partition keys")
           }
 
-          if (!oldRows.forall(newRows.contains)) {
-            throw new SparkException("Data source must have preserved the original partitioning " +
-                "during runtime filtering: the number of unique partition values obtained " +
-                s"through HasPartitionKey remain the same but do not exactly match")
+          if (!newRows.forall(oldRows.contains)) {
+            throw new SparkException("During runtime filtering, data source must not report new " +
+                "partition keys that are not present in the original partitioning.")
           }
 
           groupPartitions(newPartitions).get.map(_._2)
@@ -114,8 +117,21 @@ case class BatchScanExec(
       // return an empty RDD with 1 partition if dynamic filtering removed the only split
       sparkContext.parallelize(Array.empty[InternalRow], 1)
     } else {
+      var finalPartitions = filteredPartitions
+
+      outputPartitioning match {
+        case p: KeyGroupedPartitioning =>
+          val partitionMapping = finalPartitions.map(s =>
+            s.head.asInstanceOf[HasPartitionKey].partitionKey() -> s).toMap
+          finalPartitions = p.partitionValuesOpt.get.map { partKey =>
+            // Use empty partition for those partition keys that are not present
+            partitionMapping.getOrElse(partKey, Seq.empty)
+          }
+        case _ =>
+      }
+
       new DataSourceRDD(
-        sparkContext, filteredPartitions, readerFactory, supportsColumnar, customMetrics)
+        sparkContext, finalPartitions, readerFactory, supportsColumnar, customMetrics)
     }
     postDriverMetrics()
     rdd

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala
@@ -433,11 +433,11 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
           s"(2, 11.0, cast('2020-01-01' as timestamp)), " +
           s"(3, 19.5, cast('2020-02-01' as timestamp))")
 
-      // number of unique partitions changed after dynamic filtering - should throw exception
+      // number of unique partitions changed after dynamic filtering - the gap should be filled
+      // with empty partitions and the job should still succeed
       var df = sql(s"SELECT sum(p.price) from testcat.ns.$items i, testcat.ns.$purchases p WHERE " +
           s"i.id = p.item_id AND i.price > 40.0")
-      val e = intercept[Exception](df.collect())
-      assert(e.getMessage.contains("number of unique partition values"))
+      checkAnswer(df, Seq(Row(131)))
 
       // dynamic filtering doesn't change partitioning so storage-partitioned join should kick in
       df = sql(s"SELECT sum(p.price) from testcat.ns.$items i, testcat.ns.$purchases p WHERE " +